Index: src/lookslike.c ================================================================== --- src/lookslike.c +++ src/lookslike.c @@ -50,10 +50,41 @@ #define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */ #define LOOK_BINARY (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */ #define LOOK_EOL (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */ #endif /* INTERFACE */ +/* definitions for various UTF-8 sequence lengths, encoded as start value + * and size of each valid range belonging to some lead byte*/ +#define US2A 0x80, 0x01 /* for lead byte 0xC0 */ +#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */ +#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */ +#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */ +#define US4A 0x90, 0x30 /* for lead byte 0xF0 */ +#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */ +#define US4C 0x80, 0x10 /* for lead byte 0xF4 */ +#define US0A 0x00, 0x00 /* for any other lead byte */ + +/* a table used for quick lookup of the definition that goes with a + * particular lead byte */ +static const unsigned char lb_tab[] = { + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, + US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, + US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, + US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, + US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A +}; /* ** This function attempts to scan each logical line within the blob to ** determine the type of content it appears to contain. The return value ** is a combination of one or more of the LOOK_XXX flags (see above): @@ -135,72 +166,46 @@ } /* ** Checks for proper UTF-8. It uses the method described in: ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences -** except for the "overlong form" of \u0000 which is not considered invalid -** here: Some languages like Java and Tcl use it. This function also -** considers valid the derivatives CESU-8 & WTF-8 (as described in the -** same wikipedia article referenced previously). For UTF-8 characters -** > 7f, the variable 'c2' not necessary means the previous character. -** It's number of higher 1-bits indicate the number of continuation bytes -** that are expected to be followed. E.g. when 'c2' has a value in the range -** 0xc0..0xdf it means that 'c' is expected to contain the last continuation -** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one -** more continuation byte is expected. +** except for the "overlong form" of \u0000 which is not considered +** invalid here: Some languages like Java and Tcl use it. This function +** also considers valid the derivatives CESU-8 & WTF-8 (as described in +** the same wikipedia article referenced previously). For UTF-8 characters +** > 0x7f, the variable 'c' not necessary means the real lead byte. +** It's number of higher 1-bits indicate the number of continuation +** bytes that are expected to be followed. E.g. when 'c' has a value +** in the range 0xc0..0xdf it means that after 'c' a single continuation +** byte is expected. A value 0xe0..0xef means that after 'c' two more +** continuation bytes are expected. */ -/* definitions for various UTF-8 sequence lengths */ -#define US2A 0x80, 0x80 /* for lead byte 0xC0 */ -#define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ -#define US3A 0xA0, 0xBF /* for lead byte 0xE0 */ -#define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ -#define US4A 0x90, 0xBF /* for lead byte 0xF0 */ -#define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ -#define US4C 0x80, 0x8F /* for lead byte 0xF4 */ -#define US0A 0xFF, 0x00 /* for any other lead byte */ - -/* a table used for quick lookup of the definition that goes with a - * particular lead byte */ -static const unsigned char lb_tab[] = { - US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, - US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, - US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, - US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, - US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, - US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, - US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, - US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A -}; - int invalid_utf8( const Blob *pContent ){ const unsigned char *z = (unsigned char *) blob_buffer(pContent); unsigned int n = blob_size(pContent); - unsigned char c, c2; + unsigned char c; /* lead byte to be handled. */ if( n==0 ) return 0; /* Empty file -> OK */ c = *z; while( --n>0 ){ - c2 = c; - c = *++z; - if( c2>=0xC0 ){ - const unsigned char *def = &lb_tab[(2*c2)-0x180]; - if( (c<*def) || (c>*++def) ){ + if( c>=0x80 ){ + const unsigned char *def; /* pointer to range table*/ + + c <<= 1; /* multiply by 2 and get rid of highest bit */ + def = &lb_tab[c]; /* search fb's valid range in table */ + if( (unsigned int)(*++z-def[0])>=def[1] ){ return LOOK_INVALID; /* Invalid UTF-8 */ } - if( c2>=0xe0 ){ - c = (c2<<1)|3; - }else{ - c = ' '; - } - }else if( c2>=0x80 ){ - return LOOK_INVALID; + c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */ + } else { + c = *++z; } } - return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ + return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */ } /* ** Define the type needed to represent a Unicode (UTF-16) character. */