27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
#if INTERFACE
/*
** This macro is designed to return non-zero if the specified blob contains
** data that MAY be binary in nature; otherwise, zero will be returned.
*/
#define looks_like_binary(blob) \
((looks_like_utf8((blob), LOOK_BINARY) & LOOK_BINARY) != LOOK_NONE)
/*
** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
** to convey status information about the blob content.
*/
#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
|
|
|
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
#if INTERFACE
/*
** This macro is designed to return non-zero if the specified blob contains
** data that MAY be binary in nature; otherwise, zero will be returned.
*/
#define looks_like_binary(blob) \
((looks_like_utf8((blob), LOOK_BINARY, 0) & LOOK_BINARY) != LOOK_NONE)
/*
** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
** to convey status information about the blob content.
*/
#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
|
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
|
** carriage-return, and line-feed.
**
** This function examines the contents of the blob until one of the flags
** specified in "stopFlags" is set.
**
************************************ WARNING **********************************
*/
int looks_like_utf8(const Blob *pContent, int stopFlags){
const char *z = blob_buffer(pContent);
unsigned int n = blob_size(pContent);
int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */
if( n==0 ) return flags; /* Empty file -> text */
c = *z;
if( c==0 ){
flags |= LOOK_NUL; /* NUL character in a file -> binary */
}else if( c=='\r' ){
flags |= LOOK_CR;
if( n<=1 || z[1]!='\n' ){
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
}
}
j = (c!='\n');
if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */
while( !(flags&stopFlags) && --n>0 ){
int c2 = c;
c = *++z; ++j;
if( c==0 ){
flags |= LOOK_NUL; /* NUL character in a file -> binary */
}else if( c=='\n' ){
flags |= LOOK_LF;
if( c2=='\r' ){
flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
}else{
flags |= LOOK_LONE_LF;
}
if( j>LENGTH_MASK ){
flags |= LOOK_LONG; /* Very long line -> binary */
}
j = 0;
}else if( c=='\r' ){
flags |= LOOK_CR;
if( n<=1 || z[1]!='\n' ){
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
}
}
}
if( n ){
flags |= LOOK_SHORT; /* The whole blob was not examined */
}
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
|
** carriage-return, and line-feed.
**
** This function examines the contents of the blob until one of the flags
** specified in "stopFlags" is set.
**
************************************ WARNING **********************************
*/
int looks_like_utf8(const Blob *pContent, int stopFlags, int fVerbose){
const char *z = blob_buffer(pContent);
unsigned int n = blob_size(pContent);
int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */
int nLine = 1;
if( n==0 ) return flags; /* Empty file -> text */
c = *z;
if( c==0 ){
flags |= LOOK_NUL; /* NUL character in a file -> binary */
if( fVerbose ) fossil_print("NUL at start\n");
}else if( c=='\r' ){
flags |= LOOK_CR;
if( fVerbose ) fossil_print("CR at start\n");
if( n<=1 || z[1]!='\n' ){
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
if( fVerbose ) fossil_print("Lone CR at start\n");
}
}
j = (c!='\n');
if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */
while( !(flags&stopFlags) && --n>0 ){
int c2 = c;
c = *++z; ++j;
if( c==0 ){
if( fVerbose && !(flags&LOOK_NUL) ){
fossil_print("NUL on line %d\n", nLine);
}
flags |= LOOK_NUL; /* NUL character in a file -> binary */
}else if( c=='\n' ){
flags |= LOOK_LF;
if( c2=='\r' ){
if( fVerbose && !(flags&LOOK_CRLF) ){
fossil_print("CRLF on line %d\n", nLine);
}
flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
}else{
if( fVerbose && !(flags&LOOK_LONE_LF) ){
fossil_print("Lone LF on line %d\n", nLine);
}
flags |= LOOK_LONE_LF;
}
if( j>LENGTH_MASK ){
if( fVerbose && !(flags&LOOK_LONG) ){
fossil_print("Line %d is longer than %d bytes\n", nLine, j);
}
flags |= LOOK_LONG; /* Very long line -> binary */
}
++nLine;
j = 0;
}else if( c=='\r' ){
flags |= LOOK_CR;
if( n<=1 || z[1]!='\n' ){
if( fVerbose && !(flags&LOOK_LONE_CR) ){
fossil_print("Lone CR on line %d\n", nLine);
}
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
}
}
}
if( n ){
flags |= LOOK_SHORT; /* The whole blob was not examined */
}
|
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
|
** Usage: %fossil test-looks-like-utf FILENAME
**
** Options:
** -n|--limit N Repeat looks-like function N times, for
** performance measurement. Default = 1
** --utf8 Ignoring BOM and file size, force UTF-8 checking
** --utf16 Ignoring BOM and file size, force UTF-16 checking
**
** FILENAME is the name of a file to check for textual content in the UTF-8
** and/or UTF-16 encodings.
*/
void looks_like_utf_test_cmd(void){
Blob blob; /* the contents of the specified file */
int fUtf8 = 0; /* return value of starts_with_utf8_bom() */
int fUtf16 = 0; /* return value of starts_with_utf16_bom() */
int fUnicode = 0; /* return value of could_be_utf16() */
int lookFlags = 0; /* output flags from looks_like_utf8/utf16() */
int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
int fForceUtf8 = find_option("utf8",0,0)!=0;
int fForceUtf16 = find_option("utf16",0,0)!=0;
const char *zCount = find_option("limit","n",1);
int nRepeat = 1;
if( g.argc!=3 ) usage("FILENAME");
if( zCount ){
nRepeat = atoi(zCount);
}
blob_read_from_file(&blob, g.argv[2], ExtFILE);
while( --nRepeat >= 0 ){
fUtf8 = starts_with_utf8_bom(&blob, 0);
fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16);
if( fForceUtf8 ){
fUnicode = 0;
}else{
fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
}
if( fUnicode ){
lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
}else{
lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob);
}
}
fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
fossil_print("Starts with UTF-16 BOM: %s\n",
fUtf16?(bRevUtf16?"reversed":"yes"):"no");
fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
|
>
>
|
|
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
|
** Usage: %fossil test-looks-like-utf FILENAME
**
** Options:
** -n|--limit N Repeat looks-like function N times, for
** performance measurement. Default = 1
** --utf8 Ignoring BOM and file size, force UTF-8 checking
** --utf16 Ignoring BOM and file size, force UTF-16 checking
** -v|--verbose Report the line numbers where each flag is first set
**
** FILENAME is the name of a file to check for textual content in the UTF-8
** and/or UTF-16 encodings.
*/
void looks_like_utf_test_cmd(void){
Blob blob; /* the contents of the specified file */
int fUtf8 = 0; /* return value of starts_with_utf8_bom() */
int fUtf16 = 0; /* return value of starts_with_utf16_bom() */
int fUnicode = 0; /* return value of could_be_utf16() */
int lookFlags = 0; /* output flags from looks_like_utf8/utf16() */
int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
int fForceUtf8 = find_option("utf8",0,0)!=0;
int fForceUtf16 = find_option("utf16",0,0)!=0;
const char *zCount = find_option("limit","n",1);
int fVerbose = find_option("verbose","v",0)!=0;
int nRepeat = 1;
if( g.argc!=3 ) usage("FILENAME");
if( zCount ){
nRepeat = atoi(zCount);
}
blob_read_from_file(&blob, g.argv[2], ExtFILE);
while( --nRepeat >= 0 ){
fUtf8 = starts_with_utf8_bom(&blob, 0);
fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16);
if( fForceUtf8 ){
fUnicode = 0;
}else{
fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
}
if( fUnicode ){
lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
}else{
lookFlags = looks_like_utf8(&blob, 0, fVerbose) | invalid_utf8(&blob);
}
}
fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
fossil_print("Starts with UTF-16 BOM: %s\n",
fUtf16?(bRevUtf16?"reversed":"yes"):"no");
fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
|