Fossil

Check-in [7d881d8280]
Login

Check-in [7d881d8280]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Adjustments to looks_like_utf16 to handle wchar_t being missing or not 2 bytes.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 7d881d82802ec8cf3f6fc38a35a1ed1fd1423560
User & Date: mistachkin 2012-11-02 17:22:37.542
Context
2012-11-02
17:37
Allow commit warning for binary data to be disabled via the 'binary-glob' setting. ... (check-in: d25f6ddf35 user: mistachkin tags: trunk)
17:22
Adjustments to looks_like_utf16 to handle wchar_t being missing or not 2 bytes. ... (check-in: 7d881d8280 user: mistachkin tags: trunk)
08:31
speedup mimetype_from_content() by using a 256 byte array. <br>Mark VT and Ctrl-Z as text bytes, not binary. <br>Decrease maximum UTF-16 line length to 2731 <br>Check for FFFF in addition to 0, in UTF-16/binary detection. ... (check-in: d804902f23 user: jan.nijtmans tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/diff.c.
185
186
187
188
189
190
191











192
193
194
195
196
197
198
**         does not understand UTF-16, it may falsely consider UTF-16 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-8.
**











*/
int looks_like_utf8(const Blob *pContent){
  const char *z = blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;
  int result = 1;  /* Assume UTF-8 text with no CR/NL */








>
>
>
>
>
>
>
>
>
>
>







185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
**         does not understand UTF-16, it may falsely consider UTF-16 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-8.
**
************************************ WARNING **********************************
**
** This function does not validate that the blob content is properly formed
** UTF-8.  It assumes that all code points are the same size.  It does not
** validate any code points.  It makes no attempt to detect if any [invalid]
** switches between UTF-8 and other encodings occur.
**
** The only code points that this function cares about are the NUL character,
** carriage-return, and line-feed.
**
************************************ WARNING **********************************
*/
int looks_like_utf8(const Blob *pContent){
  const char *z = blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;
  int result = 1;  /* Assume UTF-8 text with no CR/NL */

219
220
221
222
223
224
225











226
227
228
229
230

231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260











261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*











** Maximum length of a line in a text file, in UTF-16 characters.  (2731)
** The number of bytes represented by this value after conversion to
** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK
** bytes, because that is the line buffer size used by the diff engine.
*/

#define UTF16_LENGTH_MASK     (LENGTH_MASK/3)

/*
** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
** encodings.
*/
#define UTF16BE_CR  ((wchar_t)'\r')
#define UTF16BE_LF  ((wchar_t)'\n')
#define UTF16LE_CR  (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
#define UTF16LE_LF  (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
#define UTF16_FFFF  ((wchar_t)-1)

/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-16.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-8, it may falsely consider UTF-8 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-16.
**











*/
int looks_like_utf16(const Blob *pContent){
  const wchar_t *z = (wchar_t *)blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;
  int result = 1;  /* Assume UTF-16 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  if( n%2 ) return 0;  /* Odd number of bytes -> binary (or UTF-8) */
  c = *z;
  if( c==0 ) return 0;  /* NUL character in a file -> binary */
  j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
  while( (n-=2)>0 ){
    c = *++z; ++j;
    if( c==0 || c==UTF16_FFFF ) return 0;  /* NUL/FFFF character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      int c2 = z[-1];
      if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        return 0;  /* Very long line -> binary */







>
>
>
>
>
>
>
>
>
>
>
|
|
<


>
|





|
|
|
|
<



















>
>
>
>
>
>
>
>
>
>
>


|













|







230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249

250
251
252
253
254
255
256
257
258
259
260
261
262

263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32
#    define WCHAR_T wchar_t
#  else
#    define WCHAR_T unsigned short
#  endif
#endif

/*
** Maximum length of a line in a text file, in UTF-16 characters.  (4096)
** The number of bytes represented by this value cannot exceed LENGTH_MASK

** bytes, because that is the line buffer size used by the diff engine.
*/
#define UTF16_LENGTH_MASK_SZ  (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
#define UTF16_LENGTH_MASK     ((1<<UTF16_LENGTH_MASK_SZ)-1)

/*
** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
** encodings.
*/
#define UTF16BE_CR  ((WCHAR_T)'\r')
#define UTF16BE_LF  ((WCHAR_T)'\n')
#define UTF16LE_CR  (((WCHAR_T)'\r')<<(sizeof(char)<<3))
#define UTF16LE_LF  (((WCHAR_T)'\n')<<(sizeof(char)<<3))


/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
**         delimited by line-feed characters; however, the encoding may
**         not be UTF-16.
**
**  (0) -- The content appears to be binary because it contains embedded
**         NUL characters or an extremely long line.  Since this function
**         does not understand UTF-8, it may falsely consider UTF-8 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-16.
**
************************************ WARNING **********************************
**
** This function does not validate that the blob content is properly formed
** UTF-16.  It assumes that all code points are the same size.  It does not
** validate any code points.  It makes no attempt to detect if any [invalid]
** switches between the UTF-16be and UTF-16le encodings occur.
**
** The only code points that this function cares about are the NUL character,
** carriage-return, and line-feed.
**
************************************ WARNING **********************************
*/
int looks_like_utf16(const Blob *pContent){
  const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;
  int result = 1;  /* Assume UTF-16 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return result;  /* Empty file -> text */
  if( n%2 ) return 0;  /* Odd number of bytes -> binary (or UTF-8) */
  c = *z;
  if( c==0 ) return 0;  /* NUL character in a file -> binary */
  j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
  while( (n-=2)>0 ){
    c = *++z; ++j;
    if( c==0 ) return 0;  /* NUL character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      int c2 = z[-1];
      if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        return 0;  /* Very long line -> binary */