Fossil

Check-in [d804902f23]
Login

Check-in [d804902f23]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:speedup mimetype_from_content() by using a 256 byte array. <br>Mark VT and Ctrl-Z as text bytes, not binary. <br>Decrease maximum UTF-16 line length to 2731 <br>Check for FFFF in addition to 0, in UTF-16/binary detection.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: d804902f2333e4198223063c27cbbc17ec81f5ac
User & Date: jan.nijtmans 2012-11-02 08:31:20.275
Context
2012-11-02
17:22
Adjustments to looks_like_utf16 to handle wchar_t being missing or not 2 bytes. ... (check-in: 7d881d8280 user: mistachkin tags: trunk)
10:55
Generate warning when to-be-committed file contains invalid UTF-8 ... (check-in: 4e86b06a9f user: jan.nijtmans tags: improve_commit_warning)
08:31
speedup mimetype_from_content() by using a 256 byte array. <br>Mark VT and Ctrl-Z as text bytes, not binary. <br>Decrease maximum UTF-16 line length to 2731 <br>Check for FFFF in addition to 0, in UTF-16/binary detection. ... (check-in: d804902f23 user: jan.nijtmans tags: trunk)
03:30
Add the new moderation permissions to the list maintained by the JSON code. ... (check-in: 1cc7e8ce29 user: mistachkin tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/diff.c.
219
220
221
222
223
224
225
226
227

228
229
230
231
232
233
234
235
236
237
238
239
240

241
242
243
244
245
246
247
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** Maximum length of a line in a text file, in UTF-16 characters.  (4096)
** The number of bytes represented by this value cannot exceed LENGTH_MASK

** bytes, because that is the line buffer size used by the diff engine.
*/
#define UTF16_LENGTH_MASK_SZ  (LENGTH_MASK_SZ-1)
#define UTF16_LENGTH_MASK     ((1<<UTF16_LENGTH_MASK_SZ)-1)

/*
** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
** encodings.
*/
#define UTF16BE_CR  ((wchar_t)'\r')
#define UTF16BE_LF  ((wchar_t)'\n')
#define UTF16LE_CR  (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
#define UTF16LE_LF  (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))


/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines







|
|
>


<
|









>







219
220
221
222
223
224
225
226
227
228
229
230

231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return result;  /* No problems seen -> not binary */
}

/*
** Maximum length of a line in a text file, in UTF-16 characters.  (2731)
** The number of bytes represented by this value after conversion to
** UTF-8 (which can increase the size by 50%) cannot exceed LENGTH_MASK
** bytes, because that is the line buffer size used by the diff engine.
*/

#define UTF16_LENGTH_MASK     (LENGTH_MASK/3)

/*
** The carriage-return / line-feed characters in the UTF-16be and UTF-16le
** encodings.
*/
#define UTF16BE_CR  ((wchar_t)'\r')
#define UTF16BE_LF  ((wchar_t)'\n')
#define UTF16LE_CR  (((wchar_t)'\r')<<(sizeof(wchar_t)<<2))
#define UTF16LE_LF  (((wchar_t)'\n')<<(sizeof(wchar_t)<<2))
#define UTF16_FFFF  ((wchar_t)-1)

/*
** This function attempts to scan each logical line within the blob to
** determine the type of content it appears to contain.  Possible return
** values are:
**
**  (1) -- The content appears to consist entirely of text, with lines
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
  if( n==0 ) return result;  /* Empty file -> text */
  if( n%2 ) return 0;  /* Odd number of bytes -> binary (or UTF-8) */
  c = *z;
  if( c==0 ) return 0;  /* NUL character in a file -> binary */
  j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
  while( (n-=2)>0 ){
    c = *++z; ++j;
    if( c==0 ) return 0;  /* NUL character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      int c2 = z[-1];
      if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        return 0;  /* Very long line -> binary */







|







270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
  if( n==0 ) return result;  /* Empty file -> text */
  if( n%2 ) return 0;  /* Odd number of bytes -> binary (or UTF-8) */
  c = *z;
  if( c==0 ) return 0;  /* NUL character in a file -> binary */
  j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
  while( (n-=2)>0 ){
    c = *++z; ++j;
    if( c==0 || c==UTF16_FFFF ) return 0;  /* NUL/FFFF character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      int c2 = z[-1];
      if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        return 0;  /* Very long line -> binary */
Changes to src/doc.c.
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
** For any other binary type, return "unknown/unknown".
*/
const char *mimetype_from_content(Blob *pBlob){
  int i;
  int n;
  const unsigned char *x;

  static const char isBinary[] = {
     1, 1, 1, 1,  1, 1, 1, 1,    1, 0, 0, 1,  0, 0, 1, 1,
     1, 1, 1, 1,  1, 1, 1, 1,    1, 1, 1, 0,  1, 1, 1, 1,
  };

  /* A table of mimetypes based on file content prefixes
  */
  static const struct {
    const char *zPrefix;       /* The file prefix */
    int size;                  /* Length of the prefix */
    const char *zMimetype;     /* The corresponding mimetype */
  } aMime[] = {
    { "GIF87a",                  6, "image/gif"  },
    { "GIF89a",                  6, "image/gif"  },
    { "\211PNG\r\n\032\n",       8, "image/png"  },
    { "\377\332\377",            3, "image/jpeg" },
    { "\377\330\377",            3, "image/jpeg" },
  };

  x = (const unsigned char*)blob_buffer(pBlob);
  n = blob_size(pBlob);
  for(i=0; i<n; i++){
    unsigned char c = x[i];
    if( c<=0x1f && isBinary[c] ){
      break;
    }
  }
  if( i>=n ){
    return 0;   /* Plain text */
  }
  for(i=0; i<sizeof(aMime)/sizeof(aMime[0]); i++){







|
|
|




















|







33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
** For any other binary type, return "unknown/unknown".
*/
const char *mimetype_from_content(Blob *pBlob){
  int i;
  int n;
  const unsigned char *x;

  static const char isBinary[256] = {
     1, 1, 1, 1,  1, 1, 1, 1,    1, 0, 0, 0,  0, 0, 1, 1,
     1, 1, 1, 1,  1, 1, 1, 1,    1, 1, 0, 0,  1, 1, 1, 1
  };

  /* A table of mimetypes based on file content prefixes
  */
  static const struct {
    const char *zPrefix;       /* The file prefix */
    int size;                  /* Length of the prefix */
    const char *zMimetype;     /* The corresponding mimetype */
  } aMime[] = {
    { "GIF87a",                  6, "image/gif"  },
    { "GIF89a",                  6, "image/gif"  },
    { "\211PNG\r\n\032\n",       8, "image/png"  },
    { "\377\332\377",            3, "image/jpeg" },
    { "\377\330\377",            3, "image/jpeg" },
  };

  x = (const unsigned char*)blob_buffer(pBlob);
  n = blob_size(pBlob);
  for(i=0; i<n; i++){
    unsigned char c = x[i];
    if( isBinary[c] ){
      break;
    }
  }
  if( i>=n ){
    return 0;   /* Plain text */
  }
  for(i=0; i<sizeof(aMime)/sizeof(aMime[0]); i++){