Changes On Branch simplify-starts-with
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch simplify-starts-with Excluding Merge-Ins

This is equivalent to a diff from 43c4522623 to c209105f0f

2013-02-07
15:28
Divide blob length check (even number of bytes) and UTF-32 check in the 3 versions of the UTF-16 BOM functions. check-in: be6756e26b user: jan.nijtmans tags: trunk
09:39
merge trunk check-in: 8994f3680a user: jan.nijtmans tags: improve_commit_warning
09:19
If file starts with UTF-32 BOM, always consider it binary without warning. Closed-Leaf check-in: c209105f0f user: jan.nijtmans tags: simplify-starts-with
08:47
Combine 4 "starts_with_utf??_bom" functions to a single - easier to use - function "starts_with_bom". In addition, it only checks for an UTF-16 BOM if the blob has an even number of bytes. check-in: 6c417d8bf5 user: jan.nijtmans tags: simplify-starts-with
02:08
Add the test-ssh-far-side command that can be used in place of a shell for the remote side of an ssh: sync. check-in: 43c4522623 user: drh tags: trunk
00:24
Add the shell= query parameter to the ssh: scheme for cloning and syncing. check-in: 2163cd9666 user: drh tags: trunk

Changes to src/blob.c.

1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117

1118
1119
1120
1121
1122
1123
1124

1125
1126
1127
1128
1129
1130
1131
** Strip a possible byte-order-mark (BOM) from the blob. On Windows, if there
** is either no BOM at all or an (le/be) UTF-16 BOM, a conversion to UTF-8 is
** done.  If useMbcs is false and there is no BOM, the input string is assumed
** to be UTF-8 already, so no conversion is done.
*/
void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){
  char *zUtf8;
  int bomSize = 0;
  if( starts_with_utf8_bom(pBlob, &bomSize) ){
    struct Blob temp;
    zUtf8 = blob_str(pBlob) + bomSize;
    blob_zero(&temp);
    blob_append(&temp, zUtf8, -1);
    blob_swap(pBlob, &temp);
    blob_reset(&temp);
#ifdef _WIN32
  }else if( starts_with_utf16le_bom(pBlob, &bomSize) ){
    /* Make sure the blob contains two terminating 0-bytes */
    blob_append(pBlob, "", 1);
    zUtf8 = blob_str(pBlob) + bomSize;
    zUtf8 = fossil_unicode_to_utf8(zUtf8);
    blob_zero(pBlob);
    blob_append(pBlob, zUtf8, -1);
    fossil_unicode_free(zUtf8);
  }else if( starts_with_utf16be_bom(pBlob, &bomSize) ){

    unsigned int i = blob_size(pBlob);
    zUtf8 = blob_buffer(pBlob);
    while( i > 0 ){
      /* swap bytes of unicode representation */
      char zTemp = zUtf8[--i];
      zUtf8[i] = zUtf8[i-1];
      zUtf8[--i] = zTemp;

    }
    /* Make sure the blob contains two terminating 0-bytes */
    blob_append(pBlob, "", 1);
    zUtf8 = blob_str(pBlob) + bomSize;
    zUtf8 = fossil_unicode_to_utf8(zUtf8);
    blob_zero(pBlob);
    blob_append(pBlob, zUtf8, -1);







|
|







|
<
<
|
<
<
<
|
<
>
|
<
|
|
|
|
|
>







1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109


1110



1111

1112
1113

1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
** Strip a possible byte-order-mark (BOM) from the blob. On Windows, if there
** is either no BOM at all or an (le/be) UTF-16 BOM, a conversion to UTF-8 is
** done.  If useMbcs is false and there is no BOM, the input string is assumed
** to be UTF-8 already, so no conversion is done.
*/
void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){
  char *zUtf8;
  int bomSize = starts_with_bom(pBlob);
  if( bomSize == 3 ){
    struct Blob temp;
    zUtf8 = blob_str(pBlob) + bomSize;
    blob_zero(&temp);
    blob_append(&temp, zUtf8, -1);
    blob_swap(pBlob, &temp);
    blob_reset(&temp);
#ifdef _WIN32
  }else if( bomSize == 2 ){


    zUtf8 = blob_buffer(pBlob);



    if (*((unsigned short *)zUtf8) == 0xfffe) {

      /* Found BOM, but with reversed bytes */
      unsigned int i = blob_size(pBlob);

      while( i > 0 ){
        /* swap bytes of unicode representation */
        char zTemp = zUtf8[--i];
        zUtf8[i] = zUtf8[i-1];
        zUtf8[--i] = zTemp;
      }
    }
    /* Make sure the blob contains two terminating 0-bytes */
    blob_append(pBlob, "", 1);
    zUtf8 = blob_str(pBlob) + bomSize;
    zUtf8 = fossil_unicode_to_utf8(zUtf8);
    blob_zero(pBlob);
    blob_append(pBlob, zUtf8, -1);

Changes to src/checkin.c.

897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
  Blob *p,              /* The content of the file being committed. */
  int crnlOk,           /* Non-zero if CR/NL warnings should be disabled. */
  int binOk,            /* Non-zero if binary warnings should be disabled. */
  int encodingOk,        /* Non-zero if encoding warnings should be disabled. */
  const char *zFilename /* The full name of the file being committed. */
){
  int eType;              /* return value of looks_like_utf8/utf16() */
  int fUnicode;           /* return value of starts_with_utf16_bom() */
  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return 0;
  fUnicode = starts_with_utf16_bom(p, 0);
  eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
  if( eType==0 || eType==-1 || fUnicode ){
    const char *zWarning;
    const char *zDisable;
    const char *zConvert = "c=convert/";
    Blob ans;
    char cReply;







|





|







897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
  Blob *p,              /* The content of the file being committed. */
  int crnlOk,           /* Non-zero if CR/NL warnings should be disabled. */
  int binOk,            /* Non-zero if binary warnings should be disabled. */
  int encodingOk,        /* Non-zero if encoding warnings should be disabled. */
  const char *zFilename /* The full name of the file being committed. */
){
  int eType;              /* return value of looks_like_utf8/utf16() */
  int fUnicode;           /* 1 if  blob starts with UTF-16 BOM */
  char *zMsg;             /* Warning message */
  Blob fname;             /* Relative pathname of the file */
  static int allOk = 0;   /* Set to true to disable this routine */

  if( allOk ) return 0;
  fUnicode = (starts_with_bom(p) == 2);
  eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
  if( eType==0 || eType==-1 || fUnicode ){
    const char *zWarning;
    const char *zDisable;
    const char *zConvert = "c=convert/";
    Blob ans;
    char cReply;

Changes to src/diff.c.

338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
    0xEF, 0xBB, 0xBF, 0x00, 0x00, 0x00
  };
  if( pnByte ) *pnByte = 3;
  return bom;
}

/*
** This function returns non-zero if the blob starts with a UTF-8
** byte-order-mark (BOM).
*/
int starts_with_utf8_bom(const Blob *pContent, int *pnByte){
  const char *z = blob_buffer(pContent);
  int bomSize = 0;
  const unsigned char *bom = get_utf8_bom(&bomSize);

  if( pnByte ) *pnByte = bomSize;
  if( blob_size(pContent)<bomSize ) return 0;
  return memcmp(z, bom, bomSize)==0;
}

/*
** This function returns non-zero if the blob starts with a UTF-16le or
** UTF-16be byte-order-mark (BOM).
*/
int starts_with_utf16_bom(const Blob *pContent, int *pnByte){
  const char *z = blob_buffer(pContent);
  int c1, c2;

  if( pnByte ) *pnByte = 2;
  if( blob_size(pContent)<2 ) return 0;
  c1 = z[0]; c2 = z[1];
  if( (c1==(char)0xff) && (c2==(char)0xfe) ){
    return 1;
  }else if( (c1==(char)0xfe) && (c2==(char)0xff) ){
    return 1;
  }
  return 0;
}

/*
** This function returns non-zero if the blob starts with a UTF-16le
** byte-order-mark (BOM).
*/
int starts_with_utf16le_bom(const Blob *pContent, int *pnByte){
  const char *z = blob_buffer(pContent);
  int c1, c2;

  if( pnByte ) *pnByte = 2;
  if( blob_size(pContent)<2 ) return 0;
  c1 = z[0]; c2 = z[1];
  if( (c1==(char)0xff) && (c2==(char)0xfe) ){
    return 1;
  }
  return 0;
}

/*
** This function returns non-zero if the blob starts with a UTF-16be
** byte-order-mark (BOM).
*/
int starts_with_utf16be_bom(const Blob *pContent, int *pnByte){
  const char *z = blob_buffer(pContent);
  int c1, c2;

  if( pnByte ) *pnByte = 2;
  if( blob_size(pContent)<2 ) return 0;
  c1 = z[0]; c2 = z[1];
  if( (c1==(char)0xfe) && (c2==(char)0xff) ){
    return 1;
  }
  return 0;
}

/*
** Return true if two DLine elements are identical.
*/







|
|

|

|


<
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<

<
<
|
<
<
<
<
<
|
<
|
|
|
<
<
<
<
|
<
|
<
<
<
<
<
<
<
|
<
<
<
<
|







338
339
340
341
342
343
344
345
346
347
348
349
350
351
352

353
354














355


356


357





358

359
360
361




362

363







364




365
366
367
368
369
370
371
372
    0xEF, 0xBB, 0xBF, 0x00, 0x00, 0x00
  };
  if( pnByte ) *pnByte = 3;
  return bom;
}

/*
** This function returns detected BOM size if the blob starts with
** a UTF-8, UTF-16le or UTF-16be byte-order-mark (BOM).
*/
int starts_with_bom(const Blob *pContent){
  const char *z = blob_buffer(pContent);
  int c1, bomSize = 0;
  const unsigned char *bom = get_utf8_bom(&bomSize);


  if( (blob_size(pContent)>=bomSize)
      && (memcmp(z, bom, bomSize)==0) ){














    return bomSize;


  }


  /* Only accept UTF-16 BOM if the blob has an even number of bytes */





  if( (blob_size(pContent)<2) || (blob_size(pContent)&1) ) return 0;

  c1 = *((unsigned short *)z);
  if( (c1==0xfffe) || (c1==0xfeff) ){
    if( blob_size(pContent)>=4 ){




      /* For UTF-32 BOM, always return 0. */

      if( ((unsigned short *)z)[1] == 0 ) return 0;







    }




    return 2;
  }
  return 0;
}

/*
** Return true if two DLine elements are identical.
*/
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395

  zLimit = find_option("limit",0,1);
  if( zLimit==0 || zLimit[0]==0 ) zLimit = "-1";
  iLimit = atoi(zLimit);
  showLog = find_option("log",0,0)!=0;
  fileVers = find_option("filevers",0,0)!=0;
  db_must_be_within_tree();
  if (g.argc<3) {
    usage("FILENAME");
  }
  file_tree_name(g.argv[2], &treename, 1);
  zFilename = blob_str(&treename);
  fnid = db_int(0, "SELECT fnid FROM filename WHERE name=%Q", zFilename);
  if( fnid==0 ){
    fossil_fatal("no such file: %s", zFilename);
  }
  fid = db_int(0, "SELECT rid FROM vfile WHERE pathname=%Q", zFilename);
  if( fid==0 ){
    fossil_fatal("not part of current checkout: %s", zFilename);
  }
  cid = db_lget_int("checkout", 0);
  if (cid == 0){
    fossil_fatal("Not in a checkout");
  }
  if( iLimit<=0 ) iLimit = 1000000000;
  compute_direct_ancestors(cid, iLimit);
  mid = db_int(0, "SELECT mlink.mid FROM mlink, ancestor "
          " WHERE mlink.fid=%d AND mlink.fnid=%d AND mlink.mid=ancestor.rid"
          " ORDER BY ancestor.generation ASC LIMIT 1",







|













|







2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354

  zLimit = find_option("limit",0,1);
  if( zLimit==0 || zLimit[0]==0 ) zLimit = "-1";
  iLimit = atoi(zLimit);
  showLog = find_option("log",0,0)!=0;
  fileVers = find_option("filevers",0,0)!=0;
  db_must_be_within_tree();
  if( g.argc<3 ){
    usage("FILENAME");
  }
  file_tree_name(g.argv[2], &treename, 1);
  zFilename = blob_str(&treename);
  fnid = db_int(0, "SELECT fnid FROM filename WHERE name=%Q", zFilename);
  if( fnid==0 ){
    fossil_fatal("no such file: %s", zFilename);
  }
  fid = db_int(0, "SELECT rid FROM vfile WHERE pathname=%Q", zFilename);
  if( fid==0 ){
    fossil_fatal("not part of current checkout: %s", zFilename);
  }
  cid = db_lget_int("checkout", 0);
  if( cid == 0 ){
    fossil_fatal("Not in a checkout");
  }
  if( iLimit<=0 ) iLimit = 1000000000;
  compute_direct_ancestors(cid, iLimit);
  mid = db_int(0, "SELECT mlink.mid FROM mlink, ancestor "
          " WHERE mlink.fid=%d AND mlink.fnid=%d AND mlink.mid=ancestor.rid"
          " ORDER BY ancestor.generation ASC LIMIT 1",