Index: src/blob.c ================================================================== --- src/blob.c +++ src/blob.c @@ -1092,15 +1092,17 @@ /* ** Strip a possible byte-order-mark (BOM) from the blob. On Windows, if there ** is either no BOM at all or an (le/be) UTF-16 BOM, a conversion to UTF-8 is ** done. If useMbcs is false and there is no BOM, the input string is assumed ** to be UTF-8 already, so no conversion is done. +** If useMbcs is 2, any BOM is replaced by the UTF-8 BOM */ void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){ char *zUtf8; int bomSize = 0; int bomReverse = 0; + if( starts_with_utf8_bom(pBlob, &bomSize) ){ struct Blob temp; zUtf8 = blob_str(pBlob) + bomSize; blob_zero(&temp); blob_append(&temp, zUtf8, -1); @@ -1122,10 +1124,14 @@ /* Make sure the blob contains two terminating 0-bytes */ blob_append(pBlob, "", 1); zUtf8 = blob_str(pBlob) + bomSize; zUtf8 = fossil_unicode_to_utf8(zUtf8); blob_zero(pBlob); + if( useMbcs>1 ){ + const unsigned char *bom = get_utf8_bom(&bomSize); + blob_append(pBlob, (char*)bom, bomSize); + } blob_append(pBlob, zUtf8, -1); fossil_unicode_free(zUtf8); #endif /* _WIN32 || __CYGWIN__ */ #if defined(_WIN32) }else if( useMbcs ){ Index: src/checkin.c ================================================================== --- src/checkin.c +++ src/checkin.c @@ -919,24 +919,24 @@ if( fUnicode ){ lookFlags = looks_like_utf16(p, bReverse); }else{ lookFlags = looks_like_utf8(p); } - if( lookFlags&(LOOK_BINARY|LOOK_LONG|LOOK_LONE_CR|LOOK_CRLF) || fUnicode ){ + if( !lookFlags || lookFlags&(LOOK_LONG|LOOK_LONE_CR|LOOK_CRLF|LOOK_UNICODE) ){ const char *zWarning; const char *zDisable; const char *zConvert = "c=convert/"; Blob ans; char cReply; - if( lookFlags&(LOOK_BINARY|LOOK_LONG) ){ + if( !lookFlags || (lookFlags&LOOK_LONG) ){ if( binOk ){ return 0; /* We don't want binary warnings for this file. */ } - if( (lookFlags&LOOK_LONE_CR) && !(lookFlags&LOOK_NUL) ){ + if( lookFlags&LOOK_LONE_CR ){ zWarning = "CR line endings (would be handled as binary)"; - }else if( (lookFlags&LOOK_LONG) && !(lookFlags&LOOK_NUL) ){ + }else if( lookFlags&LOOK_LONG ){ zWarning = "long lines"; zConvert = ""; /* We cannot convert binary files. */ }else{ zWarning = "binary data"; zConvert = ""; /* We cannot convert binary files. */ Index: src/diff.c ================================================================== --- src/diff.c +++ src/diff.c @@ -48,10 +48,13 @@ ** here for consistency. */ #define DIFF_CANNOT_COMPUTE_BINARY \ "cannot compute difference between binary files\n" +#define DIFF_CANNOT_COMPUTE_ENCODING \ + "cannot compute difference between files with different encodings\n" + #define DIFF_CANNOT_COMPUTE_SYMLINK \ "cannot compute difference between symlink and regular file\n" #define DIFF_TOO_MANY_CHANGES_TXT \ "more than 10,000 changes\n" @@ -58,29 +61,38 @@ #define DIFF_TOO_MANY_CHANGES_HTML \ "

More than 10,000 changes

\n" /* -** This macro is designed to return non-zero if the specified blob contains -** data that MAY be binary in nature; otherwise, zero will be returned. +** This macro is designed to return zero if the specified blob is binary +** in nature (contains NUL bytes). */ -#define looks_like_binary(blob) ((looks_like_utf8(blob)&LOOK_BINARY)!=LOOK_NONE) +#define looks_like_text(lookFlags, blob) \ + do { \ + int result = 0; \ + if (could_be_utf16(blob, &result)) { \ + result = looks_like_utf16(blob, result); \ + }else{ \ + result = looks_like_utf8(blob); \ + } \ + (lookFlags) = result&(LOOK_TEXT|LOOK_LONG); \ + }while(0) /* ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used ** to convey status information about the blob content. */ #define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */ -#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */ -#define LOOK_CR ((int)0x00000002) /* One or more CR chars were found. */ -#define LOOK_LONE_CR ((int)0x00000004) /* An unpaired CR char was found. */ -#define LOOK_LF ((int)0x00000008) /* One or more LF chars were found. */ -#define LOOK_LONE_LF ((int)0x00000010) /* An unpaired CR char was found. */ -#define LOOK_CRLF ((int)0x00000020) /* One or more CR/LF pairs were found. */ -#define LOOK_LONG ((int)0x00000040) /* An over length line was found. */ -#define LOOK_ODD ((int)0x00000080) /* An odd number of bytes was found. */ -#define LOOK_BINARY (LOOK_NUL | LOOK_LONG) /* Binary. */ +#define LOOK_UNICODE ((int)0x00000002) /* Might contain valid Unicode. */ +#define LOOK_TEXT ((int)0x00000003) /* 0=binary,1=text, 2=UTF16, 3=reversed-UTF16 */ +#define LOOK_CR ((int)0x00000004) /* One or more CR chars were found. */ +#define LOOK_LONE_CR ((int)0x00000008) /* An unpaired CR char was found. */ +#define LOOK_LF ((int)0x00000010) /* One or more LF chars were found. */ +#define LOOK_LONE_LF ((int)0x00000020) /* An unpaired CR char was found. */ +#define LOOK_CRLF ((int)0x00000040) /* One or more CR/LF pairs were found. */ +#define LOOK_LONG ((int)0x00000080) /* An over length line was found. */ +#define LOOK_ODD ((int)0x00000100) /* An odd number of bytes was found. */ #endif /* INTERFACE */ /* ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) */ @@ -235,16 +247,16 @@ ************************************ WARNING ********************************** */ int looks_like_utf8(const Blob *pContent){ const char *z = blob_buffer(pContent); unsigned int n = blob_size(pContent); - int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */ + int j, c, flags = 1; /* Assume UTF-8 text, prove otherwise */ if( n==0 ) return flags; /* Empty file -> text */ c = *z; if( c==0 ){ - flags |= LOOK_NUL; /* NUL character in a file -> binary */ + return 0; /* NUL character in a file -> binary */ }else if( c=='\r' ){ flags |= LOOK_CR; if( n<=1 || z[1]!='\n' ){ flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ } @@ -253,11 +265,11 @@ if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ while( --n>0 ){ int c2 = c; c = *++z; ++j; if( c==0 ){ - flags |= LOOK_NUL; /* NUL character in a file -> binary */ + return 0; /* NUL character in a file -> binary */ }else if( c=='\n' ){ flags |= LOOK_LF; if( c2=='\r' ){ flags |= LOOK_CRLF; /* Found LF preceded by CR */ }else{ @@ -290,16 +302,16 @@ # define WCHAR_T unsigned short # endif #endif /* -** Maximum length of a line in a text file, in UTF-16 characters. (4096) -** The number of bytes represented by this value cannot exceed LENGTH_MASK -** bytes, because that is the line buffer size used by the diff engine. +** Maximum length of a line in a text file, in UTF-16 characters. (2731) +** The number of characters represented by this value cannot exceed +** LENGTH_UTF16_LENGTH_MASK characters, because when converting UTF-16 +** to UTF-8 it could overflow the line buffer used by the diff engine. */ -#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) -#define UTF16_LENGTH_MASK ((1< text */ if( n%sizeof(WCHAR_T) ){ flags |= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */ if( n binary (UTF-8?) */ } c = *z; - if( c==0 ){ - flags |= LOOK_NUL; /* NUL character in a file -> binary */ - }else if( bReverse ){ + if( c==0 ) + return 0; /* NUL character in a file -> binary */ + if( bReverse ){ c = UTF16_SWAP(c); } j = (c!='\n'); if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ while( 1 ){ @@ -359,13 +371,13 @@ int c2 = c; n -= sizeof(WCHAR_T); if( n binary */ - }else if( bReverse ){ + if( c==0 ) + return 0; /* NUL character in a file -> binary */ + if( bReverse ){ c = UTF16_SWAP(c); } if( c=='\n' ){ if( c2=='\r' ){ flags |= (LOOK_CRLF | LOOK_CR | LOOK_LF); @@ -2536,12 +2548,12 @@ fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); fossil_print("Starts with UTF-16 BOM: %s\n", fUtf16?(bReverse?"reversed":"yes"):"no"); fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8", - (lookFlags&LOOK_BINARY)?"no":"yes"); - fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); + ((lookFlags==0) || (lookFlags&LOOK_LONG))?"no":"yes"); + fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags==0)?"yes":"no"); fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no"); fossil_print("Has flag LOOK_LONE_CR: %s\n", (lookFlags&LOOK_LONE_CR)?"yes":"no"); fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no"); fossil_print("Has flag LOOK_LONE_LF: %s\n", Index: src/diffcmd.c ================================================================== --- src/diffcmd.c +++ src/diffcmd.c @@ -76,11 +76,11 @@ ** for file names to treat as binary. If fIncludeBinary is zero, these files ** will be skipped in addition to files that may contain binary content. */ void diff_file( Blob *pFile1, /* In memory content to compare from */ - int isBin1, /* Does the 'from' content appear to be binary */ + int eType1, /* Does the 'from' content appear to be text */ const char *zFile2, /* On disk content to compare to */ const char *zName, /* Display name of the file */ const char *zDiffCmd, /* Command for comparison */ const char *zBinGlob, /* Treat file names matching this as binary */ int fIncludeBinary, /* Include binary files for external diff */ @@ -88,10 +88,11 @@ ){ if( zDiffCmd==0 ){ Blob out; /* Diff output text */ Blob file2; /* Content of zFile2 */ const char *zName2; /* Name of zFile2 for display */ + int eType2 = 0; /* Read content of zFile2 into memory */ blob_zero(&file2); if( file_wd_size(zFile2)<0 ){ zName2 = NULL_DEVICE; @@ -101,17 +102,23 @@ }else{ blob_read_from_file(&file2, zFile2); } zName2 = zName; } - + if( !fIncludeBinary ){ + looks_like_text(eType2, &file2); + } /* Compute and output the differences */ if( diffFlags & DIFF_BRIEF ){ if( blob_compare(pFile1, &file2) ){ fossil_print("CHANGED %s\n", zName); } + }else if( eType1!=eType2 ){ + fossil_print(DIFF_CANNOT_COMPUTE_ENCODING); }else{ + blob_to_utf8_no_bom(pFile1, 2); + blob_to_utf8_no_bom(&file2, 2); blob_zero(&out); text_diff(pFile1, &file2, &out, 0, diffFlags); if( blob_size(&out) ){ diff_print_filenames(zName, zName2, diffFlags); fossil_print("%s\n", blob_str(&out)); @@ -126,11 +133,12 @@ Blob nameFile1; /* Name of temporary file to old pFile1 content */ Blob cmd; /* Text of command to run */ if( !fIncludeBinary ){ Blob file2; - if( isBin1 ){ + int eType2; + if( eType1!=1 ){ fossil_print(DIFF_CANNOT_COMPUTE_BINARY); return; } if( zBinGlob ){ Glob *pBinary = glob_create(zBinGlob); @@ -147,11 +155,12 @@ blob_read_link(&file2, zFile2); }else{ blob_read_from_file(&file2, zFile2); } } - if( looks_like_binary(&file2) ){ + looks_like_text(eType2, &file2); + if( (eType2&3)!=1 ){ fossil_print(DIFF_CANNOT_COMPUTE_BINARY); blob_reset(&file2); return; } blob_reset(&file2); @@ -197,12 +206,11 @@ ** will be skipped in addition to files that may contain binary content. */ void diff_file_mem( Blob *pFile1, /* In memory content to compare from */ Blob *pFile2, /* In memory content to compare to */ - int isBin1, /* Does the 'from' content appear to be binary */ - int isBin2, /* Does the 'to' content appear to be binary */ + int eType, /* Does the content appear to be text */ const char *zName, /* Display name of the file */ const char *zDiffCmd, /* Command for comparison */ const char *zBinGlob, /* Treat file names matching this as binary */ int fIncludeBinary, /* Include binary files for external diff */ u64 diffFlags /* Diff flags */ @@ -210,10 +218,12 @@ if( diffFlags & DIFF_BRIEF ) return; if( zDiffCmd==0 ){ Blob out; /* Diff output text */ blob_zero(&out); + blob_to_utf8_no_bom(pFile1, 2); + blob_to_utf8_no_bom(pFile2, 2); text_diff(pFile1, pFile2, &out, 0, diffFlags); diff_print_filenames(zName, zName, diffFlags); fossil_print("%s\n", blob_str(&out)); /* Release memory resources */ @@ -222,11 +232,11 @@ Blob cmd; char zTemp1[300]; char zTemp2[300]; if( !fIncludeBinary ){ - if( isBin1 || isBin2 ){ + if( eType==0 ){ fossil_print(DIFF_CANNOT_COMPUTE_BINARY); return; } if( zBinGlob ){ Glob *pBinary = glob_create(zBinGlob); @@ -282,18 +292,18 @@ const char *zFileTreeName ){ Blob fname; Blob content; int isLink; - int isBin; + int eType = 0; file_tree_name(zFileTreeName, &fname, 1); historical_version_of_file(zFrom, blob_str(&fname), &content, &isLink, 0, - fIncludeBinary ? 0 : &isBin, 0); + fIncludeBinary ? 0 : &eType, 0); if( !isLink != !file_wd_islink(zFrom) ){ fossil_print(DIFF_CANNOT_COMPUTE_SYMLINK); }else{ - diff_file(&content, isBin, zFileTreeName, zFileTreeName, + diff_file(&content, eType, zFileTreeName, zFileTreeName, zDiffCmd, zBinGlob, fIncludeBinary, diffFlags); } blob_reset(&content); blob_reset(&fname); } @@ -389,11 +399,11 @@ srcid = 0; if( !asNewFile ){ showDiff = 0; } } if( showDiff ){ Blob content; - int isBin; + int eType = 0; if( !isLink != !file_wd_islink(zFullName) ){ diff_print_index(zPathname, diffFlags); diff_print_filenames(zPathname, zPathname, diffFlags); fossil_print(DIFF_CANNOT_COMPUTE_SYMLINK); continue; @@ -401,13 +411,15 @@ if( srcid>0 ){ content_get(srcid, &content); }else{ blob_zero(&content); } - isBin = fIncludeBinary ? 0 : looks_like_binary(&content); + if( !fIncludeBinary ){ + looks_like_text(eType, &content); + } diff_print_index(zPathname, diffFlags); - diff_file(&content, isBin, zFullName, zPathname, zDiffCmd, + diff_file(&content, eType, zFullName, zPathname, zDiffCmd, zBinGlob, fIncludeBinary, diffFlags); blob_reset(&content); } free(zToFree); } @@ -437,23 +449,26 @@ ){ char *zName; Blob fname; Blob v1, v2; int isLink1, isLink2; - int isBin1, isBin2; + int eType = 0, eType2 = 0; if( diffFlags & DIFF_BRIEF ) return; file_tree_name(zFileTreeName, &fname, 1); zName = blob_str(&fname); historical_version_of_file(zFrom, zName, &v1, &isLink1, 0, - fIncludeBinary ? 0 : &isBin1, 0); + fIncludeBinary ? 0 : &eType, 0); historical_version_of_file(zTo, zName, &v2, &isLink2, 0, - fIncludeBinary ? 0 : &isBin2, 0); + fIncludeBinary ? 0 : &eType2, 0); if( isLink1 != isLink2 ){ diff_print_filenames(zName, zName, diffFlags); fossil_print(DIFF_CANNOT_COMPUTE_SYMLINK); + }else if( eType!=eType2 ){ + diff_print_filenames(zName, zName, diffFlags); + fossil_print(DIFF_CANNOT_COMPUTE_ENCODING); }else{ - diff_file_mem(&v1, &v2, isBin1, isBin2, zName, zDiffCmd, + diff_file_mem(&v1, &v2, eType, zName, zDiffCmd, zBinGlob, fIncludeBinary, diffFlags); } blob_reset(&v1); blob_reset(&v2); blob_reset(&fname); @@ -477,11 +492,11 @@ const char *zBinGlob, int fIncludeBinary, u64 diffFlags ){ Blob f1, f2; - int isBin1, isBin2; + int eType = 0, eType2 = 0; int rid; const char *zName = pFrom ? pFrom->zName : pTo->zName; if( diffFlags & DIFF_BRIEF ) return; diff_print_index(zName, diffFlags); if( pFrom ){ @@ -494,14 +509,21 @@ rid = uuid_to_rid(pTo->zUuid, 0); content_get(rid, &f2); }else{ blob_zero(&f2); } - isBin1 = fIncludeBinary ? 0 : looks_like_binary(&f1); - isBin2 = fIncludeBinary ? 0 : looks_like_binary(&f2); - diff_file_mem(&f1, &f2, isBin1, isBin2, zName, zDiffCmd, - zBinGlob, fIncludeBinary, diffFlags); + if ( !fIncludeBinary ){ + looks_like_text(eType, &f1); + looks_like_text(eType2, &f2); + } + if( eType!=eType2 ){ + diff_print_filenames(zName, zName, diffFlags); + fossil_print(DIFF_CANNOT_COMPUTE_ENCODING); + }else{ + diff_file_mem(&f1, &f2, eType, zName, zDiffCmd, + zBinGlob, fIncludeBinary, diffFlags); + } blob_reset(&f1); blob_reset(&f2); } /* Index: src/stash.c ================================================================== --- src/stash.c +++ src/stash.c @@ -307,22 +307,23 @@ ); while( db_step(&q)==SQLITE_ROW ){ int rid = db_column_int(&q, 0); int isRemoved = db_column_int(&q, 1); int isLink = db_column_int(&q, 3); - int isBin1, isBin2; + int eType = 0; const char *zOrig = db_column_text(&q, 4); const char *zNew = db_column_text(&q, 5); char *zOPath = mprintf("%s%s", g.zLocalRoot, zOrig); Blob delta, a, b, disk; if( rid==0 ){ db_ephemeral_blob(&q, 6, &a); fossil_print("ADDED %s\n", zNew); diff_print_index(zNew, diffFlags); - isBin1 = 0; - isBin2 = fIncludeBinary ? 0 : looks_like_binary(&a); - diff_file_mem(&empty, &a, isBin1, isBin2, zNew, zDiffCmd, + if( !fIncludeBinary ){ + looks_like_text(eType, &a); + } + diff_file_mem(&empty, &a, eType, zNew, zDiffCmd, zBinGlob, fIncludeBinary, diffFlags); }else if( isRemoved ){ fossil_print("DELETE %s\n", zOrig); if( fBaseline==0 ){ if( file_wd_islink(zOPath) ){ @@ -332,13 +333,14 @@ } }else{ content_get(rid, &a); } diff_print_index(zNew, diffFlags); - isBin1 = fIncludeBinary ? 0 : looks_like_binary(&a); - isBin2 = 0; - diff_file_mem(&a, &empty, isBin1, isBin2, zOrig, zDiffCmd, + if( !fIncludeBinary){ + looks_like_text(eType, &a); + } + diff_file_mem(&a, &empty, eType, zOrig, zDiffCmd, zBinGlob, fIncludeBinary, diffFlags); }else{ int isOrigLink = file_wd_islink(zOPath); db_ephemeral_blob(&q, 6, &delta); if( fBaseline==0 ){ @@ -355,21 +357,29 @@ printf(DIFF_CANNOT_COMPUTE_SYMLINK); }else{ Blob *pBase = fBaseline ? &a : &disk; content_get(rid, &a); blob_delta_apply(&a, &delta, &b); - isBin1 = fIncludeBinary ? 0 : looks_like_binary(pBase); - isBin2 = fIncludeBinary ? 0 : looks_like_binary(&b); - diff_file_mem(fBaseline? &a : &disk, &b, isBin1, isBin2, zNew, - zDiffCmd, zBinGlob, fIncludeBinary, diffFlags); + int eType2 = 0; + if( !fIncludeBinary ){ + looks_like_text(eType, pBase); + looks_like_text(eType2, &b); + } + if( eType!=eType2 ){ + diff_print_filenames(zOrig, zNew, diffFlags); + printf(DIFF_CANNOT_COMPUTE_ENCODING); + }else{ + diff_file_mem(pBase, &b, eType, zNew, zDiffCmd, + zBinGlob, fIncludeBinary, diffFlags); + } blob_reset(&a); blob_reset(&b); } if( !fBaseline ) blob_reset(&disk); } blob_reset(&delta); - } + } db_finalize(&q); } /* ** Drop the indicated stash Index: src/update.c ================================================================== --- src/update.c +++ src/update.c @@ -613,11 +613,11 @@ const char *revision, /* The checkin containing the file */ const char *file, /* Full treename of the file */ Blob *content, /* Put the content here */ int *pIsLink, /* Set to true if file is link. */ int *pIsExe, /* Set to true if file is executable */ - int *pIsBin, /* Set to true if file is binary */ + int *pEType, /* Set to file type, looks_like_text() */ int errCode /* Error code if file not found. Panic if 0. */ ){ Manifest *pManifest; ManifestFile *pFile; int rid=0; @@ -640,12 +640,12 @@ rid = uuid_to_rid(pFile->zUuid, 0); if( pIsExe ) *pIsExe = ( manifest_file_mperm(pFile)==PERM_EXE ); if( pIsLink ) *pIsLink = ( manifest_file_mperm(pFile)==PERM_LNK ); manifest_destroy(pManifest); rc = content_get(rid, content); - if( rc && pIsBin ){ - *pIsBin = looks_like_binary(content); + if( rc && pEType ){ + looks_like_text(*pEType, content); } return rc; } manifest_destroy(pManifest); if( errCode<=0 ){ Index: win/Makefile.mingw ================================================================== --- win/Makefile.mingw +++ win/Makefile.mingw @@ -13,14 +13,14 @@ # #### Select one of MinGW, MinGW-w64 (32-bit) or MinGW-w64 (64-bit) compilers. # By default, this is an empty string (i.e. use the native compiler). # -PREFIX = +# PREFIX = # PREFIX = mingw32- # PREFIX = i686-pc-mingw32- -# PREFIX = i686-w64-mingw32- +PREFIX = i686-w64-mingw32- # PREFIX = x86_64-w64-mingw32- #### The toplevel directory of the source tree. Fossil can be built # in a directory that is separate from the source tree. Just change # the following to point from the build directory to the src/ folder.