Index: src/blob.c ================================================================== --- src/blob.c +++ src/blob.c @@ -1004,10 +1004,65 @@ else if( z[i+1]!='\n' ) z[j++] = '\n'; } z[j] = 0; p->nUsed = j; } + +/* +** Convert blob from cp1252 to utf-8. As cp1252 is a superset +** of iso8895-1, this is useful on UNIX as well. +** +** This table contains the character translations for 0x80..0xA0. +*/ + +static const unsigned short cp1252[32] = { + 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, + 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 +}; + +void blob_cp1252_to_utf8(Blob *p){ + unsigned char *z = (unsigned char *)p->aData; + int j = p->nUsed; + int i, n; + for(i=n=0; i=0x80 ){ + if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){ + n++; + } + n++; + } + } + j += n; + if( j>=p->nAlloc ){ + blob_resize(p, j); + z = (unsigned char *)p->aData; + } + p->nUsed = j; + z[j] = 0; + while( j>i ){ + if( z[--i]>=0x80 ){ + if( z[i]<0xa0 ){ + unsigned short sym = cp1252[z[i]&0x1f]; + if( sym>=0x800 ){ + z[--j] = 0x80 | (sym&0x3f); + z[--j] = 0x80 | ((sym>>6)&0x3f); + z[--j] = 0xe0 | (sym>>12); + }else{ + z[--j] = 0x80 | (sym&0x3f); + z[--j] = 0xc0 | (sym>>6); + } + }else{ + z[--j] = 0x80 | (z[i]&0x3f); + z[--j] = 0xC0 | (z[i]>>6); + } + }else{ + z[--j] = z[i]; + } + } +} /* ** Shell-escape the given string. Append the result to a blob. */ void shell_escape(Blob *pBlob, const char *zIn){ Index: src/checkin.c ================================================================== --- src/checkin.c +++ src/checkin.c @@ -1296,11 +1296,10 @@ }else if( fHasInvalidUtf8 ){ if( encodingOk ){ return 0; /* We don't want encoding warnings for this file. */ } zWarning = "invalid UTF-8"; - zConvert = ""; /* Possible conversion to UTF-8 not yet implemented. */ zDisable = "\"encoding-glob\" setting"; }else if( fHasAnyCr ){ if( crnlOk ){ return 0; /* We don't want CR/NL warnings for this file. */ } @@ -1341,10 +1340,12 @@ if( fUnicode ) { int bomSize; const unsigned char *bom = get_utf8_bom(&bomSize); fwrite(bom, 1, bomSize, f); blob_to_utf8_no_bom(p, 0); + }else if( fHasInvalidUtf8 ){ + blob_cp1252_to_utf8(p); } if( fHasAnyCr ){ blob_to_lf_only(p); } fwrite(blob_buffer(p), 1, blob_size(p), f);