Check-in [77d9664918]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:First attempt to mix TIP's #657 and #671
Timelines: family | ancestors | descendants | both | tip-671
Files: files | file ages | folders
SHA3-256: 77d9664918778822d6b7eb0c4b4644bd75b0e7ffa7e258ca3e3be93496bbf296
User & Date: jan.nijtmans 2023-07-14 14:51:27.755
Context
2023-07-16
21:29
Remove non-supported testcases check-in: 5d54238fee user: jan.nijtmans tags: tip-671
2023-07-14
14:51
First attempt to mix TIP's #657 and #671 check-in: 77d9664918 user: jan.nijtmans tags: tip-671
11:13
Use "strict" in almost all commands. Only "glob" and environment variables are left out. (Experiment... check-in: da536b9803 user: jan.nijtmans tags: tip-657
2023-07-13
05:43
Merge trunk check-in: 3f0de542ee user: apnadkarni tags: tip-671
Changes
Unified Diff Ignore Whitespace Patch
Changes to generic/tcl.h.
2014
2015
2016
2017
2018
2019
2020

2021
2022
2023
2024
2025
2026
2027
 * Reserve top byte for profile values (disjoint, not a mask). In case of
 * changes, ensure ENCODING_PROFILE_* macros in tclInt.h are modified if
 * necessary.
 */
#define TCL_ENCODING_PROFILE_STRICT   TCL_ENCODING_STOPONERROR
#define TCL_ENCODING_PROFILE_TCL8     0x01000000
#define TCL_ENCODING_PROFILE_REPLACE  0x02000000


/*
 * The following definitions are the error codes returned by the conversion
 * routines:
 *
 * TCL_OK -			All characters were converted.
 * TCL_CONVERT_NOSPACE -	The output buffer would not have been large







>







2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
 * Reserve top byte for profile values (disjoint, not a mask). In case of
 * changes, ensure ENCODING_PROFILE_* macros in tclInt.h are modified if
 * necessary.
 */
#define TCL_ENCODING_PROFILE_STRICT   TCL_ENCODING_STOPONERROR
#define TCL_ENCODING_PROFILE_TCL8     0x01000000
#define TCL_ENCODING_PROFILE_REPLACE  0x02000000
#define TCL_ENCODING_PROFILE_LOSSLESS 0x03000000

/*
 * The following definitions are the error codes returned by the conversion
 * routines:
 *
 * TCL_OK -			All characters were converted.
 * TCL_CONVERT_NOSPACE -	The output buffer would not have been large
Changes to generic/tclEncoding.c.
79
80
81
82
83
84
85


86
87
88
89
90
91
92
				/* Two dimensional sparse matrix to map
				 * characters from Unicode to the encoding.
				 * Each element of the fromUnicode array
				 * points to an array of 256 shorts. If there
				 * is no corresponding character the encoding,
				 * the value in the matrix is 0x0000.
				 * malloc'd. */


} TableEncodingData;

/*
 * Each of the following structures is the clientData for a dynamically-loaded
 * escape-driven encoding that is itself comprised of other simpler encodings.
 * An example is "iso-2022-jp", which uses escape sequences to switch between
 * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven"







>
>







79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
				/* Two dimensional sparse matrix to map
				 * characters from Unicode to the encoding.
				 * Each element of the fromUnicode array
				 * points to an array of 256 shorts. If there
				 * is no corresponding character the encoding,
				 * the value in the matrix is 0x0000.
				 * malloc'd. */
    int flags;			/* Miscellaneous flags */
#define ENCODING_ASCII_COMPATIBLE 0x1
} TableEncodingData;

/*
 * Each of the following structures is the clientData for a dynamically-loaded
 * escape-driven encoding that is itself comprised of other simpler encodings.
 * An example is "iso-2022-jp", which uses escape sequences to switch between
 * ascii, jis0208, jis0212, gb2312, and ksc5601. Note that "escape-driven"
191
192
193
194
195
196
197

198
199
200
201
202
203
204
205
206
207
208
209
210



211
212
213
214
215
216
217
 * Names of encoding profiles and corresponding integer values.
 * Keep alphabetical order for error messages.
 */
static struct TclEncodingProfiles {
    const char *name;
    int value;
} encodingProfiles[] = {

    {"replace", TCL_ENCODING_PROFILE_REPLACE},
    {"strict", TCL_ENCODING_PROFILE_STRICT},
    {"tcl8", TCL_ENCODING_PROFILE_TCL8},
};
#define PROFILE_TCL8(flags_)                                           \
    (ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_TCL8)

#define PROFILE_STRICT(flags_)                                         \
    (!PROFILE_TCL8(flags_) && !PROFILE_REPLACE(flags_))

#define PROFILE_REPLACE(flags_)                                         \
    (ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE)




#define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD)
#define SURROGATE(c_)      (((c_) & ~0x7FF) == 0xD800)
#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800)
#define LOW_SURROGATE(c_)  (((c_) & ~0x3FF) == 0xDC00)

/*
 * The following variable is used in the sparse matrix code for a







>







|
|

|
|

>
>
>







193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
 * Names of encoding profiles and corresponding integer values.
 * Keep alphabetical order for error messages.
 */
static struct TclEncodingProfiles {
    const char *name;
    int value;
} encodingProfiles[] = {
    {"lossless", TCL_ENCODING_PROFILE_LOSSLESS},
    {"replace", TCL_ENCODING_PROFILE_REPLACE},
    {"strict", TCL_ENCODING_PROFILE_STRICT},
    {"tcl8", TCL_ENCODING_PROFILE_TCL8},
};
#define PROFILE_TCL8(flags_)                                           \
    (ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_TCL8)

#define PROFILE_REPLACE(flags_)                                           \
    (ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_REPLACE)

#define PROFILE_LOSSLESS(flags_)                                           \
    (ENCODING_PROFILE_GET(flags_) == TCL_ENCODING_PROFILE_LOSSLESS)

#define PROFILE_STRICT(flags_)                                         \
    (!PROFILE_TCL8(flags_) && !PROFILE_REPLACE(flags_) && !PROFILE_LOSSLESS(flags_))

#define UNICODE_REPLACE_CHAR ((Tcl_UniChar)0xFFFD)
#define SURROGATE(c_)      (((c_) & ~0x7FF) == 0xD800)
#define HIGH_SURROGATE(c_) (((c_) & ~0x3FF) == 0xD800)
#define LOW_SURROGATE(c_)  (((c_) & ~0x3FF) == 0xDC00)

/*
 * The following variable is used in the sparse matrix code for a
252
253
254
255
256
257
258














259
260
261
262
263
264
265
static Tcl_EncodingConvertProc	Utf16ToUtfProc;
static Tcl_EncodingConvertProc	UtfToUtf16Proc;
static Tcl_EncodingConvertProc	UtfToUcs2Proc;
static Tcl_EncodingConvertProc	UtfToUtfProc;
static Tcl_EncodingConvertProc	Iso88591FromUtfProc;
static Tcl_EncodingConvertProc	Iso88591ToUtfProc;
















/*
 * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field
 * of the internalrep. This should help the lifetime of encodings be more useful.
 * See concerns raised in [Bug 1077262].
 */








>
>
>
>
>
>
>
>
>
>
>
>
>
>







258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
static Tcl_EncodingConvertProc	Utf16ToUtfProc;
static Tcl_EncodingConvertProc	UtfToUtf16Proc;
static Tcl_EncodingConvertProc	UtfToUcs2Proc;
static Tcl_EncodingConvertProc	UtfToUtfProc;
static Tcl_EncodingConvertProc	Iso88591FromUtfProc;
static Tcl_EncodingConvertProc	Iso88591ToUtfProc;

/* Return 1/0 if unich is a lossless wrapper */
static inline int IsLosslessWrapper(Tcl_UniChar unich) {
    return (unich >= 0xDC00 && unich <= 0xDCFF);
}
/* Convert a byte to internal lossless representation */
static inline Tcl_UniChar ToLossless(char ch) {
    /* Only encode if non-ASCII for security reasons. See TIP */
    return 0x80 & ch ? 0xDC00 + UCHAR(ch) : UNICODE_REPLACE_CHAR;
}
/* Convert an internal lossless representation to raw byte */
static inline unsigned char FromLossless(Tcl_UniChar unich) {
    assert(IsLosslessWrapper(unich));
    return (unsigned char)(unich - 0xDC00);
}

/*
 * A Tcl_ObjType for holding a cached Tcl_Encoding in the twoPtrValue.ptr1 field
 * of the internalrep. This should help the lifetime of encodings be more useful.
 * See concerns raised in [Bug 1077262].
 */

284
285
286
287
288
289
290




































291
292
293
294
295
296
297
    do {								\
	const Tcl_ObjInternalRep *irPtr;					\
	irPtr = TclFetchInternalRep ((objPtr), &encodingType);		\
	(encoding) = irPtr ? (Tcl_Encoding)irPtr->twoPtrValue.ptr1 : NULL;		\
    } while (0)






































/*
 *----------------------------------------------------------------------
 *
 * Tcl_GetEncodingFromObj --
 *
 *	Writes to (*encodingPtr) the Tcl_Encoding value of (*objPtr), if
 *	possible, and returns TCL_OK. If no such encoding exists, TCL_ERROR is







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
    do {								\
	const Tcl_ObjInternalRep *irPtr;					\
	irPtr = TclFetchInternalRep ((objPtr), &encodingType);		\
	(encoding) = irPtr ? (Tcl_Encoding)irPtr->twoPtrValue.ptr1 : NULL;		\
    } while (0)


/*
 *------------------------------------------------------------------------
 *
 * ToLosslessUtf8 --
 *
 *    Converts an entire string of bytes to their lossless utf-8 representation.
 *    Caller has to ensure the entire string is to be treated as invalid encoding.
 *
 * Results:
 *    Number of bytes in converted utf-8 output or a negative value if
 *    insufficient space.
 *
 * Side effects:
 *    The dst buffer is filled with the utf-8 lossless representation.
 *
 *------------------------------------------------------------------------
 */
static Tcl_Size
ToLosslessUtf8(
	const char *src, /* Source bytes */
	Tcl_Size srcLen, /* Number of source bytes */
	char *dst,       /* Destination buffer */
	Tcl_Size dstLen) /* Size of destination buffer */
{
    if ((dstLen / 3) < srcLen) {
	return -1; /* No space */
    }
    const char *srcEnd = src + srcLen;
    char *dstStart = dst;
    while (src < srcEnd) {
	dst += Tcl_UniCharToUtf(ToLossless(UCHAR(*src)), dst);
	++src;
    }
    return (dst - dstStart);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_GetEncodingFromObj --
 *
 *	Writes to (*encodingPtr) the Tcl_Encoding value of (*objPtr), if
 *	possible, and returns TCL_OK. If no such encoding exists, TCL_ERROR is
1223
1224
1225
1226
1227
1228
1229

1230
1231
1232
1233
1234
1235
1236
	/* TODO - what other flags are illegal? - See TIP 656 */
	Tcl_SetObjResult(
	    interp,
	    Tcl_NewStringObj(
		"Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.",
		TCL_INDEX_NONE));
	Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL);

	return TCL_ERROR;
    }

    dst = Tcl_DStringValue(dstPtr);
    dstLen = dstPtr->spaceAvl - 1;

    if (encoding == NULL) {







>







1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
	/* TODO - what other flags are illegal? - See TIP 656 */
	Tcl_SetObjResult(
	    interp,
	    Tcl_NewStringObj(
		"Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.",
		TCL_INDEX_NONE));
	Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL);
	errno = EINVAL;
	return TCL_ERROR;
    }

    dst = Tcl_DStringValue(dstPtr);
    dstLen = dstPtr->spaceAvl - 1;

    if (encoding == NULL) {
1298
1299
1300
1301
1302
1303
1304



1305
1306
1307
1308
1309
1310
1311
				      TCL_Z_MODIFIER "u: '\\x%02X'",
				      nBytesProcessed,
				      UCHAR(srcStart[nBytesProcessed])));
		    Tcl_SetErrorCode(
			interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", buf, NULL);
		}
	    }



	    return result;
	}

	flags &= ~TCL_ENCODING_START;
	srcLen -= srcChunkRead;

	if (Tcl_DStringLength(dstPtr) == 0) {







>
>
>







1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
				      TCL_Z_MODIFIER "u: '\\x%02X'",
				      nBytesProcessed,
				      UCHAR(srcStart[nBytesProcessed])));
		    Tcl_SetErrorCode(
			interp, "TCL", "ENCODING", "ILLEGALSEQUENCE", buf, NULL);
		}
	    }
	    if (result != TCL_OK) {
		errno = (result == TCL_CONVERT_NOSPACE) ? ENOMEM : EILSEQ;
	    }
	    return result;
	}

	flags &= ~TCL_ENCODING_START;
	srcLen -= srcChunkRead;

	if (Tcl_DStringLength(dstPtr) == 0) {
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
 *
 * Tcl_UtfToExternalDStringEx --
 *
 *	Convert a source buffer from UTF-8 to the specified encoding.
 *	The parameter flags controls the behavior, if any of the bytes in
 *	the source buffer are invalid or cannot be represented in the
 *	target encoding. It should be composed by OR-ing the following:
 *	- *At most one* of TCL_ENCODING_PROFILE{DEFAULT,TCL8,STRICT}
 *
 * Results:
 *      The return value is one of
 *        TCL_OK: success. Converted string in *dstPtr
 *        TCL_ERROR: error in passed parameters. Error message in interp
 *        TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence
 *        TCL_CONVERT_SYNTAX: source is not conformant to encoding definition







|







1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
 *
 * Tcl_UtfToExternalDStringEx --
 *
 *	Convert a source buffer from UTF-8 to the specified encoding.
 *	The parameter flags controls the behavior, if any of the bytes in
 *	the source buffer are invalid or cannot be represented in the
 *	target encoding. It should be composed by OR-ing the following:
 *	- *At most one* of TCL_ENCODING_PROFILE_*
 *
 * Results:
 *      The return value is one of
 *        TCL_OK: success. Converted string in *dstPtr
 *        TCL_ERROR: error in passed parameters. Error message in interp
 *        TCL_CONVERT_MULTIBYTE: source ends in truncated multibyte sequence
 *        TCL_CONVERT_SYNTAX: source is not conformant to encoding definition
1549
1550
1551
1552
1553
1554
1555

1556
1557
1558
1559
1560
1561
1562
	/* TODO - what other flags are illegal? - See TIP 656 */
	Tcl_SetObjResult(
	    interp,
	    Tcl_NewStringObj(
		"Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.",
		TCL_INDEX_NONE));
	Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL);

	return TCL_ERROR;
    }

    dst = Tcl_DStringValue(dstPtr);
    dstLen = dstPtr->spaceAvl - 1;

    if (encoding == NULL) {







>







1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
	/* TODO - what other flags are illegal? - See TIP 656 */
	Tcl_SetObjResult(
	    interp,
	    Tcl_NewStringObj(
		"Parameter error: TCL_ENCODING_{START,STOP} bits set in flags.",
		TCL_INDEX_NONE));
	Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALFLAGS", NULL);
	errno = EINVAL;
	return TCL_ERROR;
    }

    dst = Tcl_DStringValue(dstPtr);
    dstLen = dstPtr->spaceAvl - 1;

    if (encoding == NULL) {
1628
1629
1630
1631
1632
1633
1634



1635
1636
1637
1638
1639
1640
1641
			    "u: 'U+%06X'",
			    pos,
			    ucs4));
		    Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE",
				     buf, NULL);
		}
	    }



	    return result;
	}

	flags &= ~TCL_ENCODING_START;
	srcLen -= srcChunkRead;

	if (Tcl_DStringLength(dstPtr) == 0) {







>
>
>







1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
			    "u: 'U+%06X'",
			    pos,
			    ucs4));
		    Tcl_SetErrorCode(interp, "TCL", "ENCODING", "ILLEGALSEQUENCE",
				     buf, NULL);
		}
	    }
	    if (result != TCL_OK) {
		errno = (result == TCL_CONVERT_NOSPACE) ? ENOMEM : EILSEQ;
	    }
	    return result;
	}

	flags &= ~TCL_ENCODING_START;
	srcLen -= srcChunkRead;

	if (Tcl_DStringLength(dstPtr) == 0) {
2093
2094
2095
2096
2097
2098
2099

2100
2101

2102
2103
2104
















2105
2106
2107
2108
2109
2110
2111
	    pageMemPtr++;
	    p += 4;
	}
    }
    TclDecrRefCount(objPtr);

    if (type == ENCODING_DOUBLEBYTE) {

	memset(dataPtr->prefixBytes, 1, sizeof(dataPtr->prefixBytes));
    } else {

	for (hi = 1; hi < 256; hi++) {
	    if (dataPtr->toUnicode[hi] != NULL) {
		dataPtr->prefixBytes[hi] = 1;
















	    }
	}
    }

    /*
     * Invert the toUnicode array to produce the fromUnicode array. Performs a
     * single malloc to get the memory for the array and all the pages needed







>


>



>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
	    pageMemPtr++;
	    p += 4;
	}
    }
    TclDecrRefCount(objPtr);

    if (type == ENCODING_DOUBLEBYTE) {
	/* DBCS never ascii compatible so no need to set dataPtr->flags */
	memset(dataPtr->prefixBytes, 1, sizeof(dataPtr->prefixBytes));
    } else {
	int asciiCompatible = 1;
	for (hi = 1; hi < 256; hi++) {
	    if (dataPtr->toUnicode[hi] != NULL) {
		dataPtr->prefixBytes[hi] = 1;
		if (hi < 128) {
		    /* any byte < 128 is a prefix => not ASCII compatible */
		    asciiCompatible = 0;
		}
	    }
	}
	if (asciiCompatible) {
	    for (lo = 1; lo < 128; ++lo) {
		if (dataPtr->toUnicode[0][lo] != lo) {
		    /* any byte < 128 does not map to itself => not ASCII compatible */
		    asciiCompatible = 0;
		    break;
		}
	    }
	    if (asciiCompatible) {
		dataPtr->flags |= ENCODING_ASCII_COMPATIBLE;
	    }
	}
    }

    /*
     * Invert the toUnicode array to produce the fromUnicode array. Performs a
     * single malloc to get the memory for the array and all the pages needed
2550
2551
2552
2553
2554
2555
2556
2557











2558
2559
2560


2561
2562
2563
2564
2565



2566
2567
2568



2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581

2582
2583
2584
2585

2586

2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597

2598
2599
2600
2601
2602

2603


2604


2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619






2620





2621
2622
2623
2624
2625
2626
2627
	    /*
	     * Copy 7bit characters, but skip null-bytes when we are in input
	     * mode, so that they get converted to \xC0\x80.
	     */
	    *dst++ = *src++;
	} else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) &&
		 (UCHAR(src[1]) == 0x80) &&
		 (!(flags & ENCODING_INPUT) || PROFILE_STRICT(profile) ||











		  PROFILE_REPLACE(profile))) {
	    /* Special sequence \xC0\x80 */
	    if ((PROFILE_STRICT(profile) || PROFILE_REPLACE(profile)) && (flags & ENCODING_INPUT)) {


		if (PROFILE_REPLACE(profile)) {
		   dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		   src += 2;
		} else {
		   /* PROFILE_STRICT */



		   result = TCL_CONVERT_SYNTAX;
		   break;
		}



	    } else {
		/*
		 * For output convert 0xC080 to a real null.
		 */
		*dst++ = 0;
		src += 2;
	    }

	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
	    /*
	     * Incomplete byte sequence.
		 * Always check before using TclUtfToUCS4. Not doing so can cause it
		 * run beyond the end of the buffer! If we happen on such an incomplete

		 * char its bytes are made to represent themselves unless the user has
		 * explicitly asked to be told.
	     */


	    if (flags & ENCODING_INPUT) {

		/* Incomplete bytes for modified UTF-8 target */
		if (PROFILE_STRICT(profile)) {
		    result = (flags & TCL_ENCODING_CHAR_LIMIT)
			       ? TCL_CONVERT_MULTIBYTE
			       : TCL_CONVERT_SYNTAX;
		    break;
		}
	    }
	    if (PROFILE_REPLACE(profile)) {
		ch = UNICODE_REPLACE_CHAR;
		++src;

	    } else {
		/* TCL_ENCODING_PROFILE_TCL8 */
		char chbuf[2];
		chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
		TclUtfToUCS4(chbuf, &ch);

	    }


	    dst += Tcl_UniCharToUtf(ch, dst);


	} else {
	    int isInvalid = 0;
	    size_t len = TclUtfToUCS4(src, &ch);
	    if (flags & ENCODING_INPUT) {
		if ((len < 2) && (ch != 0)) {
		    isInvalid = 1;
		} else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)) {
		    isInvalid = 1;
		}
		if (isInvalid) {
		    if (PROFILE_STRICT(profile)) {
			result = TCL_CONVERT_SYNTAX;
			break;
		    } else if (PROFILE_REPLACE(profile)) {
			ch = UNICODE_REPLACE_CHAR;






		    }





		}
	    }

	    const char *saveSrc = src;
	    src += len;
	    if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) {
		if (ch > 0xFFFF) {







|
>
>
>
>
>
>
>
>
>
>
>
|
<
|
>
>
|



|
>
>
>
|
|
|
>
>
>

<
|
<



<


|
|
|
>
|
|

|
>

>


|
|
|
|





>
|
<



>
|
>
>
|
>
>















>
>
>
>
>
>
|
>
>
>
>
>







2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651

2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669

2670

2671
2672
2673

2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699

2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
	    /*
	     * Copy 7bit characters, but skip null-bytes when we are in input
	     * mode, so that they get converted to \xC0\x80.
	     */
	    *dst++ = *src++;
	} else if ((UCHAR(*src) == 0xC0) && (src + 1 < srcEnd) &&
		 (UCHAR(src[1]) == 0x80) &&
		 (!(flags & ENCODING_INPUT) || ! PROFILE_TCL8(profile))) {
	    /* 
	     * \xC0\x80 is handled specially for either of the following:
	     *   1. We are doing (internal) modified utf-8 to (external)
	     *   conformant utf-8. C080 is valid internal utf-8 so we
	     *   simply output a \0. Note this overrides case 2.
	     *   2. The profile in use is not TCL8, in which case we have to
	     *   to take a profile-dependent action.
	     * Note the remaining case of external->internal with a TCL8
	     * profile is handled in the default if clause later. (TODO - why not here?)
	     */
	    if (flags & ENCODING_INPUT) {
		assert(!PROFILE_TCL8(profile));

		if (PROFILE_STRICT(profile)) {
		   result = TCL_CONVERT_SYNTAX;
		   break;
		} else if (PROFILE_REPLACE(profile)) {
		   dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		   src += 2;
		} else {
		   assert(PROFILE_LOSSLESS(profile));
		   Tcl_Size len =
		       ToLosslessUtf8(src, 2, dst, dstLen - (dst - dstStart));
		   if (len < 0) {
		       result = TCL_CONVERT_NOSPACE;
		       break;
		   }
		   dst += len;
		   src += 2;
		}
	    } else {

		/* For output convert 0xC080 to a real null. */

		*dst++ = 0;
		src += 2;
	    }

	} else if (!Tcl_UtfCharComplete(src, srcEnd - src)) {
	    /*
	     * Incomplete byte sequence. We need to do this check before the
	     * TclUtfToUCS4 checks in the next sibling if clause. Not doing
	     * so can cause it run beyond the end of the buffer! If we
	     * happen on such an incomplete char its bytes are made to
	     * represent themselves unless the user has explicitly asked to
	     * be told.
	     */
	    assert(flags & TCL_ENCODING_END); /* Else break earlier would
	    					 trigger (srcClose compare) */
	    if (flags & ENCODING_INPUT) {
		/* TODO - why is this inside a ENCODING_INPUT check? */
		/* Incomplete bytes for modified UTF-8 target */
		if (PROFILE_STRICT(profile)) {
		   result = (flags & TCL_ENCODING_CHAR_LIMIT)
			      ? TCL_CONVERT_MULTIBYTE
			      : TCL_CONVERT_SYNTAX;
		   break;
		}
	    }
	    if (PROFILE_REPLACE(profile)) {
		ch = UNICODE_REPLACE_CHAR;
		++src;
		dst += Tcl_UniCharToUtf(ch, dst);
	    } else if (PROFILE_TCL8(profile)) {

		char chbuf[2];
		chbuf[0] = UCHAR(*src++); chbuf[1] = 0;
		TclUtfToUCS4(chbuf, &ch);
		dst += Tcl_UniCharToUtf(ch, dst);
	    } else {
		assert(PROFILE_LOSSLESS(profile));
                ch = ToLossless(UCHAR(*src));
		dst += Tcl_UniCharToUtf(ch, dst);
		src += 1;
	    }
	} else {
	    int isInvalid = 0;
	    size_t len = TclUtfToUCS4(src, &ch);
	    if (flags & ENCODING_INPUT) {
		if ((len < 2) && (ch != 0)) {
		    isInvalid = 1;
		} else if ((ch > 0xFFFF) && !(flags & ENCODING_UTF)) {
		    isInvalid = 1;
		}
		if (isInvalid) {
		    if (PROFILE_STRICT(profile)) {
			result = TCL_CONVERT_SYNTAX;
			break;
		    } else if (PROFILE_REPLACE(profile)) {
			ch = UNICODE_REPLACE_CHAR;
		    } else if (PROFILE_LOSSLESS(profile)) {
			Tcl_Size n = ToLosslessUtf8(
			    src, len, dst, dstLen - (dst - dstStart));
			if (n < 0) {
			    result = TCL_CONVERT_NOSPACE;
			    break;
			}
			dst += n;
			src += len;
			continue;
		    }
		    /* else PROFILE_TCL8 - treat as normal char below */
		}
	    }

	    const char *saveSrc = src;
	    src += len;
	    if (!(flags & ENCODING_UTF) && !(flags & ENCODING_INPUT) && (ch > 0x3FF)) {
		if (ch > 0xFFFF) {
2635
2636
2637
2638
2639
2640
2641
2642
2643

2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683




2684
2685
2686
2687
2688
2689
2690
2691
#if TCL_UTF_MAX < 4
	    cesu8:
#endif
		*dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
		*dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
		*dst++ = (char) ((ch | 0x80) & 0xBF);
		continue;
#if TCL_UTF_MAX < 4
	    } else if (SURROGATE(ch)) {

		/*
		 * A surrogate character is detected, handle especially.
		 */
		if (PROFILE_STRICT(profile) && (flags & ENCODING_UTF)) {
		    result = TCL_CONVERT_UNKNOWN;
		    src = saveSrc;
		    break;
		}
		if (PROFILE_REPLACE(profile)) {
		    /* TODO - is this right for cesu8 or should we fall through below? */
		    ch = UNICODE_REPLACE_CHAR;
		} else {
		    int low = ch;
		    len = (src <= srcEnd - 3) ? TclUtfToUCS4(src, &low) : 0;

		    if ((!LOW_SURROGATE(low)) || (ch & 0x400)) {

			if (PROFILE_STRICT(profile)) {
			    result = TCL_CONVERT_UNKNOWN;
			    src = saveSrc;
			    break;
			}
			goto cesu8;
		    }
		    src += len;
		    dst += Tcl_UniCharToUtf(ch, dst);
		    ch = low;
		}
#endif
	    } else if (PROFILE_STRICT(profile) &&
		       (!(flags & ENCODING_INPUT)) &&
		       SURROGATE(ch)) {
		result = TCL_CONVERT_UNKNOWN;
		src = saveSrc;
		break;
	    } else if (PROFILE_STRICT(profile) &&
		       (flags & ENCODING_INPUT) &&
		       SURROGATE(ch)) {
		result = TCL_CONVERT_SYNTAX;
		src = saveSrc;




		break;
	    }
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;







<

>







<
|



















|
|
|
<
|
|
|
|
<
|
<
<
>
>
>
>
|







2751
2752
2753
2754
2755
2756
2757

2758
2759
2760
2761
2762
2763
2764
2765
2766

2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789

2790
2791
2792
2793

2794


2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
#if TCL_UTF_MAX < 4
	    cesu8:
#endif
		*dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF);
		*dst++ = (char) (((ch >> 6) | 0x80) & 0xBF);
		*dst++ = (char) ((ch | 0x80) & 0xBF);
		continue;

	    } else if (SURROGATE(ch)) {
#if TCL_UTF_MAX < 4
		/*
		 * A surrogate character is detected, handle especially.
		 */
		if (PROFILE_STRICT(profile) && (flags & ENCODING_UTF)) {
		    result = TCL_CONVERT_UNKNOWN;
		    src = saveSrc;
		    break;

		} else if (PROFILE_REPLACE(profile)) {
		    /* TODO - is this right for cesu8 or should we fall through below? */
		    ch = UNICODE_REPLACE_CHAR;
		} else {
		    int low = ch;
		    len = (src <= srcEnd - 3) ? TclUtfToUCS4(src, &low) : 0;

		    if ((!LOW_SURROGATE(low)) || (ch & 0x400)) {

			if (PROFILE_STRICT(profile)) {
			    result = TCL_CONVERT_UNKNOWN;
			    src = saveSrc;
			    break;
			}
			goto cesu8;
		    }
		    src += len;
		    dst += Tcl_UniCharToUtf(ch, dst);
		    ch = low;
		}
#else /* TCL_UTF_MAX */
		if (PROFILE_STRICT(profile)) {
		    result = (flags & ENCODING_INPUT) ? TCL_CONVERT_SYNTAX

						      : TCL_CONVERT_UNKNOWN;
		    src = saveSrc;
		    break;
		} else if (PROFILE_LOSSLESS(profile)) {

		    if (IsLosslessWrapper(ch)) {


			*dst++ = FromLossless(ch); /* Invalid UTF8 by design! */
			continue;
		    }
		}
#endif
	    }
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
2811
2812
2813
2814
2815
2816
2817
2818

2819
2820
2821
2822
2823
2824
2825
	    }
	} else if (PROFILE_STRICT(flags) && SURROGATE(ch)) {
	    result = TCL_CONVERT_SYNTAX;
#if TCL_UTF_MAX < 4
	    ch = 0;
#endif
	    break;
	} else if (PROFILE_REPLACE(flags) && SURROGATE(ch)) {

	    ch = UNICODE_REPLACE_CHAR;
	}

	/*
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * unsigned short-size data.
	 */







|
>







2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
	    }
	} else if (PROFILE_STRICT(flags) && SURROGATE(ch)) {
	    result = TCL_CONVERT_SYNTAX;
#if TCL_UTF_MAX < 4
	    ch = 0;
#endif
	    break;
	} else if ((! PROFILE_TCL8(flags)) && SURROGATE(ch)) {
            /* PROFILE_REPLACE | PROFILE_LOSSLESS */
	    ch = UNICODE_REPLACE_CHAR;
	}

	/*
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * unsigned short-size data.
	 */
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
    if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	} else {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
	    } else {
		/* PROFILE_REPLACE or PROFILE_TCL8 */
		result = TCL_OK;
		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		numChars++;
		src += bytesLeft; /* Go past truncated code unit */
	    }
	}
    }







|







2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
    if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	} else {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
	    } else {
		/* PROFILE_REPLACE | PROFILE_LOSSLESS | PROFILE_TCL8 */
		result = TCL_OK;
		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		numChars++;
		src += bytesLeft; /* Go past truncated code unit */
	    }
	}
    }
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
	    break;
	}
	len = TclUtfToUCS4(src, &ch);
	if (SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	    if (PROFILE_REPLACE(flags)) {
		ch = UNICODE_REPLACE_CHAR;
	    }
	}
	src += len;
	if (flags & TCL_ENCODING_LE) {
	    *dst++ = (ch & 0xFF);
	    *dst++ = ((ch >> 8) & 0xFF);







|
|







3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
	    break;
	}
	len = TclUtfToUCS4(src, &ch);
	if (SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    } else if (! PROFILE_TCL8(flags)) {
                /* PROFILE_REPLACE | PROFILE_LOSSLESS */
		ch = UNICODE_REPLACE_CHAR;
	    }
	}
	src += len;
	if (flags & TCL_ENCODING_LE) {
	    *dst++ = (ch & 0xFF);
	    *dst++ = ((ch >> 8) & 0xFF);
3069
3070
3071
3072
3073
3074
3075
3076



3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108


3109

3110


3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127

3128
3129
3130
3131
3132
3133
3134
3135

3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
	if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
		src -= 2; /* Go back to beginning of high surrogate */
		dst--; /* Also undo writing a single byte too much */
		numChars--;
		break;
	    } else if (PROFILE_REPLACE(flags)) {



		/*
		 * Previous loop wrote a single byte to mark the high surrogate.
		 * Replace it with the replacement character. Further, restart
		 * current loop iteration since need to recheck destination space
		 * and reset processing of current character.
		 */
		ch = UNICODE_REPLACE_CHAR;
		dst--;
		dst += Tcl_UniCharToUtf(ch, dst);
		src -= 2;
		numChars--;
		continue;
	    } else {
	    /* Bug [10c2c17c32]. If Hi surrogate not followed by Lo surrogate, finish 3-byte UTF-8 */
		dst += Tcl_UniCharToUtf(-1, dst);
	    }
	}

	/*
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * unsigned short-size data.
	 */

	if ((unsigned)ch - 1 < 0x7F) {
	    *dst++ = (ch & 0xFF);
	} else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) {
	    dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
	} else if (LOW_SURROGATE(ch) && !PROFILE_TCL8(flags)) {
	    /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;


	    } else {

		/* PROFILE_REPLACE */


		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
	    }
	} else {
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }

    if (HIGH_SURROGATE(ch)) {
	if (PROFILE_STRICT(flags)) {
	    result = TCL_CONVERT_SYNTAX;
	    src -= 2;
	    dst--;
	    numChars--;
	} else if (PROFILE_REPLACE(flags)) {
	    dst--;
	    dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
	} else {

	    /* Bug [10c2c17c32]. If Hi surrogate, finish 3-byte UTF-8 */
	    dst += Tcl_UniCharToUtf(-1, dst);
	}
    }

    /*
     * If we had a truncated code unit at the end AND this is the last
     * fragment AND profile is not "strict", stick FFFD in its place.

     */
    if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	} else {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
	    } else {
		/* PROFILE_REPLACE or PROFILE_TCL8 */
		result = TCL_OK;
		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		numChars++;
		src++; /* Go past truncated code unit */
	    }
	}
    }







|
>
>
>












<
<
<












|




>
>

>
|
>
>

|











|
<
|

>
|
|





|
>








|







3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207



3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245

3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
	if (HIGH_SURROGATE(prev) && !LOW_SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
		src -= 2; /* Go back to beginning of high surrogate */
		dst--; /* Also undo writing a single byte too much */
		numChars--;
		break;
	    } else if (PROFILE_TCL8(flags)) {
                /* Bug [10c2c17c32]. Hi surrogate not followed by Lo: finish 3-byte UTF-8 */
		dst += Tcl_UniCharToUtf(-1, dst);
	    } else {
		/*
		 * Previous loop wrote a single byte to mark the high surrogate.
		 * Replace it with the replacement character. Further, restart
		 * current loop iteration since need to recheck destination space
		 * and reset processing of current character.
		 */
		ch = UNICODE_REPLACE_CHAR;
		dst--;
		dst += Tcl_UniCharToUtf(ch, dst);
		src -= 2;
		numChars--;
		continue;



	    }
	}

	/*
	 * Special case for 1-byte utf chars for speed. Make sure we work with
	 * unsigned short-size data.
	 */

	if ((unsigned)ch - 1 < 0x7F) {
	    *dst++ = (ch & 0xFF);
	} else if (HIGH_SURROGATE(prev) || HIGH_SURROGATE(ch)) {
	    dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst);
	} else if (LOW_SURROGATE(ch)) {
	    /* Lo surrogate not preceded by Hi surrogate and not tcl8 profile */
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    } else if (PROFILE_TCL8(flags)) {
                dst += Tcl_UniCharToUtf(ch, dst);
	    } else {
                /*
                 * PROFILE_REPLACE | PROFILE_LOSSLESS. LOSSLESS treated like
                 * REPLACE for UTF16 - see TIP 671
                 */
		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
            }
	} else {
	    dst += Tcl_UniCharToUtf(ch, dst);
	}
    }

    if (HIGH_SURROGATE(ch)) {
	if (PROFILE_STRICT(flags)) {
	    result = TCL_CONVERT_SYNTAX;
	    src -= 2;
	    dst--;
	    numChars--;
	} else if (PROFILE_TCL8(flags)) {

	    dst += Tcl_UniCharToUtf(-1, dst);
	} else {
            /* PROFILE_REPLACE | PROFILE_LOSSLESS */
	    dst--;
	    dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
	}
    }

    /*
     * If we had a truncated code unit at the end AND this is the last
     * fragment AND profile is not "strict", use the appropriate replacement
     * strategy.
     */
    if ((flags & TCL_ENCODING_END) && (result == TCL_CONVERT_MULTIBYTE)) {
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	} else {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
	    } else {
		/* PROFILE_REPLACE | PROFILE_TCL8 | PROFILE_LOSSLESS */
		result = TCL_OK;
		dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		numChars++;
		src++; /* Go past truncated code unit */
	    }
	}
    }
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
	    break;
	}
	len = TclUtfToUCS4(src, &ch);
	if (SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	    if (PROFILE_REPLACE(flags)) {
		ch = UNICODE_REPLACE_CHAR;
	    }
	}
	src += len;
	if (flags & TCL_ENCODING_LE) {
	    if (ch <= 0xFFFF) {
		*dst++ = (ch & 0xFF);







|
|







3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
	    break;
	}
	len = TclUtfToUCS4(src, &ch);
	if (SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    } else if (! PROFILE_TCL8(flags)) {
                /* PROFILE_REPLACE | PROFILE_LOSSLESS */
		ch = UNICODE_REPLACE_CHAR;
	    }
	}
	src += len;
	if (flags & TCL_ENCODING_LE) {
	    if (ch <= 0xFFFF) {
		*dst++ = (ch & 0xFF);
3350
3351
3352
3353
3354
3355
3356

3357
3358
3359




3360
3361
3362
3363
3364
3365
3366
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	    ch = UNICODE_REPLACE_CHAR;
	}
#endif

	if (PROFILE_STRICT(flags) && SURROGATE(ch)) {
	    result = TCL_CONVERT_SYNTAX;
	    break;




	}

	src += len;

	/*
	 * Need to handle this in a way that won't cause misalignment by
	 * casting dst to a Tcl_UniChar. [Bug 1122671]







>
|
|
|
>
>
>
>







3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	    ch = UNICODE_REPLACE_CHAR;
	}
#endif
	if (SURROGATE(ch)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
		break;
	    } else if (!PROFILE_TCL8(flags)) {
                /* PROFILE_REPLACE | PROFILE_LOSSLESS */
                ch = UNICODE_REPLACE_CHAR;
            }
	}

	src += len;

	/*
	 * Need to handle this in a way that won't cause misalignment by
	 * casting dst to a Tcl_UniChar. [Bug 1122671]
3457
3458
3459
3460
3461
3462
3463

3464
3465
3466
3467
3468
3469
3470
3471
3472






3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492


3493
3494
3495
3496
3497
3498
3499
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	byte = *((unsigned char *) src);
	if (prefixBytes[byte]) {
	    if (src >= srcEnd-1) {
		/* Prefix byte but nothing after it */

		if (!(flags & TCL_ENCODING_END)) {
		    /* More data to come */
		    result = TCL_CONVERT_MULTIBYTE;
		    break;
		} else if (PROFILE_STRICT(flags)) {
		    result = TCL_CONVERT_SYNTAX;
		    break;
		} else if (PROFILE_REPLACE(flags)) {
		    ch = UNICODE_REPLACE_CHAR;






		} else {
		    ch = (Tcl_UniChar)byte;
		}
	    } else {
		ch = toUnicode[byte][*((unsigned char *)++src)];
	    }
	} else {
	    ch = pageZero[byte];
	}
	if ((ch == 0) && (byte != 0)) {
	    /* Prefix+suffix pair is invalid */
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
		break;
	    }
	    if (prefixBytes[byte]) {
		src--;
	    }
	    if (PROFILE_REPLACE(flags)) {
		ch = UNICODE_REPLACE_CHAR;


	    } else {
		ch = (Tcl_UniChar)byte;
	    }
	}

	/*
	 * Special case for 1-byte Utf chars for speed.







>









>
>
>
>
>
>




















>
>







3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	byte = *((unsigned char *) src);
	if (prefixBytes[byte]) {
	    if (src >= srcEnd-1) {
		/* Prefix byte but nothing after it */
		/* Truncated sequence ... */
		if (!(flags & TCL_ENCODING_END)) {
		    /* More data to come */
		    result = TCL_CONVERT_MULTIBYTE;
		    break;
		} else if (PROFILE_STRICT(flags)) {
		    result = TCL_CONVERT_SYNTAX;
		    break;
		} else if (PROFILE_REPLACE(flags)) {
		    ch = UNICODE_REPLACE_CHAR;
		} else if (PROFILE_LOSSLESS(flags)) {
		    if (dataPtr->flags & ENCODING_ASCII_COMPATIBLE) {
			ch = ToLossless(byte);
		    } else {
			ch = UNICODE_REPLACE_CHAR;
		    }
		} else {
		    ch = (Tcl_UniChar)byte;
		}
	    } else {
		ch = toUnicode[byte][*((unsigned char *)++src)];
	    }
	} else {
	    ch = pageZero[byte];
	}
	if ((ch == 0) && (byte != 0)) {
	    /* Prefix+suffix pair is invalid */
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_SYNTAX;
		break;
	    }
	    if (prefixBytes[byte]) {
		src--;
	    }
	    if (PROFILE_REPLACE(flags)) {
		ch = UNICODE_REPLACE_CHAR;
	    } else if (PROFILE_LOSSLESS(flags)) {
		ch = ToLossless(byte);
	    } else {
		ch = (Tcl_UniChar)byte;
	    }
	}

	/*
	 * Special case for 1-byte Utf chars for speed.
3556
3557
3558
3559
3560
3561
3562

3563
3564
3565
3566
3567
3568
3569
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd, *prefixBytes;
    Tcl_UniChar ch = 0;
    int result, len, word, numChars;
    TableEncodingData *dataPtr = (TableEncodingData *)clientData;

    const unsigned short *const *fromUnicode;

    result = TCL_OK;

    prefixBytes = dataPtr->prefixBytes;
    fromUnicode = (const unsigned short *const *) dataPtr->fromUnicode;








>







3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd, *prefixBytes;
    Tcl_UniChar ch = 0;
    int result, len, word, numChars;
    TableEncodingData *dataPtr = (TableEncodingData *)clientData;
    int asciiCompatible = dataPtr->flags & ENCODING_ASCII_COMPATIBLE;
    const unsigned short *const *fromUnicode;

    result = TCL_OK;

    prefixBytes = dataPtr->prefixBytes;
    fromUnicode = (const unsigned short *const *) dataPtr->fromUnicode;

3595
3596
3597
3598
3599
3600
3601
3602
3603

3604
3605
3606
3607
3608






3609
3610

3611
3612
3613
3614
3615
3616
3617
3618
	    word = 0;
	} else
#else
	if (!len) {
	    word = 0;
	} else
#endif
	    word = fromUnicode[(ch >> 8)][ch & 0xFF];


	if ((word == 0) && (ch != 0)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }






	    word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */
	}

	if (prefixBytes[(word >> 8)] != 0) {
	    if (dst + 1 > dstEnd) {
		result = TCL_CONVERT_NOSPACE;
		break;
	    }
	    dst[0] = (char) (word >> 8);
	    dst[1] = (char) word;
	    dst += 2;







|

>





>
>
>
>
>
>
|
|
>
|







3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
	    word = 0;
	} else
#else
	if (!len) {
	    word = 0;
	} else
#endif
	word = fromUnicode[(ch >> 8)][ch & 0xFF];

	int isWrappedLossless = 0;
	if ((word == 0) && (ch != 0)) {
	    if (PROFILE_STRICT(flags)) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	    if (PROFILE_LOSSLESS(flags) && IsLosslessWrapper(ch) &&
		asciiCompatible) {
		word = FromLossless(ch);
		isWrappedLossless = 1;
	    }
	    else {
		word = dataPtr->fallback; /* Both profiles REPLACE and TCL8 */
	    }
	}
	if (prefixBytes[(word >> 8)] != 0 && !isWrappedLossless) {
	    if (dst + 1 > dstEnd) {
		result = TCL_CONVERT_NOSPACE;
		break;
	    }
	    dst[0] = (char) (word >> 8);
	    dst[1] = (char) word;
	    dst += 2;
3689
3690
3691
3692
3693
3694
3695





3696
3697
3698
3699
3700
3701
3702
    /* Initialize the buffer so that some random data doesn't trick
     * Tcl_UniCharToUtf() into thinking it should combine surrogate pairs.
     * Once TCL_UTF_MAX == 3 is removed and Tcl_UniCharToUtf restored to its
     * prior non-stateful nature, this call to memset can also be removed.
     */
    memset(dst, 0xff, dstLen);
#endif






    result = TCL_OK;
    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
	Tcl_UniChar ch = 0;

	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;







>
>
>
>
>







3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
    /* Initialize the buffer so that some random data doesn't trick
     * Tcl_UniCharToUtf() into thinking it should combine surrogate pairs.
     * Once TCL_UTF_MAX == 3 is removed and Tcl_UniCharToUtf restored to its
     * prior non-stateful nature, this call to memset can also be removed.
     */
    memset(dst, 0xff, dstLen);
#endif
    /*
     * Note with respect to profiles: all byte values are mapped
     * to Unicode characters on input so there is no question of invalid
     * 8859-1 characters.
     */

    result = TCL_OK;
    for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) {
	Tcl_UniChar ch = 0;

	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
3804
3805
3806
3807
3808
3809
3810
3811

3812
3813
3814
3815

3816
3817
3818
3819
3820
3821
3822
		break;
	    }
#if TCL_UTF_MAX < 4
	    if ((ch >= 0xD800) && (len < 3)) {
		len = 4;
	    }
#endif
	    /*

	     * Plunge on, using '?' as a fallback character.
	     */

	    ch = (Tcl_UniChar) '?'; /* Profiles TCL8 and REPLACE */

	}

	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	*(dst++) = (char) ch;







<
>
|
<
|
|
>







3954
3955
3956
3957
3958
3959
3960

3961
3962

3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
		break;
	    }
#if TCL_UTF_MAX < 4
	    if ((ch >= 0xD800) && (len < 3)) {
		len = 4;
	    }
#endif

	    if (PROFILE_LOSSLESS(flags) && IsLosslessWrapper(ch)) {
		ch = FromLossless(ch);

	    } else {
		ch = (Tcl_UniChar)'?';
	    }
	}

	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	*(dst++) = (char) ch;
4031
4032
4033
4034
4035
4036
4037
4038



4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
	     * We have a split-up or unrecognized escape sequence. If we
	     * checked all the sequences, then it's a syntax error, otherwise
	     * we need more bytes to determine a match.
	     */

	    if ((checked == dataPtr->numSubTables + 2)
		    || (flags & TCL_ENCODING_END)) {
		if (!PROFILE_STRICT(flags)) {



		    /* Unknown escape sequence */
		    dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		    src += longest;
		    continue;
		}
		result = TCL_CONVERT_SYNTAX;
	    } else {
		result = TCL_CONVERT_MULTIBYTE;
	    }
	    break;
	}

	if (encodingPtr == NULL) {







|
>
>
>


|


<







4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196

4197
4198
4199
4200
4201
4202
4203
	     * We have a split-up or unrecognized escape sequence. If we
	     * checked all the sequences, then it's a syntax error, otherwise
	     * we need more bytes to determine a match.
	     */

	    if ((checked == dataPtr->numSubTables + 2)
		    || (flags & TCL_ENCODING_END)) {
		if (PROFILE_STRICT(flags)) {
		    result = TCL_CONVERT_SYNTAX;
		} else {
                    Tcl_Size skip = longest > left ? left : longest;
		    /* Unknown escape sequence */
		    dst += Tcl_UniCharToUtf(UNICODE_REPLACE_CHAR, dst);
		    src += skip;
		    continue;
		}

	    } else {
		result = TCL_CONVERT_MULTIBYTE;
	    }
	    break;
	}

	if (encodingPtr == NULL) {
4209
4210
4211
4212
4213
4214
4215

4216
4217
4218
4219
4220
4221
4222
4223
	    if (word == 0) {
		state = oldState;
		if (PROFILE_STRICT(flags)) {
		    result = TCL_CONVERT_UNKNOWN;
		    break;
		}
		encodingPtr = GetTableEncoding(dataPtr, state);

		tableDataPtr = (const TableEncodingData *)encodingPtr->clientData;
		word = tableDataPtr->fallback;
	    }

	    tablePrefixBytes = (const char *) tableDataPtr->prefixBytes;
	    tableFromUnicode = (const unsigned short *const *)
		    tableDataPtr->fromUnicode;








>
|







4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
	    if (word == 0) {
		state = oldState;
		if (PROFILE_STRICT(flags)) {
		    result = TCL_CONVERT_UNKNOWN;
		    break;
		}
		encodingPtr = GetTableEncoding(dataPtr, state);
		tableDataPtr =
		    (const TableEncodingData *)encodingPtr->clientData;
		word = tableDataPtr->fallback;
	    }

	    tablePrefixBytes = (const char *) tableDataPtr->prefixBytes;
	    tableFromUnicode = (const unsigned short *const *)
		    tableDataPtr->fromUnicode;

4614
4615
4616
4617
4618
4619
4620
4621
4622




















































































4623
4624
4625
4626
4627
4628
    objPtr = Tcl_NewListObj(n, NULL);
    for (i = 0; i < n; ++i) {
	Tcl_ListObjAppendElement(
	    interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE));
    }
    Tcl_SetObjResult(interp, objPtr);
}

/*




















































































 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */









>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>






4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
    objPtr = Tcl_NewListObj(n, NULL);
    for (i = 0; i < n; ++i) {
	Tcl_ListObjAppendElement(
	    interp, objPtr, Tcl_NewStringObj(encodingProfiles[i].name, TCL_INDEX_NONE));
    }
    Tcl_SetObjResult(interp, objPtr);
}

/*
 *------------------------------------------------------------------------
 *
 * TclSystemToInternalEncoding --
 *
 *    Converts a string encoded in the system encoding to Tcl's internal UTF8
 *    using the lossless profile.
 *
 * Results:
 *    Tcl_OK / TCL_ERROR
 *
 * Side effects:
 *    On success *dsPtr holds the converted string. On error, *dsPtr is
 *    cleared, an error message is stored in interp (if not NULL), and a
 *    POSIX error code stored in errno.
 *
 *------------------------------------------------------------------------
 */
int
TclSystemToInternalEncoding(
    Tcl_Interp *interp, /* For error messages, may be NULL */
    const char *src,    /* String in system encoding */
    Tcl_Size srcLen,    /* Number of bytes passed in */
    Tcl_DString *dsPtr) /* Pointer to uninitialized or cleared Tcl_DString. */
{
    Tcl_Size errorLoc;
    int ret;
    ret = Tcl_ExternalToUtfDStringEx(interp,
				     NULL,
				     src,
				     srcLen,
				     TCL_ENCODING_PROFILE_LOSSLESS,
				     dsPtr,
				     &errorLoc);
    /* On TCL_OK, caller owns *dsPtr. On failure we have to free it. */
    if (ret != TCL_OK) {
	Tcl_DStringFree(dsPtr);
	ret = TCL_ERROR; /* Map TCL_CONVERT_* to TCL_ERROR */
    }
    return ret;
}

/*
 *------------------------------------------------------------------------
 *
 * TclInternalToSystemEncoding --
 *
 *    Converts a string to the system encoding using the lossless profile
 *
 * Results:
 *    Tcl_OK / TCL_ERROR
 *
 * Side effects:
 *    On success *dsPtr holds the converted string. On error, *dsPtr is
 *    cleared, an error message is stored in interp (if not NULL), and a
 *    POSIX error code stored in errno.
 *
 *------------------------------------------------------------------------
 */
int
TclInternalToSystemEncoding(
    Tcl_Interp *interp, /* For error messages, may be NULL */
    const char *src,    /* String in system encoding */
    Tcl_Size srcLen,    /* Number of bytes passed in */
    Tcl_DString *dsPtr) /* Pointer to uninitialized or cleared Tcl_DString. */
{
    Tcl_Size errorLoc;
    int ret;
    ret = Tcl_UtfToExternalDStringEx(interp,
				     NULL,
				     src,
				     srcLen,
				     TCL_ENCODING_PROFILE_LOSSLESS,
				     dsPtr,
				     &errorLoc);
    /* On TCL_OK, caller owns *dsPtr. On failure we have to free it. */
    if (ret != TCL_OK) {
	Tcl_DStringFree(dsPtr);
	ret = TCL_ERROR; /* Map TCL_CONVERT_* to TCL_ERROR */
    }
    return ret;
}


/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */
Changes to generic/tclEnv.c.
15
16
17
18
19
20
21

22
23


24
25

26
27
28
29
30
31
32

33



34
35





36
37
38
39
40
41
42

#include "tclInt.h"

TCL_DECLARE_MUTEX(envMutex)	/* To serialize access to environ. */

#if defined(_WIN32)
#  define tenviron _wenviron

#  define tenviron2utfdstr(string, len, dsPtr) (Tcl_DStringInit(dsPtr), \
		(char *)Tcl_Char16ToUtfDString((const unsigned short *)(string), ((((len) + 2) >> 1) - 1), (dsPtr)))


#  define utf2tenvirondstr(string, len, dsPtr) (Tcl_DStringInit(dsPtr), \
		(const WCHAR *)Tcl_UtfToChar16DString((string), (len), (dsPtr)))

#  define techar WCHAR
#  ifdef USE_PUTENV
#    define putenv(env) _wputenv((const wchar_t *)env)
#  endif
#else
#  define tenviron environ
#  define tenviron2utfdstr(tenvstr, len, dstr) \

		Tcl_ExternalToUtfDString(NULL, tenvstr, len, dstr)



#  define utf2tenvirondstr(str, len, dstr) \
		Tcl_UtfToExternalDString(NULL, str, len, dstr)





#  define techar char
#endif


/* MODULE_SCOPE */
size_t TclEnvEpoch = 0;	/* Epoch of the tcl environment
				 * (if changed with tcl-env). */







>
|
|
>
>
|
|
>






|
>
|
>
>
>
|
|
>
>
>
>
>







15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

#include "tclInt.h"

TCL_DECLARE_MUTEX(envMutex)	/* To serialize access to environ. */

#if defined(_WIN32)
#  define tenviron _wenviron
static inline char *tenviron2utfdstr(const WCHAR *str, Tcl_DString *dsPtr) {
    Tcl_DStringInit(dsPtr);
    return Tcl_Char16ToUtfDString(str, -1, dsPtr);
}
static inline WCHAR *utf2tenvirondstr(const char *str, Tcl_DString *dsPtr) {
    Tcl_DStringInit(dsPtr);
    return Tcl_UtfToChar16DString(str, -1, dsPtr);
}
#  define techar WCHAR
#  ifdef USE_PUTENV
#    define putenv(env) _wputenv((const wchar_t *)env)
#  endif
#else
#  define tenviron environ
static inline char *tenviron2utfdstr(const char *str, Tcl_DString *dsPtr) {
    if (TclSystemToInternalEncoding(NULL,str,-1,dsPtr) == TCL_OK) {
        return Tcl_DStringValue(dsPtr);
    }
    return NULL;
}
static inline char *utf2tenvirondstr(const char *str, Tcl_DString *dsPtr) {
    Tcl_DStringInit(dsPtr);
    if (TclInternalToSystemEncoding(NULL,str,-1,dsPtr) == TCL_OK) {
        return Tcl_DStringValue(dsPtr);
    }
    return NULL;
}
#  define techar char
#endif


/* MODULE_SCOPE */
size_t TclEnvEpoch = 0;	/* Epoch of the tcl environment
				 * (if changed with tcl-env). */
155
156
157
158
159
160
161
162




163
164
165
166
167
168
169

	Tcl_MutexLock(&envMutex);
	for (i = 0; tenviron[i] != NULL; i++) {
	    Tcl_Obj *obj1, *obj2;
	    const char *p1;
	    char *p2;

	    p1 = tenviron2utfdstr(tenviron[i], -1, &envString);




	    p2 = (char *)strchr(p1, '=');
	    if (p2 == NULL) {
		/*
		 * This condition seem to happen occasionally under some
		 * versions of Solaris, or when encoding accidents swallow the
		 * '='; ignore the entry.
		 */







|
>
>
>
>







168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186

	Tcl_MutexLock(&envMutex);
	for (i = 0; tenviron[i] != NULL; i++) {
	    Tcl_Obj *obj1, *obj2;
	    const char *p1;
	    char *p2;

	    p1 = tenviron2utfdstr(tenviron[i], &envString);
            if (p1 == NULL) {
                /* Ignore what cannot be decoded (should not happen) */
                continue;
            }
	    p2 = (char *)strchr(p1, '=');
	    if (p2 == NULL) {
		/*
		 * This condition seem to happen occasionally under some
		 * versions of Solaris, or when encoding accidents swallow the
		 * '='; ignore the entry.
		 */
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327






328
329
330
331
332
333
334
	 * Compare the new value to the existing value. If they're the same
	 * then quit immediately (e.g. don't rewrite the value or propagate it
	 * to other interpreters). Otherwise, when there are N interpreters
	 * there will be N! propagations of the same value among the
	 * interpreters.
	 */

	oldEnv = tenviron2utfdstr(tenviron[index], -1, &envString);
	if (strcmp(value, oldEnv + (length + 1)) == 0) {
	    Tcl_DStringFree(&envString);
	    Tcl_MutexUnlock(&envMutex);
	    return;
	}
	Tcl_DStringFree(&envString);

	oldValue = (char *)tenviron[index];
	nameLength = length;
    }

    /*
     * Create a new entry. Build a complete UTF string that contains a
     * "name=value" pattern. Then convert the string to the native encoding,
     * and set the environ array value.
     */

    valueLength = strlen(value);
    p = (char *)Tcl_Alloc(nameLength + valueLength + 2);
    memcpy(p, name, nameLength);
    p[nameLength] = '=';
    memcpy(p+nameLength+1, value, valueLength+1);
    p2 = utf2tenvirondstr(p, -1, &envString);







    /*
     * Copy the native string to heap memory.
     */

    p = (char *)Tcl_Realloc(p, Tcl_DStringLength(&envString) + tNTL);
    memcpy(p, p2, Tcl_DStringLength(&envString) + tNTL);







|
|
|




















|
>
>
>
>
>
>







314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
	 * Compare the new value to the existing value. If they're the same
	 * then quit immediately (e.g. don't rewrite the value or propagate it
	 * to other interpreters). Otherwise, when there are N interpreters
	 * there will be N! propagations of the same value among the
	 * interpreters.
	 */

	oldEnv = tenviron2utfdstr(tenviron[index], &envString);
	if (oldEnv == NULL || strcmp(value, oldEnv + (length + 1)) == 0) {
	    Tcl_DStringFree(&envString); /* OK even if oldEnv is NULL */
	    Tcl_MutexUnlock(&envMutex);
	    return;
	}
	Tcl_DStringFree(&envString);

	oldValue = (char *)tenviron[index];
	nameLength = length;
    }

    /*
     * Create a new entry. Build a complete UTF string that contains a
     * "name=value" pattern. Then convert the string to the native encoding,
     * and set the environ array value.
     */

    valueLength = strlen(value);
    p = (char *)Tcl_Alloc(nameLength + valueLength + 2);
    memcpy(p, name, nameLength);
    p[nameLength] = '=';
    memcpy(p+nameLength+1, value, valueLength+1);
    p2 = utf2tenvirondstr(p, &envString);
    if (p2 == NULL) {
        /* No way to signal error from here :-( but should not happen */
        Tcl_Free(p);
        Tcl_MutexUnlock(&envMutex);
        return;
    }

    /*
     * Copy the native string to heap memory.
     */

    p = (char *)Tcl_Realloc(p, Tcl_DStringLength(&envString) + tNTL);
    memcpy(p, p2, Tcl_DStringLength(&envString) + tNTL);
498
499
500
501
502
503
504
505




506
507
508
509
510
511
512
    string[length+1] = '\0';
#else
    string = (char *)Tcl_Alloc(length + 1);
    memcpy(string, name, length);
    string[length] = '\0';
#endif /* _WIN32 */

    utf2tenvirondstr(string, -1, &envString);




    string = (char *)Tcl_Realloc(string, Tcl_DStringLength(&envString) + tNTL);
    memcpy(string, Tcl_DStringValue(&envString),
	    Tcl_DStringLength(&envString) + tNTL);
    Tcl_DStringFree(&envString);

    putenv(string);








|
>
>
>
>







521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
    string[length+1] = '\0';
#else
    string = (char *)Tcl_Alloc(length + 1);
    memcpy(string, name, length);
    string[length] = '\0';
#endif /* _WIN32 */

    if (utf2tenvirondstr(string, &envString) == NULL) {
        /* Should not happen except memory alloc fail. */
        Tcl_MutexUnlock(&envMutex);
        return;
    }
    string = (char *)Tcl_Realloc(string, Tcl_DStringLength(&envString) + tNTL);
    memcpy(string, Tcl_DStringValue(&envString),
	    Tcl_DStringLength(&envString) + tNTL);
    Tcl_DStringFree(&envString);

    putenv(string);

573
574
575
576
577
578
579
580

581
582
583
584
585
586
587
588
589
590

591
592
593
594
595
596
597

    Tcl_MutexLock(&envMutex);
    index = TclpFindVariable(name, &length);
    result = NULL;
    if (index != -1) {
	Tcl_DString envStr;

	result = tenviron2utfdstr(tenviron[index], -1, &envStr);

	result += length;
	if (*result == '=') {
	    result++;
	    Tcl_DStringInit(valuePtr);
	    Tcl_DStringAppend(valuePtr, result, -1);
	    result = Tcl_DStringValue(valuePtr);
	} else {
	    result = NULL;
	}
	Tcl_DStringFree(&envStr);

    }
    Tcl_MutexUnlock(&envMutex);
    return result;
}

/*
 *----------------------------------------------------------------------







|
>
|
|
|
|
|
|
|
|
|
|
>







600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626

    Tcl_MutexLock(&envMutex);
    index = TclpFindVariable(name, &length);
    result = NULL;
    if (index != -1) {
	Tcl_DString envStr;

	result = tenviron2utfdstr(tenviron[index], &envStr);
        if (result) {
            result += length;
            if (*result == '=') {
                result++;
                Tcl_DStringInit(valuePtr);
                Tcl_DStringAppend(valuePtr, result, -1);
                result = Tcl_DStringValue(valuePtr);
            } else {
                result = NULL;
            }
            Tcl_DStringFree(&envStr);
        }
    }
    Tcl_MutexUnlock(&envMutex);
    return result;
}

/*
 *----------------------------------------------------------------------
Changes to generic/tclInt.h.
3032
3033
3034
3035
3036
3037
3038




3039
3040
3041
3042
3043
3044
3045
MODULE_SCOPE int
TclEncodingProfileNameToId(Tcl_Interp *interp,
			   const char *profileName,
			   int *profilePtr);
MODULE_SCOPE const char *TclEncodingProfileIdToName(Tcl_Interp *interp,
						    int profileId);
MODULE_SCOPE void TclGetEncodingProfiles(Tcl_Interp *interp);





/*
 * TIP #233 (Virtualized Time)
 * Data for the time hooks, if any.
 */

MODULE_SCOPE Tcl_GetTimeProc *tclGetTimeProcPtr;







>
>
>
>







3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
MODULE_SCOPE int
TclEncodingProfileNameToId(Tcl_Interp *interp,
			   const char *profileName,
			   int *profilePtr);
MODULE_SCOPE const char *TclEncodingProfileIdToName(Tcl_Interp *interp,
						    int profileId);
MODULE_SCOPE void TclGetEncodingProfiles(Tcl_Interp *interp);
MODULE_SCOPE int TclInternalToSystemEncoding(Tcl_Interp *interp,
			const char *src, Tcl_Size srcLen, Tcl_DString *dsPtr);
MODULE_SCOPE int TclSystemToInternalEncoding(Tcl_Interp *interp,
			const char *src, Tcl_Size srcLen, Tcl_DString *dsPtr);

/*
 * TIP #233 (Virtualized Time)
 * Data for the time hooks, if any.
 */

MODULE_SCOPE Tcl_GetTimeProc *tclGetTimeProcPtr;
Changes to generic/tclMain.c.
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
{
    Tcl_DString ds;

#ifdef UNICODE
    Tcl_DStringInit(&ds);
    Tcl_WCharToUtfDString(string, -1, &ds);
#else
    (void)Tcl_ExternalToUtfDString(NULL, (char *)string, -1, &ds);
#endif
    return Tcl_DStringToObj(&ds);
}

/*
 * Declarations for various library functions and variables (don't want to
 * include tclPort.h here, because people might copy this file out of the Tcl







|







49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
{
    Tcl_DString ds;

#ifdef UNICODE
    Tcl_DStringInit(&ds);
    Tcl_WCharToUtfDString(string, -1, &ds);
#else
    (void) TclSystemToInternalEncoding(NULL, string, -1, &ds);
#endif
    return Tcl_DStringToObj(&ds);
}

/*
 * Declarations for various library functions and variables (don't want to
 * include tclPort.h here, because people might copy this file out of the Tcl
Changes to generic/tclTest.c.
2124
2125
2126
2127
2128
2129
2130

2131
2132
2133
2134
2135
2136
2137
	{"end", TCL_ENCODING_END},
	{"stoponerror", TCL_ENCODING_STOPONERROR},
	{"noterminate", TCL_ENCODING_NO_TERMINATE},
	{"charlimit", TCL_ENCODING_CHAR_LIMIT},
	{"profiletcl8", TCL_ENCODING_PROFILE_TCL8},
	{"profilestrict", TCL_ENCODING_PROFILE_STRICT},
	{"profilereplace", TCL_ENCODING_PROFILE_REPLACE},

	{NULL, 0}
    };
    Tcl_Size i;
    Tcl_WideInt wide;

    if (objc < 7 || objc > 10) {
        Tcl_WrongNumArgs(interp,







>







2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
	{"end", TCL_ENCODING_END},
	{"stoponerror", TCL_ENCODING_STOPONERROR},
	{"noterminate", TCL_ENCODING_NO_TERMINATE},
	{"charlimit", TCL_ENCODING_CHAR_LIMIT},
	{"profiletcl8", TCL_ENCODING_PROFILE_TCL8},
	{"profilestrict", TCL_ENCODING_PROFILE_STRICT},
	{"profilereplace", TCL_ENCODING_PROFILE_REPLACE},
	{"profilelossless", TCL_ENCODING_PROFILE_LOSSLESS},
	{NULL, 0}
    };
    Tcl_Size i;
    Tcl_WideInt wide;

    if (objc < 7 || objc > 10) {
        Tcl_WrongNumArgs(interp,
Changes to generic/tclUtil.c.
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
	     */

	    Tcl_DString native, newValue;

	    Tcl_MutexLock(&pgvPtr->mutex);
	    epoch = ++pgvPtr->epoch;
	    Tcl_UtfToExternalDStringEx(NULL, pgvPtr->encoding, pgvPtr->value,
		pgvPtr->numBytes, TCL_ENCODING_PROFILE_TCL8, &native, NULL);
	    Tcl_ExternalToUtfDStringEx(NULL, current, Tcl_DStringValue(&native),
		Tcl_DStringLength(&native), TCL_ENCODING_PROFILE_TCL8,
		&newValue, NULL);
	    Tcl_DStringFree(&native);
	    Tcl_Free(pgvPtr->value);
	    pgvPtr->value = (char *)Tcl_Alloc(Tcl_DStringLength(&newValue) + 1);
	    memcpy(pgvPtr->value, Tcl_DStringValue(&newValue),
		    Tcl_DStringLength(&newValue) + 1);
	    Tcl_DStringFree(&newValue);







|

|







4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
	     */

	    Tcl_DString native, newValue;

	    Tcl_MutexLock(&pgvPtr->mutex);
	    epoch = ++pgvPtr->epoch;
	    Tcl_UtfToExternalDStringEx(NULL, pgvPtr->encoding, pgvPtr->value,
		pgvPtr->numBytes, TCL_ENCODING_PROFILE_LOSSLESS, &native, NULL);
	    Tcl_ExternalToUtfDStringEx(NULL, current, Tcl_DStringValue(&native),
		Tcl_DStringLength(&native), TCL_ENCODING_PROFILE_LOSSLESS,
		&newValue, NULL);
	    Tcl_DStringFree(&native);
	    Tcl_Free(pgvPtr->value);
	    pgvPtr->value = (char *)Tcl_Alloc(Tcl_DStringLength(&newValue) + 1);
	    memcpy(pgvPtr->value, Tcl_DStringValue(&newValue),
		    Tcl_DStringLength(&newValue) + 1);
	    Tcl_DStringFree(&newValue);
Changes to macosx/tclMacOSXFCmd.c.
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
    const char *string;
    int result = TCL_OK;
    Tcl_DString ds;
    Tcl_Encoding encoding = Tcl_GetEncoding(NULL, "macRoman");
    Tcl_Size length;

    string = Tcl_GetStringFromObj(objPtr, &length);
    Tcl_UtfToExternalDStringEx(NULL, encoding, string, length, TCL_ENCODING_PROFILE_TCL8, &ds, NULL);

    if (Tcl_DStringLength(&ds) > 4) {
	if (interp) {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "expected Macintosh OS type but got \"%s\": ", string));
	    Tcl_SetErrorCode(interp, "TCL", "VALUE", "MAC_OSTYPE", NULL);
	}







|







639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
    const char *string;
    int result = TCL_OK;
    Tcl_DString ds;
    Tcl_Encoding encoding = Tcl_GetEncoding(NULL, "macRoman");
    Tcl_Size length;

    string = Tcl_GetStringFromObj(objPtr, &length);
    Tcl_UtfToExternalDStringEx(NULL, encoding, string, length, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL);

    if (Tcl_DStringLength(&ds) > 4) {
	if (interp) {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "expected Macintosh OS type but got \"%s\": ", string));
	    Tcl_SetErrorCode(interp, "TCL", "VALUE", "MAC_OSTYPE", NULL);
	}
Changes to tests/cmdAH.test.
642
643
644
645
646
647
648
649





































































































650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
            # Failure expected
            set result $prefix
            incr expected_failidx $prefixLen
        }
        testfailindex cmdAH-4.4.14.$printable.middle convertto $enc $prefix$str$suffix $result $expected_failidx $profile
    }
}






































































































test cmdAH-4.4.xx {convertto -profile strict} -constraints {testbytestring knownBug} -body {
    # TODO - what does testbytestring even test? Invalid UTF8 in the Tcl_Obj bytes field
    encoding convertto -profile strict utf-8 A[testbytestring \x80]B
} -returnCodes error -result {unexpected byte sequence starting at index 1: '\x80'}

#
# encoding names 4.5.*
badnumargs cmdAH-4.5.1 {encoding names} {foo}
test cmdAH-4.5.2 {encoding names should include at least utf-8 and iso8859-1 and at least one more} -body {
    set names [encoding names]
    list [expr {"utf-8" in $names}] [expr {"iso8859-1" in $names}] [expr {[llength $names] > 2}]
} -result {1 1 1}

#
# encoding profiles 4.6.*
badnumargs cmdAH-4.6.1 {encoding profiles} {foo}
test cmdAH-4.6.2 {encoding profiles} -body {
    lsort [encoding profiles]
} -result {replace strict tcl8}

#
# file command

test cmdAH-5.1 {Tcl_FileObjCmd} -returnCodes error -body {
    file
} -result {wrong # args: should be "file subcommand ?arg ...?"}








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|
















|







642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
            # Failure expected
            set result $prefix
            incr expected_failidx $prefixLen
        }
        testfailindex cmdAH-4.4.14.$printable.middle convertto $enc $prefix$str$suffix $result $expected_failidx $profile
    }
}

proc ascii_compatible {enc} {
    # All bytes under 128 should map to their ascii values
    for {set i 0} {$i < 128} {incr i} {
        set bin [binary format c $i]
        if {[catch {set ch [encoding convertfrom -profile strict $enc $bin]}]} {
            return 0
        }
        if {$ch ne [encoding convertfrom -profile strict ascii $bin]} {
            return 0
        }
    }
    return 1
}

#
# Roundtrip tests for lossless profile
foreach {enc hex profile str failidx ctrl comment} $encInvalidBytes {
    if {"knownBug" in $ctrl} continue
    if {$profile ne "lossless"} continue
    # There are multiple test cases based on location of invalid bytes
    set bytes [binary decode hex $hex]
    set prefix A
    set suffix B
    set prefix_bytes [encoding convertto $enc $prefix]
    set suffix_bytes [encoding convertto $enc $suffix]
    set prefixLen [string length $prefix_bytes]
    if {![ascii_compatible $enc]} {
        # These do not implement lossless behaviors
        test cmdAH-4.4.15.$hex.solo.$enc "Invalid byte under 128 for lossless profile" -body {
            encoding convertfrom -profile lossless $enc $bytes
        } -result \uFFFD* -match glob
        continue
    }
    if {$ctrl eq {} || "solo" in $ctrl} {
        test cmdAH-4.4.15.$hex.solo.$enc "Lossless roundtrip $hex for $enc" -body {
            set decoded [encoding convertfrom -profile lossless $enc $bytes]
            string equal $bytes [encoding convertto -profile lossless $enc $decoded]
        } -result 1
    }
    if {$ctrl eq {} || "lead" in $ctrl} {
        test cmdAH-4.4.15.$hex.lead.$enc "Lossless roundtrip $hex for $enc" -body {
            set decoded [encoding convertfrom -profile lossless $enc $bytes$suffix_bytes]
            string equal $bytes$suffix_bytes [encoding convertto -profile lossless $enc $decoded]
        } -result 1
    }
    if {$ctrl eq {} || "tail" in $ctrl} {
        test cmdAH-4.4.15.$hex.tail.$enc "Lossless roundtrip $hex for $enc" -body {
            set decoded [encoding convertfrom -profile lossless $enc $prefix_bytes$bytes]
            string equal $prefix_bytes$bytes [encoding convertto -profile lossless $enc $decoded]
        } -result 1
    }
    if {$ctrl eq {} || "middle" in $ctrl} {
        test cmdAH-4.4.15.$hex.middle.$enc "Lossless roundtrip $hex for $enc" -body {
            set decoded [encoding convertfrom -profile lossless $enc $prefix_bytes$bytes$suffix_bytes]
            string equal $prefix_bytes$bytes$suffix_bytes [encoding convertto -profile lossless $enc $decoded]
        } -result 1
    }
}

#
# Non-ascii encoding should not output lossless wrappers
foreach enc [encoding names] {
    if {$enc eq "cesu-8"} {
        test cmdAH-4.4.16.$enc "Lossless output for CESU-8" -body {
            encoding convertto -profile lossless $enc \uDC41
        } -result \xED\xB1\x81
    } elseif {[ascii_compatible $enc]} {
        test cmdAH-4.4.16.$enc "Lossless output for ascii-compatible encodings" -body {
            encoding convertto -profile lossless $enc \uDC41
        } -result A
    } else {
        test cmdAH-4.4.16.$enc "Lossless output for ascii-incompatible encodings" -body {
            encoding convertto -profile lossless $enc \uDC41
        } -result [encoding convertto -profile tcl8 $enc \uFFFD]
    }
}

#
# Invalid bytes < 128 should map to FFFD for lossless profile

# Find an invalid byte within a range for the given encoding
proc find_invalid_byte {enc {lo 0} {hi 127}} {
    for {set i $lo} {$i <= $hi} {incr i} {
        set bin [binary format c $i]
        if {[catch {set ch [encoding convertfrom -profile strict $enc $bin]}]} {
            return $bin
        }
    }
    # All bytes under 128 are valid
    return ""
}
foreach enc [encoding names] {
    set byte [find_invalid_byte $enc]
    if {$byte ne ""} {
        test cmdAH-4.4.17.$enc "Invalid byte under 128 for lossless profile" -body {
            encoding convertfrom -profile lossless $enc $byte
        } -result \uFFFD
    }
}


test cmdAH-4.4.xx {convertto -profile strict} -constraints {testbytestring knownBug} -body {
    # TODO - what does testbytestring even test? Invalid UTF8 in the Tcl_Obj bytes field?
    encoding convertto -profile strict utf-8 A[testbytestring \x80]B
} -returnCodes error -result {unexpected byte sequence starting at index 1: '\x80'}

#
# encoding names 4.5.*
badnumargs cmdAH-4.5.1 {encoding names} {foo}
test cmdAH-4.5.2 {encoding names should include at least utf-8 and iso8859-1 and at least one more} -body {
    set names [encoding names]
    list [expr {"utf-8" in $names}] [expr {"iso8859-1" in $names}] [expr {[llength $names] > 2}]
} -result {1 1 1}

#
# encoding profiles 4.6.*
badnumargs cmdAH-4.6.1 {encoding profiles} {foo}
test cmdAH-4.6.2 {encoding profiles} -body {
    lsort [encoding profiles]
} -result {lossless replace strict tcl8}

#
# file command

test cmdAH-5.1 {Tcl_FileObjCmd} -returnCodes error -body {
    file
} -result {wrong # args: should be "file subcommand ?arg ...?"}
Changes to tests/encoding.test.
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
test encoding-24.15.tcl8 {Parse invalid utf-8, -profile tcl8} -body {
    encoding convertfrom -profile tcl8 utf-8 "Z\xE0\x80"
} -result Z\xE0\u20AC
test encoding-24.16 {Parse valid or invalid utf-8} -constraints testbytestring -body {
    encoding convertto utf-8 [testbytestring "Z\u4343\x80"]
} -returnCodes 1 -result {expected byte sequence but character 1 was '䍃€' (U+004343)}
test encoding-24.17 {Parse valid or invalid utf-8} -constraints testbytestring -body {
    encoding convertto utf-8 [testbytestring "Z\xE0\x80"]
} -result "Z\xC3\xA0\xE2\x82\xAC"
test encoding-24.18 {Parse valid or invalid utf-8} -constraints testbytestring -body {
    encoding convertto utf-8 [testbytestring "Z\xE0\x80xxxxxx"]
} -result "Z\xC3\xA0\xE2\x82\xACxxxxxx"
test encoding-24.19.1 {Parse valid or invalid utf-8} -body {
    encoding convertto -profile tcl8 utf-8 "ZX\uD800"
} -result ZX\xED\xA0\x80







|







809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
test encoding-24.15.tcl8 {Parse invalid utf-8, -profile tcl8} -body {
    encoding convertfrom -profile tcl8 utf-8 "Z\xE0\x80"
} -result Z\xE0\u20AC
test encoding-24.16 {Parse valid or invalid utf-8} -constraints testbytestring -body {
    encoding convertto utf-8 [testbytestring "Z\u4343\x80"]
} -returnCodes 1 -result {expected byte sequence but character 1 was '䍃€' (U+004343)}
test encoding-24.17 {Parse valid or invalid utf-8} -constraints testbytestring -body {
    encoding convertto -profile tcl8 utf-8 [testbytestring "Z\xE0\x80"]
} -result "Z\xC3\xA0\xE2\x82\xAC"
test encoding-24.18 {Parse valid or invalid utf-8} -constraints testbytestring -body {
    encoding convertto utf-8 [testbytestring "Z\xE0\x80xxxxxx"]
} -result "Z\xC3\xA0\xE2\x82\xACxxxxxx"
test encoding-24.19.1 {Parse valid or invalid utf-8} -body {
    encoding convertto -profile tcl8 utf-8 "ZX\uD800"
} -result ZX\xED\xA0\x80
1164
1165
1166
1167
1168
1169
1170








1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182








1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
    encoding convertfrom -profile tcl8 gb12345 x
} -result x
test encoding-bug-66ffafd309-1-strict {Bug [66ffafd309] - truncated DBCS} -body {
    encoding convertfrom -profile strict gb12345 x
} -result {unexpected byte sequence starting at index 0: '\x78'} -returnCodes error
test encoding-bug-66ffafd309-1-replace {Bug [66ffafd309] - truncated DBCS} -body {
    encoding convertfrom -profile replace gb12345 x








} -result \uFFFD
test encoding-bug-66ffafd309-2-tcl8 {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid
    encoding convertfrom -profile tcl8 jis0208 \x78\x79
} -result \x78\x79
test encoding-bug-66ffafd309-2-strict {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid
    encoding convertfrom -profile strict jis0208 \x78\x79
} -result {unexpected byte sequence starting at index 1: '\x79'} -returnCodes error
test encoding-bug-66ffafd309-2-replace {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid
    encoding convertfrom -profile replace jis0208 \x78\x79








} -result \uFFFD\uFFFD

# cleanup
namespace delete ::tcl::test::encoding
::tcltest::cleanupTests
return

# Local Variables:
# mode: tcl
# End:







>
>
>
>
>
>
>
>












>
>
>
>
>
>
>
>










1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
    encoding convertfrom -profile tcl8 gb12345 x
} -result x
test encoding-bug-66ffafd309-1-strict {Bug [66ffafd309] - truncated DBCS} -body {
    encoding convertfrom -profile strict gb12345 x
} -result {unexpected byte sequence starting at index 0: '\x78'} -returnCodes error
test encoding-bug-66ffafd309-1-replace {Bug [66ffafd309] - truncated DBCS} -body {
    encoding convertfrom -profile replace gb12345 x
} -result \uFFFD
test encoding-bug-66ffafd309-1-lossless-a {Bug [66ffafd309] - truncated DBCS} -body {
    # lossless - byte < 128
    encoding convertfrom -profile lossless gb12345 x
} -result \uFFFD
test encoding-bug-66ffafd309-1-lossless-b {Bug [66ffafd309] - truncated DBCS} -body {
    # lossless - byte > 128
    encoding convertfrom -profile lossless gb12345 \x82
} -result \uFFFD
test encoding-bug-66ffafd309-2-tcl8 {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid
    encoding convertfrom -profile tcl8 jis0208 \x78\x79
} -result \x78\x79
test encoding-bug-66ffafd309-2-strict {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid
    encoding convertfrom -profile strict jis0208 \x78\x79
} -result {unexpected byte sequence starting at index 1: '\x79'} -returnCodes error
test encoding-bug-66ffafd309-2-replace {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid
    encoding convertfrom -profile replace jis0208 \x78\x79
} -result \uFFFD\uFFFD
test encoding-bug-66ffafd309-2-lossless-a {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid. \x78 is invalid prefix
    encoding convertfrom -profile lossless jis0208 \x78\x79
} -result \uFFFD\uFFFD
test encoding-bug-66ffafd309-2-lossless-b {Bug [66ffafd309] - invalid DBCS} -body {
    # Not truncated but invalid. \x21 is valid prefix but FF is not valid suffix
    encoding convertfrom -profile lossless jis0208 \x21\xFF
} -result \uFFFD\uFFFD

# cleanup
namespace delete ::tcl::test::encoding
::tcltest::cleanupTests
return

# Local Variables:
# mode: tcl
# End:
Changes to tests/encodingVectors.tcl.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# This file contains test vectors for verifying various encodings. They are
# stored in a common file so that they can be sourced into the various test
# modules that are dependent on encodings. This file contains statically defined
# test vectors. In addition, it sources the ICU-generated test vectors from
# icuUcmTests.tcl.
#
# Note that sourcing the file will reinitialize any existing encoding test
# vectors.
#

# List of defined encoding profiles
set encProfiles {tcl8 strict replace}
set encDefaultProfile strict; # Should reflect the default from implementation

# encValidStrings - Table of valid strings.
#
# Each row is <ENCODING STR BYTES CTRL COMMENT>
# The pair <ENCODING,STR> should be unique for generated test ids to be unique.
# STR is a string that can be encoded in the encoding ENCODING resulting











|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# This file contains test vectors for verifying various encodings. They are
# stored in a common file so that they can be sourced into the various test
# modules that are dependent on encodings. This file contains statically defined
# test vectors. In addition, it sources the ICU-generated test vectors from
# icuUcmTests.tcl.
#
# Note that sourcing the file will reinitialize any existing encoding test
# vectors.
#

# List of defined encoding profiles
set encProfiles {tcl8 strict replace lossless}
set encDefaultProfile strict; # Should reflect the default from implementation

# encValidStrings - Table of valid strings.
#
# Each row is <ENCODING STR BYTES CTRL COMMENT>
# The pair <ENCODING,STR> should be unique for generated test ids to be unique.
# STR is a string that can be encoded in the encoding ENCODING resulting
107
108
109
110
111
112
113

114
115
116
117
118
119
120
# ascii - Any byte above 127 is invalid and is mapped
# to the same numeric code point except for the range
# 80-9F which is treated as cp1252.
# This tests the TableToUtfProc code path.
lappend encInvalidBytes {*}{
    ascii 80 tcl8    \u20AC -1 {knownBug} {map to cp1252}
    ascii 80 replace \uFFFD -1 {} {Smallest invalid byte}

    ascii 80 strict  {}      0 {} {Smallest invalid byte}
    ascii 81 tcl8    \u0081 -1 {knownBug} {map to cp1252}
    ascii 82 tcl8    \u201A -1 {knownBug} {map to cp1252}
    ascii 83 tcl8    \u0192 -1 {knownBug} {map to cp1252}
    ascii 84 tcl8    \u201E -1 {knownBug} {map to cp1252}
    ascii 85 tcl8    \u2026 -1 {knownBug} {map to cp1252}
    ascii 86 tcl8    \u2020 -1 {knownBug} {map to cp1252}







>







107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# ascii - Any byte above 127 is invalid and is mapped
# to the same numeric code point except for the range
# 80-9F which is treated as cp1252.
# This tests the TableToUtfProc code path.
lappend encInvalidBytes {*}{
    ascii 80 tcl8    \u20AC -1 {knownBug} {map to cp1252}
    ascii 80 replace \uFFFD -1 {} {Smallest invalid byte}
    ascii 80 lossless \uDC80 -1 {} {Smallest invalid byte}
    ascii 80 strict  {}      0 {} {Smallest invalid byte}
    ascii 81 tcl8    \u0081 -1 {knownBug} {map to cp1252}
    ascii 82 tcl8    \u201A -1 {knownBug} {map to cp1252}
    ascii 83 tcl8    \u0192 -1 {knownBug} {map to cp1252}
    ascii 84 tcl8    \u201E -1 {knownBug} {map to cp1252}
    ascii 85 tcl8    \u2026 -1 {knownBug} {map to cp1252}
    ascii 86 tcl8    \u2020 -1 {knownBug} {map to cp1252}
142
143
144
145
146
147
148

149
150
151
152
153
154
155
    ascii 9C tcl8    \u0153 -1 {knownBug} {map to cp1252}
    ascii 9D tcl8    \u009D -1 {knownBug} {map to cp1252}
    ascii 9E tcl8    \u017E -1 {knownBug} {map to cp1252}
    ascii 9F tcl8    \u0178 -1 {knownBug} {map to cp1252}

    ascii FF tcl8    \u00FF -1 {} {Largest invalid byte}
    ascii FF replace \uFFFD -1 {} {Largest invalid byte}

    ascii FF strict  {}      0 {} {Largest invalid byte}
}

# utf-8 - valid sequences based on Table 3.7 in the Unicode
# standard.
#
# Code Points        First   Second  Third   Fourth Byte







>







143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
    ascii 9C tcl8    \u0153 -1 {knownBug} {map to cp1252}
    ascii 9D tcl8    \u009D -1 {knownBug} {map to cp1252}
    ascii 9E tcl8    \u017E -1 {knownBug} {map to cp1252}
    ascii 9F tcl8    \u0178 -1 {knownBug} {map to cp1252}

    ascii FF tcl8    \u00FF -1 {} {Largest invalid byte}
    ascii FF replace \uFFFD -1 {} {Largest invalid byte}
    ascii FF lossless \uDCFF -1 {} {Largest invalid byte}
    ascii FF strict  {}      0 {} {Largest invalid byte}
}

# utf-8 - valid sequences based on Table 3.7 in the Unicode
# standard.
#
# Code Points        First   Second  Third   Fourth Byte
166
167
168
169
170
171
172

173
174
175
176
177
178
179
# Tests below are based on the "gaps" in the above table. Note ascii test
# values are repeated because internally a different code path is used
# (UtfToUtfProc).
# Note C0, C1, F5:FF are invalid bytes ANYWHERE. Exception is C080
lappend encInvalidBytes {*}{
    utf-8 80 tcl8    \u20AC -1 {} {map to cp1252}
    utf-8 80 replace \uFFFD -1 {} {Smallest invalid byte}

    utf-8 80 strict  {}      0 {} {Smallest invalid byte}
    utf-8 81 tcl8    \u0081 -1 {} {map to cp1252}
    utf-8 82 tcl8    \u201A -1 {} {map to cp1252}
    utf-8 83 tcl8    \u0192 -1 {} {map to cp1252}
    utf-8 84 tcl8    \u201E -1 {} {map to cp1252}
    utf-8 85 tcl8    \u2026 -1 {} {map to cp1252}
    utf-8 86 tcl8    \u2020 -1 {} {map to cp1252}







>







168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# Tests below are based on the "gaps" in the above table. Note ascii test
# values are repeated because internally a different code path is used
# (UtfToUtfProc).
# Note C0, C1, F5:FF are invalid bytes ANYWHERE. Exception is C080
lappend encInvalidBytes {*}{
    utf-8 80 tcl8    \u20AC -1 {} {map to cp1252}
    utf-8 80 replace \uFFFD -1 {} {Smallest invalid byte}
    utf-8 80 lossless \uDC80 -1 {} {Smallest invalid byte}
    utf-8 80 strict  {}      0 {} {Smallest invalid byte}
    utf-8 81 tcl8    \u0081 -1 {} {map to cp1252}
    utf-8 82 tcl8    \u201A -1 {} {map to cp1252}
    utf-8 83 tcl8    \u0192 -1 {} {map to cp1252}
    utf-8 84 tcl8    \u201E -1 {} {map to cp1252}
    utf-8 85 tcl8    \u2026 -1 {} {map to cp1252}
    utf-8 86 tcl8    \u2020 -1 {} {map to cp1252}
202
203
204
205
206
207
208

209
210
211

212
213

214
215
216

217
218
219

220
221
222

223
224
225
226

227
228
229

230
231
232

233
234
235
236

237
238
239

240
241
242

243
244
245

246
247
248

249
250
251
252

253
254
255

256
257
258

259
260
261

262
263
264

265
266
267

268
269
270

271
272
273

274
275
276
277

278
279
280

281
282
283

284
285
286

287
288
289

290
291
292

293
294
295

296
297
298

299
300
301

302
303
304

305
306
307

308
309
310

311
312
313
314
315
316
317

318
319
320

321
322
323

324
325
326

327
328
329

330
331
332

333
334
335

336
337
338

339
340
341

342
343
344

345
346
347

348
349
350

351
352
353
354

355
356
357

358
359
360

361
362
363

364
365
366

367
368
369

370
371
372

373
374
375

376
377
378

379
380
381

382
383
384

385
386
387

388
389
390

391
392
393

394
395
396
397

398
399
400

401
402
403

404
405
406

407
408
409

410
411
412

413
414
415

416
417
418

419
420
421

422
423
424

425
426
427

428
429
430

431
432
433
434

435
436
437

438
439
440

441
442
443

444
445
446

447
448
449

450
451
452

453
454
455

456
457
458

459
460
461

462
463
464

465
466
467

468
469
470

471
472
473

474
475
476

477
478
479

480
481
482

483
484
485

486
487
488

489
490
491

492
493
494

495
496
497

498
499
500
501

502
503
504

505
506
507

508
509
510

511
512
513

514
515
516

517
518
519

520
521
522

523
524
525

526
527
528

529
530
531

532
533
534
535

536
537
538

539
540
541

542

543

544

545
546
547
548
549
550
551
552
553
554


555
556

557
558
559

560
561
562
563
564


565
566

567
568
569

570
571
572
573
574
575
576
577
578
579

580
581
582

583
584
585

586
587
588

589
590
591

592
593
594

595
596

597
598
599

600
601
602
603
604

605
606
607

608
609
610

611
612
613

614
615
616

617
618
619

620
621

622
623
624

625
626
627





628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654

655
    utf-8 9D tcl8    \u009D -1 {} {map to cp1252}
    utf-8 9E tcl8    \u017E -1 {} {map to cp1252}
    utf-8 9F tcl8    \u0178 -1 {} {map to cp1252}

    utf-8 C0 tcl8    \u00C0 -1 {} {C0 is invalid anywhere}
    utf-8 C0 strict  {}      0 {} {C0 is invalid anywhere}
    utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere}

    utf-8 C080 tcl8    \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8}
    utf-8 C080 strict  {}      0 {} {C080 -> invalid}
    utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char}

    utf-8 C0A2 tcl8    \u00C0\u00A2 -1 {} {websec.github.io - A}
    utf-8 C0A2 replace \uFFFD\uFFFD -1 {} {websec.github.io - A}

    utf-8 C0A2 strict  {}            0 {} {websec.github.io - A}
    utf-8 C0A7 tcl8    \u00C0\u00A7 -1 {} {websec.github.io - double quote}
    utf-8 C0A7 replace \uFFFD\uFFFD -1 {} {websec.github.io - double quote}

    utf-8 C0A7 strict  {}            0 {} {websec.github.io - double quote}
    utf-8 C0AE tcl8    \u00C0\u00AE -1 {} {websec.github.io - full stop}
    utf-8 C0AE replace \uFFFD\uFFFD -1 {} {websec.github.io - full stop}

    utf-8 C0AE strict  {}            0 {} {websec.github.io - full stop}
    utf-8 C0AF tcl8    \u00C0\u00AF -1 {} {websec.github.io - solidus}
    utf-8 C0AF replace \uFFFD\uFFFD -1 {} {websec.github.io - solidus}

    utf-8 C0AF strict  {}            0 {} {websec.github.io - solidus}

    utf-8 C1 tcl8    \u00C1 -1 {} {C1 is invalid everywhere}
    utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere}

    utf-8 C1 strict  {}      0 {} {C1 is invalid everywhere}
    utf-8 C181 tcl8    \u00C1\u0081 -1 {} {websec.github.io - base test (A)}
    utf-8 C181 replace \uFFFD\uFFFD -1 {} {websec.github.io - base test (A)}

    utf-8 C181 strict  {}            0 {} {websec.github.io - base test (A)}
    utf-8 C19C tcl8    \u00C1\u0153 -1 {} {websec.github.io - reverse solidus}
    utf-8 C19C replace \uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus}

    utf-8 C19C strict  {}            0 {} {websec.github.io - reverse solidus}

    utf-8 C2 tcl8      \u00C2     -1 {} {Missing trail byte}
    utf-8 C2 replace   \uFFFD     -1 {} {Missing trail byte}

    utf-8 C2 strict    {}          0 {} {Missing trail byte}
    utf-8 C27F tcl8    \u00C2\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 C27F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}

    utf-8 C27F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 DF tcl8      \u00DF     -1 {} {Missing trail byte}
    utf-8 DF replace   \uFFFD     -1 {} {Missing trail byte}

    utf-8 DF strict    {}          0 {} {Missing trail byte}
    utf-8 DF7F tcl8    \u00DF\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 DF7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}

    utf-8 DF7F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 DFE0A080 tcl8    \u00DF\u0800 -1 {} {Invalid trail byte is start of valid sequence}
    utf-8 DFE0A080 replace \uFFFD\u0800 -1 {} {Invalid trail byte is start of valid sequence}

    utf-8 DFE0A080 strict  {}            0 {} {Invalid trail byte is start of valid sequence}

    utf-8 E0 tcl8      \u00E0     -1 {} {Missing trail byte}
    utf-8 E0 replace   \uFFFD     -1 {} {Missing trail byte}

    utf-8 E0 strict    {}          0 {} {Missing trail byte}
    utf-8 E080 tcl8      \u00E0\u20AC   -1 {} {First trail byte must be A0:BF}
    utf-8 E080 replace   \uFFFD\uFFFD   -1 {} {First trail byte must be A0:BF}

    utf-8 E080 strict    {}              0 {} {First trail byte must be A0:BF}
    utf-8 E0819C tcl8    \u00E0\u0081\u0153 -1 {} {websec.github.io - reverse solidus}
    utf-8 E0819C replace \uFFFD\uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus}

    utf-8 E0819C strict  {}                  0 {} {websec.github.io - reverse solidus}
    utf-8 E09F tcl8      \u00E0\u0178   -1 {} {First trail byte must be A0:BF}
    utf-8 E09F replace   \uFFFD\uFFFD   -1 {} {First trail byte must be A0:BF}

    utf-8 E09F strict    {}              0 {} {First trail byte must be A0:BF}
    utf-8 E0A0 tcl8      \u00E0\u00A0   -1 {} {Missing second trail byte}
    utf-8 E0A0 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 E0A0 strict    {}              0 {} {Missing second trail byte}
    utf-8 E0BF tcl8      \u00E0\u00BF   -1 {} {Missing second trail byte}
    utf-8 E0BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 E0BF strict    {}              0 {} {Missing second trail byte}
    utf-8 E0A07F tcl8    \u00E0\u00A0\x7F   -1 {}     {Second trail byte must be 80:BF}
    utf-8 E0A07F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 E0A07F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 E0BF7F tcl8    \u00E0\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
    utf-8 E0BF7F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 E0BF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}

    utf-8 E1 tcl8      \u00E1     -1 {} {Missing trail byte}
    utf-8 E1 replace   \uFFFD     -1 {} {Missing trail byte}

    utf-8 E1 strict    {}          0 {} {Missing trail byte}
    utf-8 E17F tcl8    \u00E1\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 E17F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}

    utf-8 E17F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 E181 tcl8      \u00E1\u0081   -1 {} {Missing second trail byte}
    utf-8 E181 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 E181 strict    {}              0 {} {Missing second trail byte}
    utf-8 E1BF tcl8      \u00E1\u00BF   -1 {} {Missing second trail byte}
    utf-8 E1BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 E1BF strict    {}              0 {} {Missing second trail byte}
    utf-8 E1807F tcl8    \u00E1\u20AC\x7F   -1 {} {Second trail byte must be 80:BF}
    utf-8 E1807F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 E1807F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 E1BF7F tcl8    \u00E1\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
    utf-8 E1BF7F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 E1BF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 EC tcl8      \u00EC     -1 {} {Missing trail byte}
    utf-8 EC replace   \uFFFD     -1 {} {Missing trail byte}

    utf-8 EC strict    {}          0 {} {Missing trail byte}
    utf-8 EC7F tcl8    \u00EC\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 EC7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}

    utf-8 EC7F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 EC81 tcl8      \u00EC\u0081   -1 {} {Missing second trail byte}
    utf-8 EC81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 EC81 strict    {}              0 {} {Missing second trail byte}
    utf-8 ECBF tcl8      \u00EC\u00BF   -1 {} {Missing second trail byte}
    utf-8 ECBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 ECBF strict    {}              0 {} {Missing second trail byte}
    utf-8 EC807F tcl8    \u00EC\u20AC\x7F   -1 {} {Second trail byte must be 80:BF}
    utf-8 EC807F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 EC807F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 ECBF7F tcl8    \u00EC\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
    utf-8 ECBF7F replace \uFFFD\u7F         -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 ECBF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}

    utf-8 ED tcl8       \u00ED        -1 {} {Missing trail byte}
    utf-8 ED replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 ED strict     {}             0 {} {Missing trail byte}
    utf-8 ED7F tcl8     \u00ED\u7F    -1 {} {First trail byte must be 80:9F}
    utf-8 ED7F replace  \uFFFD\u7F    -1 {} {First trail byte must be 80:9F}

    utf-8 ED7F strict   {}             0 {} {First trail byte must be 80:9F}
    utf-8 EDA0 tcl8     \u00ED\u00A0  -1 {} {First trail byte must be 80:9F}
    utf-8 EDA0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:9F}

    utf-8 EDA0 strict   {}             0 {} {First trail byte must be 80:9F}
    utf-8 ED81 tcl8      \u00ED\u0081   -1 {} {Missing second trail byte}
    utf-8 ED81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 ED81 strict    {}              0 {} {Missing second trail byte}
    utf-8 EDBF tcl8      \u00ED\u00BF   -1 {} {Missing second trail byte}
    utf-8 EDBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 EDBF strict    {}              0 {} {Missing second trail byte}
    utf-8 ED807F tcl8      \u00ED\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 ED807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 ED807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 ED9F7F tcl8      \u00ED\u0178\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 ED9F7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 ED9F7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EDA080 tcl8       \uD800          -1 {}  {High surrogate}
    utf-8 EDA080 replace    \uFFFD          -1 {knownBug}  {High surrogate}

    utf-8 EDA080 strict     {}               0 {}  {High surrogate}
    utf-8 EDAFBF tcl8       \uDBFF          -1 {}  {High surrogate}
    utf-8 EDAFBF replace    \uFFFD          -1 {knownBug}  {High surrogate}

    utf-8 EDAFBF strict     {}               0 {}  {High surrogate}
    utf-8 EDB080 tcl8       \uDC00          -1 {}  {Low surrogate}
    utf-8 EDB080 replace    \uFFFD          -1 {knownBug}  {Low surrogate}

    utf-8 EDB080 strict     {}               0 {}  {Low surrogate}
    utf-8 EDBFBF tcl8       \uDFFF          -1 {knownBug}  {Low surrogate}
    utf-8 EDBFBF replace    \uFFFD          -1 {knownBug}  {Low surrogate}

    utf-8 EDBFBF strict     {}               0 {}  {Low surrogate}
    utf-8 EDA080EDB080 tcl8 \U00010000      -1 {knownBug}  {High low surrogate pair}
    utf-8 EDA080EDB080 replace \uFFFD\uFFFD -1 {knownBug}  {High low surrogate pair}

    utf-8 EDA080EDB080 strict {}             0 {}  {High low surrogate pair}
    utf-8 EDAFBFEDBFBF tcl8 \U0010FFFF      -1 {knownBug}  {High low surrogate pair}
    utf-8 EDAFBFEDBFBF replace \uFFFD\uFFFD -1 {knownBug}  {High low surrogate pair}

    utf-8 EDAFBFEDBFBF strict {}             0 {}  {High low surrogate pair}

    utf-8 EE tcl8       \u00EE        -1 {} {Missing trail byte}
    utf-8 EE replace    \uFFFD        -1 {} {Missing trail byte}

    utf-8 EE strict     {}             0 {} {Missing trail byte}
    utf-8 EE7F tcl8     \u00EE\u7F    -1 {} {First trail byte must be 80:BF}
    utf-8 EE7F replace  \uFFFD\u7F    -1 {} {First trail byte must be 80:BF}

    utf-8 EE7F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EED0 tcl8     \u00EE\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 EED0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}

    utf-8 EED0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EE81 tcl8      \u00EE\u0081   -1 {} {Missing second trail byte}
    utf-8 EE81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 EE81 strict    {}              0 {} {Missing second trail byte}
    utf-8 EEBF tcl8      \u00EE\u00BF   -1 {} {Missing second trail byte}
    utf-8 EEBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 EEBF strict    {}              0 {} {Missing second trail byte}
    utf-8 EE807F tcl8      \u00EE\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EE807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 EE807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EEBF7F tcl8      \u00EE\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EEBF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 EEBF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EF tcl8       \u00EF        -1 {} {Missing trail byte}
    utf-8 EF replace    \uFFFD        -1 {} {Missing trail byte}

    utf-8 EF strict     {}             0 {} {Missing trail byte}
    utf-8 EF7F tcl8     \u00EF\u7F    -1 {} {First trail byte must be 80:BF}
    utf-8 EF7F replace  \uFFFD\u7F    -1 {} {First trail byte must be 80:BF}

    utf-8 EF7F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EFD0 tcl8     \u00EF\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 EFD0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}

    utf-8 EFD0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EF81 tcl8      \u00EF\u0081   -1 {} {Missing second trail byte}
    utf-8 EF81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 EF81 strict    {}              0 {} {Missing second trail byte}
    utf-8 EFBF tcl8      \u00EF\u00BF   -1 {} {Missing second trail byte}
    utf-8 EFBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 EFBF strict    {}              0 {} {Missing second trail byte}
    utf-8 EF807F tcl8      \u00EF\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EF807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 EF807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EFBF7F tcl8      \u00EF\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EFBF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 EFBF7F strict    {}                0 {}  {Second trail byte must be 80:BF}

    utf-8 F0 tcl8       \u00F0        -1 {} {Missing trail byte}
    utf-8 F0 replace    \uFFFD        -1 {} {Missing trail byte}

    utf-8 F0 strict     {}             0 {} {Missing trail byte}
    utf-8 F080 tcl8     \u00F0\u20AC  -1 {} {First trail byte must be 90:BF}
    utf-8 F080 replace  \uFFFD        -1 {knownW3C} {First trail byte must be 90:BF}

    utf-8 F080 strict   {}             0 {} {First trail byte must be 90:BF}
    utf-8 F08F tcl8     \u00F0\u8F    -1 {} {First trail byte must be 90:BF}
    utf-8 F08F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 90:BF}

    utf-8 F08F strict   {}             0 {} {First trail byte must be 90:BF}
    utf-8 F0D0 tcl8     \u00F0\u00D0  -1 {} {First trail byte must be 90:BF}
    utf-8 F0D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 90:BF}

    utf-8 F0D0 strict   {}             0 {} {First trail byte must be 90:BF}
    utf-8 F090 tcl8      \u00F0\u0090   -1 {} {Missing second trail byte}
    utf-8 F090 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F090 strict    {}              0 {} {Missing second trail byte}
    utf-8 F0BF tcl8      \u00F0\u00BF   -1 {} {Missing second trail byte}
    utf-8 F0BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F0BF strict    {}              0 {} {Missing second trail byte}
    utf-8 F0907F tcl8      \u00F0\u0090\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F0907F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F0907F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F0BF7F tcl8      \u00F0\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F0BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F0BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F090BF tcl8      \u00F0\u0090\u00BF   -1 {} {Missing third trail byte}
    utf-8 F090BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F090BF strict    {}              0 {} {Missing third trail byte}
    utf-8 F0BF81 tcl8      \u00F0\u00BF\u0081   -1 {} {Missing third trail byte}
    utf-8 F0BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F0BF81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F0BF807F tcl8      \u00F0\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F0BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F0BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F090BFD0 tcl8      \u00F0\u0090\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F090BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F090BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}

    utf-8 F1 tcl8       \u00F1        -1 {} {Missing trail byte}
    utf-8 F1 replace    \uFFFD        -1 {} {Missing trail byte}

    utf-8 F1 strict     {}             0 {} {Missing trail byte}
    utf-8 F17F tcl8     \u00F1\u7F    -1 {} {First trail byte must be 80:BF}
    utf-8 F17F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 80:BF}

    utf-8 F17F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F1D0 tcl8     \u00F1\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 F1D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}

    utf-8 F1D0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F180 tcl8      \u00F1\u20AC   -1 {} {Missing second trail byte}
    utf-8 F180 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F180 strict    {}              0 {} {Missing second trail byte}
    utf-8 F1BF tcl8      \u00F1\u00BF   -1 {} {Missing second trail byte}
    utf-8 F1BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F1BF strict    {}              0 {} {Missing second trail byte}
    utf-8 F1807F tcl8      \u00F1\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F1807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F1807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F1BF7F tcl8      \u00F1\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F1BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F1BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F180BF tcl8      \u00F1\u20AC\u00BF   -1 {} {Missing third trail byte}
    utf-8 F180BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F180BF strict    {}              0 {} {Missing third trail byte}
    utf-8 F1BF81 tcl8      \u00F1\u00BF\u0081   -1 {} {Missing third trail byte}
    utf-8 F1BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F1BF81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F1BF807F tcl8      \u00F1\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F1BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F1BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F180BFD0 tcl8      \u00F1\u20AC\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F180BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F180BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F3 tcl8       \u00F3        -1 {} {Missing trail byte}
    utf-8 F3 replace    \uFFFD        -1 {} {Missing trail byte}

    utf-8 F3 strict     {}             0 {} {Missing trail byte}
    utf-8 F37F tcl8     \u00F3\x7F    -1 {} {First trail byte must be 80:BF}
    utf-8 F37F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 80:BF}

    utf-8 F37F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F3D0 tcl8     \u00F3\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 F3D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}

    utf-8 F3D0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F380 tcl8      \u00F3\u20AC   -1 {} {Missing second trail byte}
    utf-8 F380 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F380 strict    {}              0 {} {Missing second trail byte}
    utf-8 F3BF tcl8      \u00F3\u00BF   -1 {} {Missing second trail byte}
    utf-8 F3BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F3BF strict    {}              0 {} {Missing second trail byte}
    utf-8 F3807F tcl8      \u00F3\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F3807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F3807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F3BF7F tcl8      \u00F3\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F3BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F3BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F380BF tcl8      \u00F3\u20AC\u00BF   -1 {} {Missing third trail byte}
    utf-8 F380BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F380BF strict    {}              0 {} {Missing third trail byte}
    utf-8 F3BF81 tcl8      \u00F3\u00BF\u0081   -1 {} {Missing third trail byte}
    utf-8 F3BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F3BF81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F3BF807F tcl8      \u00F3\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F3BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F3BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F380BFD0 tcl8      \u00F3\u20AC\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F380BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F380BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}

    utf-8 F4 tcl8       \u00F4        -1 {} {Missing trail byte}
    utf-8 F4 replace    \uFFFD        -1 {} {Missing trail byte}

    utf-8 F4 strict     {}             0 {} {Missing trail byte}
    utf-8 F47F tcl8     \u00F4\u7F    -1 {} {First trail byte must be 80:8F}
    utf-8 F47F replace  \uFFFD\u7F    -1 {knownW3C} {First trail byte must be 80:8F}

    utf-8 F47F strict   {}             0 {} {First trail byte must be 80:8F}
    utf-8 F490 tcl8     \u00F4\u0090  -1 {} {First trail byte must be 80:8F}
    utf-8 F490 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:8F}

    utf-8 F490 strict   {}             0 {} {First trail byte must be 80:8F}
    utf-8 F480 tcl8      \u00F4\u20AC   -1 {} {Missing second trail byte}
    utf-8 F480 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F480 strict    {}              0 {} {Missing second trail byte}
    utf-8 F48F tcl8      \u00F4\u008F   -1 {} {Missing second trail byte}
    utf-8 F48F replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}

    utf-8 F48F strict    {}              0 {} {Missing second trail byte}
    utf-8 F4807F tcl8      \u00F4\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F4807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F4807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F48F7F tcl8      \u00F4\u008F\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F48F7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}

    utf-8 F48F7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F48081 tcl8      \u00F4\u20AC\u0081   -1 {} {Missing third trail byte}
    utf-8 F48081 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F48081 strict    {}              0 {} {Missing third trail byte}
    utf-8 F48F81 tcl8      \u00F4\u008F\u0081   -1 {} {Missing third trail byte}
    utf-8 F48F81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}

    utf-8 F48F81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F481817F tcl8      \u00F4\u0081\u0081\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F480817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F480817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F48FBFD0 tcl8      \u00F4\u008F\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F48FBFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}

    utf-8 F48FBFD0 strict    {}              0 {} {Third trail byte must be 80:BF}

    utf-8 F5 tcl8    \u00F5 -1 {} {F5:FF are invalid everywhere}
    utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere}

    utf-8 F5 strict  {}      0 {} {F5:FF are invalid everywhere}
    utf-8 FF tcl8    \u00FF -1 {} {F5:FF are invalid everywhere}
    utf-8 FF replace \uFFFD -1 {} {F5:FF are invalid everywhere}

    utf-8 FF strict  {}      0 {} {F5:FF are invalid everywhere}

    utf-8 C0AFE080BFF0818130 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-8}

    utf-8 EDA080EDBFBFEDAF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3-9}

    utf-8 F4919293FF4180BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0041\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-10}

    utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30                         -1 {knownW3C} {Unicode Table 3.11}

}

# utf16-le and utf16-be test cases. Note utf16 cases are automatically generated
# based on these depending on platform endianness. Note truncated tests can only
# happen when the sequence is at the end (including by itself) Thus {solo tail}
# in some cases.
lappend encInvalidBytes {*}{
    utf-16le 41      tcl8      \uFFFD -1 {solo tail} {Truncated}
    utf-16le 41      replace   \uFFFD -1 {solo tail} {Truncated}
    utf-16le 41      strict    {}      0 {solo tail} {Truncated}


    utf-16le 00D8    tcl8      \uD800 -1 {} {Missing low surrogate}
    utf-16le 00D8    replace   \uFFFD -1 {} {Missing low surrogate}

    utf-16le 00D8    strict    {}      0 {knownBug} {Missing low surrogate}
    utf-16le 00DC    tcl8      \uDC00 -1 {} {Missing high surrogate}
    utf-16le 00DC    replace   \uFFFD -1 {} {Missing high surrogate}

    utf-16le 00DC    strict    {}      0 {knownBug} {Missing high surrogate}

    utf-16be 41      tcl8      \uFFFD -1 {solo tail} {Truncated}
    utf-16be 41      replace   \uFFFD -1 {solo tail} {Truncated}
    utf-16be 41      strict    {}      0 {solo tail} {Truncated}


    utf-16be D800    tcl8      \uD800 -1 {} {Missing low surrogate}
    utf-16be D800    replace   \uFFFD -1 {knownBug} {Missing low surrogate}

    utf-16be D800    strict    {}      0 {knownBug} {Missing low surrogate}
    utf-16be DC00    tcl8      \uDC00 -1 {} {Missing high surrogate}
    utf-16be DC00    replace   \uFFFD -1 {knownBug} {Missing high surrogate}

    utf-16be DC00    strict    {}      0 {knownBug} {Missing high surrogate}
}

# utf32-le and utf32-be test cases. Note utf32 cases are automatically generated
# based on these depending on platform endianness. Note truncated tests can only
# happen when the sequence is at the end (including by itself) Thus {solo tail}
# in some cases.
lappend encInvalidBytes {*}{
    utf-32le 41      tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32le 41      replace   \uFFFD  -1 {solo} {Truncated}

    utf-32le 41      strict    {}   0 {solo tail} {Truncated}
    utf-32le 4100    tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32le 4100    replace   \uFFFD  -1 {solo} {Truncated}

    utf-32le 4100    strict    {}   0 {solo tail} {Truncated}
    utf-32le 410000  tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32le 410000  replace   \uFFFD  -1 {solo} {Truncated}

    utf-32le 410000  strict    {}       0 {solo tail} {Truncated}
    utf-32le 00D80000 tcl8     \uD800   -1 {} {High-surrogate}
    utf-32le 00D80000 replace  \uFFFD   -1 {} {High-surrogate}

    utf-32le 00D80000 strict   {}        0 {} {High-surrogate}
    utf-32le 00DC0000 tcl8     \uDC00   -1 {} {Low-surrogate}
    utf-32le 00DC0000 replace  \uFFFD   -1 {} {Low-surrogate}

    utf-32le 00DC0000 strict   {}        0 {} {Low-surrogate}
    utf-32le 00D8000000DC0000 tcl8 \uD800\uDC00    -1 {} {High-low-surrogate-pair}
    utf-32le 00D8000000DC0000 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}

    utf-32le 00D8000000DC0000 strict  {}            0 {} {High-low-surrogate-pair}
    utf-32le 00001100 tcl8 \uFFFD    -1 {} {Out of range}

    utf-32le 00001100 replace \uFFFD -1 {} {Out of range}
    utf-32le 00001100 strict {}       0 {} {Out of range}
    utf-32le FFFFFFFF tcl8 \uFFFD    -1 {} {Out of range}

    utf-32le FFFFFFFF replace \uFFFD -1 {} {Out of range}
    utf-32le FFFFFFFF strict {}       0 {} {Out of range}

    utf-32be 41      tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 41      replace   \uFFFD  -1 {solo tail} {Truncated}

    utf-32be 41      strict    {}       0 {solo tail} {Truncated}
    utf-32be 0041    tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 0041    replace   \uFFFD  -1 {solo} {Truncated}

    utf-32be 0041    strict    {}   0 {solo tail} {Truncated}
    utf-32be 000041  tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 000041  replace   \uFFFD  -1 {solo} {Truncated}

    utf-32be 000041  strict    {}       0 {solo tail} {Truncated}
    utf-32be 0000D800 tcl8     \uD800   -1 {} {High-surrogate}
    utf-32be 0000D800 replace  \uFFFD   -1 {} {High-surrogate}

    utf-32be 0000D800 strict   {}        0 {} {High-surrogate}
    utf-32be 0000DC00 tcl8     \uDC00   -1 {} {Low-surrogate}
    utf-32be 0000DC00 replace  \uFFFD   -1 {} {Low-surrogate}

    utf-32be 0000DC00 strict   {}        0 {} {Low-surrogate}
    utf-32be 0000D8000000DC00 tcl8 \uD800\uDC00    -1 {} {High-low-surrogate-pair}
    utf-32be 0000D8000000DC00 replace \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}

    utf-32be 0000D8000000DC00 strict  {}            0 {} {High-low-surrogate-pair}
    utf-32be 00110000 tcl8 \uFFFD    -1 {} {Out of range}

    utf-32be 00110000 replace \uFFFD -1 {} {Out of range}
    utf-32be 00110000 strict {}       0 {} {Out of range}
    utf-32be FFFFFFFF tcl8 \uFFFD    -1 {} {Out of range}

    utf-32be FFFFFFFF replace \uFFFD -1 {} {Out of range}
    utf-32be FFFFFFFF strict {}       0 {} {Out of range}
}






# Strings that cannot be encoded for specific encoding / profiles
# <ENCODING STRING PROFILE EXPECTEDRESULT EXPECTEDFAILINDEX CTRL COMMENT>
# <ENCODING,STRING,PROFILE> should be unique for test ids to be unique.
# See earlier comments about CTRL field.
#
# Note utf-16, utf-32 missing because they are automatically
# generated based on le/be versions.
# TODO - out of range code point (note cannot be generated by \U notation)
lappend encUnencodableStrings {*}{
    ascii \u00e0 tcl8    3f -1 {} {unencodable}
    ascii \u00e0 strict  {}  0 {} {unencodable}

    iso8859-1 \u0141 tcl8    3f -1 {} unencodable
    iso8859-1 \u0141 strict  {}  0 {} unencodable

    utf-8 \uD800 tcl8    eda080 -1 {} High-surrogate
    utf-8 \uD800 strict  {}      0 {} High-surrogate
    utf-8 \uDC00 tcl8    edb080 -1 {} High-surrogate
    utf-8 \uDC00 strict  {}      0 {} High-surrogate
}


# The icuUcmTests.tcl is generated by the tools/ucm2tests.tcl script
# and generates test vectors for the above tables for various encodings
# based on ICU UCM files.
# TODO - commented out for now as generating a lot of mismatches.

# source [file join [file dirname [info script]] icuUcmTests.tcl]







>



>


>



>



>



>




>



>



>




>



>



>



>



>




>



>



>



>



>



>


|
>


|
>




>



>



>



>


|
>


|
>



>



>



>



>


|
>


|
>





|
|
>



>



>



>


|
>


|
>



>



>



>



>



>



>




>

|
|
>



>



>



>


|
>


|
>



>



>



>



>



>


|
>


|
>




>



>



>



>



>



>



>



>



>



>



>



>




>



>



>



>



>


|
>


|
>



>



>

|

>



>



>



>



>



>



>

|
|
>



>



>



>



>



>




>



>



>



>



>



>



>



>



>



>



>




>



>



>

>

>

>










>
>


>



>





>
>


>



>










>



>



>



>



>


|
>


>
|


>
|




>



>



>



>



>


|
>


>
|


>
|


>
>
>
>
>






<
<


















|
>

205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776


777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
    utf-8 9D tcl8    \u009D -1 {} {map to cp1252}
    utf-8 9E tcl8    \u017E -1 {} {map to cp1252}
    utf-8 9F tcl8    \u0178 -1 {} {map to cp1252}

    utf-8 C0 tcl8    \u00C0 -1 {} {C0 is invalid anywhere}
    utf-8 C0 strict  {}      0 {} {C0 is invalid anywhere}
    utf-8 C0 replace \uFFFD -1 {} {C0 is invalid anywhere}
    utf-8 C0 lossless \uDCC0 -1 {} {C0 is invalid anywhere}
    utf-8 C080 tcl8    \u0000 -1 {} {C080 -> U+0 in Tcl's internal modified UTF8}
    utf-8 C080 strict  {}      0 {} {C080 -> invalid}
    utf-8 C080 replace \uFFFD -1 {} {C080 -> single replacement char}
    utf-8 C080 lossless \uDCC0\uDC80 -1 {} {C080 -> two lossless wrappers}
    utf-8 C0A2 tcl8    \u00C0\u00A2 -1 {} {websec.github.io - A}
    utf-8 C0A2 replace \uFFFD\uFFFD -1 {} {websec.github.io - A}
    utf-8 C0A2 lossless \uDCC0\uDCA2 -1 {} {websec.github.io - A}
    utf-8 C0A2 strict  {}            0 {} {websec.github.io - A}
    utf-8 C0A7 tcl8    \u00C0\u00A7 -1 {} {websec.github.io - double quote}
    utf-8 C0A7 replace \uFFFD\uFFFD -1 {} {websec.github.io - double quote}
    utf-8 C0A7 lossless \uDCC0\uDCA7 -1 {} {websec.github.io - A}
    utf-8 C0A7 strict  {}            0 {} {websec.github.io - double quote}
    utf-8 C0AE tcl8    \u00C0\u00AE -1 {} {websec.github.io - full stop}
    utf-8 C0AE replace \uFFFD\uFFFD -1 {} {websec.github.io - full stop}
    utf-8 C0AE lossless \uDCC0\uDCAE -1 {} {websec.github.io - A}
    utf-8 C0AE strict  {}            0 {} {websec.github.io - full stop}
    utf-8 C0AF tcl8    \u00C0\u00AF -1 {} {websec.github.io - solidus}
    utf-8 C0AF replace \uFFFD\uFFFD -1 {} {websec.github.io - solidus}
    utf-8 C0AF lossless \uDCC0\uDCAF -1 {} {websec.github.io - A}
    utf-8 C0AF strict  {}            0 {} {websec.github.io - solidus}

    utf-8 C1 tcl8    \u00C1 -1 {} {C1 is invalid everywhere}
    utf-8 C1 replace \uFFFD -1 {} {C1 is invalid everywhere}
    utf-8 C1 lossless \uDCC1 -1 {} {C1 is invalid anywhere}
    utf-8 C1 strict  {}      0 {} {C1 is invalid everywhere}
    utf-8 C181 tcl8    \u00C1\u0081 -1 {} {websec.github.io - base test (A)}
    utf-8 C181 replace \uFFFD\uFFFD -1 {} {websec.github.io - base test (A)}
    utf-8 C181 lossless \uDCC1\uDC81 -1 {} {websec.github.io - base test (A)}
    utf-8 C181 strict  {}            0 {} {websec.github.io - base test (A)}
    utf-8 C19C tcl8    \u00C1\u0153 -1 {} {websec.github.io - reverse solidus}
    utf-8 C19C replace \uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus}
    utf-8 C19C lossless \uDCC1\uDC9C -1 {} {websec.github.io - reverse solidus}
    utf-8 C19C strict  {}            0 {} {websec.github.io - reverse solidus}

    utf-8 C2 tcl8      \u00C2     -1 {} {Missing trail byte}
    utf-8 C2 replace   \uFFFD     -1 {} {Missing trail byte}
    utf-8 C2 lossless   \uDCC2     -1 {} {Missing trail byte}
    utf-8 C2 strict    {}          0 {} {Missing trail byte}
    utf-8 C27F tcl8    \u00C2\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 C27F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 C27F lossless \uDCC2\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 C27F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 DF tcl8      \u00DF     -1 {} {Missing trail byte}
    utf-8 DF replace   \uFFFD     -1 {} {Missing trail byte}
    utf-8 DF lossless   \uDCDF     -1 {} {Missing trail byte}
    utf-8 DF strict    {}          0 {} {Missing trail byte}
    utf-8 DF7F tcl8    \u00DF\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 DF7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 DF7F lossless \uDCDF\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 DF7F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 DFE0A080 tcl8    \u00DF\u0800 -1 {} {Invalid trail byte is start of valid sequence}
    utf-8 DFE0A080 replace \uFFFD\u0800 -1 {} {Invalid trail byte is start of valid sequence}
    utf-8 DFE0A080 lossless \uDCDF\u0800 -1 {} {Invalid trail byte is start of valid sequence}
    utf-8 DFE0A080 strict  {}            0 {} {Invalid trail byte is start of valid sequence}

    utf-8 E0 tcl8      \u00E0     -1 {} {Missing trail byte}
    utf-8 E0 replace   \uFFFD     -1 {} {Missing trail byte}
    utf-8 E0 lossless  \uDCE0    -1 {} {Missing trail byte}
    utf-8 E0 strict    {}          0 {} {Missing trail byte}
    utf-8 E080 tcl8      \u00E0\u20AC   -1 {} {First trail byte must be A0:BF}
    utf-8 E080 replace   \uFFFD\uFFFD   -1 {} {First trail byte must be A0:BF}
    utf-8 E080 lossless   \uDCE0\uDC80  -1 {} {First trail byte must be A0:BF}
    utf-8 E080 strict    {}              0 {} {First trail byte must be A0:BF}
    utf-8 E0819C tcl8    \u00E0\u0081\u0153 -1 {} {websec.github.io - reverse solidus}
    utf-8 E0819C replace \uFFFD\uFFFD\uFFFD -1 {} {websec.github.io - reverse solidus}
    utf-8 E0819C lossless \uDCE0\uDC81\uDC9C -1 {} {websec.github.io - reverse solidus}
    utf-8 E0819C strict  {}                  0 {} {websec.github.io - reverse solidus}
    utf-8 E09F tcl8      \u00E0\u0178   -1 {} {First trail byte must be A0:BF}
    utf-8 E09F replace   \uFFFD\uFFFD   -1 {} {First trail byte must be A0:BF}
    utf-8 E09F lossless  \uDCE0\uDC9F   -1 {} {First trail byte must be A0:BF}
    utf-8 E09F strict    {}              0 {} {First trail byte must be A0:BF}
    utf-8 E0A0 tcl8      \u00E0\u00A0   -1 {} {Missing second trail byte}
    utf-8 E0A0 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 E0A0 lossless  \uDCE0\uDCA0   -1 {} {Missing second trail byte}
    utf-8 E0A0 strict    {}              0 {} {Missing second trail byte}
    utf-8 E0BF tcl8      \u00E0\u00BF   -1 {} {Missing second trail byte}
    utf-8 E0BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 E0BF lossless  \uDCE0\uDCBF   -1 {} {Missing second trail byte}
    utf-8 E0BF strict    {}              0 {} {Missing second trail byte}
    utf-8 E0A07F tcl8    \u00E0\u00A0\x7F   -1 {}     {Second trail byte must be 80:BF}
    utf-8 E0A07F replace \uFFFD\x7F         -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 E0A07F lossless \uDCE0\uDCA0\x7F  -1 {} {Second trail byte must be 80:BF}
    utf-8 E0A07F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 E0BF7F tcl8    \u00E0\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
    utf-8 E0BF7F replace \uFFFD\x7F         -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 E0BF7F lossless \uDCE0\uDCBF\x7F  -1 {} {Second trail byte must be 80:BF}
    utf-8 E0BF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}

    utf-8 E1 tcl8      \u00E1     -1 {} {Missing trail byte}
    utf-8 E1 replace   \uFFFD     -1 {} {Missing trail byte}
    utf-8 E1 lossless  \uDCE1     -1 {} {Missing trail byte}
    utf-8 E1 strict    {}          0 {} {Missing trail byte}
    utf-8 E17F tcl8    \u00E1\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 E17F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 E17F lossless \uDCE1\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 E17F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 E181 tcl8      \u00E1\u0081   -1 {} {Missing second trail byte}
    utf-8 E181 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 E181 lossless  \uDCE1\uDC81   -1 {} {Missing second trail byte}
    utf-8 E181 strict    {}              0 {} {Missing second trail byte}
    utf-8 E1BF tcl8      \u00E1\u00BF   -1 {} {Missing second trail byte}
    utf-8 E1BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 E1BF lossless   \uDCE1\uDCBF  -1 {} {Missing second trail byte}
    utf-8 E1BF strict    {}              0 {} {Missing second trail byte}
    utf-8 E1807F tcl8    \u00E1\u20AC\x7F   -1 {} {Second trail byte must be 80:BF}
    utf-8 E1807F replace \uFFFD\x7F         -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 E1807F lossless \uDCE1\uDC80\x7F  -1 {} {Second trail byte must be 80:BF}
    utf-8 E1807F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 E1BF7F tcl8    \u00E1\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
    utf-8 E1BF7F replace \uFFFD\x7F         -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 E1BF7F lossless \uDCE1\uDCBF\x7F  -1 {} {Second trail byte must be 80:BF}
    utf-8 E1BF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 EC tcl8      \u00EC     -1 {} {Missing trail byte}
    utf-8 EC replace   \uFFFD     -1 {} {Missing trail byte}
    utf-8 EC lossless   \uDCEC    -1 {} {Missing trail byte}
    utf-8 EC strict    {}          0 {} {Missing trail byte}
    utf-8 EC7F tcl8    \u00EC\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 EC7F replace \uFFFD\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 EC7F lossless \uDCEC\x7F -1 {} {Trail byte must be 80:BF}
    utf-8 EC7F strict  {}          0 {} {Trail byte must be 80:BF}
    utf-8 EC81 tcl8      \u00EC\u0081   -1 {} {Missing second trail byte}
    utf-8 EC81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 EC81 lossless   \uDCEC\uDC81  -1 {} {Missing second trail byte}
    utf-8 EC81 strict    {}              0 {} {Missing second trail byte}
    utf-8 ECBF tcl8      \u00EC\u00BF   -1 {} {Missing second trail byte}
    utf-8 ECBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 ECBF lossless   \uDCEC\uDCBF  -1 {} {Missing second trail byte}
    utf-8 ECBF strict    {}              0 {} {Missing second trail byte}
    utf-8 EC807F tcl8    \u00EC\u20AC\x7F   -1 {} {Second trail byte must be 80:BF}
    utf-8 EC807F replace \uFFFD\x7F         -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 EC807F lossless \uDCEC\uDC80\x7F  -1 {} {Second trail byte must be 80:BF}
    utf-8 EC807F strict  {}                  0 {}         {Second trail byte must be 80:BF}
    utf-8 ECBF7F tcl8    \u00EC\u00BF\x7F   -1 {}         {Second trail byte must be 80:BF}
    utf-8 ECBF7F replace \uFFFD\x7F         -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 ECBF7F lossless \uDCEC\uDCBF\x7F  -1 {} {Second trail byte must be 80:BF}
    utf-8 ECBF7F strict  {}                  0 {}         {Second trail byte must be 80:BF}

    utf-8 ED tcl8       \u00ED        -1 {} {Missing trail byte}
    utf-8 ED replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 ED strict     {}             0 {} {Missing trail byte}
    utf-8 ED7F tcl8     \u00ED\x7F    -1 {} {First trail byte must be 80:9F}
    utf-8 ED7F replace  \uFFFD\x7F    -1 {} {First trail byte must be 80:9F}
    utf-8 ED7F lossless  \uDCED\x7F    -1 {} {First trail byte must be 80:9F}
    utf-8 ED7F strict   {}             0 {} {First trail byte must be 80:9F}
    utf-8 EDA0 tcl8     \u00ED\u00A0  -1 {} {First trail byte must be 80:9F}
    utf-8 EDA0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:9F}
    utf-8 EDA0 lossless  \uDCED\uDCA0  -1 {} {First trail byte must be 80:9F}
    utf-8 EDA0 strict   {}             0 {} {First trail byte must be 80:9F}
    utf-8 ED81 tcl8      \u00ED\u0081   -1 {} {Missing second trail byte}
    utf-8 ED81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 ED81 lossless   \uDCED\uDC81  -1 {} {Missing second trail byte}
    utf-8 ED81 strict    {}              0 {} {Missing second trail byte}
    utf-8 EDBF tcl8      \u00ED\u00BF   -1 {} {Missing second trail byte}
    utf-8 EDBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 EDBF lossless   \uDCED\uDCBF  -1 {} {Missing second trail byte}
    utf-8 EDBF strict    {}              0 {} {Missing second trail byte}
    utf-8 ED807F tcl8      \u00ED\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 ED807F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 ED807F lossless   \uDCED\uDC80\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 ED807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 ED9F7F tcl8      \u00ED\u0178\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 ED9F7F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 ED9F7F lossless   \uDCED\uDC9F\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 ED9F7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EDA080 tcl8       \uD800          -1 {}  {High surrogate}
    utf-8 EDA080 replace    \uFFFD          -1 {knownBug}  {High surrogate}
    utf-8 EDA080 lossless    \uDCED\uDCA0\uED80 -1 {knownBug}  {High surrogate}
    utf-8 EDA080 strict     {}               0 {}  {High surrogate}
    utf-8 EDAFBF tcl8       \uDBFF          -1 {}  {High surrogate}
    utf-8 EDAFBF replace    \uFFFD          -1 {knownBug}  {High surrogate}
    utf-8 EDAFBF lossless    \uDCED\uDCAF\uDCBF -1 {knownBug}  {High surrogate}
    utf-8 EDAFBF strict     {}               0 {}  {High surrogate}
    utf-8 EDB080 tcl8       \uDC00          -1 {}  {Low surrogate}
    utf-8 EDB080 replace    \uFFFD          -1 {knownBug}  {Low surrogate}
    utf-8 EDB080 lossless    \uDCED\uDCB0\uDC80 -1 {knownBug}  {Low surrogate}
    utf-8 EDB080 strict     {}               0 {}  {Low surrogate}
    utf-8 EDBFBF tcl8       \uDFFF          -1 {knownBug}  {Low surrogate}
    utf-8 EDBFBF replace    \uFFFD          -1 {knownBug}  {Low surrogate}
    utf-8 EDBFBF lossless    \uDCED\uDCBF\uDCBF -1 {knownBug}  {Low surrogate}
    utf-8 EDBFBF strict     {}               0 {}  {Low surrogate}
    utf-8 EDA080EDB080 tcl8 \U00010000      -1 {knownBug}  {High low surrogate pair}
    utf-8 EDA080EDB080 replace \uFFFD\uFFFD -1 {knownBug}  {High low surrogate pair}
    utf-8 EDA080EDB080 lossless \uDCED\uDCA0\uDC80\uDCED\uDCB0\uDC80  -1 {knownBug}  {High low surrogate pair}
    utf-8 EDA080EDB080 strict {}             0 {}  {High low surrogate pair}
    utf-8 EDAFBFEDBFBF tcl8 \U0010FFFF      -1 {knownBug}  {High low surrogate pair}
    utf-8 EDAFBFEDBFBF replace \uFFFD\uFFFD -1 {knownBug}  {High low surrogate pair}
    utf-8 EDAFBFEDBFBF lossless \uDCED\uDCAF\uDCBF\uDCED\uDCBF\uDCBF  -1 {knownBug}  {High low surrogate pair}
    utf-8 EDAFBFEDBFBF strict {}             0 {}  {High low surrogate pair}

    utf-8 EE tcl8       \u00EE        -1 {} {Missing trail byte}
    utf-8 EE replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 EE lossless    \uDCEE       -1 {} {Missing trail byte}
    utf-8 EE strict     {}             0 {} {Missing trail byte}
    utf-8 EE7F tcl8     \u00EE\x7F    -1 {} {First trail byte must be 80:BF}
    utf-8 EE7F replace  \uFFFD\x7F    -1 {} {First trail byte must be 80:BF}
    utf-8 EE7F lossless  \uDCEE\x7F     -1 {} {First trail byte must be 80:BF}
    utf-8 EE7F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EED0 tcl8     \u00EE\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 EED0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}
    utf-8 EED0 lossless  \uDCEE\uDCD0   -1 {} {First trail byte must be 80:BF}
    utf-8 EED0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EE81 tcl8      \u00EE\u0081   -1 {} {Missing second trail byte}
    utf-8 EE81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 EE81 lossless   \uDCEE\uDC81  -1 {} {Missing second trail byte}
    utf-8 EE81 strict    {}              0 {} {Missing second trail byte}
    utf-8 EEBF tcl8      \u00EE\u00BF   -1 {} {Missing second trail byte}
    utf-8 EEBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 EEBF lossless   \uDCEE\uDCBF  -1 {} {Missing second trail byte}
    utf-8 EEBF strict    {}              0 {} {Missing second trail byte}
    utf-8 EE807F tcl8      \u00EE\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EE807F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 EE807F lossless  \uDCEE\uDC80\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EE807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EEBF7F tcl8      \u00EE\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EEBF7F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 EEBF7F lossless  \uDCEE\uDCBF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EEBF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EF tcl8       \u00EF        -1 {} {Missing trail byte}
    utf-8 EF replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 EF lossless    \uDCEF       -1 {} {Missing trail byte}
    utf-8 EF strict     {}             0 {} {Missing trail byte}
    utf-8 EF7F tcl8     \u00EF\u7F    -1 {} {First trail byte must be 80:BF}
    utf-8 EF7F replace  \uFFFD\u7F    -1 {} {First trail byte must be 80:BF}
    utf-8 EF7F lossless  \uDCEF\x7F -1 {} {First trail byte must be 80:BF}
    utf-8 EF7F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EFD0 tcl8     \u00EF\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 EFD0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}
    utf-8 EFD0 lossless  \uDCEF\uDCD0 -1 {} {First trail byte must be 80:BF}
    utf-8 EFD0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 EF81 tcl8      \u00EF\u0081   -1 {} {Missing second trail byte}
    utf-8 EF81 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 EF81 lossless  \uDCEF\uDC81   -1 {} {Missing second trail byte}
    utf-8 EF81 strict    {}              0 {} {Missing second trail byte}
    utf-8 EFBF tcl8      \u00EF\u00BF   -1 {} {Missing second trail byte}
    utf-8 EFBF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 EFBF lossless  \uDCEF\uDCBF   -1 {} {Missing second trail byte}
    utf-8 EFBF strict    {}              0 {} {Missing second trail byte}
    utf-8 EF807F tcl8      \u00EF\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EF807F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 EF807F lossless  \uDCEF\uDC80\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EF807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 EFBF7F tcl8      \u00EF\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EFBF7F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 EFBF7F lossless  \uDCEF\uDCBF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 EFBF7F strict    {}                0 {}  {Second trail byte must be 80:BF}

    utf-8 F0 tcl8       \u00F0        -1 {} {Missing trail byte}
    utf-8 F0 replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 F0 lossless   \uDCF0       -1 {} {Missing trail byte}
    utf-8 F0 strict     {}             0 {} {Missing trail byte}
    utf-8 F080 tcl8     \u00F0\u20AC  -1 {} {First trail byte must be 90:BF}
    utf-8 F080 replace  \uFFFD        -1 {knownW3C} {First trail byte must be 90:BF}
    utf-8 F080 lossless \uDCF0\uDC80 -1 {} {First trail byte must be 90:BF}
    utf-8 F080 strict   {}             0 {} {First trail byte must be 90:BF}
    utf-8 F08F tcl8     \u00F0\u8F    -1 {} {First trail byte must be 90:BF}
    utf-8 F08F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 90:BF}
    utf-8 F08F lossless \uDCF0\uDC8F -1 {} {First trail byte must be 90:BF}
    utf-8 F08F strict   {}             0 {} {First trail byte must be 90:BF}
    utf-8 F0D0 tcl8     \u00F0\u00D0  -1 {} {First trail byte must be 90:BF}
    utf-8 F0D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 90:BF}
    utf-8 F0D0 lossless \uDCF0\uDCD0 -1 {} {First trail byte must be 90:BF}
    utf-8 F0D0 strict   {}             0 {} {First trail byte must be 90:BF}
    utf-8 F090 tcl8      \u00F0\u0090   -1 {} {Missing second trail byte}
    utf-8 F090 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F090 lossless  \uDCF0\uDC90   -1 {} {Missing second trail byte}
    utf-8 F090 strict    {}              0 {} {Missing second trail byte}
    utf-8 F0BF tcl8      \u00F0\u00BF   -1 {} {Missing second trail byte}
    utf-8 F0BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F0BF lossless   \uDCF0\uDCBF  -1 {} {Missing second trail byte}
    utf-8 F0BF strict    {}              0 {} {Missing second trail byte}
    utf-8 F0907F tcl8      \u00F0\u0090\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F0907F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F0907F lossless   \uDCF0\uDC90\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F0907F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F0BF7F tcl8      \u00F0\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F0BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F0BF7F lossless  \uDCF0\uDCBF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F0BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F090BF tcl8      \u00F0\u0090\u00BF   -1 {} {Missing third trail byte}
    utf-8 F090BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F090BF lossless  \uDCF0\uDC90\uDCBF  -1 {} {Missing third trail byte}
    utf-8 F090BF strict    {}              0 {} {Missing third trail byte}
    utf-8 F0BF81 tcl8      \u00F0\u00BF\u0081   -1 {} {Missing third trail byte}
    utf-8 F0BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F0BF81 lossless  \uDCF0\uDCBF\uDC81  -1 {} {Missing third trail byte}
    utf-8 F0BF81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F0BF807F tcl8      \u00F0\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F0BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F0BF817F lossless   \uDCF0\uDCBF\uDC81\x7F -1 {} {Third trail byte must be 80:BF}
    utf-8 F0BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F090BFD0 tcl8      \u00F0\u0090\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F090BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F090BFD0 lossless   \uDCF0\uDC90\uDCBF\uDCD0 -1 {} {Third trail byte must be 80:BF}
    utf-8 F090BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}

    utf-8 F1 tcl8       \u00F1        -1 {} {Missing trail byte}
    utf-8 F1 replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 F1 lossless    \uDCF1       -1 {} {Missing trail byte}
    utf-8 F1 strict     {}             0 {} {Missing trail byte}
    utf-8 F17F tcl8     \u00F1\u7F    -1 {} {First trail byte must be 80:BF}
    utf-8 F17F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 80:BF}
    utf-8 F17F lossless  \uDCF1\x7F -1 {} {First trail byte must be 80:BF}
    utf-8 F17F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F1D0 tcl8     \u00F1\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 F1D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}
    utf-8 F1D0 lossless  \uDCF1\uDCD0 -1 {} {First trail byte must be 80:BF}
    utf-8 F1D0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F180 tcl8      \u00F1\u20AC   -1 {} {Missing second trail byte}
    utf-8 F180 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F180 lossless   \uDCF1\uDC80  -1 {} {Missing second trail byte}
    utf-8 F180 strict    {}              0 {} {Missing second trail byte}
    utf-8 F1BF tcl8      \u00F1\u00BF   -1 {} {Missing second trail byte}
    utf-8 F1BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F1BF lossless   \uDCF1\uDCBF  -1 {} {Missing second trail byte}
    utf-8 F1BF strict    {}              0 {} {Missing second trail byte}
    utf-8 F1807F tcl8      \u00F1\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F1807F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F1807F lossless   \uDCF1\uDC80\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F1807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F1BF7F tcl8      \u00F1\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F1BF7F replace   \uFFFD\x7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F1BF7F lossless   \uDCF1\uDCBF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F1BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F180BF tcl8      \u00F1\u20AC\u00BF   -1 {} {Missing third trail byte}
    utf-8 F180BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F180BF lossless   \uDCF1\uDC80\uDCBF -1 {} {Missing third trail byte}
    utf-8 F180BF strict    {}              0 {} {Missing third trail byte}
    utf-8 F1BF81 tcl8      \u00F1\u00BF\u0081   -1 {} {Missing third trail byte}
    utf-8 F1BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F1BF81 lossless   \uDCF1\uDCBF\uDC81  -1 {} {Missing third trail byte}
    utf-8 F1BF81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F1BF807F tcl8      \u00F1\u00BF\u20AC\x7F  -1 {} {Third trail byte must be 80:BF}
    utf-8 F1BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F1BF817F lossless   \uDCF1\uDCBF\uDC81\x7F -1 {} {Third trail byte must be 80:BF}
    utf-8 F1BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F180BFD0 tcl8      \u00F1\u20AC\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F180BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F180BFD0 lossless   \uDCF1\uDC80\uDCBF\uDCD0 -1 {} {Third trail byte must be 80:BF}
    utf-8 F180BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F3 tcl8       \u00F3        -1 {} {Missing trail byte}
    utf-8 F3 replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 F3 lossless    \uDCF3       -1 {} {Missing trail byte}
    utf-8 F3 strict     {}             0 {} {Missing trail byte}
    utf-8 F37F tcl8     \u00F3\x7F    -1 {} {First trail byte must be 80:BF}
    utf-8 F37F replace  \uFFFD        -1 {knownW3C} {First trail byte must be 80:BF}
    utf-8 F37F lossless  \uDCF3\x7F   -1 {} {First trail byte must be 80:BF}
    utf-8 F37F strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F3D0 tcl8     \u00F3\u00D0  -1 {} {First trail byte must be 80:BF}
    utf-8 F3D0 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:BF}
    utf-8 F3D0 lossless  \uDCF3\uDCD0 -1 {} {First trail byte must be 80:BF}
    utf-8 F3D0 strict   {}             0 {} {First trail byte must be 80:BF}
    utf-8 F380 tcl8      \u00F3\u20AC   -1 {} {Missing second trail byte}
    utf-8 F380 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F380 lossless   \uDCF3\uDC80  -1 {} {Missing second trail byte}
    utf-8 F380 strict    {}              0 {} {Missing second trail byte}
    utf-8 F3BF tcl8      \u00F3\u00BF   -1 {} {Missing second trail byte}
    utf-8 F3BF replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F3BF lossless   \uDCF3\uDCBF  -1 {} {Missing second trail byte}
    utf-8 F3BF strict    {}              0 {} {Missing second trail byte}
    utf-8 F3807F tcl8     \u00F3\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F3807F replace  \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F3807F lossless  \uDCF3\uDC80\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F3807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F3BF7F tcl8      \u00F3\u00BF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F3BF7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F3BF7F lossless   \uDCF3\uDCBF\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F3BF7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F380BF tcl8      \u00F3\u20AC\u00BF   -1 {} {Missing third trail byte}
    utf-8 F380BF replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F380BF lossless   \uDCF3\uDC80\uDCBF -1 {} {Missing third trail byte}
    utf-8 F380BF strict    {}              0 {} {Missing third trail byte}
    utf-8 F3BF81 tcl8      \u00F3\u00BF\u0081   -1 {} {Missing third trail byte}
    utf-8 F3BF81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F3BF81 lossless   \uDCF3\uDCBF\uDC81 -1 {} {Missing third trail byte}
    utf-8 F3BF81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F3BF807F tcl8      \u00F3\u00BF\u20AC\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F3BF817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F3BF817F lossless   \uDCF3\uDCBF\uDC81\x7F -1 {} {Third trail byte must be 80:BF}
    utf-8 F3BF817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F380BFD0 tcl8      \u00F3\u20AC\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F380BFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F380BFD0 lossless   \uDCF3\uDC80\uDCBF\uDCD0 -1 {} {Third trail byte must be 80:BF}
    utf-8 F380BFD0 strict    {}              0 {} {Third trail byte must be 80:BF}

    utf-8 F4 tcl8       \u00F4        -1 {} {Missing trail byte}
    utf-8 F4 replace    \uFFFD        -1 {} {Missing trail byte}
    utf-8 F4 lossless    \uDCF4       -1 {} {Missing trail byte}
    utf-8 F4 strict     {}             0 {} {Missing trail byte}
    utf-8 F47F tcl8     \u00F4\u7F    -1 {} {First trail byte must be 80:8F}
    utf-8 F47F replace  \uFFFD\u7F    -1 {knownW3C} {First trail byte must be 80:8F}
    utf-8 F47F lossless  \uDCF4\x7F -1 {} {First trail byte must be 80:8F}
    utf-8 F47F strict   {}             0 {} {First trail byte must be 80:8F}
    utf-8 F490 tcl8     \u00F4\u0090  -1 {} {First trail byte must be 80:8F}
    utf-8 F490 replace  \uFFFD\uFFFD  -1 {} {First trail byte must be 80:8F}
    utf-8 F490 lossless  \uDCF4\uDC90 -1 {} {First trail byte must be 80:8F}
    utf-8 F490 strict   {}             0 {} {First trail byte must be 80:8F}
    utf-8 F480 tcl8      \u00F4\u20AC   -1 {} {Missing second trail byte}
    utf-8 F480 replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F480 lossless   \uDCF4\uDC80  -1 {} {Missing second trail byte}
    utf-8 F480 strict    {}              0 {} {Missing second trail byte}
    utf-8 F48F tcl8      \u00F4\u008F   -1 {} {Missing second trail byte}
    utf-8 F48F replace   \uFFFD         -1 {knownW3C} {Missing second trail byte}
    utf-8 F48F lossless   \uDCF4\uDC8F  -1 {} {Missing second trail byte}
    utf-8 F48F strict    {}              0 {} {Missing second trail byte}
    utf-8 F4807F tcl8      \u00F4\u20AC\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F4807F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F4807F lossless   \uDCF4\uDC80\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F4807F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F48F7F tcl8      \u00F4\u008F\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F48F7F replace   \uFFFD\u7F       -1 {knownW3C} {Second trail byte must be 80:BF}
    utf-8 F48F7F lossless   \uDCF4\uDC8F\x7F -1 {} {Second trail byte must be 80:BF}
    utf-8 F48F7F strict    {}                0 {}  {Second trail byte must be 80:BF}
    utf-8 F48081 tcl8      \u00F4\u20AC\u0081   -1 {} {Missing third trail byte}
    utf-8 F48081 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F48081 lossless   \uDCF4\uDC80\uDC81  -1 {} {Missing third trail byte}
    utf-8 F48081 strict    {}              0 {} {Missing third trail byte}
    utf-8 F48F81 tcl8      \u00F4\u008F\u0081   -1 {} {Missing third trail byte}
    utf-8 F48F81 replace   \uFFFD         -1 {knownW3C} {Missing third trail byte}
    utf-8 F48F81 lossless   \uDCF4\uDC8F\uDC81  -1 {} {Missing third trail byte}
    utf-8 F48F81 strict    {}              0 {} {Missing third trail byte}
    utf-8 F481817F tcl8      \u00F4\u0081\u0081\x7F   -1 {} {Third trail byte must be 80:BF}
    utf-8 F480817F replace   \uFFFD\x7F           -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F480817F lossless   \uDCF4\uDC80\uDC81\x7F -1 {} {Third trail byte must be 80:BF}
    utf-8 F480817F strict    {}              0 {} {Third trail byte must be 80:BF}
    utf-8 F48FBFD0 tcl8      \u00F4\u008F\u00BF\u00D0   -1 {} {Third trail byte must be 80:BF}
    utf-8 F48FBFD0 replace   \uFFFD         -1 {knownW3C} {Third trail byte must be 80:BF}
    utf-8 F48FBFD0 lossless   \uDCF4\uDC8F\uDCBF\uDCD0 -1 {} {Third trail byte must be 80:BF}
    utf-8 F48FBFD0 strict    {}              0 {} {Third trail byte must be 80:BF}

    utf-8 F5 tcl8    \u00F5 -1 {} {F5:FF are invalid everywhere}
    utf-8 F5 replace \uFFFD -1 {} {F5:FF are invalid everywhere}
    utf-8 F5 lossless \uDCF5 -1 {} {F5:FF are invalid everywhere}
    utf-8 F5 strict  {}      0 {} {F5:FF are invalid everywhere}
    utf-8 FF tcl8    \u00FF -1 {} {F5:FF are invalid everywhere}
    utf-8 FF replace \uFFFD -1 {} {F5:FF are invalid everywhere}
    utf-8 FF lossless \uDCFF -1 {} {F5:FF are invalid everywhere}
    utf-8 FF strict  {}      0 {} {F5:FF are invalid everywhere}

    utf-8 C0AFE080BFF0818130 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-8}
    utf-8 C0AFE080BFF0818130 lossless \uDCC0\uDCAF\uDCE0\uDC80\uDCBF\uDCF0\uDC81\uDC81\x30 -1 {} {Unicode Table 3-8}
    utf-8 EDA080EDBFBFEDAF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x30 -1 {knownW3C} {Unicode Table 3-9}
    utf-8 EDA080EDBFBFEDAF30 lossless \uD800\uDFFF\uDCED\uDCAF0 -1 {} {Unicode Table 3-9 - TODO assumes surrogates permitted in utf-8 lossless}
    utf-8 F4919293FF4180BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0041\uFFFD\uFFFD\x30 -1 {} {Unicode Table 3-10}
    utf-8 F4919293FF4180BF30 lossless \uDCF4\uDC91\uDC92\uDC93\uDCFF\u0041\uDC80\uDCBF\x30 -1 {} {Unicode Table 3-10}
    utf-8 E180E2F09192F1BF30 replace \uFFFD\uFFFD\uFFFD\uFFFD\x30                         -1 {knownW3C} {Unicode Table 3.11}
    utf-8 E180E2F09192F1BF30 lossless \uDCE1\uDC80\uDCE2\uDCF0\uDC91\uDC92\uDCF1\uDCBF\x30 -1 {} {Unicode Table 3.11}
}

# utf16-le and utf16-be test cases. Note utf16 cases are automatically generated
# based on these depending on platform endianness. Note truncated tests can only
# happen when the sequence is at the end (including by itself) Thus {solo tail}
# in some cases.
lappend encInvalidBytes {*}{
    utf-16le 41      tcl8      \uFFFD -1 {solo tail} {Truncated}
    utf-16le 41      replace   \uFFFD -1 {solo tail} {Truncated}
    utf-16le 41      strict    {}      0 {solo tail} {Truncated}
    utf-16le 41      lossless  \uFFFD -1 {solo tail} {Truncated - byte < 0x80}
    utf-16le 80      lossless  \uFFFD -1 {solo tail} {Truncated - byte >= 0x80}
    utf-16le 00D8    tcl8      \uD800 -1 {} {Missing low surrogate}
    utf-16le 00D8    replace   \uFFFD -1 {} {Missing low surrogate}
    utf-16le 00D8    lossless  \uFFFD -1 {} {Missing low surrogate}
    utf-16le 00D8    strict    {}      0 {knownBug} {Missing low surrogate}
    utf-16le 00DC    tcl8      \uDC00 -1 {} {Missing high surrogate}
    utf-16le 00DC    replace   \uFFFD -1 {} {Missing high surrogate}
    utf-16le 00DC    lossless  \uFFFD -1 {} {Missing high surrogate}
    utf-16le 00DC    strict    {}      0 {knownBug} {Missing high surrogate}

    utf-16be 41      tcl8      \uFFFD -1 {solo tail} {Truncated}
    utf-16be 41      replace   \uFFFD -1 {solo tail} {Truncated}
    utf-16be 41      strict    {}      0 {solo tail} {Truncated}
    utf-16be 41      lossless  \uFFFD -1 {solo tail} {Truncated - byte < 0x80}
    utf-16be 80      lossless  \uFFFD -1 {solo tail} {Truncated - byte >= 0x80}
    utf-16be D800    tcl8      \uD800 -1 {} {Missing low surrogate}
    utf-16be D800    replace   \uFFFD -1 {knownBug} {Missing low surrogate}
    utf-16be D800    lossless  \uFFFD -1 {knownBug} {Missing low surrogate}
    utf-16be D800    strict    {}      0 {knownBug} {Missing low surrogate}
    utf-16be DC00    tcl8      \uDC00 -1 {} {Missing high surrogate}
    utf-16be DC00    replace   \uFFFD -1 {knownBug} {Missing high surrogate}
    utf-16be DC00    lossless  \uFFFD -1 {knownBug} {Missing high surrogate}
    utf-16be DC00    strict    {}      0 {knownBug} {Missing high surrogate}
}

# utf32-le and utf32-be test cases. Note utf32 cases are automatically generated
# based on these depending on platform endianness. Note truncated tests can only
# happen when the sequence is at the end (including by itself) Thus {solo tail}
# in some cases.
lappend encInvalidBytes {*}{
    utf-32le 41      tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32le 41      replace   \uFFFD  -1 {solo} {Truncated}
    utf-32le 41      lossless  \uFFFD  -1 {solo} {Truncated}
    utf-32le 41      strict    {}   0 {solo tail} {Truncated}
    utf-32le 4100    tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32le 4100    replace   \uFFFD  -1 {solo} {Truncated}
    utf-32le 4100    lossless  \uFFFD  -1 {solo} {Truncated}
    utf-32le 4100    strict    {}   0 {solo tail} {Truncated}
    utf-32le 410000  tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32le 410000  replace   \uFFFD  -1 {solo} {Truncated}
    utf-32le 410000  lossless  \uFFFD  -1 {solo} {Truncated}
    utf-32le 410000  strict    {}       0 {solo tail} {Truncated}
    utf-32le 00D80000 tcl8     \uD800   -1 {} {High-surrogate}
    utf-32le 00D80000 replace  \uFFFD   -1 {} {High-surrogate}
    utf-32le 00D80000 lossless \uFFFD   -1 {} {High-surrogate}
    utf-32le 00D80000 strict   {}        0 {} {High-surrogate}
    utf-32le 00DC0000 tcl8     \uDC00   -1 {} {Low-surrogate}
    utf-32le 00DC0000 replace  \uFFFD   -1 {} {Low-surrogate}
    utf-32le 00DC0000 lossless \uFFFD   -1 {} {Low-surrogate}
    utf-32le 00DC0000 strict   {}        0 {} {Low-surrogate}
    utf-32le 00D8000000DC0000 tcl8 \uD800\uDC00    -1 {} {High-low-surrogate-pair}
    utf-32le 00D8000000DC0000 replace  \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}
    utf-32le 00D8000000DC0000 lossless \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}
    utf-32le 00D8000000DC0000 strict  {}            0 {} {High-low-surrogate-pair}
    utf-32le 00001100 tcl8 \uFFFD    -1 {} {Out of range}
    utf-32le 00001100 replace  \uFFFD -1 {} {Out of range}
    utf-32le 00001100 lossless \uFFFD -1 {} {Out of range}
    utf-32le 00001100 strict {}       0 {} {Out of range}
    utf-32le FFFFFFFF tcl8 \uFFFD    -1 {} {Out of range}
    utf-32le FFFFFFFF replace  \uFFFD -1 {} {Out of range}
    utf-32le FFFFFFFF lossless \uFFFD -1 {} {Out of range}
    utf-32le FFFFFFFF strict {}       0 {} {Out of range}

    utf-32be 41      tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 41      replace   \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 41      lossless  \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 41      strict    {}       0 {solo tail} {Truncated}
    utf-32be 0041    tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 0041    replace   \uFFFD  -1 {solo} {Truncated}
    utf-32be 0041    lossless  \uFFFD  -1 {solo} {Truncated}
    utf-32be 0041    strict    {}   0 {solo tail} {Truncated}
    utf-32be 000041  tcl8      \uFFFD  -1 {solo tail} {Truncated}
    utf-32be 000041  replace   \uFFFD  -1 {solo} {Truncated}
    utf-32be 000041  lossless  \uFFFD  -1 {solo} {Truncated}
    utf-32be 000041  strict    {}       0 {solo tail} {Truncated}
    utf-32be 0000D800 tcl8     \uD800   -1 {} {High-surrogate}
    utf-32be 0000D800 replace  \uFFFD   -1 {} {High-surrogate}
    utf-32be 0000D800 lossless \uFFFD   -1 {} {High-surrogate}
    utf-32be 0000D800 strict   {}        0 {} {High-surrogate}
    utf-32be 0000DC00 tcl8     \uDC00   -1 {} {Low-surrogate}
    utf-32be 0000DC00 replace  \uFFFD   -1 {} {Low-surrogate}
    utf-32be 0000DC00 lossless \uFFFD   -1 {} {Low-surrogate}
    utf-32be 0000DC00 strict   {}        0 {} {Low-surrogate}
    utf-32be 0000D8000000DC00 tcl8 \uD800\uDC00    -1 {} {High-low-surrogate-pair}
    utf-32be 0000D8000000DC00 replace  \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}
    utf-32be 0000D8000000DC00 lossless \uFFFD\uFFFD -1 {} {High-low-surrogate-pair}
    utf-32be 0000D8000000DC00 strict  {}            0 {} {High-low-surrogate-pair}
    utf-32be 00110000 tcl8 \uFFFD    -1 {} {Out of range}
    utf-32be 00110000 replace  \uFFFD -1 {} {Out of range}
    utf-32be 00110000 lossless \uFFFD -1 {} {Out of range}
    utf-32be 00110000 strict {}       0 {} {Out of range}
    utf-32be FFFFFFFF tcl8 \uFFFD    -1 {} {Out of range}
    utf-32be FFFFFFFF replace  \uFFFD -1 {} {Out of range}
    utf-32be FFFFFFFF lossless \uFFFD -1 {} {Out of range}
    utf-32be FFFFFFFF strict {}       0 {} {Out of range}
}

# escape tables - TODO
# This tests the EscapeToUtf code path.
lappend encInvalidBytes {*}{
}

# Strings that cannot be encoded for specific encoding / profiles
# <ENCODING STRING PROFILE EXPECTEDRESULT EXPECTEDFAILINDEX CTRL COMMENT>
# <ENCODING,STRING,PROFILE> should be unique for test ids to be unique.
# See earlier comments about CTRL field.
#


# TODO - out of range code point (note cannot be generated by \U notation)
lappend encUnencodableStrings {*}{
    ascii \u00e0 tcl8    3f -1 {} {unencodable}
    ascii \u00e0 strict  {}  0 {} {unencodable}

    iso8859-1 \u0141 tcl8    3f -1 {} unencodable
    iso8859-1 \u0141 strict  {}  0 {} unencodable

    utf-8 \uD800 tcl8    eda080 -1 {} High-surrogate
    utf-8 \uD800 strict  {}      0 {} High-surrogate
    utf-8 \uDC00 tcl8    edb080 -1 {} High-surrogate
    utf-8 \uDC00 strict  {}      0 {} High-surrogate
}


# The icuUcmTests.tcl is generated by the tools/ucm2tests.tcl script
# and generates test vectors for the above tables for various encodings
# based on ICU UCM files.
# TODO - commented out for now as generating a lot of mismatches, mainly
# due to Tcl using ? for replacement char and ICU often using ^Z.
# source [file join [file dirname [info script]] icuUcmTests.tcl]
Changes to tests/env.test.
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
if {"::tcltest" ni [namespace children]} {
    package require tcltest 2.5
    namespace import -force ::tcltest::*
}

source [file join [file dirname [info script]] tcltests.tcl]

testConstraint utf8system [string equal [encoding system] utf-8]
if {[llength [auto_execok bash]]} {
    testConstraint haveBash 1
}

# [exec] is required here to see the actual environment received by child
# processes.
proc getenv {} {







<







14
15
16
17
18
19
20

21
22
23
24
25
26
27
if {"::tcltest" ni [namespace children]} {
    package require tcltest 2.5
    namespace import -force ::tcltest::*
}

source [file join [file dirname [info script]] tcltests.tcl]


if {[llength [auto_execok bash]]} {
    testConstraint haveBash 1
}

# [exec] is required here to see the actual environment received by child
# processes.
proc getenv {} {
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527




528
529
530
531










































532
533
534
535
536
537
538
    set result [gets $pipe]
    close $pipe
    if {$result ne $::env(USERPROFILE)} {
	list ERROR $result ne $::env(USERPROFILE)
    }
} -result {}

test env-10.0 {
    Unequal environment strings test should test unequal
} -constraints {unix haveBash utf8system knownBug} -setup {
    set tclScript [makeFile {
        puts [string equal $env(XX) $env(YY)]
    } tclScript]
    set shellCode {
        export XX=$'\351'
        export YY=$'\303\251'
    }
    append shellCode "[info nameofexecutable] $tclScript\n"
    set shScript [makeFile $shellCode shScript]




} -body {
    exec {*}[auto_execok bash] $shScript
} -result 0













































# cleanup
rename getenv {}
rename envrestore {}
rename envprep {}
rename encodingrestore {}







|
|
|









>
>
>
>




>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
    set result [gets $pipe]
    close $pipe
    if {$result ne $::env(USERPROFILE)} {
	list ERROR $result ne $::env(USERPROFILE)
    }
} -result {}

test env-10.1 {
    Unequal environment strings test should test unequal (failed pre-TIP 671)
} -constraints {unix haveBash} -setup {
    set tclScript [makeFile {
        puts [string equal $env(XX) $env(YY)]
    } tclScript]
    set shellCode {
        export XX=$'\351'
        export YY=$'\303\251'
    }
    append shellCode "[info nameofexecutable] $tclScript\n"
    set shScript [makeFile $shellCode shScript]
    set oldEnc [encoding system]
    encoding system utf-8
} -cleanup {
    encoding system $oldEnc
} -body {
    exec {*}[auto_execok bash] $shScript
} -result 0

test env-10.2 {
    Read invalidly encoded bytes in environment value - TIP 671
} -constraints {unix haveBash} -setup {
    set tclScript [makeFile {
	puts [format {%04x} [scan $env(XX) %c]]
    } tclScript]
    # Note following requires bash, not sh! Dunno the equivalent in sh
    set shellCode {
        export XX=$'\xe9'
    }
    append shellCode "[info nameofexecutable] $tclScript\n"
    set shScript [makeFile $shellCode shScript]
    set oldEnc [encoding system]
    encoding system utf-8
} -cleanup {
    encoding system $oldEnc
} -body {
    exec {*}[auto_execok bash] $shScript
} -result dce9

test env-10.3 {
    Write invalidly encoded bytes to environment value - TIP 671
} -constraints {unix haveBash} -setup {
    set tclScript [makeFile {
        set env(YY) A$env(XX)B
        set line [lindex [split [exec {*}[auto_execok bash] -c {echo $YY | od -t x1}] \n] 0]
        puts [lrange $line 1 3]
    } tclScript]
    # Note following requires bash, not sh! Dunno the equivalent in sh
    set shellCode {
        export XX=$'\xe9'
    }
    append shellCode "[info nameofexecutable] $tclScript\n"
    set shScript [makeFile $shellCode shScript]
    set oldEnc [encoding system]
    encoding system utf-8
} -cleanup {
    encoding system $oldEnc
} -body {
    exec {*}[auto_execok bash] $shScript
} -result {41 e9 42}



# cleanup
rename getenv {}
rename envrestore {}
rename envprep {}
rename encodingrestore {}
Changes to tests/fileName.test.
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
    # Create a file name that is invalid if interpreted as utf-8
    encoding system iso8859-1
    close [open \xe9 w]
} -cleanup {
    encoding system $prevEnc
    cd $prevDir
    file delete -force $testDir
} -constraints {unix knownBug} -body {
    set result [file exists [lindex [glob *] 0]]
    encoding system utf-8
    lappend result [file exists [lindex [glob *] 0]]
} -result {1 1}

apply [list {} {
    test fileName-6d4e9d1af5bf5b7d {







|







1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
    # Create a file name that is invalid if interpreted as utf-8
    encoding system iso8859-1
    close [open \xe9 w]
} -cleanup {
    encoding system $prevEnc
    cd $prevDir
    file delete -force $testDir
} -constraints {unix} -body {
    set result [file exists [lindex [glob *] 0]]
    encoding system utf-8
    lappend result [file exists [lindex [glob *] 0]]
} -result {1 1}

apply [list {} {
    test fileName-6d4e9d1af5bf5b7d {
1681
1682
1683
1684
1685
1686
1687













































































































































































































































1688
1689
1690
1691
1692
1693
1694
			}
		} [namespace current]]
	    }
	    interp delete $interp
	}
    } -result 0
} [namespace current]]














































































































































































































































# cleanup
catch {file delete -force C:/globTest}
cd [temporaryDirectory]
file delete -force globTest
cd $oldpwd
catch {removeDirectory tcl[pid]}







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
			}
		} [namespace current]]
	    }
	    interp delete $interp
	}
    } -result 0
} [namespace current]]

# Helpers for TIP 671 tests for invalid encoding in system API
# Specifically use /tmp for tests because on WSL [temporaryDirectory]
# on NTFS prevents creation of arbitrary byte sequences in names.
if {[llength [auto_execok bash]]} {
    testConstraint haveBash 1
}

set dir671 /tmp/tip-671-test
proc clean671 {} {
    file delete -force $::dir671
}
proc mkfile671 {{content ""}} {
    file mkdir $::dir671
    exec {*}[auto_execok bash] -c "echo -n '$content' > [file join $::dir671 {$'\xe8'.$'\xe9'}]"
    # Return the path as retrieved from the system
    return [lindex [glob $::dir671/*] 0]
}
proc mkdir671 {args} {
    file mkdir $::dir671
    exec {*}[auto_execok bash] -c "mkdir [file join $::dir671 {$'\xe7'}]"
    # Return the path as retrieved from the system
    set dir [lindex [glob $::dir671/*] 0]
    if {[llength $args] == 0} {
        return $dir
    }
    return [lmap arg $args {
        set fn [file join $dir $arg]
        close [open $fn w]
        set fn
    }]
}
proc test-tip671 {id comment body result args} {
    set setup {
        clean671; # In case left over from manual testing
        set oldEnc [encoding system]
        # The invalid bytes file name assumes utf-8
        encoding system utf-8
    }
    if {[dict exists $args -setup]} {
        append setup "\n[dict get $args -setup]"
        dict unset args -setup
    }
    set cleanup {
        clean671
        encoding system $oldEnc
    }
    if {[dict exists $args -cleanup]} {
        set cleanup "[dict get $args -cleanup]\n$cleanup"
        dict unset args -cleanup
    }
    set constraints {unix haveBash}
    if {[dict exists $args -constraints]} {
        lappend constraints {*}[dict get $args -constraints]
        dict unset args -constraints
    }
    test tip671-$id "TIP 671 $comment" \
        -constraints $constraints \
        -setup $setup \
        -cleanup $cleanup \
        -body $body \
        -result $result \
        {*}$args
}

# These tests are really only intended to verify that path names
# containing TIP 671 mappings are passed correctly to the system
# and don't raise a file not found error. They are not intended
# to really test the commands themselves.
test-tip671 basic-file "Mapping of bytes" {mkfile671} $::dir671/\uDCE8.\uDCE9
test-tip671 basic-dir "Mapping of bytes" {mkdir671} $::dir671/\uDCE7
test-tip671 file-atime "file atime" {
    expr {[file atime [mkfile671]] - [clock seconds] <= 1}
} 1
test-tip671 file-attributes "file attributes" {
    lsort [dict keys [file attributes [mkfile671]]]
} {-group -owner -permissions}
test-tip671 file-copy-from "file copy source" {
    set tofile [makeFile "" file-copy-671]
    file copy -force [mkfile671 abc] $tofile
    viewFile $tofile
} abc
test-tip671 file-copy-to "file copy destination" {
    set tofile [mkfile671]
    file copy -force [makeFile "xyz" file-copy-671] $tofile
    viewFile $tofile
} xyz
test-tip671 file-delete "file delete" {
    set fn [mkfile671]
    list [file exists $fn] [file delete $fn] [file exists $fn]
} {1 {} 0}
test-tip671 file-dirname "file dirname" {
    file isdirectory [file dirname [lindex [mkdir671 x] 0]]
} 1
test-tip671 file-executable "file executable" {file executable [mkfile671]} 0
test-tip671 file-exists "file exists" {file exists [mkfile671]} 1
test-tip671 file-extension "file extension" {file extension [mkfile671]} .\uDCE9
test-tip671 file-home "file home" {
    file home
} $::dir671/\uDCE7 -setup {
    set oldHome $::env(HOME)
    set ::env(HOME) [mkdir671]
} -cleanup {
    set ::env(HOME) $oldHome
}
test-tip671 file-isdirectory "file isdirectory" {file isdirectory [mkdir671]} 1
test-tip671 file-isfile "file isfile" {file isfile [mkfile671]} 1
test-tip671 file-link-symlink "file link (symbolic)" {
    set dir [mkdir671]
    set target [mkfile671 abc]
    set lnk [file join $dir lnk]
    file link -symbolic $lnk $target
    list [viewFile $lnk] [file readlink $lnk]
} [list abc $::dir671/\uDCE8.\uDCE9]
if 0 { # TODO: Still need to find out why this fails 
test-tip671 file-link-hard "file link (hard)" {
    set dir [mkdir671]
    set target [mkfile671 abc]
    set lnk [file join $dir lnk]
    file link -hard $lnk $target
    viewFile $lnk
} abc
}
test-tip671 file-lstat "file lstat (symbolic)" {
    set dir [mkdir671]
    set target [mkfile671 abc]
    set lnk [file join $dir lnk]
    file link -symbolic $lnk $target
    dict get [file lstat $lnk] type
} link
test-tip671 file-mkdir "file mkdir" {
    set dir [mkdir671]
    file delete $dir
    list [file exists $dir] [file mkdir $dir] [file exists $dir]
} {0 {} 1}
test-tip671 file-mtime "file mtime" {
    expr {[file mtime [mkfile671]] - [clock seconds] <= 1}
} 1
test-tip671 file-nativename "file nativename" {
    file nativename [mkdir671 f]
} $::dir671/\uDCE7/f
test-tip671 file-normalize "file normalize" {
    set path [mkdir671 f]
    set cwd [pwd]
    cd [file dirname $path]
    set norm [file normalize f]
    cd $cwd
    string equal $path $norm
} 1
test-tip671 file-pathtype "file pathtype" {file pathtype [mkfile671]} absolute
test-tip671 file-owned "file owned" {file owned [mkfile671]} 1
test-tip671 file-readable "file readable" {file readable [mkfile671]} 1
test-tip671 file-rename-from "file rename source" {
    set tofile [makeFile "" file-rename-671]
    file rename -force [mkfile671 abc] $tofile
    viewFile $tofile
} abc
test-tip671 file-rename-to "file rename destination" {
    set tofile [mkfile671]
    file rename -force [makeFile "xyz" file-copy-671] $tofile
    viewFile $tofile
} xyz
test-tip671 file-rootname "file rootname" {file rootname [mkfile671]} $::dir671/\uDCE8
test-tip671 file-size "file size" {file size [mkfile671 abc]} 3
test-tip671 file-split "file split" {
    file split [mkfile671]
} [list / tmp tip-671-test \uDCE8.\uDCE9]
test-tip671 file-stat "file stat" {dict get [file stat [mkfile671 abc]] size} 3
test-tip671 file-system "file system" {file system [mkfile671 abc]} native
test-tip671 file-tail "file tail" {file tail [mkfile671]} \uDCE8.\uDCE9
test-tip671 file-tildeexpand "file tildeexpand" {
    file tildeexpand ~/f
} $::dir671/\uDCE7/f -setup {
    set oldHome $::env(HOME)
    set ::env(HOME) [mkdir671]
} -cleanup {
    set ::env(HOME) $oldHome
}
test-tip671 file-type "file type" {file type [mkfile671]} file
test-tip671 file-writable "file writable" {file writable [mkfile671]} 1
test-tip671 open "open" {
    set fd [open [mkfile671 abc]]
    set content [read $fd]
    close $fd
    set content
} abc
if 0 { # Not supported
test-tip671 open-pipe "open pipe" {
    # Tests both program name as well as argument being invalid chars
    set script "#!/bin/sh\necho \$1 | od -t x1\n"
    set exe [mkfile671 $script]
    file attributes $exe -permissions u+x
    set fd [open |[list $exe [file tail $exe]]]
    set result [read $fd]
    close $fd
    set result
} {* e8 2e e9 0a*} -match glob
test-tip671 cd,pwd "cd and pwd" {
    set dir [mkdir671]
    cd $dir
    pwd
} $::dir671/\uDCE7
test-tip671 exec "exec" {
    # Tests both program name as well as argument being invalid chars
    set script "#!/bin/sh\necho \$1 | od -t x1\n"
    set exe [mkfile671 $script]
    file attributes $exe -permissions u+x
    exec $exe [file tail $exe]
} {* e8 2e e9 0a*} -match glob
}

# Tests require the existence of one of the DLLs in the dltest directory.
set dltestDir [file join [file dirname [info nameofexecutable]] dltest]
testConstraint havePkgua [file readable \
                              [file join $dltestDir tcl9pkgua[info sharedlibextension]]]
if 0 { # Should not be supported
test-tip671 load "load" {
    set dll [lindex [glob -types f /tmp/tip-671-test/*] 0]
    load $dll Pkgua
    set loaded [info loaded]
    set idx [lsearch -index 1 $loaded Pkgua]
    set result [list [expr {$idx >= 0}]]
    if {$idx >= 0} {
        unload [lindex $loaded $idx 0]
        lappend result [lsearch -index 1 [info loaded] Pkgua]
    }
    set result
} {1 -1} -setup {
    set ext [info sharedlibextension]
    set dltestDir [file join [file dirname [info nameofexecutable]] dltest]
    file mkdir $::dir671
    set shellCmd [string cat "cp " [file join $dltestDir tcl9pkgua$ext] " " [file join $::dir671 {$'\xe8'}] $ext]
    exec {*}[auto_execok bash] -c $shellCmd
} -constraints {
    havePkgua
}
}

# cleanup
catch {file delete -force C:/globTest}
cd [temporaryDirectory]
file delete -force globTest
cd $oldpwd
catch {removeDirectory tcl[pid]}
Changes to tests/ioCmd.test.
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
    fconfigure $console -blah blih
} -returnCodes error -result [expectedOpts "-blah" {-inputmode}]
# TODO: Test parsing of serial channel options (nonPortable, since requires an
# open channel to work with).

test iocmd-8.23 {fconfigure -profile badprofile} -body {
    fconfigure stdin -profile froboz
} -returnCodes error -result {bad profile name "froboz": must be replace, strict, or tcl8}

test iocmd-9.1 {eof command} {
    list [catch {eof} msg] $msg $::errorCode
} {1 {wrong # args: should be "eof channelId"} {TCL WRONGARGS}}
test iocmd-9.2 {eof command} {
    list [catch {eof a b} msg] $msg $::errorCode
} {1 {wrong # args: should be "eof channelId"} {TCL WRONGARGS}}







|







367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
    fconfigure $console -blah blih
} -returnCodes error -result [expectedOpts "-blah" {-inputmode}]
# TODO: Test parsing of serial channel options (nonPortable, since requires an
# open channel to work with).

test iocmd-8.23 {fconfigure -profile badprofile} -body {
    fconfigure stdin -profile froboz
} -returnCodes error -result {bad profile name "froboz": must be lossless, replace, strict, or tcl8}

test iocmd-9.1 {eof command} {
    list [catch {eof} msg] $msg $::errorCode
} {1 {wrong # args: should be "eof channelId"} {TCL WRONGARGS}}
test iocmd-9.2 {eof command} {
    list [catch {eof a b} msg] $msg $::errorCode
} {1 {wrong # args: should be "eof channelId"} {TCL WRONGARGS}}
Changes to tests/main.test.
1
2
3
4
5
6
7
8
9
10
11
12




13
14
15
16
17
18
19
# This file contains a collection of tests for generic/tclMain.c.

if {"::tcltest" ni [namespace children]} {
    package require tcltest 2.5
    namespace import -force ::tcltest::*
}

namespace eval ::tcl::test::main {
    namespace import ::tcltest::*

    # Is [exec] defined?
    testConstraint exec [llength [info commands exec]]





    # Is the tcl::test package loaded?
    testConstraint tcl::test [expr {
	[llength [package provide tcl::test]]
	&& [package vsatisfies [package provide tcl::test] 8.5-]}]

    # Procedure to simulate interactive typing of commands, line by line












>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# This file contains a collection of tests for generic/tclMain.c.

if {"::tcltest" ni [namespace children]} {
    package require tcltest 2.5
    namespace import -force ::tcltest::*
}

namespace eval ::tcl::test::main {
    namespace import ::tcltest::*

    # Is [exec] defined?
    testConstraint exec [llength [info commands exec]]

    if {[llength [auto_execok bash]]} {
        testConstraint haveBash 1
    }

    # Is the tcl::test package loaded?
    testConstraint tcl::test [expr {
	[llength [package provide tcl::test]]
	&& [package vsatisfies [package provide tcl::test] 8.5-]}]

    # Procedure to simulate interactive typing of commands, line by line
1279
1280
1281
1282
1283
1284
1285



















1286
1287
1288
1289
1290
1291
1292
1293
		set tcl_interactive 1} >& result
	set f [open result]
	read $f
    } -cleanup {
	close $f
	file delete result
    } -result "1\nfoo\n"




















    cd [workingDirectory]

    cleanupTests
}

namespace delete ::tcl::test::main
return







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>








1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
		set tcl_interactive 1} >& result
	set f [open result]
	read $f
    } -cleanup {
	close $f
	file delete result
    } -result "1\nfoo\n"

    test Tcl_Main-10.1 {
        Invalidly encoded bytes in arguments - TIP 671
    } -constraints {unix haveBash} -setup {
        set tclScript [makeFile {
            lassign $::argv a b
            # a should be \udce9 (lossless map ofe9) b should be \ue9
            puts [list [format %x [scan $a %c]] [format %x [scan $b %c]]]
        } tclScript]
        # Note following requires bash, not sh! Dunno the equivalent in sh
        set shellCode [string cat "[info nameofexecutable] $tclScript" { $'\351'} { $'\303\251'} \n]
        set shScript [makeFile $shellCode shScript]
        set oldEnc [encoding system]
        encoding system utf-8
    } -cleanup {
        encoding system $oldEnc
    } -body {
        exec {*}[auto_execok bash] $shScript
    } -result {dce9 e9}

    cd [workingDirectory]

    cleanupTests
}

namespace delete ::tcl::test::main
return
Changes to unix/tclUnixChan.c.
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
    if (len==0 || (len>1 && strncmp(optionName, "-xchar", len)==0)) {
	Tcl_DString ds;

	valid = 1;
	tcgetattr(fsPtr->fileState.fd, &iostate);
	Tcl_DStringInit(&ds);

	Tcl_ExternalToUtfDStringEx(NULL, NULL, (char *) &iostate.c_cc[VSTART], 1, TCL_ENCODING_PROFILE_TCL8, &ds, NULL);
	Tcl_DStringAppendElement(dsPtr, Tcl_DStringValue(&ds));
	TclDStringClear(&ds);

	Tcl_ExternalToUtfDStringEx(NULL, NULL, (char *) &iostate.c_cc[VSTOP], 1, TCL_ENCODING_PROFILE_TCL8, &ds, NULL);
	Tcl_DStringAppendElement(dsPtr, Tcl_DStringValue(&ds));
	Tcl_DStringFree(&ds);
    }
    if (len == 0) {
	Tcl_DStringEndSublist(dsPtr);
    }








|



|







1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
    if (len==0 || (len>1 && strncmp(optionName, "-xchar", len)==0)) {
	Tcl_DString ds;

	valid = 1;
	tcgetattr(fsPtr->fileState.fd, &iostate);
	Tcl_DStringInit(&ds);

	TclSystemToInternalEncoding(NULL, (char *) &iostate.c_cc[VSTART], 1, &ds);
	Tcl_DStringAppendElement(dsPtr, Tcl_DStringValue(&ds));
	TclDStringClear(&ds);

	TclSystemToInternalEncoding(NULL, (char *) &iostate.c_cc[VSTOP], 1, &ds);
	Tcl_DStringAppendElement(dsPtr, Tcl_DStringValue(&ds));
	Tcl_DStringFree(&ds);
    }
    if (len == 0) {
	Tcl_DStringEndSublist(dsPtr);
    }

Changes to unix/tclUnixFile.c.
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#ifdef DJGPP
    if (name[1] == ':')
#else
    if (name[0] == '/')
#endif
    {
	encoding = Tcl_GetEncoding(NULL, NULL);
	Tcl_ExternalToUtfDStringEx(NULL, encoding, name, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, &utfName, NULL);
	TclSetObjNameOfExecutable(
		Tcl_NewStringObj(Tcl_DStringValue(&utfName), TCL_INDEX_NONE), encoding);
	Tcl_DStringFree(&utfName);
	goto done;
    }

    if (TclpGetCwd(NULL, &cwd) == NULL) {







|







151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#ifdef DJGPP
    if (name[1] == ':')
#else
    if (name[0] == '/')
#endif
    {
	encoding = Tcl_GetEncoding(NULL, NULL);
	Tcl_ExternalToUtfDStringEx(NULL, encoding, name, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, &utfName, NULL);
	TclSetObjNameOfExecutable(
		Tcl_NewStringObj(Tcl_DStringValue(&utfName), TCL_INDEX_NONE), encoding);
	Tcl_DStringFree(&utfName);
	goto done;
    }

    if (TclpGetCwd(NULL, &cwd) == NULL) {
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
    }

    Tcl_DStringInit(&nameString);
    Tcl_DStringAppend(&nameString, name, TCL_INDEX_NONE);

    Tcl_DStringFree(&buffer);
    Tcl_UtfToExternalDStringEx(NULL, NULL, Tcl_DStringValue(&cwd),
	    Tcl_DStringLength(&cwd), TCL_ENCODING_PROFILE_TCL8, &buffer, NULL);
    if (Tcl_DStringValue(&cwd)[Tcl_DStringLength(&cwd) -1] != '/') {
	TclDStringAppendLiteral(&buffer, "/");
    }
    Tcl_DStringFree(&cwd);
    TclDStringAppendDString(&buffer, &nameString);
    Tcl_DStringFree(&nameString);

    encoding = Tcl_GetEncoding(NULL, NULL);
    Tcl_ExternalToUtfDStringEx(NULL, encoding, Tcl_DStringValue(&buffer), TCL_INDEX_NONE,
	    TCL_ENCODING_PROFILE_TCL8, &utfName, NULL);
    TclSetObjNameOfExecutable(
	    Tcl_NewStringObj(Tcl_DStringValue(&utfName), TCL_INDEX_NONE), encoding);
    Tcl_DStringFree(&utfName);

  done:
    Tcl_DStringFree(&buffer);
}







|









|







179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
    }

    Tcl_DStringInit(&nameString);
    Tcl_DStringAppend(&nameString, name, TCL_INDEX_NONE);

    Tcl_DStringFree(&buffer);
    Tcl_UtfToExternalDStringEx(NULL, NULL, Tcl_DStringValue(&cwd),
	    Tcl_DStringLength(&cwd), TCL_ENCODING_PROFILE_LOSSLESS, &buffer, NULL);
    if (Tcl_DStringValue(&cwd)[Tcl_DStringLength(&cwd) -1] != '/') {
	TclDStringAppendLiteral(&buffer, "/");
    }
    Tcl_DStringFree(&cwd);
    TclDStringAppendDString(&buffer, &nameString);
    Tcl_DStringFree(&nameString);

    encoding = Tcl_GetEncoding(NULL, NULL);
    Tcl_ExternalToUtfDStringEx(NULL, encoding, Tcl_DStringValue(&buffer), TCL_INDEX_NONE,
	    TCL_ENCODING_PROFILE_LOSSLESS, &utfName, NULL);
    TclSetObjNameOfExecutable(
	    Tcl_NewStringObj(Tcl_DStringValue(&utfName), TCL_INDEX_NONE), encoding);
    Tcl_DStringFree(&utfName);

  done:
    Tcl_DStringFree(&buffer);
}
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
	    }
	}

	/*
	 * Now open the directory for reading and iterate over the contents.
	 */

	if (Tcl_UtfToExternalDStringEx(interp, NULL, dirName, TCL_INDEX_NONE, 0, &ds, NULL) != TCL_OK) {
	    Tcl_DStringFree(&dsOrig);
	    Tcl_DStringFree(&ds);
	    Tcl_DecrRefCount(fileNamePtr);
	    return TCL_ERROR;
	}
	native = Tcl_DStringValue(&ds);








|







304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
	    }
	}

	/*
	 * Now open the directory for reading and iterate over the contents.
	 */

	if (Tcl_UtfToExternalDStringEx(interp, NULL, dirName, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL) != TCL_OK) {
	    Tcl_DStringFree(&dsOrig);
	    Tcl_DStringFree(&ds);
	    Tcl_DecrRefCount(fileNamePtr);
	    return TCL_ERROR;
	}
	native = Tcl_DStringValue(&ds);

375
376
377
378
379
380
381
382
383
384
385
386
387
388
389

	    /*
	     * Now check to see if the file matches, according to both type
	     * and pattern. If so, add the file to the result.
	     */

	    if (Tcl_ExternalToUtfDStringEx(interp, NULL, entryPtr->d_name, TCL_INDEX_NONE,
		    0, &utfDs, NULL) != TCL_OK) {
		matchResult = -1;
		break;
	    }
	    utfname = Tcl_DStringValue(&utfDs);
	    if (Tcl_StringCaseMatch(utfname, pattern, 0)) {
		int typeOk = 1;








|







375
376
377
378
379
380
381
382
383
384
385
386
387
388
389

	    /*
	     * Now check to see if the file matches, according to both type
	     * and pattern. If so, add the file to the result.
	     */

	    if (Tcl_ExternalToUtfDStringEx(interp, NULL, entryPtr->d_name, TCL_INDEX_NONE,
		    TCL_ENCODING_PROFILE_LOSSLESS, &utfDs, NULL) != TCL_OK) {
		matchResult = -1;
		break;
	    }
	    utfname = Tcl_DStringValue(&utfDs);
	    if (Tcl_StringCaseMatch(utfname, pattern, 0)) {
		int typeOk = 1;

607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
    Tcl_DString *bufferPtr)	/* Uninitialized or free DString filled with
				 * name of user's home directory. */
{
    struct passwd *pwPtr;
    Tcl_DString ds;
    const char *native;

    if (Tcl_UtfToExternalDStringEx(NULL, NULL, name, TCL_INDEX_NONE, 0, &ds, NULL) != TCL_OK) {
	Tcl_DStringFree(&ds);
	return NULL;
    }
    native = Tcl_DStringValue(&ds);

    pwPtr = TclpGetPwNam(native);			/* INTL: Native. */
    Tcl_DStringFree(&ds);

    if (pwPtr == NULL) {
	return NULL;
    }
    if (Tcl_ExternalToUtfDStringEx(NULL, NULL, pwPtr->pw_dir, TCL_INDEX_NONE, 0, bufferPtr, NULL) != TCL_OK) {
	return NULL;
    } else {
	return Tcl_DStringValue(bufferPtr);
    }
}

/*







|











|







607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
    Tcl_DString *bufferPtr)	/* Uninitialized or free DString filled with
				 * name of user's home directory. */
{
    struct passwd *pwPtr;
    Tcl_DString ds;
    const char *native;

    if (Tcl_UtfToExternalDStringEx(NULL, NULL, name, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL) != TCL_OK) {
	Tcl_DStringFree(&ds);
	return NULL;
    }
    native = Tcl_DStringValue(&ds);

    pwPtr = TclpGetPwNam(native);			/* INTL: Native. */
    Tcl_DStringFree(&ds);

    if (pwPtr == NULL) {
	return NULL;
    }
    if (Tcl_ExternalToUtfDStringEx(NULL, NULL, pwPtr->pw_dir, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, bufferPtr, NULL) != TCL_OK) {
	return NULL;
    } else {
	return Tcl_DStringValue(bufferPtr);
    }
}

/*
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
	if (interp != NULL) {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "error getting working directory name: %s",
		    Tcl_PosixError(interp)));
	}
	return NULL;
    }
    if (Tcl_ExternalToUtfDStringEx(interp, NULL, buffer, TCL_INDEX_NONE, 0, bufferPtr, NULL) != TCL_OK) {
	return NULL;
    }
    return Tcl_DStringValue(bufferPtr);
}

/*
 *---------------------------------------------------------------------------







|







801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
	if (interp != NULL) {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "error getting working directory name: %s",
		    Tcl_PosixError(interp)));
	}
	return NULL;
    }
    if (Tcl_ExternalToUtfDStringEx(interp, NULL, buffer, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, bufferPtr, NULL) != TCL_OK) {
	return NULL;
    }
    return Tcl_DStringValue(bufferPtr);
}

/*
 *---------------------------------------------------------------------------
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
{
#ifndef DJGPP
    char link[MAXPATHLEN];
    Tcl_Size length;
    const char *native;
    Tcl_DString ds;

    if (Tcl_UtfToExternalDStringEx(NULL, NULL, path, TCL_INDEX_NONE, 0, &ds, NULL) != TCL_OK) {
	Tcl_DStringFree(&ds);
	return NULL;
    }
    native = Tcl_DStringValue(&ds);
    length = readlink(native, link, sizeof(link));	/* INTL: Native. */
    Tcl_DStringFree(&ds);

    if (length < 0) {
	return NULL;
    }

    if (Tcl_ExternalToUtfDStringEx(NULL, NULL, link, length, 0, linkPtr, NULL) == TCL_OK) {
	return Tcl_DStringValue(linkPtr);
    }
#endif /* !DJGPP */
    
    return NULL;
}








|











|







839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
{
#ifndef DJGPP
    char link[MAXPATHLEN];
    Tcl_Size length;
    const char *native;
    Tcl_DString ds;

    if (Tcl_UtfToExternalDStringEx(NULL, NULL, path, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL) != TCL_OK) {
	Tcl_DStringFree(&ds);
	return NULL;
    }
    native = Tcl_DStringValue(&ds);
    length = readlink(native, link, sizeof(link));	/* INTL: Native. */
    Tcl_DStringFree(&ds);

    if (length < 0) {
	return NULL;
    }

    if (Tcl_ExternalToUtfDStringEx(NULL, NULL, link, length, TCL_ENCODING_PROFILE_LOSSLESS, linkPtr, NULL) == TCL_OK) {
	return Tcl_DStringValue(linkPtr);
    }
#endif /* !DJGPP */
    
    return NULL;
}

986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
	     */

	    transPtr = Tcl_FSGetTranslatedPath(NULL, toPtr);
	    if (transPtr == NULL) {
		return NULL;
	    }
	    target = Tcl_GetStringFromObj(transPtr, &length);
	    if (Tcl_UtfToExternalDStringEx(NULL, NULL, target, length, 0, &ds, NULL) != TCL_OK) {
		Tcl_DStringFree(&ds);
		return NULL;
	    }
	    target = Tcl_DStringValue(&ds);
	    Tcl_DecrRefCount(transPtr);

	    if (symlink(target, src) != 0) {







|







986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
	     */

	    transPtr = Tcl_FSGetTranslatedPath(NULL, toPtr);
	    if (transPtr == NULL) {
		return NULL;
	    }
	    target = Tcl_GetStringFromObj(transPtr, &length);
	    if (Tcl_UtfToExternalDStringEx(NULL, NULL, target, length, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL) != TCL_OK) {
		Tcl_DStringFree(&ds);
		return NULL;
	    }
	    target = Tcl_DStringValue(&ds);
	    Tcl_DecrRefCount(transPtr);

	    if (symlink(target, src) != 0) {
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
	Tcl_DecrRefCount(transPtr);

	length = readlink((const char *)Tcl_FSGetNativePath(pathPtr), link, sizeof(link));
	if (length < 0) {
	    return NULL;
	}

	if (Tcl_ExternalToUtfDStringEx(NULL, NULL, link, (size_t)length, 0, &ds, NULL) != TCL_OK) {
	    return NULL;
	}
	linkPtr = Tcl_DStringToObj(&ds);
	Tcl_IncrRefCount(linkPtr);
	return linkPtr;
    }
}







|







1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
	Tcl_DecrRefCount(transPtr);

	length = readlink((const char *)Tcl_FSGetNativePath(pathPtr), link, sizeof(link));
	if (length < 0) {
	    return NULL;
	}

	if (Tcl_ExternalToUtfDStringEx(NULL, NULL, link, (size_t)length, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL) != TCL_OK) {
	    return NULL;
	}
	linkPtr = Tcl_DStringToObj(&ds);
	Tcl_IncrRefCount(linkPtr);
	return linkPtr;
    }
}
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106

Tcl_Obj *
TclpNativeToNormalized(
    void *clientData)
{
    Tcl_DString ds;

    Tcl_ExternalToUtfDStringEx(NULL, NULL, (const char *) clientData, TCL_INDEX_NONE, 0, &ds, NULL);
    return Tcl_DStringToObj(&ds);
}

/*
 *---------------------------------------------------------------------------
 *
 * TclNativeCreateNativeRep --







|







1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106

Tcl_Obj *
TclpNativeToNormalized(
    void *clientData)
{
    Tcl_DString ds;

    Tcl_ExternalToUtfDStringEx(NULL, NULL, (const char *) clientData, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL);
    return Tcl_DStringToObj(&ds);
}

/*
 *---------------------------------------------------------------------------
 *
 * TclNativeCreateNativeRep --
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
	if (validPathPtr == NULL) {
	    return NULL;
	}
	Tcl_IncrRefCount(validPathPtr);
    }

    str = Tcl_GetStringFromObj(validPathPtr, &len);
    if (Tcl_UtfToExternalDStringEx(NULL, NULL, str, len, 0, &ds, NULL) != TCL_OK) {
	Tcl_DecrRefCount(validPathPtr);
	Tcl_DStringFree(&ds);
	return NULL;
    }
    len = Tcl_DStringLength(&ds) + sizeof(char);
    if (strlen(Tcl_DStringValue(&ds)) < len - sizeof(char)) {
	/* See bug [3118489]: NUL in filenames */







|







1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
	if (validPathPtr == NULL) {
	    return NULL;
	}
	Tcl_IncrRefCount(validPathPtr);
    }

    str = Tcl_GetStringFromObj(validPathPtr, &len);
    if (Tcl_UtfToExternalDStringEx(NULL, NULL, str, len, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL) != TCL_OK) {
	Tcl_DecrRefCount(validPathPtr);
	Tcl_DStringFree(&ds);
	return NULL;
    }
    len = Tcl_DStringLength(&ds) + sizeof(char);
    if (strlen(Tcl_DStringValue(&ds)) < len - sizeof(char)) {
	/* See bug [3118489]: NUL in filenames */
Changes to unix/tclUnixInit.c.
469
470
471
472
473
474
475
476
477




478
479
480
481
482
483
484
     * Look for the library relative to the TCL_LIBRARY env variable. If the
     * last dirname in the TCL_LIBRARY path does not match the last dirname in
     * the installLib variable, use the last dir name of installLib in
     * addition to the original TCL_LIBRARY path.
     */

    str = getenv("TCL_LIBRARY");			/* INTL: Native. */
    Tcl_ExternalToUtfDStringEx(NULL, NULL, str, TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, &buffer, NULL);
    str = Tcl_DStringValue(&buffer);





    if ((str != NULL) && (str[0] != '\0')) {
	Tcl_DString ds;
	Tcl_Size pathc;
	const char **pathv;
	char installLib[LIBRARY_SIZE];








|
|
>
>
>
>







469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
     * Look for the library relative to the TCL_LIBRARY env variable. If the
     * last dirname in the TCL_LIBRARY path does not match the last dirname in
     * the installLib variable, use the last dir name of installLib in
     * addition to the original TCL_LIBRARY path.
     */

    str = getenv("TCL_LIBRARY");			/* INTL: Native. */
    if (TclSystemToInternalEncoding(NULL, str, -1, &buffer) == TCL_OK) {
        str = Tcl_DStringValue(&buffer);
    } else {
        /* Note buffer is initialized even on error so can be cleared later */
        str = NULL;
    }

    if ((str != NULL) && (str[0] != '\0')) {
	Tcl_DString ds;
	Tcl_Size pathc;
	const char **pathv;
	char installLib[LIBRARY_SIZE];

Changes to win/tclWinSock.c.
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
	Tcl_DString inDs;

	Tcl_DStringInit(&inDs);
	Tcl_DStringSetLength(&inDs, 256);
	if (gethostname(Tcl_DStringValue(&inDs),
		Tcl_DStringLength(&inDs)) == 0) {
	    Tcl_ExternalToUtfDStringEx(NULL, NULL, Tcl_DStringValue(&inDs),
		    TCL_INDEX_NONE, TCL_ENCODING_PROFILE_TCL8, &ds, NULL);
	}
	Tcl_DStringFree(&inDs);
    }

    *encodingPtr = Tcl_GetEncoding(NULL, "utf-8");
    *lengthPtr = Tcl_DStringLength(&ds);
    *valuePtr = (char *)Tcl_Alloc(*lengthPtr + 1);







|







370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
	Tcl_DString inDs;

	Tcl_DStringInit(&inDs);
	Tcl_DStringSetLength(&inDs, 256);
	if (gethostname(Tcl_DStringValue(&inDs),
		Tcl_DStringLength(&inDs)) == 0) {
	    Tcl_ExternalToUtfDStringEx(NULL, NULL, Tcl_DStringValue(&inDs),
                -1, TCL_ENCODING_PROFILE_LOSSLESS, &ds, NULL);
	}
	Tcl_DStringFree(&inDs);
    }

    *encodingPtr = Tcl_GetEncoding(NULL, "utf-8");
    *lengthPtr = Tcl_DStringLength(&ds);
    *valuePtr = (char *)Tcl_Alloc(*lengthPtr + 1);