Changes On Branch 36b40699fe3fc9f3
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch bug-ed29806baf-8.7 Excluding Merge-Ins

This is equivalent to a diff from 7dccfe91e3 to 36b40699fe

2020-05-07
21:59
Merge 8.6 check-in: 6f4a6b90ef user: dgp tags: core-8-branch
17:02
merge 8.7 Leaf check-in: 36b40699fe user: dgp tags: bug-ed29806baf-8.7
14:47
Merge 8.7 check-in: 63344df549 user: jan.nijtmans tags: trunk
14:38
Merge 8.6 check-in: 7dccfe91e3 user: jan.nijtmans tags: core-8-branch
14:15
For TCL_UTF_MAX==4: Make sure that Tcl_UtfNext()/Tcl_UtfPrev() never move more than 3 bytes. This is... check-in: 116d4a8943 user: jan.nijtmans tags: core-8-6-branch
11:17
Fix [fad64a857e76f98e]: "lsearch" provides wrong errorCode with bad -stride option check-in: a368336356 user: jan.nijtmans tags: core-8-branch
2020-05-01
23:45
Encoding improvements, handling Unicode > 0xFFFF check-in: 02e76e4aa7 user: jan.nijtmans tags: bug-ed29806baf-8.7

Changes to generic/tclEncoding.c.
2500
2501
2502
2503
2504
2505
2506
2507

2508
2509
2510
2511
2512
2513
2514
2500
2501
2502
2503
2504
2505
2506

2507
2508
2509
2510
2511
2512
2513
2514







-
+








static int
UtfToUtf16Proc(
    ClientData clientData,	/* != NULL means LE, == NUL means BE */
    const char *src,		/* Source string in UTF-8. */
    int srcLen,			/* Source string length in bytes. */
    int flags,			/* Conversion control flags. */
    Tcl_EncodingState *statePtr,/* Place for conversion routine to store state
    TCL_UNUSED(Tcl_EncodingState *),/* Place for conversion routine to store state
				 * information used during a piecewise
				 * conversion. Contents of statePtr are
				 * initialized and/or reset by conversion
				 * routine under control of flags argument. */
    char *dst,			/* Output buffer in which converted string is
				 * stored. */
    int dstLen,			/* The maximum length of output buffer in
2523
2524
2525
2526
2527
2528
2529
2530
2531

2532
2533
2534

2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547

2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560

2561
2562
2563
2564
2565
2566



2567
2568
2569
2570
2571




2572
2573
2574
2575
2576
2577
2578
2579
2580
2581



2582
2583
2584
2585
2586




2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2523
2524
2525
2526
2527
2528
2529


2530



2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543

2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556

2557
2558
2559




2560
2561
2562
2563




2564
2565
2566
2567
2568




2569




2570
2571
2572
2573




2574
2575
2576
2577
2578




2579
2580
2581
2582
2583
2584
2585







-
-
+
-
-
-
+












-
+












-
+


-
-
-
-
+
+
+

-
-
-
-
+
+
+
+

-
-
-
-

-
-
-
-
+
+
+

-
-
-
-
+
+
+
+

-
-
-
-







				 * the conversion. */
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
    int result, numChars;
    Tcl_UniChar *chPtr = (Tcl_UniChar *) statePtr;

    int ch;
    if (flags & TCL_ENCODING_START) {
    	*statePtr = 0;
    }

    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= TCL_UTF_MAX;
    }

    dstStart = dst;
    dstEnd   = dst + dstLen - sizeof(Tcl_UniChar);

    result = TCL_OK;
    for (numChars = 0; src < srcEnd; numChars++) {
	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
	if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) {
	    /*
	     * If there is more string to follow, this will ensure that the
	     * last UTF-8 character in the source buffer hasn't been cut off.
	     */

	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	src += TclUtfToUniChar(src, chPtr);
	src += TclUtfToUCS4(src, &ch);

	if (clientData) {
#if TCL_UTF_MAX > 3
	    if (*chPtr <= 0xFFFF) {
		*dst++ = (*chPtr & 0xFF);
		*dst++ = (*chPtr >> 8);
	    if (ch <= 0xFFFF) {
		*dst++ = (ch & 0xFF);
		*dst++ = (ch >> 8);
	    } else {
		*dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
		*dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
		*dst++ = (*chPtr & 0xFF);
		*dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
		*dst++ = (((ch - 0x10000) >> 10) & 0xFF);
		*dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
		*dst++ = (ch & 0xFF);
		*dst++ = ((ch & 0x3) >> 8) | 0xDC;
	    }
#else
	    *dst++ = (*chPtr & 0xFF);
	    *dst++ = (*chPtr >> 8);
#endif
	} else {
#if TCL_UTF_MAX > 3
	    if (*chPtr <= 0xFFFF) {
		*dst++ = (*chPtr >> 8);
		*dst++ = (*chPtr & 0xFF);
	    if (ch <= 0xFFFF) {
		*dst++ = (ch >> 8);
		*dst++ = (ch & 0xFF);
	    } else {
		*dst++ = ((*chPtr & 0x3) >> 8) | 0xDC;
		*dst++ = (*chPtr & 0xFF);
		*dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8;
		*dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF);
		*dst++ = ((ch & 0x3) >> 8) | 0xDC;
		*dst++ = (ch & 0xFF);
		*dst++ = (((ch - 0x10000) >> 18) & 0x3) | 0xD8;
		*dst++ = (((ch - 0x10000) >> 10) & 0xFF);
	    }
#else
	    *dst++ = (*chPtr >> 8);
	    *dst++ = (*chPtr & 0xFF);
#endif
	}
    }
    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;
}
2634
2635
2636
2637
2638
2639
2640
2641
2642

2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658

2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676






2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2621
2622
2623
2624
2625
2626
2627


2628


2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641

2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654






2655
2656
2657
2658
2659
2660



2661
2662

2663
2664
2665
2666
2667
2668
2669







-
-
+
-
-













-
+












-
-
-
-
-
-
+
+
+
+
+
+
-
-
-


-







				 * the conversion. */
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd;
    int result, numChars;
#if TCL_UTF_MAX <= 3
    int len;
    int ch;
#endif
    Tcl_UniChar ch = 0;

    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= TCL_UTF_MAX;
    }

    dstStart = dst;
    dstEnd   = dst + dstLen - sizeof(Tcl_UniChar);

    result = TCL_OK;
    for (numChars = 0; src < srcEnd; numChars++) {
	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
	if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) {
	    /*
	     * If there is more string to follow, this will ensure that the
	     * last UTF-8 character in the source buffer hasn't been cut off.
	     */

	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
#if TCL_UTF_MAX <= 3
	src += (len = TclUtfToUniChar(src, &ch));
	if ((ch >= 0xD800) && (len < 3)) {
	    src += TclUtfToUniChar(src, &ch);
	    ch = 0xFFFD;
	}
	src += TclUtfToUCS4(src, &ch);
	if (ch & ~0xFFFF) {
	    if (flags & TCL_ENCODING_STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
#else
	src += TclUtfToUniChar(src, &ch);
	if (ch > 0xFFFF) {
	    ch = 0xFFFD;
	}
#endif

	/*
	 * Need to handle this in a way that won't cause misalignment by
	 * casting dst to a Tcl_UniChar. [Bug 1122671]
	 */

	if (clientData) {
2848
2849
2850
2851
2852
2853
2854
2855

2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876

2877
2878
2879
2880
2881
2882
2883
2884
2885

2886
2887
2888
2889
2890
2891
2892
2893

2894
2895

2896
2897
2898
2899
2900
2901

2902
2903
2904
2905
2906
2907
2908
2828
2829
2830
2831
2832
2833
2834

2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855

2856
2857
2858
2859
2860
2861
2862
2863
2864

2865
2866







2867
2868

2869





2870
2871
2872
2873
2874
2875
2876
2877
2878







-
+




















-
+








-
+

-
-
-
-
-
-
-
+

-
+
-
-
-
-
-

+







				 * the conversion. */
    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd, *prefixBytes;
    Tcl_UniChar ch = 0;
    int ch;
    int result, len, word, numChars;
    TableEncodingData *dataPtr = (TableEncodingData *)clientData;
    const unsigned short *const *fromUnicode;

    result = TCL_OK;

    prefixBytes = dataPtr->prefixBytes;
    fromUnicode = (const unsigned short *const *) dataPtr->fromUnicode;

    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= TCL_UTF_MAX;
    }

    dstStart = dst;
    dstEnd = dst + dstLen - 1;

    for (numChars = 0; src < srcEnd; numChars++) {
	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
	if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) {
	    /*
	     * If there is more string to follow, this will ensure that the
	     * last UTF-8 character in the source buffer hasn't been cut off.
	     */

	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	len = TclUtfToUniChar(src, &ch);
	len = TclUtfToUCS4(src, &ch);

#if TCL_UTF_MAX > 3
	/*
	 * This prevents a crash condition. More evaluation is required for
	 * full support of int Tcl_UniChar. [Bug 1004065]
	 */

	if (ch & 0xFFFF0000) {
	if (ch & ~0xFFFF) {
	    word = 0;
	} else
	} else {
#else
	if (!len) {
	    word = 0;
	} else
#endif
	    word = fromUnicode[(ch >> 8)][ch & 0xFF];
	}

	if ((word == 0) && (ch != 0)) {
	    if (flags & TCL_ENCODING_STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
	    word = dataPtr->fallback;
3050
3051
3052
3053
3054
3055
3056
3057

3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072

3073
3074
3075
3076
3077
3078
3079
3080
3081

3082
3083
3084
3085
3086
3087

3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103

3104
3105
3106
3107
3108
3109
3110
3020
3021
3022
3023
3024
3025
3026

3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041

3042
3043
3044
3045
3046
3047
3048
3049
3050

3051
3052
3053
3054
3055
3056

3057




3058
3059
3060
3061



3062
3063
3064
3065

3066
3067
3068
3069
3070
3071
3072
3073







-
+














-
+








-
+





-
+
-
-
-
-




-
-
-




-
+







    int *dstCharsPtr)		/* Filled with the number of characters that
				 * correspond to the bytes stored in the
				 * output buffer. */
{
    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd;
    int result = TCL_OK, numChars;
    Tcl_UniChar ch = 0;
    int ch;

    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
	srcClose -= TCL_UTF_MAX;
    }

    dstStart = dst;
    dstEnd = dst + dstLen - 1;

    for (numChars = 0; src < srcEnd; numChars++) {
	int len;

	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
	if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) {
	    /*
	     * If there is more string to follow, this will ensure that the
	     * last UTF-8 character in the source buffer hasn't been cut off.
	     */

	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	len = TclUtfToUniChar(src, &ch);
	len = TclUtfToUCS4(src, &ch);

	/*
	 * Check for illegal characters.
	 */

	if (ch > 0xFF
	if (ch > 0xFF) {
#if TCL_UTF_MAX <= 3
		|| ((ch >= 0xD800) && (len < 3))
#endif
		) {
	    if (flags & TCL_ENCODING_STOPONERROR) {
		result = TCL_CONVERT_UNKNOWN;
		break;
	    }
#if TCL_UTF_MAX <= 3
	    if ((ch >= 0xD800) && (len < 3)) len = 4;
#endif
	    /*
	     * Plunge on, using '?' as a fallback character.
	     */

	    ch = (Tcl_UniChar) '?';
	    ch = '?';
	}

	if (dst > dstEnd) {
	    result = TCL_CONVERT_NOSPACE;
	    break;
	}
	*(dst++) = (char) ch;
3415
3416
3417
3418
3419
3420
3421
3422

3423
3424
3425
3426
3427
3428
3429
3378
3379
3380
3381
3382
3383
3384

3385
3386
3387
3388
3389
3390
3391
3392







-
+







    const Encoding *encodingPtr;
    const char *srcStart, *srcEnd, *srcClose;
    const char *dstStart, *dstEnd;
    int state, result, numChars;
    const TableEncodingData *tableDataPtr;
    const char *tablePrefixBytes;
    const unsigned short *const *tableFromUnicode;
    Tcl_UniChar ch = 0;
    int ch;

    result = TCL_OK;

    srcStart = src;
    srcEnd = src + srcLen;
    srcClose = srcEnd;
    if ((flags & TCL_ENCODING_END) == 0) {
3457
3458
3459
3460
3461
3462
3463
3464

3465
3466
3467
3468
3469
3470
3471
3472
3473
3474






3475
3476
3477
3478
3479
3480
3481
3482
3483



3484


3485
3486
3487
3488
3489
3490
3491
3420
3421
3422
3423
3424
3425
3426

3427
3428
3429
3430
3431
3432
3433
3434
3435


3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453

3454
3455
3456
3457
3458
3459
3460
3461
3462







-
+








-
-
+
+
+
+
+
+









+
+
+
-
+
+







    tableFromUnicode = (const unsigned short *const *)
	    tableDataPtr->fromUnicode;

    for (numChars = 0; src < srcEnd; numChars++) {
	unsigned len;
	int word;

	if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) {
	if ((src > srcClose) && (!TclUCS4Complete(src, srcEnd - src))) {
	    /*
	     * If there is more string to follow, this will ensure that the
	     * last UTF-8 character in the source buffer hasn't been cut off.
	     */

	    result = TCL_CONVERT_MULTIBYTE;
	    break;
	}
	len = TclUtfToUniChar(src, &ch);
	word = tableFromUnicode[(ch >> 8)][ch & 0xFF];
	len = TclUtfToUCS4(src, &ch);
	if (ch & ~0xFFFF) {
	    word = 0;
	} else {
	    word = tableFromUnicode[(ch >> 8)][ch & 0xFF];
	}

	if ((word == 0) && (ch != 0)) {
	    int oldState;
	    const EscapeSubTable *subTablePtr;

	    oldState = state;
	    for (state = 0; state < dataPtr->numSubTables; state++) {
		encodingPtr = GetTableEncoding(dataPtr, state);
		tableDataPtr = (const TableEncodingData *)encodingPtr->clientData;
		if (ch & ~0xFFFF) {
		    word = 0;
		} else {
		word = tableDataPtr->fromUnicode[(ch >> 8)][ch & 0xFF];
		    word = tableDataPtr->fromUnicode[(ch >> 8)][ch & 0xFF];
		}
		if (word != 0) {
		    break;
		}
	    }

	    if (word == 0) {
		state = oldState;