Diff
Not logged in

Differences From Artifact [d71ee01f31]:

To Artifact [385e9d7fe5]:


62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

84
85
86
87
88
89
90
static CONST unsigned char totalBytes[256] = {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
#if TCL_UTF_MAX > 3
    4,4,4,4,4,4,4,4,
#else
    1,1,1,1,1,1,1,1,
#endif
    1,1,1,1,1,1,1,1
};

/*
 * Functions used only in this module.
 */

static int		UtfCount(int ch);


/*
 *---------------------------------------------------------------------------
 *
 * UtfCount --
 *
 *	Find the number of bytes in the Utf character "ch".







|














>







62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
static CONST unsigned char totalBytes[256] = {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
#if TCL_UTF_MAX > 3
    4,4,4,4,4,4,4,4,
#else
    1,1,1,1,1,1,1,1,
#endif
    1,1,1,1,1,1,1,1
};

/*
 * Functions used only in this module.
 */

static int		UtfCount(int ch);
static int		Overlong(unsigned char *src);

/*
 *---------------------------------------------------------------------------
 *
 * UtfCount --
 *
 *	Find the number of bytes in the Utf character "ch".
111
112
113
114
115
116
117
118





















































119
120
121
122
123
124
125
#if TCL_UTF_MAX > 3
    if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
	return 4;
    }
#endif
    return 3;
}






















































/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UniCharToUtf --
 *
 *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
 *	provided buffer. Equivalent to Plan 9 runetochar().







|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#if TCL_UTF_MAX > 3
    if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
	return 4;
    }
#endif
    return 3;
}

/*
 *---------------------------------------------------------------------------
 *
 * Overlong --
 *
 *	Utility routine to report whether /src/ points to the start of an
 *	overlong byte sequence that should be rejected. Caller guarantees
 *	that src[0] and src[1] are readable, and
 *
 *	(src[0] >= 0xC0) && (src[0] != 0xC1)
 * 	(src[1] >= 0x80) && (src[1] < 0xC0)
 *	(src[0] < ((TCL_UTF_MAX > 3) ? 0xF8 : 0xF0))
 *
 * Results:
 *	A boolean.
 *---------------------------------------------------------------------------
 */

static CONST unsigned char overlong[3] = {
    0x80,	/* \xD0 -- all sequences valid */
    0xA0,	/* \xE0\x80 through \xE0\x9F are invalid prefixes */
#if TCL_UTF_MAX > 3
    0x90	/* \xF0\x80 through \xF0\x8F are invalid prefixes */
#else
    0xC0	/* Not used, but reject all again for safety. */
#endif
};

INLINE static int
Overlong(
    unsigned char *src)	/* Points to lead byte of a UTF-8 byte sequence */
{
    unsigned char byte = *src;

    if (byte % 0x10) {
	/* Only lead bytes 0xC0, 0xE0, 0xF0 need examination */
	return 0;
    }
    if (byte == 0xC0) {
	if (src[1] == 0x80) {
	    /* Valid sequence: \xC0\x80 for \u0000 */
	    return 0;
	}
	/* Reject overlong: \xC0\x81 - \xC0\xBF */
	return 1;
    }
    if (src[1] < overlong[(byte >> 4) - 0x0D]) {
	/* Reject overlong */
	return 1;
    }
    return 0;
}

/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UniCharToUtf --
 *
 *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
 *	provided buffer. Equivalent to Plan 9 runetochar().
657
658
659
660
661
662
663

664
665




666
667
668
669
670
671
672
673
674
675

676

677






678
679
680










681










682








683















684
685


686

687






688
689
690
691
692
693
694
695
 */

CONST char *
Tcl_UtfPrev(
    CONST char *src,		/* A location in a UTF-8 string. */
    CONST char *start)		/* Pointer to the beginning of the string */
{

    CONST char *look;
    int i, byte;





    src--;
    look = src;
    for (i = 0; i < TCL_UTF_MAX; i++) {
	if (look < start) {
	    if (src < start) {
		src = start;
	    }
	    break;
	}

	byte = *((unsigned char *) look);

	if (byte < 0x80) {






	    break;
	}
	if (byte >= 0xC0) {










	    if (totalBytes[byte] <= i) {










		break;








	    }















	    return look;
	}


	look--;

    }






    return src;
}

/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UniCharAtIndex --
 *







>
|
|
>
>
>
>

<
|
<
|
<
|
|
<
|
>
|
>

>
>
>
>
>
>
|


>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|

>
>

>
|
>
>
>
>
>
>
|







711
712
713
714
715
716
717
718
719
720
721
722
723
724
725

726

727

728
729

730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
 */

CONST char *
Tcl_UtfPrev(
    CONST char *src,		/* A location in a UTF-8 string. */
    CONST char *start)		/* Pointer to the beginning of the string */
{
    int trailBytesSeen = 0;	/* How many trail bytes have been verified? */
    CONST char *fallback = src - 1;
				/* If we cannot find a lead byte that might
				 * start a prefix of a valid UTF byte sequence,
				 * we will fallback to a one-byte back step */
    unsigned char *look = (unsigned char *)fallback;
				/* Start search at the fallback position */


    /* Quick boundary case exit. */

    if (fallback <= start) {

	return start;
    }


    do {
	unsigned char byte = look[0];

	if (byte < 0x80) {
	    /*
	     * Single byte character. Either this is a correct previous
	     * character, or it is followed by at least one trail byte
	     * which indicates a malformed sequence. In either case the
	     * correct result is to return the fallback.
	     */
	    return fallback;
	}
	if (byte >= 0xC0) {
	    /* Non-trail byte; May be multibyte lead. */

	    if ((trailBytesSeen == 0)
		/*
		 * We've seen no trailing context to use to check
		 * anything. From what we know, this non-trail byte
		 * is a prefix of a previous character, and accepting
		 * it (the fallback) is correct.
		 */

		    || (trailBytesSeen >= totalBytes[byte])) {
		/*
		 * That is, (1 + trailBytesSeen > needed).
		 * We've examined more bytes than needed to complete
		 * this lead byte. No matter about well-formedness or
		 * validity, the sequence starting with this lead byte
		 * will never include the fallback location, so we must
		 * return the fallback location. See test utf-7.17
		 */
		return fallback;
	    }

	    /*
	     * trailBytesSeen > 0, so we can examine look[1] safely.
	     * Use that capability to screen out overlong sequences.
	     */

	    if (Overlong(look)) {
		/* Reject */
		return fallback;
	    }
	    return (CONST char *)look;
	}

	/* We saw a trail byte. */
	trailBytesSeen++;

	if ((CONST char *)look == start) {
	    /*
	     * Do not read before the start of the string
	     *
	     * If we get here, we've examined bytes at every location
	     * >= start and < src and all of them are trail bytes,
	     * including (*start).  We need to return our fallback
	     * and exit this loop before we run past the start of the string.
	     */
	    return fallback;
	}

	/* Continue the search backwards... */
	look--;
    } while (trailBytesSeen < TCL_UTF_MAX);

    /*
     * We've seen TCL_UTF_MAX trail bytes, so we know there will not be a
     * properly formed byte sequence to find, and we can stop looking,
     * accepting the fallback.
     */

    return fallback;
}

/*
 *---------------------------------------------------------------------------
 *
 * Tcl_UniCharAtIndex --
 *