Differences From Artifact [d71ee01f31]:
- File generic/tclUtf.c — part of check-in [4227206dc3] at 2020-04-20 04:40:38 on branch dgp-utf-explore — Apply first fix of [c61818e4c9] and adjust tests. (user: dgp size: 48435)
To Artifact [385e9d7fe5]:
- File generic/tclUtf.c — part of check-in [32bc8e9f0c] at 2020-04-20 04:58:52 on branch dgp-utf-explore — Apply fix for [5e6346a252] and adjust tests. (user: dgp size: 51958)
| ︙ | ︙ | |||
62 63 64 65 66 67 68 |
static CONST unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
| | > | 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
static CONST unsigned char totalBytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
#if TCL_UTF_MAX > 3
4,4,4,4,4,4,4,4,
#else
1,1,1,1,1,1,1,1,
#endif
1,1,1,1,1,1,1,1
};
/*
* Functions used only in this module.
*/
static int UtfCount(int ch);
static int Overlong(unsigned char *src);
/*
*---------------------------------------------------------------------------
*
* UtfCount --
*
* Find the number of bytes in the Utf character "ch".
|
| ︙ | ︙ | |||
111 112 113 114 115 116 117 |
#if TCL_UTF_MAX > 3
if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
return 4;
}
#endif
return 3;
}
| | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
#if TCL_UTF_MAX > 3
if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
return 4;
}
#endif
return 3;
}
/*
*---------------------------------------------------------------------------
*
* Overlong --
*
* Utility routine to report whether /src/ points to the start of an
* overlong byte sequence that should be rejected. Caller guarantees
* that src[0] and src[1] are readable, and
*
* (src[0] >= 0xC0) && (src[0] != 0xC1)
* (src[1] >= 0x80) && (src[1] < 0xC0)
* (src[0] < ((TCL_UTF_MAX > 3) ? 0xF8 : 0xF0))
*
* Results:
* A boolean.
*---------------------------------------------------------------------------
*/
static CONST unsigned char overlong[3] = {
0x80, /* \xD0 -- all sequences valid */
0xA0, /* \xE0\x80 through \xE0\x9F are invalid prefixes */
#if TCL_UTF_MAX > 3
0x90 /* \xF0\x80 through \xF0\x8F are invalid prefixes */
#else
0xC0 /* Not used, but reject all again for safety. */
#endif
};
INLINE static int
Overlong(
unsigned char *src) /* Points to lead byte of a UTF-8 byte sequence */
{
unsigned char byte = *src;
if (byte % 0x10) {
/* Only lead bytes 0xC0, 0xE0, 0xF0 need examination */
return 0;
}
if (byte == 0xC0) {
if (src[1] == 0x80) {
/* Valid sequence: \xC0\x80 for \u0000 */
return 0;
}
/* Reject overlong: \xC0\x81 - \xC0\xBF */
return 1;
}
if (src[1] < overlong[(byte >> 4) - 0x0D]) {
/* Reject overlong */
return 1;
}
return 0;
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UniCharToUtf --
*
* Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
* provided buffer. Equivalent to Plan 9 runetochar().
|
| ︙ | ︙ | |||
657 658 659 660 661 662 663 |
*/
CONST char *
Tcl_UtfPrev(
CONST char *src, /* A location in a UTF-8 string. */
CONST char *start) /* Pointer to the beginning of the string */
{
| > | | > > > > < | < | < | | < | > | > > > > > > > | > > > > > > > > > > | > > > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > | > > > | > > > > > > | | 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 |
*/
CONST char *
Tcl_UtfPrev(
CONST char *src, /* A location in a UTF-8 string. */
CONST char *start) /* Pointer to the beginning of the string */
{
int trailBytesSeen = 0; /* How many trail bytes have been verified? */
CONST char *fallback = src - 1;
/* If we cannot find a lead byte that might
* start a prefix of a valid UTF byte sequence,
* we will fallback to a one-byte back step */
unsigned char *look = (unsigned char *)fallback;
/* Start search at the fallback position */
/* Quick boundary case exit. */
if (fallback <= start) {
return start;
}
do {
unsigned char byte = look[0];
if (byte < 0x80) {
/*
* Single byte character. Either this is a correct previous
* character, or it is followed by at least one trail byte
* which indicates a malformed sequence. In either case the
* correct result is to return the fallback.
*/
return fallback;
}
if (byte >= 0xC0) {
/* Non-trail byte; May be multibyte lead. */
if ((trailBytesSeen == 0)
/*
* We've seen no trailing context to use to check
* anything. From what we know, this non-trail byte
* is a prefix of a previous character, and accepting
* it (the fallback) is correct.
*/
|| (trailBytesSeen >= totalBytes[byte])) {
/*
* That is, (1 + trailBytesSeen > needed).
* We've examined more bytes than needed to complete
* this lead byte. No matter about well-formedness or
* validity, the sequence starting with this lead byte
* will never include the fallback location, so we must
* return the fallback location. See test utf-7.17
*/
return fallback;
}
/*
* trailBytesSeen > 0, so we can examine look[1] safely.
* Use that capability to screen out overlong sequences.
*/
if (Overlong(look)) {
/* Reject */
return fallback;
}
return (CONST char *)look;
}
/* We saw a trail byte. */
trailBytesSeen++;
if ((CONST char *)look == start) {
/*
* Do not read before the start of the string
*
* If we get here, we've examined bytes at every location
* >= start and < src and all of them are trail bytes,
* including (*start). We need to return our fallback
* and exit this loop before we run past the start of the string.
*/
return fallback;
}
/* Continue the search backwards... */
look--;
} while (trailBytesSeen < TCL_UTF_MAX);
/*
* We've seen TCL_UTF_MAX trail bytes, so we know there will not be a
* properly formed byte sequence to find, and we can stop looking,
* accepting the fallback.
*/
return fallback;
}
/*
*---------------------------------------------------------------------------
*
* Tcl_UniCharAtIndex --
*
|
| ︙ | ︙ |