Fossil

Diff
Login

Differences From Artifact [c01cb3663d]:

To Artifact [cdbc67e459]:


142
143
144
145
146
147
148
149

150
151
152

153
154
155

156
157
158

159
160
161

162
163
164

165
166
167

168
169
170
171
172

173
174
175
176
177
178
179
142
143
144
145
146
147
148

149
150
151

152
153
154

155
156
157

158
159
160

161
162
163

164
165
166

167
168
169
170
171

172
173
174
175
176
177
178
179







-
+


-
+


-
+


-
+


-
+


-
+


-
+




-
+







** Java and Tcl use it. This function also considers valid
** the derivatives CESU-8 & WTF-8 (as described in the same
** wikipedia article referenced previously).
*/

/* definitions for various UTF-8 sequence lengths */
static const unsigned char us2a[] = {
  2, 0xC0, 0xC0, 0x80, 0x80
  2, 0x80, 0x80
};
static const unsigned char us2b[] = {
  2, 0xC2, 0xDF, 0x80, 0xBF
  2, 0x80, 0xBF
};
static const unsigned char us3a[] = {
  3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
  3, 0xA0, 0xBF, 0x80, 0xBF
};
static const unsigned char us3b[] = {
  3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
  3, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4a[] = {
  4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
  4, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4b[] = {
  4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
  4, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
};
static const unsigned char us4c[] = {
  4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
  4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
};

/* a table used for quick lookup of the definition that goes with a
 * particular lead byte */
static const unsigned char* lb_tab[] = {
static const unsigned char* const lb_tab[] = {
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
199
200
201
202
203
204
205
206

207
208
209
210
211
212

213
214
215
216
217
218
219
220
221
222
223
199
200
201
202
203
204
205

206
207
208
209
210
211

212
213
214


215
216
217
218
219
220
221







-
+





-
+


-
-







  while( n>0 ){
    /* ascii is trivial */
    if( *z<0x80 ){
      ++z;
      --n;
    }else{
      /* get the definition for this lead byte */
      unsigned char* def = lb_tab[(*z++)-0x80];
      const unsigned char* def = lb_tab[(*z++)-0x80];
      unsigned char i, len;

      /* if the definition doesn't exist, return invalid */
      if( !def ) return LOOK_INVALID;
      /* get the expected sequence length */
      len = *def;
      len = *def++;
      /* if there aren't enough bytes left, return invalid */
      if( n<len ) return LOOK_INVALID;
      /* skip the length & lead byte range */
      def += 3;
      /* we already know byte #0 is good, so check the remaining bytes */
      for(i=1; i<len; ++i){
        /* if the byte is outside the allowed range for this definition,
         * return invalid */
        if( (*z<*def++) || (*z++>*def++) ){
          return LOOK_INVALID;
        }