Fossil

Check-in [c84051f38d]
Login

Check-in [c84051f38d]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Rename re_execute() to re_match(). Other fixes to regex matcher bugs that were introduced when porting the code from SQLite.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: c84051f38db377f5bd7843b9f72fb0b5ca38f5c6
User & Date: drh 2013-01-03 20:02:22.308
Context
2013-01-03
22:33
4-byte utf-8 for re_compile, fix 3-byte utf-8 there, and fix a compiler warning. ... (check-in: f86304fefa user: jan.nijtmans tags: trunk)
20:02
Rename re_execute() to re_match(). Other fixes to regex matcher bugs that were introduced when porting the code from SQLite. ... (check-in: c84051f38d user: drh tags: trunk)
13:38
regexp.c (re_next_char): 3 off-by-one errors, and disallow characters > 0x10ffff unicode.c/diff.c: fix some gcc warnings ... (check-in: a13e0a20a8 user: jan.nijtmans tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/diff.c.
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
*/
static int re_dline_match(
  ReCompiled *pRe,    /* The regular expression to be matched */
  DLine *aDLine,      /* First of N DLines to compare against */
  int N               /* Number of DLines to check */
){
  while( N-- ){
    if( re_execute(pRe, (const unsigned char *)aDLine->z, LENGTH(aDLine)) ){
      return 1;
    }
    aDLine++;
  }
  return 0;
}








|







421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
*/
static int re_dline_match(
  ReCompiled *pRe,    /* The regular expression to be matched */
  DLine *aDLine,      /* First of N DLines to compare against */
  int N               /* Number of DLines to check */
){
  while( N-- ){
    if( re_match(pRe, (const unsigned char *)aDLine->z, LENGTH(aDLine)) ){
      return 1;
    }
    aDLine++;
  }
  return 0;
}

Changes to src/regexp.c.
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
static int re_space_char(int c){
  return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f' ;
}

/* Run a compiled regular expression on the zero-terminated input
** string zIn[].  Return true on a match and false if there is no match.
*/
int re_execute(ReCompiled *pRe, const unsigned char *zIn, int nIn){
  ReStateSet aStateSet[2], *pThis, *pNext;
  ReStateNumber aSpace[100];
  ReStateNumber *pToFree;
  unsigned int i = 0;
  unsigned int iSwap = 0;
  int c = RE_EOF+1;
  int cPrev = 0;
  int rc = 0;
  ReInput in;

  in.z = zIn;
  in.i = 0;
  in.mx = nIn>=0 ? nIn : strlen((char const*)zIn);
  if( pRe->nInit ){
    unsigned char x = pRe->zInit[0];
    while( in.i+pRe->nInit<in.mx 
        && (zIn[in.i]!=x || memcmp(zIn+in.i, pRe->zInit, pRe->nInit)!=0)
    ){
      in.i++;
    }
    if( in.i+pRe->nInit>=in.mx ) return 0;
  }
  if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
    pToFree = 0;
    aStateSet[0].aState = aSpace;
  }else{
    pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState );
    if( pToFree==0 ) return -1;







|















|




|







176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
static int re_space_char(int c){
  return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f' ;
}

/* Run a compiled regular expression on the zero-terminated input
** string zIn[].  Return true on a match and false if there is no match.
*/
int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
  ReStateSet aStateSet[2], *pThis, *pNext;
  ReStateNumber aSpace[100];
  ReStateNumber *pToFree;
  unsigned int i = 0;
  unsigned int iSwap = 0;
  int c = RE_EOF+1;
  int cPrev = 0;
  int rc = 0;
  ReInput in;

  in.z = zIn;
  in.i = 0;
  in.mx = nIn>=0 ? nIn : strlen((char const*)zIn);
  if( pRe->nInit ){
    unsigned char x = pRe->zInit[0];
    while( in.i+pRe->nInit<=in.mx 
        && (zIn[in.i]!=x || memcmp(zIn+in.i, pRe->zInit, pRe->nInit)!=0)
    ){
      in.i++;
    }
    if( in.i+pRe->nInit>in.mx ) return 0;
  }
  if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
    pToFree = 0;
    aStateSet[0].aState = aSpace;
  }else{
    pToFree = fossil_malloc( sizeof(ReStateNumber)*2*pRe->nState );
    if( pToFree==0 ) return -1;
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
        }
        case RE_OP_GOTO: {
          re_add_state(pThis, x+pRe->aArg[x]);
          break;
        }
        case RE_OP_ACCEPT: {
          rc = 1;
          goto re_execute_end;
        }
        case RE_OP_CC_INC:
        case RE_OP_CC_EXC: {
          int j = 1;
          int n = pRe->aArg[x];
          int hit = 0;
          for(j=1; j>0 && j<n; j++){







|







273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
        }
        case RE_OP_GOTO: {
          re_add_state(pThis, x+pRe->aArg[x]);
          break;
        }
        case RE_OP_ACCEPT: {
          rc = 1;
          goto re_match_end;
        }
        case RE_OP_CC_INC:
        case RE_OP_CC_EXC: {
          int j = 1;
          int n = pRe->aArg[x];
          int hit = 0;
          for(j=1; j>0 && j<n; j++){
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
        }
      }
    }
  }
  for(i=0; i<pNext->nState; i++){
    if( pRe->aOp[pNext->aState[i]]==RE_OP_ACCEPT ){ rc = 1; break; }
  }
re_execute_end:
  fossil_free(pToFree);
  return rc;
}

/* Resize the opcode and argument arrays for an RE under construction.
*/
static int re_resize(ReCompiled *p, int N){







|







305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
        }
      }
    }
  }
  for(i=0; i<pNext->nState; i++){
    if( pRe->aOp[pNext->aState[i]]==RE_OP_ACCEPT ){ rc = 1; break; }
  }
re_match_end:
  fossil_free(pToFree);
  return rc;
}

/* Resize the opcode and argument arrays for an RE under construction.
*/
static int re_resize(ReCompiled *p, int N){
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
*/
static unsigned re_esc_char(ReCompiled *p){
  static const char zEsc[] = "afnrtv\\()*.+?[$^{|}]";
  static const char zTrans[] = "\a\f\n\r\t\v";
  int i, v = 0;
  char c;
  if( p->sIn.i>=p->sIn.mx ) return 0;
  c = p->sIn.z[0];
  if( c=='u' && p->sIn.i+5<p->sIn.mx ){
    v = 0;
    const unsigned char *zIn = p->sIn.z + p->sIn.i;
    if( re_hex(zIn[1],&v)
     && re_hex(zIn[2],&v)
     && re_hex(zIn[3],&v)
     && re_hex(zIn[4],&v)







|







384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
*/
static unsigned re_esc_char(ReCompiled *p){
  static const char zEsc[] = "afnrtv\\()*.+?[$^{|}]";
  static const char zTrans[] = "\a\f\n\r\t\v";
  int i, v = 0;
  char c;
  if( p->sIn.i>=p->sIn.mx ) return 0;
  c = p->sIn.z[p->sIn.i];
  if( c=='u' && p->sIn.i+5<p->sIn.mx ){
    v = 0;
    const unsigned char *zIn = p->sIn.z + p->sIn.i;
    if( re_hex(zIn[1],&v)
     && re_hex(zIn[2],&v)
     && re_hex(zIn[3],&v)
     && re_hex(zIn[4],&v)
592
593
594
595
596
597
598

599
600
601
602
603
604
605
606
607
608
609
610
611
** regular expression.  Applications should invoke this routine once
** for every call to re_compile() to avoid memory leaks.
*/
void re_free(ReCompiled *pRe){
  if( pRe ){
    fossil_free(pRe->aOp);
    fossil_free(pRe->aArg);

  }
}

/*
** Compile a textual regular expression in zIn[] into a compiled regular
** expression suitable for us by re_execute() and return a pointer to the
** compiled regular expression in *ppRe.  Return NULL on success or an
** error message if something goes wrong.
*/
const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
  ReCompiled *pRe;
  const char *zErr;
  int i, j;







>





|







592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
** regular expression.  Applications should invoke this routine once
** for every call to re_compile() to avoid memory leaks.
*/
void re_free(ReCompiled *pRe){
  if( pRe ){
    fossil_free(pRe->aOp);
    fossil_free(pRe->aArg);
    fossil_free(pRe);
  }
}

/*
** Compile a textual regular expression in zIn[] into a compiled regular
** expression suitable for us by re_match() and return a pointer to the
** compiled regular expression in *ppRe.  Return NULL on success or an
** error message if something goes wrong.
*/
const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
  ReCompiled *pRe;
  const char *zErr;
  int i, j;
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
  if( zIn[0]=='^' ){
    zIn++;
  }else{
    re_append(pRe, RE_OP_ANYSTAR, 0);
  }
  pRe->sIn.z = (unsigned char*)zIn;
  pRe->sIn.i = 0;
  pRe->sIn.mx = strlen(zIn);
  zErr = re_subcompile_re(pRe);
  if( zErr ){
    re_free(pRe);
    return zErr;
  }
  if( rePeek(pRe)=='$' && pRe->sIn.i+1==pRe->sIn.mx ){
    re_append(pRe, RE_OP_MATCH, RE_EOF);
    re_append(pRe, RE_OP_ACCEPT, 0);
    *ppRe = pRe;
  }else if( pRe->sIn.i>=pRe->sIn.mx ){
    re_append(pRe, RE_OP_ACCEPT, 0);
    *ppRe = pRe;
  }else{







|





|







625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
  if( zIn[0]=='^' ){
    zIn++;
  }else{
    re_append(pRe, RE_OP_ANYSTAR, 0);
  }
  pRe->sIn.z = (unsigned char*)zIn;
  pRe->sIn.i = 0;
  pRe->sIn.mx = strlen(pRe->sIn.z);
  zErr = re_subcompile_re(pRe);
  if( zErr ){
    re_free(pRe);
    return zErr;
  }
  if( rePeek(pRe)=='$' && pRe->sIn.i+1>=pRe->sIn.mx ){
    re_append(pRe, RE_OP_MATCH, RE_EOF);
    re_append(pRe, RE_OP_ACCEPT, 0);
    *ppRe = pRe;
  }else if( pRe->sIn.i>=pRe->sIn.mx ){
    re_append(pRe, RE_OP_ACCEPT, 0);
    *ppRe = pRe;
  }else{
657
658
659
660
661
662
663

664
665
666
667
668
669
670
        pRe->zInit[j++] = 0xd0 | (x>>12);
        pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
        pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
      }else{
        break;
      }
    }

    pRe->nInit = j;
  }
  return pRe->zErr;
}

/*
** Implementation of the regexp() SQL function.  This function implements







>







658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
        pRe->zInit[j++] = 0xd0 | (x>>12);
        pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
        pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
      }else{
        break;
      }
    }
    if( j>0 && pRe->zInit[j-1]==0 ) j--;
    pRe->nInit = j;
  }
  return pRe->zErr;
}

/*
** Implementation of the regexp() SQL function.  This function implements
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
      sqlite3_result_error_nomem(context);
      return;
    }
    sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
  }
  zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
  if( zStr!=0 ){
    sqlite3_result_int(context, re_execute(pRe, zStr, -1));
  }
}

/*
** Invoke this routine in order to install the REGEXP function in an
** SQLite database connection.
**







|







700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
      sqlite3_result_error_nomem(context);
      return;
    }
    sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
  }
  zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
  if( zStr!=0 ){
    sqlite3_result_int(context, re_match(pRe, zStr, -1));
  }
}

/*
** Invoke this routine in order to install the REGEXP function in an
** SQLite database connection.
**
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
  int ln = 0;
  int n;
  char zLine[2000];
  while( fgets(zLine, sizeof(zLine), in) ){
    ln++;
    n = (int)strlen(zLine);
    while( n && (zLine[n-1]=='\n' || zLine[n-1]=='\r') ) n--;
    if( re_execute(pRe, (const unsigned char*)zLine, n) ){
      printf("%s:%d:%.*s\n", zFile, ln, n, zLine);
    }
  }
}

/*
** COMMAND: test-grep







|







731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
  int ln = 0;
  int n;
  char zLine[2000];
  while( fgets(zLine, sizeof(zLine), in) ){
    ln++;
    n = (int)strlen(zLine);
    while( n && (zLine[n-1]=='\n' || zLine[n-1]=='\r') ) n--;
    if( re_match(pRe, (const unsigned char*)zLine, n) ){
      printf("%s:%d:%.*s\n", zFile, ln, n, zLine);
    }
  }
}

/*
** COMMAND: test-grep