Fossil

Check-in [48e1e18304]
Login

Check-in [48e1e18304]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Initial work on the search_stext() function used to extract searchable text from formatted files.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 48e1e18304a2ec18173e4985977a28f7b0a87c5f
User & Date: drh 2015-01-31 22:13:39.480
Context
2015-02-01
00:15
The /search page now covers wiki and check-in comments. And the formatting of snippets is improved. The search is still done by full-scan but the infrastructure is coming into place to handle the search using an index. ... (check-in: 8e02c26ad2 user: drh tags: trunk)
2015-01-31
22:13
Initial work on the search_stext() function used to extract searchable text from formatted files. ... (check-in: 48e1e18304 user: drh tags: trunk)
19:58
Add a routine that attempts to strip all markup off of HTML text. The intended use is in the search logic. ... (check-in: cbd8e67f73 user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/doc.c.
348
349
350
351
352
353
354




































355
356
357
358
359
360
361
*/
void mimetype_test_cmd(void){
  int i;
  for(i=2; i<g.argc; i++){
    fossil_print("%-20s -> %s\n", g.argv[i], mimetype_from_name(g.argv[i]));
  }
}





































/*
** WEBPAGE: doc
** URL: /doc?name=CHECKIN/FILE
** URL: /doc/CHECKIN/FILE
**
** CHECKIN can be either tag or SHA1 hash or timestamp identifying a







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
*/
void mimetype_test_cmd(void){
  int i;
  for(i=2; i<g.argc; i++){
    fossil_print("%-20s -> %s\n", g.argv[i], mimetype_from_name(g.argv[i]));
  }
}

/*
** Look for a file named zName in the checkin with RID=vid.  Load the content
** of that file into pContent and return the RID for the file.  Or return 0
** if the file is not found or could not be loaded.
*/
int doc_load_content(int vid, const char *zName, Blob *pContent){
  int rid;   /* The RID of the file being loaded */
  if( !db_table_exists("repository","vcache") ){
    db_multi_exec(
      "CREATE TABLE IF NOT EXISTS vcache(\n"
      "  vid INTEGER,         -- checkin ID\n"
      "  fname TEXT,          -- filename\n"
      "  rid INTEGER,         -- artifact ID\n"
      "  PRIMARY KEY(vid,fname)\n"
      ") WITHOUT ROWID"
    );
  }
  if( !db_exists("SELECT 1 FROM vcache WHERE vid=%d", vid) ){
    db_multi_exec(
      "DELETE FROM vcache;\n"
      "CREATE VIRTUAL TABLE IF NOT EXISTS temp.foci USING files_of_checkin;\n"
      "INSERT INTO vcache(vid,fname,rid)"
      "  SELECT checkinID, filename, blob.rid FROM foci, blob"
      "   WHERE blob.uuid=foci.uuid"
      "     AND foci.checkinID=%d;",
      vid
    );
  }
  rid = db_int(0, "SELECT rid FROM vcache"
                  " WHERE vid=%d AND fname=%Q", vid, zName);
  if( rid && content_get(rid, pContent)==0 ){
    rid = 0;
  }
  return rid;
}

/*
** WEBPAGE: doc
** URL: /doc?name=CHECKIN/FILE
** URL: /doc/CHECKIN/FILE
**
** CHECKIN can be either tag or SHA1 hash or timestamp identifying a
395
396
397
398
399
400
401

402
403
404
405
406
407
408
  int nMiss = (-1);                 /* Failed attempts to find the document */
  static const char *const azSuffix[] = {
     "index.html", "index.wiki", "index.md"
  };

  login_check_credentials();
  if( !g.perm.Read ){ login_needed(); return; }

  while( rid==0 && (++nMiss)<=ArraySize(azSuffix) ){
    zName = PD("name", "tip/index.wiki");
    for(i=0; zName[i] && zName[i]!='/'; i++){}
    zCheckin = mprintf("%.*s", i, zName);
    if( fossil_strcmp(zCheckin,"ckout")==0 && db_open_local(0)==0 ){
      zCheckin = "tip";
    }







>







431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
  int nMiss = (-1);                 /* Failed attempts to find the document */
  static const char *const azSuffix[] = {
     "index.html", "index.wiki", "index.md"
  };

  login_check_credentials();
  if( !g.perm.Read ){ login_needed(); return; }
  db_begin_transaction();
  while( rid==0 && (++nMiss)<=ArraySize(azSuffix) ){
    zName = PD("name", "tip/index.wiki");
    for(i=0; zName[i] && zName[i]!='/'; i++){}
    zCheckin = mprintf("%.*s", i, zName);
    if( fossil_strcmp(zCheckin,"ckout")==0 && db_open_local(0)==0 ){
      zCheckin = "tip";
    }
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
      zFullpath = mprintf("%s/%s", g.zLocalRoot, zName);
      if( file_isfile(zFullpath)
       && blob_read_from_file(&filebody, zFullpath)>0 ){
        rid = 1;  /* Fake RID just to get the loop to end */
      }
      fossil_free(zFullpath);
    }else{
      db_begin_transaction();
      vid = name_to_typed_rid(zCheckin, "ci");
      db_multi_exec(
        "CREATE TABLE IF NOT EXISTS vcache(\n"
        "  vid INTEGER,         -- checkin ID\n"
        "  fname TEXT,          -- filename\n"
        "  rid INTEGER,         -- artifact ID\n"
        "  PRIMARY KEY(vid,fname)\n"
        ") WITHOUT ROWID"
      );
      if( !db_exists("SELECT 1 FROM vcache WHERE vid=%d", vid) ){
        db_multi_exec(
          "DELETE FROM vcache;\n"
          "CREATE VIRTUAL TABLE temp.foci USING files_of_checkin;\n"
          "INSERT INTO vcache(vid,fname,rid)"
          "  SELECT checkinID, filename, blob.rid FROM foci, blob"
          "   WHERE blob.uuid=foci.uuid"
          "     AND foci.checkinID=%d;",
          vid
        );
      }
      rid = db_int(0, "SELECT rid FROM vcache"
                      " WHERE vid=%d AND fname=%Q", vid, zName);
      if( rid==0 || content_get(rid, &filebody)==0 ){
        goto doc_not_found;
      }
      db_end_transaction(0);
    }
  }
  if( rid==0 ) goto doc_not_found;
  blob_to_utf8_no_bom(&filebody, 0);

  /* The file is now contained in the filebody blob.  Deliver the
  ** file to the user







<

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<







472
473
474
475
476
477
478

479




















480




481
482
483
484
485
486
487
      zFullpath = mprintf("%s/%s", g.zLocalRoot, zName);
      if( file_isfile(zFullpath)
       && blob_read_from_file(&filebody, zFullpath)>0 ){
        rid = 1;  /* Fake RID just to get the loop to end */
      }
      fossil_free(zFullpath);
    }else{

      vid = name_to_typed_rid(zCheckin, "ci");




















      rid = doc_load_content(vid, zName, &filebody);




    }
  }
  if( rid==0 ) goto doc_not_found;
  blob_to_utf8_no_bom(&filebody, 0);

  /* The file is now contained in the filebody blob.  Deliver the
  ** file to the user
520
521
522
523
524
525
526

527
528
529
530
531
532
533
534
535
536
537
538

539
540
541
542
543
544
545
    style_footer();
#endif
  }else{
    cgi_set_content_type(zMime);
    cgi_set_content(&filebody);
  }
  if( nMiss>=ArraySize(azSuffix) ) cgi_set_status(404, "Not Found");

  return;

  /* Jump here when unable to locate the document */
doc_not_found:
  db_end_transaction(0);
  cgi_set_status(404, "Not Found");
  style_header("Not Found");
  @ <p>Document %h(zOrigName) not found
  if( fossil_strcmp(zCheckin,"ckout")!=0 ){
    @ in %z(href("%R/tree?ci=%T",zCheckin))%h(zCheckin)</a>
  }
  style_footer();

  return;
}

/*
** The default logo.
*/
static const unsigned char aLogo[] = {







>












>







532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
    style_footer();
#endif
  }else{
    cgi_set_content_type(zMime);
    cgi_set_content(&filebody);
  }
  if( nMiss>=ArraySize(azSuffix) ) cgi_set_status(404, "Not Found");
  db_end_transaction(0);
  return;

  /* Jump here when unable to locate the document */
doc_not_found:
  db_end_transaction(0);
  cgi_set_status(404, "Not Found");
  style_header("Not Found");
  @ <p>Document %h(zOrigName) not found
  if( fossil_strcmp(zCheckin,"ckout")!=0 ){
    @ in %z(href("%R/tree?ci=%T",zCheckin))%h(zCheckin)</a>
  }
  style_footer();
  db_end_transaction(0);
  return;
}

/*
** The default logo.
*/
static const unsigned char aLogo[] = {
Changes to src/search.c.
561
562
563
564
565
566
567




























































































      @ <li><p>%s(href("%s",zUrl))%h(zUrl)</a><br>%s(zSnippet)</li>
    }
    db_finalize(&q);
    @ </ol>
  }
  style_footer();
}



































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
      @ <li><p>%s(href("%s",zUrl))%h(zUrl)</a><br>%s(zSnippet)</li>
    }
    db_finalize(&q);
    @ </ol>
  }
  style_footer();
}


/*
** This is a helper function for search_stext().  Writing into pOut
** the search text obtained from pIn according to zMimetype.
*/
static void get_stext_by_mimetype(
  Blob *pIn,
  const char *zMimetype,
  Blob *pOut
){
  Blob html, title;
  blob_init(&html, 0, 0);
  blob_init(&title, 0, 0);
  if( zMimetype==0 ) zMimetype = "text/plain";
  if( fossil_strcmp(zMimetype,"text/x-fossil-wiki")==0 ){
    wiki_convert(pIn, &html, 0);
    html_to_plaintext(blob_str(&html), pOut);
  }else if( fossil_strcmp(zMimetype,"text/x-markdown")==0 ){
    markdown_to_html(pIn, &title, &html);
    html_to_plaintext(blob_str(&html), pOut);
  }else if( fossil_strcmp(zMimetype,"text/html")==0 ){
    html_to_plaintext(blob_str(pIn), pOut);
  }else{
    *pOut = *pIn;
    blob_init(pIn, 0, 0);
  }
  blob_reset(&html);
  blob_reset(&title);
}

/*
** Return "search text" - a reduced version of a document appropriate for
** full text search and/or for constructing a search result snippet.
**
**    cType:            d      Embedded documentation
**                      s      Source code listing
**                      w      Wiki page
**                      c      Check-in comment
**                      t      Ticket text
**                      e      Event/Blog text
**                      k      Diff of a wiki
**                      f      Diff of a checkin
**
**   zArg1, zArg2:      Description of the document, depending on cType.
*/
void search_stext(
  char cType,            /* Type of document */
  const char *zArg1,     /* First parameter */
  const char *zArg2,     /* Second parameter */
  Blob *pOut             /* OUT: Initialize to the search text */
){
  blob_init(pOut, 0, 0);
  switch( cType ){
    case 'd':     /* Doc.     zArg1: RID of the file.  zArg2: Filename */
    case 's': {   /* Source.  zArg1: RID of the file.  zArg2: Filename */
      int rid = atoi(zArg1);
      Blob doc;
      content_get(rid, &doc);
      blob_to_utf8_no_bom(&doc, 0);
      get_stext_by_mimetype(&doc, mimetype_from_name(zArg2), pOut);
      blob_reset(&doc);
      break;
    }
    case 'w': {   /* Wiki.    zArg1: RID of the page.  zArg2: Page name */
      int rid = atoi(zArg1);
      Manifest *pWiki = manifest_get(rid, CFTYPE_WIKI,0);
      Blob wiki;
      if( pWiki==0 ) break;
      blob_init(&wiki, pWiki->zWiki, -1);
      get_stext_by_mimetype(&wiki, wiki_filter_mimetypes(pWiki->zMimetype),
                            pOut);
      blob_reset(&wiki);
      manifest_destroy(pWiki);
      break;
    }
  }
}

/*
** COMMAND: test-search-stext
**
** Usage: fossil test-search-stext TYPE ARG1 ARG2
*/
void test_search_stext(void){
  Blob out;
  db_find_and_open_repository(0,0);
  if( g.argc!=5 ) usage("TYPE ARG1 ARG2");
  search_stext(g.argv[2][0], g.argv[3], g.argv[4], &out);
  fossil_print("%s",blob_str(&out));
  blob_reset(&out);
}