/* ** Copyright (c) 2009 D. Richard Hipp ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the Simplified BSD License (also ** known as the "2-Clause License" or "FreeBSD License".) ** This program is distributed in the hope that it will be useful, ** but without any warranty; without even the implied warranty of ** merchantability or fitness for a particular purpose. ** ** Author contact information: ** drh@hwaci.com ** http://www.hwaci.com/drh/ ** ******************************************************************************* ** ** This file contains code to implement a very simple search function ** against timeline comments, checkin content, wiki pages, and/or tickets. ** ** The search is full-text like in that it is looking for words and ignores ** punctuation and capitalization. But it is more akin to "grep" in that ** it scans the entire corpus for the search, and it does not support the ** full functionality of FTS4. */ #include "config.h" #include "search.h" #include #if INTERFACE /* Maximum number of search terms */ #define SEARCH_MAX_TERM 8 /* ** A compiled search pattern */ struct Search { int nTerm; /* Number of search terms */ struct srchTerm { /* For each search term */ char *z; /* Text */ int n; /* length */ } a[SEARCH_MAX_TERM]; /* Snippet controls */ char *zPattern; /* The search pattern */ char *zMarkBegin; /* Start of a match */ char *zMarkEnd; /* End of a match */ char *zMarkGap; /* A gap between two matches */ unsigned fSrchFlg; /* Flags */ }; #define SRCHFLG_HTML 0x01 /* Escape snippet text for HTML */ #define SRCHFLG_SCORE 0x02 /* Prepend the score to each snippet */ #define SRCHFLG_STATIC 0x04 /* The static gSearch object */ #endif /* ** There is a single global Search object: */ static Search gSearch; /* ** Theses characters constitute a word boundary */ static const char isBoundary[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; #define ISALNUM(x) (!isBoundary[(x)&0xff]) /* ** Destroy a search context. */ void search_end(Search *p){ if( p ){ fossil_free(p->zPattern); fossil_free(p->zMarkBegin); fossil_free(p->zMarkEnd); fossil_free(p->zMarkGap); memset(p, 0, sizeof(*p)); if( p!=&gSearch ) fossil_free(p); } } /* ** Compile a search pattern */ Search *search_init( const char *zPattern, /* The search pattern */ const char *zMarkBegin, /* Start of a match */ const char *zMarkEnd, /* End of a match */ const char *zMarkGap, /* A gap between two matches */ unsigned fSrchFlg /* Flags */ ){ Search *p; char *z; int i; if( fSrchFlg & SRCHFLG_STATIC ){ p = &gSearch; search_end(p); }else{ p = fossil_malloc(sizeof(*p)); memset(p, 0, sizeof(*p)); } p->zPattern = z = mprintf("%s", zPattern); p->zMarkBegin = mprintf("%s", zMarkBegin); p->zMarkEnd = mprintf("%s", zMarkEnd); p->zMarkGap = mprintf("%s", zMarkGap); p->fSrchFlg = fSrchFlg; while( *z && p->nTerma[p->nTerm].z = z; for(i=1; ISALNUM(z[i]); i++){} p->a[p->nTerm].n = i; z += i; p->nTerm++; } return p; } /* ** Append n bytes of text to snippet zTxt. Encode the text appropriately. */ static void snippet_text_append( Search *p, /* The search context */ Blob *pSnip, /* Append to this snippet */ const char *zTxt, /* Text to append */ int n /* How many bytes to append */ ){ if( n>0 ){ if( p->fSrchFlg & SRCHFLG_HTML ){ blob_appendf(pSnip, "%#h", n, zTxt); }else{ blob_append(pSnip, zTxt, n); } } } /* ** Compare a search pattern against one or more input strings which ** collectively comprise a document. Return a match score. Optionally ** also return a "snippet". ** ** Scoring: ** * All terms must match at least once or the score is zero ** * One point for each matching term ** * Extra points if consecutive words of the pattern are consecutive ** in the document */ static int search_score( Search *p, /* Search pattern and flags */ int nDoc, /* Number of strings in this document */ const char **azDoc, /* Text of each string */ Blob *pSnip /* If not NULL: Write a snippet here */ ){ int score; /* Final score */ int i; /* Offset into current document */ int ii; /* Loop counter */ int j; /* Loop over search terms */ int k; /* Loop over prior terms */ int iWord = 0; /* Current word number */ int iDoc; /* Current document number */ int wantGap = 0; /* True if a zMarkGap is wanted */ const char *zDoc; /* Current document text */ const int CTX = 50; /* Amount of snippet context */ int anMatch[SEARCH_MAX_TERM]; /* Number of terms in best match */ int aiBestDoc[SEARCH_MAX_TERM]; /* Document containing best match */ int aiBestOfst[SEARCH_MAX_TERM]; /* Byte offset to start of best match */ int aiLastDoc[SEARCH_MAX_TERM]; /* Document containing most recent match */ int aiLastOfst[SEARCH_MAX_TERM]; /* Byte offset to the most recent match */ int aiWordIdx[SEARCH_MAX_TERM]; /* Word index of most recent match */ memset(anMatch, 0, sizeof(anMatch)); memset(aiWordIdx, 0xff, sizeof(aiWordIdx)); for(iDoc=0; iDocnTerm; j++){ int n = p->a[j].n; if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0 && (!ISALNUM(zDoc[i+n]) || p->a[j].z[n]=='*') ){ aiWordIdx[j] = iWord; aiLastDoc[j] = iDoc; aiLastOfst[j] = i; for(k=1; j-k>=0 && anMatch[j-k] && aiWordIdx[j-k]==iWord-k; k++){} for(ii=0; iinTerm; j++) score *= anMatch[j]; if( score==0 || pSnip==0 ) return score; /* Prepare a snippet that describes the matching text. */ blob_init(pSnip, 0, 0); if( p->fSrchFlg & SRCHFLG_SCORE ) blob_appendf(pSnip, "%08x", score); while(1){ int iOfst; int iTail; int iBest; for(ii=0; iinTerm && anMatch[ii]==0; ii++){} if( ii>=p->nTerm ) break; /* This is where the loop exits */ iBest = ii; iDoc = aiBestDoc[ii]; iOfst = aiBestOfst[ii]; for(; iinTerm; ii++){ if( anMatch[ii]==0 ) continue; if( aiBestDoc[ii]>iDoc ) continue; if( aiBestOfst[ii]>iOfst ) continue; iDoc = aiBestDoc[ii]; iOfst = aiBestOfst[ii]; iBest = ii; } iTail = iOfst + p->a[iBest].n; anMatch[iBest] = 0; for(ii=0; iinTerm; ii++){ if( anMatch[ii]==0 ) continue; if( aiBestDoc[ii]!=iDoc ) continue; if( aiBestOfst[ii]<=iTail+CTX*2 ){ if( iTaila[ii].n ){ iTail = aiBestOfst[ii]+p->a[ii].n; } anMatch[ii] = 0; ii = -1; continue; } } zDoc = azDoc[iDoc]; iOfst -= CTX; if( iOfst<0 ) iOfst = 0; while( iOfst>0 && ISALNUM(zDoc[iOfst-1]) ) iOfst--; while( zDoc[iOfst] && !ISALNUM(zDoc[iOfst]) ) iOfst++; for(ii=0; ii0 || wantGap ) blob_append(pSnip, p->zMarkGap, -1); wantGap = zDoc[iTail]!=0; zDoc += iOfst; iTail -= iOfst; /* Add a snippet segment using characters iOfst..iOfst+iTail from zDoc */ for(i=0; inTerm; j++){ int n = p->a[j].n; if( sqlite3_strnicmp(p->a[j].z, &zDoc[i], n)==0 && (!ISALNUM(zDoc[i+n]) || p->a[j].z[n]=='*') ){ snippet_text_append(p, pSnip, zDoc, i); zDoc += i; iTail -= i; blob_append(pSnip, p->zMarkBegin, -1); if( p->a[j].z[n]=='*' ){ while( ISALNUM(zDoc[n]) ) n++; } snippet_text_append(p, pSnip, zDoc, n); zDoc += n; iTail -= n; blob_append(pSnip, p->zMarkEnd, -1); i = -1; break; } /* end-if */ } /* end for(j) */ if( jnTerm ){ while( ISALNUM(zDoc[i]) && izMarkGap, -1); return score; } /* ** COMMAND: test-snippet ** ** Usage: fossil test-snippet SEARCHSTRING FILE1 FILE2 ... */ void test_snippet_cmd(void){ Search *p; int i; Blob x; Blob snip; int score; char *zDoc; int flg = 0; char *zBegin = (char*)find_option("begin",0,1); char *zEnd = (char*)find_option("end",0,1); char *zGap = (char*)find_option("gap",0,1); if( find_option("html",0,0)!=0 ) flg |= SRCHFLG_HTML; if( find_option("score",0,0)!=0 ) flg |= SRCHFLG_SCORE; if( find_option("static",0,0)!=0 ) flg |= SRCHFLG_STATIC; verify_all_options(); if( g.argc<4 ) usage("SEARCHSTRING FILE1..."); if( zBegin==0 ) zBegin = "[["; if( zEnd==0 ) zEnd = "]]"; if( zGap==0 ) zGap = " ... "; p = search_init(g.argv[2], zBegin, zEnd, zGap, flg); for(i=3; i20 or 0"); } }else{ width = -1; } db_must_be_within_tree(); if( g.argc<2 ) return; blob_init(&pattern, g.argv[2], -1); for(i=3; i0;", timeline_utc() ); iBest = db_int(0, "SELECT max(x) FROM srch"); blob_append(&sql, "SELECT rid, uuid, date, comment, 0, 0 FROM srch " "WHERE 1 ", -1); if(!fAll){ blob_append_sql(&sql,"AND x>%d ", iBest/3); } blob_append(&sql, "ORDER BY x DESC, date DESC ", -1); db_prepare(&q, "%s", blob_sql_text(&sql)); blob_reset(&sql); print_timeline(&q, nLimit, width, 0); db_finalize(&q); } /* ** WEBPAGE: /search ** ** This is an EXPERIMENTAL page for doing search across a repository. ** ** The current implementation does a full text search over embedded ** documentation files on the tip of the "trunk" branch. Only files ** ending in ".wiki", ".md", ".html", and ".txt" are searched. ** ** The entire text is scanned. There is no full-text index. This is ** experimental. We may change to using a full-text index depending ** on performance. ** ** Other pending enhancements: ** * Search tickets ** * Search wiki */ void search_page(void){ const char *zPattern = PD("s",""); Stmt q; login_check_credentials(); if( !g.perm.Read ){ login_needed(); return; } style_header("Search"); @
@ @ @
while( fossil_isspace(zPattern[0]) ) zPattern++; if( zPattern[0] ){ search_sql_setup(g.db); add_content_sql_commands(g.db); search_init(zPattern, "", "", " ... ", SRCHFLG_STATIC|SRCHFLG_HTML|SRCHFLG_SCORE); db_multi_exec( "CREATE VIRTUAL TABLE temp.foci USING files_of_checkin;" "CREATE TEMP TABLE x(fn TEXT,url TEXT,snip TEXT);" "INSERT INTO x(fn,url,snip)" " SELECT filename, printf('%R/doc/trunk/%%s',filename)," " snippet(content(uuid))" " FROM foci" " WHERE checkinID=symbolic_name_to_rid('trunk')" " AND (filename GLOB '*.wiki' OR" " filename GLOB '*.md' OR" " filename GLOB '*.txt' OR" " filename GLOB '*.html');" ); db_prepare(&q, "SELECT url, substr(snip,8)" " FROM x WHERE snip IS NOT NULL" " ORDER BY substr(snip,1,8) DESC, fn;"); @
    while( db_step(&q)==SQLITE_ROW ){ const char *zUrl = db_column_text(&q, 0); const char *zSnippet = db_column_text(&q, 1); @
  1. %s(href("%s",zUrl))%h(zUrl)
    %s(zSnippet)

  2. } db_finalize(&q); @
} style_footer(); } /* ** This is a helper function for search_stext(). Writing into pOut ** the search text obtained from pIn according to zMimetype. */ static void get_stext_by_mimetype( Blob *pIn, const char *zMimetype, Blob *pOut ){ Blob html, title; blob_init(&html, 0, 0); blob_init(&title, 0, 0); if( zMimetype==0 ) zMimetype = "text/plain"; if( fossil_strcmp(zMimetype,"text/x-fossil-wiki")==0 ){ wiki_convert(pIn, &html, 0); html_to_plaintext(blob_str(&html), pOut); }else if( fossil_strcmp(zMimetype,"text/x-markdown")==0 ){ markdown_to_html(pIn, &title, &html); html_to_plaintext(blob_str(&html), pOut); }else if( fossil_strcmp(zMimetype,"text/html")==0 ){ html_to_plaintext(blob_str(pIn), pOut); }else{ *pOut = *pIn; blob_init(pIn, 0, 0); } blob_reset(&html); blob_reset(&title); } /* ** Return "search text" - a reduced version of a document appropriate for ** full text search and/or for constructing a search result snippet. ** ** cType: d Embedded documentation ** s Source code listing ** w Wiki page ** c Check-in comment ** t Ticket text ** e Event/Blog text ** k Diff of a wiki ** f Diff of a checkin ** ** zArg1, zArg2: Description of the document, depending on cType. */ void search_stext( char cType, /* Type of document */ const char *zArg1, /* First parameter */ const char *zArg2, /* Second parameter */ Blob *pOut /* OUT: Initialize to the search text */ ){ blob_init(pOut, 0, 0); switch( cType ){ case 'd': /* Doc. zArg1: RID of the file. zArg2: Filename */ case 's': { /* Source. zArg1: RID of the file. zArg2: Filename */ int rid = atoi(zArg1); Blob doc; content_get(rid, &doc); blob_to_utf8_no_bom(&doc, 0); get_stext_by_mimetype(&doc, mimetype_from_name(zArg2), pOut); blob_reset(&doc); break; } case 'w': { /* Wiki. zArg1: RID of the page. zArg2: Page name */ int rid = atoi(zArg1); Manifest *pWiki = manifest_get(rid, CFTYPE_WIKI,0); Blob wiki; if( pWiki==0 ) break; blob_init(&wiki, pWiki->zWiki, -1); get_stext_by_mimetype(&wiki, wiki_filter_mimetypes(pWiki->zMimetype), pOut); blob_reset(&wiki); manifest_destroy(pWiki); break; } } } /* ** COMMAND: test-search-stext ** ** Usage: fossil test-search-stext TYPE ARG1 ARG2 */ void test_search_stext(void){ Blob out; db_find_and_open_repository(0,0); if( g.argc!=5 ) usage("TYPE ARG1 ARG2"); search_stext(g.argv[2][0], g.argv[3], g.argv[4], &out); fossil_print("%s",blob_str(&out)); blob_reset(&out); }