Check-in [e180dbb455]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add fts-config tokenizer unicode61 option. Prompted by [forum:a4bfcff66548a1ff|forum post a4bfcff66548a1ff].
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: e180dbb4559d5c85b6d293ab12dbb64c43504602323d83fed3d19df83afb4f73
User & Date: stephan 2023-08-18 12:17:38.981
Original Comment: Add fts-config tokenizer unicode61 option. Prompted by [forum post a4bfcff66548a1ff|forum:a4bfcff66548a1ff].
Context
2023-08-18
13:03
Added "unicode61" to search setup usage message check-in: 9965e1d86f user: wyoung tags: trunk
12:17
Add fts-config tokenizer unicode61 option. Prompted by [forum:a4bfcff66548a1ff|forum post a4bfcff66548a1ff]. check-in: e180dbb455 user: stephan tags: trunk
2023-08-14
21:09
Make sure the EmailEvent object is completely zeroed whenever it is allocated. check-in: 33877fa50b user: drh tags: trunk
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/search.c.
1549
1550
1551
1552
1553
1554
1555
1556
1557

1558
1559
1560
1561
1562
1563
1564
1565
@ DROP TABLE IF EXISTS repository.ftsdocs;
;

#if INTERFACE
/*
** Values for the search-tokenizer config option.
*/
#define FTS5TOK_NONE     0 /* no FTS stemmer */
#define FTS5TOK_PORTER   1 /* porter stemmer */

#define FTS5TOK_TRIGRAM  3 /* trigram stemmer */
#endif

/*
** Cached FTS5TOK_xyz value for search_tokenizer_type() and
** friends.
*/
static int iFtsTokenizer = -1;







|
|
>
|







1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
@ DROP TABLE IF EXISTS repository.ftsdocs;
;

#if INTERFACE
/*
** Values for the search-tokenizer config option.
*/
#define FTS5TOK_NONE      0 /* disabled */
#define FTS5TOK_PORTER    1 /* porter stemmer */
#define FTS5TOK_UNICODE61 2 /* unicode61 tokenizer */
#define FTS5TOK_TRIGRAM   3 /* trigram tokenizer */
#endif

/*
** Cached FTS5TOK_xyz value for search_tokenizer_type() and
** friends.
*/
static int iFtsTokenizer = -1;
1576
1577
1578
1579
1580
1581
1582


1583
1584
1585
1586
1587
1588
1589
    return iFtsTokenizer;
  }
  z = db_get("search-tokenizer",0);
  if( 0==z ){
    iFtsTokenizer = FTS5TOK_NONE;
  }else if(0==fossil_strcmp(z,"porter")){
    iFtsTokenizer = FTS5TOK_PORTER;


  }else if(0==fossil_strcmp(z,"trigram")){
    iFtsTokenizer = FTS5TOK_TRIGRAM;
  }else{
    iFtsTokenizer = is_truth(z) ? FTS5TOK_PORTER : FTS5TOK_NONE;
  }
  fossil_free(z);
  return iFtsTokenizer;







>
>







1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
    return iFtsTokenizer;
  }
  z = db_get("search-tokenizer",0);
  if( 0==z ){
    iFtsTokenizer = FTS5TOK_NONE;
  }else if(0==fossil_strcmp(z,"porter")){
    iFtsTokenizer = FTS5TOK_PORTER;
  }else if(0==fossil_strcmp(z,"unicode61")){
    iFtsTokenizer = FTS5TOK_UNICODE61;
  }else if(0==fossil_strcmp(z,"trigram")){
    iFtsTokenizer = FTS5TOK_TRIGRAM;
  }else{
    iFtsTokenizer = is_truth(z) ? FTS5TOK_PORTER : FTS5TOK_NONE;
  }
  fossil_free(z);
  return iFtsTokenizer;
1604
1605
1606
1607
1608
1609
1610


1611
1612
1613
1614
1615
1616
1617
  if( 0==z ){
    z = zTmp = db_get("search-tokenizer",0);
  }
  if( 0==z ){
    zRc = "off";
  }else if( 0==fossil_strcmp(z,"porter") ){
    zRc = "porter";


  }else if( 0==fossil_strcmp(z,"trigram") ){
    zRc = "trigram";
  }else{
    zRc = is_truth(z) ? "porter" : "off";
  }
  fossil_free(zTmp);
  return zRc;







>
>







1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
  if( 0==z ){
    z = zTmp = db_get("search-tokenizer",0);
  }
  if( 0==z ){
    zRc = "off";
  }else if( 0==fossil_strcmp(z,"porter") ){
    zRc = "porter";
  }else if( 0==fossil_strcmp(z,"unicode61") ){
    zRc = "unicode61";
  }else if( 0==fossil_strcmp(z,"trigram") ){
    zRc = "trigram";
  }else{
    zRc = is_truth(z) ? "porter" : "off";
  }
  fossil_free(zTmp);
  return zRc;
1631
1632
1633
1634
1635
1636
1637

1638
1639
1640
1641
1642
1643
1644
*/
static int searchIdxExists = -1;
void search_create_index(void){
  const int useTokenizer = search_tokenizer_type(0);
  const char *zExtra;
  switch(useTokenizer){
    case FTS5TOK_PORTER: zExtra = ",tokenize=porter"; break;

    case FTS5TOK_TRIGRAM: zExtra = ",tokenize=trigram"; break;
    default: zExtra = ""; break;
  }
  search_sql_setup(g.db);
  db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/);
  searchIdxExists = 1;
}







>







1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
*/
static int searchIdxExists = -1;
void search_create_index(void){
  const int useTokenizer = search_tokenizer_type(0);
  const char *zExtra;
  switch(useTokenizer){
    case FTS5TOK_PORTER: zExtra = ",tokenize=porter"; break;
    case FTS5TOK_UNICODE61: zExtra = ",tokenize=unicode61"; break;
    case FTS5TOK_TRIGRAM: zExtra = ",tokenize=trigram"; break;
    default: zExtra = ""; break;
  }
  search_sql_setup(g.db);
  db_multi_exec(zFtsSchema/*works-like:"%s"*/, zExtra/*safe-for-%s*/);
  searchIdxExists = 1;
}
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
**
**     enable cdtwe       Enable various kinds of search. c=Check-ins,
**                        d=Documents, t=Tickets, w=Wiki, e=Tech Notes.
**
**     disable cdtwe      Disable various kinds of search
**
**     tokenizer VALUE    Select a tokenizer for indexed search. VALUE
**                        may be one of (porter, on, off, trigram), and
**                        "on" is equivalent to "porter". Unindexed
**                        search never uses tokenization or stemming.
**
** The current search settings are displayed after any changes are applied.
** Run this command with no arguments to simply see the settings.
*/
void fts_config_cmd(void){
  static const struct {







|
|







1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
**
**     enable cdtwe       Enable various kinds of search. c=Check-ins,
**                        d=Documents, t=Tickets, w=Wiki, e=Tech Notes.
**
**     disable cdtwe      Disable various kinds of search
**
**     tokenizer VALUE    Select a tokenizer for indexed search. VALUE
**                        may be one of (porter, on, off, trigram, unicode61),
**                        and "on" is equivalent to "porter". Unindexed
**                        search never uses tokenization or stemming.
**
** The current search settings are displayed after any changes are applied.
** Run this command with no arguments to simply see the settings.
*/
void fts_config_cmd(void){
  static const struct {
Changes to src/setup.c.
2014
2015
2016
2017
2018
2019
2020

2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
** Renders a selection list of values for the search-tokenizer
** setting, using the form field name "ftstok".
*/
static void select_fts_tokenizer(void){
  const char *const aTokenizer[] = {
  "off",     "None",
  "porter",  "Porter Stemmer",

  "trigram", "Trigram"
  };
  multiple_choice_attribute("FTS Tokenizer", "search-tokenizer",
                            "ftstok", "off", 3, aTokenizer);
}

/*
** WEBPAGE: srchsetup
**
** Configure the search engine.  Requires Admin privilege.
*/







>
|


|







2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
** Renders a selection list of values for the search-tokenizer
** setting, using the form field name "ftstok".
*/
static void select_fts_tokenizer(void){
  const char *const aTokenizer[] = {
  "off",     "None",
  "porter",  "Porter Stemmer",
  "unicode61", "Unicode without stemming",
  "trigram", "Trigram",
  };
  multiple_choice_attribute("FTS Tokenizer", "search-tokenizer",
                            "ftstok", "off", 4, aTokenizer);
}

/*
** WEBPAGE: srchsetup
**
** Configure the search engine.  Requires Admin privilege.
*/