Check-in [86b6ef7fe3]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add the robot-exception setting.
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 86b6ef7fe3424284a38f9706482c58b4e65bd39d2f57ee0bdeb52065440e37a6
User & Date: drh 2025-08-21 14:08:29.274
Context
2025-08-21
14:13
Add /reports to the default robot-restrict setting. check-in: 12d871a00a user: stephan tags: trunk
14:08
Add the robot-exception setting. check-in: 86b6ef7fe3 user: drh tags: trunk
12:02
Change [3710202914] to call the function to load the diff-related JS code even for blocked diffs. By default, the loader function is already a no-op if diffs are blocked, so the behavior intended by [3710202914] is retained. But other branches are patching the loader function because they rely on the JS code even if the diffs are hidden. check-in: 171127fd14 user: florian tags: trunk
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/regexp.c.
848
849
850
851
852
853
854

855
856
857
858

859










860
861
862
863

864
865
866
867
868
869
870
871
872
873
874
875
876
** Usage: %fossil test-grep REGEXP [FILE...]
**
** Run a regular expression match over the named disk files, or against
** standard input if no disk files are named on the command-line.
**
** Options:
**   -i|--ignore-case    Ignore case

*/
void re_test_grep(void){
  ReCompiled *pRe;
  const char *zErr;

  int ignoreCase = find_option("ignore-case","i",0)!=0;










  if( g.argc<3 ){
    usage("REGEXP [FILE...]");
  }
  zErr = re_compile(&pRe, g.argv[2], ignoreCase);

  if( zErr ) fossil_fatal("%s", zErr);
  if( g.argc==3 ){
    grep_file(pRe, "-", stdin);
  }else{
    int i;
    for(i=3; i<g.argc; i++){
      FILE *in = fossil_fopen(g.argv[i], "rb");
      if( in==0 ){
        fossil_warning("cannot open \"%s\"", g.argv[i]);
      }else{
        grep_file(pRe, g.argv[i], in);
        fclose(in);
      }







>




>

>
>
>
>
>
>
>
>
>
>
|
|
|
|
>

|



|







848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
** Usage: %fossil test-grep REGEXP [FILE...]
**
** Run a regular expression match over the named disk files, or against
** standard input if no disk files are named on the command-line.
**
** Options:
**   -i|--ignore-case    Ignore case
**   --robot-exception   Use the robot-exception setting as the REGEXP
*/
void re_test_grep(void){
  ReCompiled *pRe;
  const char *zErr;
  int iFileList = 3;
  int ignoreCase = find_option("ignore-case","i",0)!=0;
  int bRobot = find_option("robot-exception",0,0)!=0;
  if( bRobot ){
    const char *zRe;
    db_find_and_open_repository(0,0);
    verify_all_options();
    zRe = db_get("robot-exception","^$");
    zErr = re_compile(&pRe, zRe, ignoreCase);
    iFileList = 2;
  }else{
    verify_all_options();
    if( g.argc<3 ){
      usage("REGEXP [FILE...]");
    }
    zErr = re_compile(&pRe, g.argv[2], ignoreCase);
  }
  if( zErr ) fossil_fatal("%s", zErr);
  if( g.argc==iFileList ){
    grep_file(pRe, "-", stdin);
  }else{
    int i;
    for(i=iFileList; i<g.argc; i++){
      FILE *in = fossil_fopen(g.argv[i], "rb");
      if( in==0 ){
        fossil_warning("cannot open \"%s\"", g.argv[i]);
      }else{
        grep_file(pRe, g.argv[i], in);
        fclose(in);
      }
Changes to src/robot.c.
262
263
264
265
266
267
268


















269
270
271
272
273
274
275
** also covers /tarball and /sqlar.  If a tag has an "X" character appended,
** then it only applies if query parameters are such that the page is
** particularly difficult to compute. In all other case, the tag should
** exactly match the page name.
**
** Change this setting "off" to disable all robot restrictions.
*/



















/*
** Return the default restriction GLOB
*/
const char *robot_restrict_default(void){
  return "timelineX,diff,annotate,zip,fileage,file,finfo";
}







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
** also covers /tarball and /sqlar.  If a tag has an "X" character appended,
** then it only applies if query parameters are such that the page is
** particularly difficult to compute. In all other case, the tag should
** exactly match the page name.
**
** Change this setting "off" to disable all robot restrictions.
*/
/*
** SETTING: robot-exception              width=40 block-text
**
** The value of this setting should be a regular expression.
** If it matches the REQUEST_URI without the SCRIPT_NAME prefix
** matches this regular expression, then the request is an exception
** to anti-robot defenses and should be allowed through.  For
** example, to allow robots to download tarballs or ZIP archives
** for named versions and releases, you could use an expression like
** this:
**
**     ^/(tarball|zip)\\b*\\b(version-|release)\\b
**
** This setting can hold multiple regular expressions, one
** regular expression per line.  The input URL is exempted from
** anti-robot defenses if any of the multiple regular expressions
** matches.
*/

/*
** Return the default restriction GLOB
*/
const char *robot_restrict_default(void){
  return "timelineX,diff,annotate,zip,fileage,file,finfo";
}
285
286
287
288
289
290
291







































































292
293
294
295
296
297
298
299
300
301
302
303
304
305
306




307
308
309
310
311
312
313
    if( zGlob==0 ) zGlob = "";
  }
  if( zGlob[0]==0 || fossil_strcmp(zGlob, "off")==0 ){
    return 0;
  }
  return glob_multi_match(zGlob,zTag);
}








































































/*
** Check to see if the page named in the argument is on the
** robot-restrict list.  If it is on the list and if the user
** is "nobody" then bring up a captcha to test to make sure that
** client is not a robot.
**
** This routine returns true if a captcha was rendered and if subsequent
** page generation should be aborted.  It returns false if the page
** should not be restricted and should be rendered normally.
*/
int robot_restrict(const char *zTag){
  if( robot.resultCache==KNOWN_NOT_ROBOT ) return 0;
  if( !robot_restrict_has_tag(zTag) ) return 0;
  if( !client_might_be_a_robot() ) return 0;





  /* Generate the proof-of-work captcha */   
  ask_for_proof_that_client_is_not_robot();
  return 1;
}

/*







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>















>
>
>
>







303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
    if( zGlob==0 ) zGlob = "";
  }
  if( zGlob[0]==0 || fossil_strcmp(zGlob, "off")==0 ){
    return 0;
  }
  return glob_multi_match(zGlob,zTag);
}

/*
** Check the request URI to see if it matches one of the URI
** exceptions listed in the robot-exception setting.  Return true
** if it does.  Return false if it does not.
**
** For the purposes of this routine, the "request URI" means
** the REQUEST_URI value with the SCRIPT_NAME prefix removed and
** with QUERY_STRING appended with a "?" separator if QUERY_STRING
** is not empty.
**
** If the robot-exception setting does not exist or is an empty
** string, then return false.
*/
int robot_exception(void){
  const char *zRE = db_get("robot-exception",0);
  const char *zQS;    /* QUERY_STRING */
  const char *zURI;   /* REQUEST_URI */
  const char *zSN;    /* SCRIPT_NAME */
  const char *zNL;    /* Next newline character */
  char *zRequest;     /* REQUEST_URL w/o SCRIPT_NAME prefix + QUERY_STRING */
  int nRequest;       /* Length of zRequest in bytes */
  size_t nURI, nSN;   /* Length of zURI and zSN */
  int bMatch = 0;     /* True if there is a match */

  if( zRE==0 ) return 0;
  if( zRE[0]==0 ) return 0;
  zURI = PD("REQUEST_URI","");
  nURI = strlen(zURI);
  zSN = PD("SCRIPT_NAME","");
  nSN = strlen(zSN);
  if( nSN<=nURI ) zURI += nSN;
  zQS = P("QUERY_STRING");
  if( zQS && zQS[0] ){
    zRequest = mprintf("%s?%s", zURI, zQS);
  }else{
    zRequest = fossil_strdup(zURI);
  }
  nRequest = (int)strlen(zRequest);
  while( zRE[0] && bMatch==0 ){
    char *z;
    const char *zErr;
    size_t n;
    ReCompiled *pRe;
    zNL = strchr(zRE,'\n');
    if( zNL ){
      n = (size_t)(zNL - zRE)+1;
      while( zNL>zRE && fossil_isspace(zNL[0]) ) zNL--;
      if( zNL==zRE ){
        zRE += n;
        continue;
      }
    }else{
      n = strlen(zRE);
    }
    z = mprintf("%.*s", (int)(zNL - zRE)+1, zRE);
    zRE += n;
    zErr = re_compile(&pRe, z, 0);
    if( zErr ){
      fossil_warning("robot-exception error \"%s\" in expression \"%s\"\n",
                     zErr, z);
      fossil_free(z);
      continue;
    }
    fossil_free(z);
    bMatch = re_match(pRe, (const unsigned char*)zRequest, nRequest);
    re_free(pRe);
  }
  fossil_free(zRequest);
  return bMatch;
}

/*
** Check to see if the page named in the argument is on the
** robot-restrict list.  If it is on the list and if the user
** is "nobody" then bring up a captcha to test to make sure that
** client is not a robot.
**
** This routine returns true if a captcha was rendered and if subsequent
** page generation should be aborted.  It returns false if the page
** should not be restricted and should be rendered normally.
*/
int robot_restrict(const char *zTag){
  if( robot.resultCache==KNOWN_NOT_ROBOT ) return 0;
  if( !robot_restrict_has_tag(zTag) ) return 0;
  if( !client_might_be_a_robot() ) return 0;
  if( robot_exception() ){
    robot.resultCache = KNOWN_NOT_ROBOT;
    return 0;
  }

  /* Generate the proof-of-work captcha */   
  ask_for_proof_that_client_is_not_robot();
  return 1;
}

/*
Changes to src/setup.c.
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503















504
505
506
507
508
509
510
  @ <p>The settings on this page are intended to help site administrators
  @ defend the site against robots.
  @
  @ <form action="%R/setup_robot" method="post"><div>
  login_insert_csrf_secret();
  @ <input type="submit"  name="submit" value="Apply Changes"></p>
  @ <hr>
  @ <p><b>Do not allow robots access to these pages.</b>
  @ <p> If the page name matches the GLOB pattern of this setting, and the
  @ users is "nobody", and the client has not previously passed a captcha
  @ test to show that it is not a robot, then the page is not displayed.
  @ A captcha test is is rendered instead.
  @ The recommended value for this setting is:
  @ <p>
  @ &emsp;&emsp;&emsp;<tt>%h(robot_restrict_default())</tt>
  @ <p>
  @ The "diff" tag covers all diffing pages such as /vdiff, /fdiff, and 
  @ /vpatch.  The "annotate" tag covers /annotate and also /blame and
  @ /praise.  The "zip" covers itself and also /tarball and /sqlar. If a
  @ tag has an "X" character appended, then it only applies if query
  @ parameters are such that the page is particularly difficult to compute.
  @ In all other case, the tag should exactly match the page name.
  @
  @ To disable robot restrictions, change this setting to "off".
  @ (Property: robot-restrict)
  @ <br>
  textarea_attribute("", 2, 80,
      "robot-restrict", "rbrestrict", robot_restrict_default(), 0);
















  @ <hr>
  addAutoHyperlinkSettings();

  @ <hr>
  entry_attribute("Anonymous Login Validity", 11, "anon-cookie-lifespan",
                  "anoncookls", "840", 0);
  @ <p>The number of minutes for which an anonymous login cookie is valid.







|
|




















>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
  @ <p>The settings on this page are intended to help site administrators
  @ defend the site against robots.
  @
  @ <form action="%R/setup_robot" method="post"><div>
  login_insert_csrf_secret();
  @ <input type="submit"  name="submit" value="Apply Changes"></p>
  @ <hr>
  @ <p><b>Do not allow robots access to these pages.</b><br>
  @ If the page name matches the GLOB pattern of this setting, and the
  @ users is "nobody", and the client has not previously passed a captcha
  @ test to show that it is not a robot, then the page is not displayed.
  @ A captcha test is is rendered instead.
  @ The recommended value for this setting is:
  @ <p>
  @ &emsp;&emsp;&emsp;<tt>%h(robot_restrict_default())</tt>
  @ <p>
  @ The "diff" tag covers all diffing pages such as /vdiff, /fdiff, and 
  @ /vpatch.  The "annotate" tag covers /annotate and also /blame and
  @ /praise.  The "zip" covers itself and also /tarball and /sqlar. If a
  @ tag has an "X" character appended, then it only applies if query
  @ parameters are such that the page is particularly difficult to compute.
  @ In all other case, the tag should exactly match the page name.
  @
  @ To disable robot restrictions, change this setting to "off".
  @ (Property: robot-restrict)
  @ <br>
  textarea_attribute("", 2, 80,
      "robot-restrict", "rbrestrict", robot_restrict_default(), 0);

  @ <hr>
  @ <p><b>Exceptions to anti-robot restrictions</b><br>
  @ The entry below is a list of regular expressions, one per line.
  @ If any of these regular expressions match the input URL, then the
  @ request is exempt from anti-robot defenses.  Use this, for example,
  @ to allow scripts to download release tarballs using a pattern
  @ like:</p>
  @ <p>
  @ &emsp;&emsp;<tt>^/tarball\\b*\\b(version-|release)\\b</tt>
  @ <p>The pattern should match against the REQUEST_URI with the
  @ SCRIPT_NAME prefix removed, and with QUERY_STRING appended following
  @ a "?" if QUERY_STRING exists.  (Property: robot-exception)<br>
  textarea_attribute("", 3, 80,
      "robot-exception", "rbexcept", "", 0);

  @ <hr>
  addAutoHyperlinkSettings();

  @ <hr>
  entry_attribute("Anonymous Login Validity", 11, "anon-cookie-lifespan",
                  "anoncookls", "840", 0);
  @ <p>The number of minutes for which an anonymous login cookie is valid.