/* ** Copyright (c) 2025 D. Richard Hipp ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the Simplified BSD License (also ** known as the "2-Clause License" or "FreeBSD License".) ** This program is distributed in the hope that it will be useful, ** but without any warranty; without even the implied warranty of ** merchantability or fitness for a particular purpose. ** ** Author contact information: ** drh@hwaci.com ** http://www.hwaci.com/drh/ ** ******************************************************************************* ** ** This file contains code that attempts to prevent robots and ** especially bot-nets from consume excess CPU and bandwidth when ** Fossil is run as a service. */ #include "config.h" #include "robot.h" #include #include /* ** The name of the cookie used to demonstrate that the client has been ** tested and is believed to be operated by a human, not by a robot. */ #if INTERFACE #define ROBOT_COOKIE "fossil-client-ok" #endif /* ** Values computed only once and then cached. */ static struct RobotCache { unsigned int h1, h2; /* Proof-of-work hash values */ unsigned int resultCache; /* 0: unknown. 1: human 2: might-be-robot */ } robot = { 0, 0, 0 }; /* ** Allowed values for robot.resultCache. ** ** The names are slightly misleading. KNOWN_NOT_ROBOT might be set even ** if the client is a robot, but only if the robot is an approved robot. ** A better name might be "KNOWN_NOT_UNAUTHORIZED_ROBOT", but that is too ** long of a name. */ #define KNOWN_NOT_ROBOT 1 /* Approved to consume CPU and bandwidth */ #define MIGHT_BE_ROBOT 2 /* Might be an unapproved robot */ /* ** Compute two hashes, robot.h1 and robot.h2, that are used as ** part of determining whether or not the HTTP client is a robot. ** These hashes are based on current time, client IP address, ** and User-Agent. robot.h1 is for the current time slot and ** robot.h2 is the previous. ** ** The hashes are integer values between 100,000,000 and 999,999,999 ** inclusive. */ static void robot_pow_hash(void){ const char *az[2], *z; sqlite3_int64 tm; unsigned int h1, h2, k; if( robot.h1 ) return; /* Already computed */ /* Construct a proof-of-work value based on the IP address of the ** sender and the sender's user-agent string. The current time also ** affects the pow value, so actually compute two values, one for the ** current 900-second interval and one for the previous. Either can ** match. The pow-value is an integer between 100,000,000 and ** 999,999,999. */ az[0] = P("REMOTE_ADDR"); az[1] = P("HTTP_USER_AGENT"); tm = time(0); h1 = (unsigned)(tm/900)&0xffffffff; h2 = h1 - 1; for(k=0; k<2; k++){ z = az[k]; if( z==0 ) continue; while( *z ){ h1 = (h1 + *(unsigned char*)z)*0x9e3779b1; h2 = (h2 + *(unsigned char*)z)*0x9e3779b1; z++; } } robot.h1 = (h1 % 900000000) + 100000000; robot.h2 = (h2 % 900000000) + 100000000; } /* ** Return true if the HTTP client has not demonstrated that it is ** human interactive. Return false is the HTTP client might be ** a non-interactive robot. ** ** For this routine, any of the following is considered proof that ** the HTTP client is not a robot: ** ** 1. There is a valid login, including "anonymous". User "nobody" ** is not a valid login, but every other user is. ** ** 2. There exists a ROBOT_COOKIE with the correct proof-of-work ** value. ** ** 3. There exists a proof=VALUE query parameter where VALUE is ** a correct proof-of-work value. ** ** 4. There exists a valid token=VALUE query parameter. ** ** After being run once, this routine caches its findings and ** returns very quickly on subsequent invocations. */ int client_might_be_a_robot(void){ const char *z; /* Only do this computation once, then cache the results for future ** use */ if( robot.resultCache ){ return robot.resultCache==MIGHT_BE_ROBOT; } /* Condition 1: Is there a valid login? */ if( g.userUid==0 ){ login_check_credentials(); } if( g.zLogin!=0 ){ robot.resultCache = KNOWN_NOT_ROBOT; return 0; } /* Condition 2: If there is already a proof-of-work cookie ** with a correct value, then the user agent has been authenticated. */ z = P(ROBOT_COOKIE); if( z ){ unsigned h = atoi(z); robot_pow_hash(); if( (h==robot.h1 || h==robot.h2) && !cgi_is_qp(ROBOT_COOKIE) ){ robot.resultCache = KNOWN_NOT_ROBOT; return 0; } } /* Condition 3: There is a "proof=VALUE" query parameter with a valid ** VALUE attached. If this is the case, also set the robot cookie ** so that future requests will hit condition 2 above. */ z = P("proof"); if( z ){ unsigned h = atoi(z); robot_pow_hash(); if( h==robot.h1 || h==robot.h2 ){ cgi_set_cookie(ROBOT_COOKIE,z,"/",900); robot.resultCache = KNOWN_NOT_ROBOT; return 0; } cgi_tag_query_parameter("proof"); } /* Condition 4: If there is a "token=VALUE" query parameter with a ** valid VALUE argument, then assume that the request is coming from ** either an interactive human session, or an authorized robot that we ** want to treat as human. Allow it through and also set the robot cookie. */ z = P("token"); if( z!=0 ){ if( db_exists("SELECT 1 FROM config" " WHERE name='token-%q'" " AND json_valid(value,6)" " AND value->>'user' IS NOT NULL", z) ){ char *zVal; robot_pow_hash(); zVal = mprintf("%u", robot.h1); cgi_set_cookie(ROBOT_COOKIE,zVal,"/",900); fossil_free(zVal); robot.resultCache = KNOWN_NOT_ROBOT; return 0; /* There is a valid token= query parameter */ } cgi_tag_query_parameter("token"); } /* We have no proof that the request is coming from an interactive ** human session, so assume the request comes from a robot. */ robot.resultCache = MIGHT_BE_ROBOT; return 1; } /* ** Rewrite the current page with content that attempts ** to prove that the client is not a robot. */ static void ask_for_proof_that_client_is_not_robot(void){ unsigned p1, p2, p3, p4, p5, k2, k3; int k; /* Ask the client to present proof-of-work */ cgi_reset_content(); cgi_set_content_type("text/html"); style_header("Browser Verification"); @

Checking to see if you are a robot

@

@ @ \ @

if( g.zExtra && g.zExtra[0] ) cgi_tag_query_parameter("name"); cgi_query_parameters_to_hidden(); @ @
@ style_finish_page(); } /* ** SETTING: robot-restrict width=40 block-text ** The VALUE of this setting is a list of GLOB patterns that match ** pages for which complex HTTP requests from unauthenticated clients ** should be disallowed. "Unauthenticated" means the user is "nobody". ** The recommended value for this setting is: ** ** timelineX,diff,annotate,fileage,file,finfo,reports,tree,download,hexdump ** ** Usually the tag should exactly match the page name. The "diff" tag ** covers all diffing pages such as /vdiff, /fdiff, and /vpatch. The ** "annotate" tag also covers /blame and /praise. "zip" also covers ** /tarball and /sqlar. If a tag has an "X" character appended then it ** only applies if query parameters are such that the page is particularly ** difficult to compute. Useful "X" tags include "timelineX" and "zipX". ** The "ext" tag matches all extension, but a tag of the form "ext/PATH" ** only matches the extension at PATH. ** ** See the [[robot-zip-leaf]] and [[robot-zip-tag]] settings ** for additional controls associated with the "zipX" restriction. ** ** Change this setting "off" to disable all robot restrictions. */ /* ** SETTING: robot-exception width=40 block-text ** ** The value of this setting should be a regular expression. ** If it matches the REQUEST_URI without the SCRIPT_NAME prefix ** matches this regular expression, then the request is an exception ** to anti-robot defenses and should be allowed through. For ** example, to allow robots to download tarballs or ZIP archives ** for named versions and releases, you could use an expression like ** this: ** ** ^/tarball/(version-[0-9.]+|release)/ ** ** This setting can hold multiple regular expressions, one ** regular expression per line. The input URL is exempted from ** anti-robot defenses if any of the multiple regular expressions ** matches. */ /* ** SETTING: robot-zip-leaf boolean ** ** If this setting is true, the robots are allowed to download tarballs, ** ZIP-archives, and SQL-archives even though "zipX" is found in ** the [[robot-restrict]] setting as long as the specific check-in being ** downloaded is a leaf check-in. */ /* ** SETTING: robot-zip-tag width=40 block-text ** ** If this setting is a list of GLOB patterns matching tags, ** then robots are allowed to download tarballs, ZIP-archives, and ** SQL-archives even though "zipX" appears in [[robot-restrict]], as long as ** the specific check-in being downloaded has a tags that matches ** the GLOB list of this setting. Recommended value: ** "release,robot-access". */ /* ** Return the default restriction GLOB */ const char *robot_restrict_default(void){ return "timelineX,diff,annotate,fileage,file,finfo,reports," "tree,hexdump,download"; } /* ** Return true if zTag matches one of the tags in the robot-restrict ** setting. ** ** A zTag of "*" matches anything. */ static int robot_restrict_has_tag(const char *zTag){ static const char *zGlob = 0; if( zGlob==0 ){ zGlob = db_get("robot-restrict",robot_restrict_default()); if( zGlob==0 ) zGlob = ""; } if( zGlob[0]==0 || fossil_strcmp(zGlob, "off")==0 ){ return 0; } if( zTag==0 || (zTag[0]=='*' && zTag[1]==0) ){ return 1; } return glob_multi_match(zGlob,zTag); } /* ** Check the request URI to see if it matches one of the URI ** exceptions listed in the robot-exception setting. Return true ** if it does. Return false if it does not. ** ** For the purposes of this routine, the "request URI" means ** the REQUEST_URI value with the SCRIPT_NAME prefix removed and ** with QUERY_STRING appended with a "?" separator if QUERY_STRING ** is not empty. ** ** If the robot-exception setting does not exist or is an empty ** string, then return false. */ int robot_exception(void){ const char *zRE = db_get("robot-exception",0); const char *zQS; /* QUERY_STRING */ const char *zURI; /* REQUEST_URI */ const char *zSN; /* SCRIPT_NAME */ const char *zNL; /* Next newline character */ char *zRequest; /* REQUEST_URL w/o SCRIPT_NAME prefix + QUERY_STRING */ int nRequest; /* Length of zRequest in bytes */ size_t nURI, nSN; /* Length of zURI and zSN */ int bMatch = 0; /* True if there is a match */ if( zRE==0 ) return 0; if( zRE[0]==0 ) return 0; zURI = PD("REQUEST_URI",""); nURI = strlen(zURI); zSN = PD("SCRIPT_NAME",""); nSN = strlen(zSN); if( nSN<=nURI ) zURI += nSN; zQS = P("QUERY_STRING"); if( zQS && zQS[0] ){ zRequest = mprintf("%s?%s", zURI, zQS); }else{ zRequest = fossil_strdup(zURI); } nRequest = (int)strlen(zRequest); while( zRE[0] && bMatch==0 ){ char *z; const char *zErr; size_t n; ReCompiled *pRe; zNL = strchr(zRE,'\n'); if( zNL ){ n = (size_t)(zNL - zRE)+1; while( zNL>zRE && fossil_isspace(zNL[0]) ) zNL--; if( zNL==zRE ){ zRE += n; continue; } }else{ n = strlen(zRE); } z = mprintf("%.*s", (int)(zNL - zRE)+1, zRE); zRE += n; zErr = fossil_re_compile(&pRe, z, 0); if( zErr ){ fossil_warning("robot-exception error \"%s\" in expression \"%s\"\n", zErr, z); fossil_free(z); continue; } fossil_free(z); bMatch = re_match(pRe, (const unsigned char*)zRequest, nRequest); re_free(pRe); } fossil_free(zRequest); return bMatch; } /* ** Return true if one or more of the conditions below are true. ** Return false if all of the following are false: ** ** * The zTag is on the robot-restrict list ** ** * The client that submitted the HTTP request might be ** a robot ** ** * The Request URI does not match any of the exceptions ** in the robot-exception setting. ** ** In other words, return true if a call to robot_restrict() would ** return true and false if a call to robot_restrict() would return ** false. ** ** The difference between this routine an robot_restrict() is that ** this routine does not generate a proof-of-work captcha. This ** routine does not change the HTTP reply in any way. It simply ** returns true or false. */ int robot_would_be_restricted(const char *zTag){ if( robot.resultCache==KNOWN_NOT_ROBOT ) return 0; if( !robot_restrict_has_tag(zTag) ) return 0; if( !client_might_be_a_robot() ) return 0; if( robot_exception() ){ robot.resultCache = KNOWN_NOT_ROBOT; return 0; } return 1; } /* ** Check to see if the page named in the argument is on the ** robot-restrict list. If it is on the list and if the user ** is might be a robot, then bring up a captcha to test to make ** sure that client is not a robot. ** ** This routine returns true if a captcha was rendered and if subsequent ** page generation should be aborted. It returns false if the page ** should not be restricted and should be rendered normally. */ int robot_restrict(const char *zTag){ if( robot_would_be_restricted(zTag) ){ /* Generate the proof-of-work captcha */ ask_for_proof_that_client_is_not_robot(); return 1; }else{ return 0; } } /* ** Check to see if a robot is allowed to download a tarball, ZIP archive, ** or SQL Archive for a particular check-in identified by the "rid" ** argument. Return true to block the download. Return false to ** continue. Prior to returning true, a captcha is presented to the user. ** No output is generated when returning false. ** ** The rules: ** ** (1) If "zipX" is missing from the robot-restrict setting, then robots ** are allowed to download any archive. None of the remaining rules ** below are consulted unless "zipX" is on the robot-restrict setting. ** ** (2) If the robot-zip-leaf setting is true, then robots are allowed ** to download archives for any leaf check-in. This allows URL like ** /tarball/trunk/archive.tar.gz to work since branch labels like "trunk" ** always resolve to a leaf. ** ** (3) If the robot-zip-tag setting is a comma-separated tags, then any ** check-in that contains one of the tags on that list is allowed to ** be downloaded. This allows check-ins with tags like "release" or ** "robot-access" to be downloaded by robots. */ int robot_restrict_zip(int rid){ const char *zTag; if( !robot_restrict_has_tag("zipX") || !client_might_be_a_robot() ){ return 0; /* Rule (1) */ } if( db_get_boolean("robot-zip-leaf",0) && is_a_leaf(rid) ){ return 0; /* Rule (2) */ } zTag = db_get("robot-zip-tag",0); if( zTag && zTag[0] && fossil_strcmp(zTag,"off")!=0 ){ int ok = 0; Stmt q; db_prepare(&q, "SELECT substr(tagname,5) FROM tagxref, tag" " WHERE tagxref.rid=%d" " AND tag.tagid=tagxref.tagid" " AND tagxref.tagtype=1" " AND tag.tagname GLOB 'sym-*'", rid ); while( !ok && db_step(&q)==SQLITE_ROW ){ if( glob_multi_match(zTag, db_column_text(&q,0)) ) ok = 1; } db_finalize(&q); if( ok ) return 0; /* Rule (3) */ } /* Generate the proof-of-work captcha */ ask_for_proof_that_client_is_not_robot(); return 1; } /* ** WEBPAGE: test-robotck ** ** Run the robot_restrict() function using the value of the "name=" ** query parameter as an argument. Used for testing the robot_restrict() ** logic. ** ** Whenever this page is successfully rendered (when it doesn't go to ** the captcha) it deletes the proof-of-work cookie. So reloading the ** page will reset the cookie and restart the verification. ** ** If the zip=CHECKIN query parameter is provided, then also invoke ** robot_restrict_archive() on the RID of CHECKIN. */ void robot_restrict_test_page(void){ const char *zName = P("name"); const char *zZip = P("zip"); const char *zP1 = P("proof"); const char *zP2 = P(ROBOT_COOKIE); const char *z; int rid = 0; if( zName==0 || zName[0]==0 ) zName = g.zPath; login_check_credentials(); if( g.zLogin==0 ){ login_needed(1); return; } g.zLogin = 0; if( robot_restrict(zName) ) return; if( zZip && zZip[0] ){ rid = symbolic_name_to_rid(zZip, "ci"); if( rid && robot_restrict_zip(rid) ) return; } style_set_current_feature("test"); style_header("robot_restrict() test"); @

Captcha passed

@ @

if( zP1 && zP1[0] ){ @ proof=%h(zP1)
} if( zP2 && zP2[0] ){ @ %h(ROBOT_COOKIE)=%h(zP2)
cgi_set_cookie(ROBOT_COOKIE,"",0,-1); } if( zZip && zZip[0] ){ @ zip=%h(zZip)
@ rid=%d(rid)
} if( g.perm.Admin ){ z = db_get("robot-restrict",robot_restrict_default()); if( z && z[0] ){ @ robot-restrict=%h(z)
} @ robot.h1=%u(robot.h1)
@ robot.h2=%u(robot.h2)
switch( robot.resultCache ){ case MIGHT_BE_ROBOT: { @ robot.resultCache=MIGHT_BE_ROBOT
break; } case KNOWN_NOT_ROBOT: { @ robot.resultCache=KNOWN_NOT_ROBOT
break; } default: { @ robot.resultCache=OTHER (%d(robot.resultCache))
break; } } } @

@

Retry style_finish_page(); } /* ** WEBPAGE: tokens ** ** Allow users to create, delete, and view their access token. ** ** The access token is a string TOKEN which if included in a query ** parameter like "token=TOKEN" authenticates a request as coming ** from an authorized agent. This can be used, for example, by ** script to access content without running into problems with ** robot defenses. */ void tokens_page(void){ char *zMyToken; login_check_credentials(); style_set_current_feature("tokens"); style_header("Access Tokens"); if( g.zLogin==0 || fossil_strcmp(g.zLogin,"anonymous")==0 ){ @ User "%h(g.zLogin?g.zLogin:"anonymous")" is not allowed to @ own or use access tokens. style_finish_page(); return; } if( g.perm.Admin && P("del")!=0 ){ const char *zDel = P("del"); db_unprotect(PROTECT_CONFIG); db_multi_exec( "DELETE FROM config WHERE name='token-%q'", zDel); db_protect_pop(); } zMyToken = db_text(0, "SELECT substr(name,7) FROM config" " WHERE name GLOB 'token-*'" " AND json_valid(value,6)" " AND value->>'user' = %Q", g.zLogin ); if( zMyToken==0 && P("new") ){ sqlite3_uint64 r; sqlite3_randomness(sizeof(r),&r); zMyToken = mprintf("%016llx", r); db_unprotect(PROTECT_CONFIG); db_multi_exec( "INSERT INTO config(name,value,mtime)" "VALUES('token-%q','{user:%!j}',now())", zMyToken, g.zLogin ); db_protect_pop(); }else if( zMyToken!=0 && P("selfdel") && fossil_strcmp(zMyToken,P("selfdel"))==0 ){ db_unprotect(PROTECT_CONFIG); db_multi_exec( "DELETE FROM config WHERE name='token-%q'", zMyToken); db_protect_pop(); zMyToken = 0; } if( zMyToken==0 ){ @

You do not currently have an access token. @ Create one }else{ @

Your access token is "%h(zMyToken)". @

Use this token as the value of the token= query parameter @ to bypass robot defenses on unauthenticated queries to this @ server (%R). Do not misuse your token. Keep it confidential. @ If you misuse your token, or if somebody else steals your token @ and misuses, that can result in loss of access privileges to this @ server. @

Delete my token } if( g.perm.Admin ){ int nTok = 0; Stmt s; db_prepare(&s, "SELECT substr(name,7), value->>'user', datetime(mtime,'unixepoch')" " FROM config" " WHERE name GLOB 'token-*'" " AND json_valid(value,6)" ); while( db_step(&s)==SQLITE_ROW ){ if( nTok==0 ){ @


@

All tokens

@ @ } nTok++; @ } db_finalize(&s); if( nTok==0 ){ @
@

There are access tokens defined for this repository. }else{ @

User Token Date  
%h(db_column_text(&s,1)) @ %h(db_column_text(&s,0)) @ %h(db_column_text(&s,2)) @ delete @
} } style_finish_page(); }