Fossil

codecheck1.c
Login

File tools/codecheck1.c from the latest check-in


/*
** Copyright (c) 2014 D. Richard Hipp
**
** This program is free software; you can redistribute it and/or
** modify it under the terms of the Simplified BSD License (also
** known as the "2-Clause License" or "FreeBSD License".)
**
** This program is distributed in the hope that it will be useful,
** but without any warranty; without even the implied warranty of
** merchantability or fitness for a particular purpose.
**
** Author contact information:
**   drh@hwaci.com
**   http://www.hwaci.com/drh/
**
*******************************************************************************
**
** This program reads Fossil source code files and tries to verify that
** printf-style format strings are correct.
**
** This program implements a compile-time validation step on the Fossil
** source code.  Running this program is entirely optional.  Its role is
** similar to the -Wall compiler switch on gcc, or the scan-build utility
** of clang, or other static analyzers.  The purpose is to try to identify
** problems in the source code at compile-time.  The difference is that this
** static checker is specifically designed for the particular printf formatter
** implementation used by Fossil.
**
** Checks include:
**
**    *  Verify that vararg formatting routines like blob_printf() or
**       db_multi_exec() have the correct number of arguments for their
**       format string.
**
**    *  For routines designed to generate SQL, warn about the use of %s
**       which might allow SQL injection.
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <assert.h>

/*
** Debugging switch
*/
static int eVerbose = 0;

/*
** Malloc, aborting if it fails.
*/
void *safe_malloc(int nByte){
  void *x = malloc(nByte);
  if( x==0 ){
    fprintf(stderr, "failed to allocate %d bytes\n", nByte);
    exit(1);
  }
  return x;
}
void *safe_realloc(void *pOld, int nByte){
  void *x = realloc(pOld, nByte);
  if( x==0 ){
    fprintf(stderr, "failed to allocate %d bytes\n", nByte);
    exit(1);
  }
  return x;
}

/*
** Read the entire content of the file named zFilename into memory obtained
** from malloc().   Add a zero-terminator to the end.
** Return a pointer to that memory.
*/
static char *read_file(const char *zFilename){
  FILE *in;
  char *z;
  int nByte;
  int got;
  in = fopen(zFilename, "rb");
  if( in==0 ){
    return 0;
  }
  fseek(in, 0, SEEK_END);
  nByte = ftell(in);
  fseek(in, 0, SEEK_SET);
  z = safe_malloc( nByte+1 );
  got = fread(z, 1, nByte, in);
  z[got] = 0;
  fclose(in);
  return z;
}

/*
** When parsing the input file, the following token types are recognized.
*/
#define TK_SPACE      1      /* Whitespace or comments */
#define TK_ID         2      /* An identifier */
#define TK_STR        3      /* A string literal in double-quotes */
#define TK_OTHER      4      /* Any other token */
#define TK_EOF       99      /* End of file */

/*
** Determine the length and type of the token beginning at z[0]
*/
static int token_length(const char *z, int *pType, int *pLN){
  int i;
  if( z[0]==0 ){
    *pType = TK_EOF;
    return 0;
  }
  if( z[0]=='"' || z[0]=='\'' ){
    for(i=1; z[i] && z[i]!=z[0]; i++){
      if( z[i]=='\\' && z[i+1]!=0 ){
        if( z[i+1]=='\n' ) (*pLN)++;
        i++;
      }
    }
    if( z[i]!=0 ) i++;
    *pType = z[0]=='"' ? TK_STR : TK_OTHER;
    return i;
  }
  if( isalnum(z[0]) || z[0]=='_' ){
    for(i=1; isalnum(z[i]) || z[i]=='_'; i++){}
    *pType = isalpha(z[0]) || z[0]=='_' ? TK_ID : TK_OTHER;
    return i;
  }
  if( isspace(z[0]) ){
    if( z[0]=='\n' ) (*pLN)++;
    for(i=1; isspace(z[i]); i++){
      if( z[i]=='\n' ) (*pLN)++;
    }
    *pType = TK_SPACE;
    return i;
  }
  if( z[0]=='/' && z[1]=='*' ){
    for(i=2; z[i] && (z[i]!='*' || z[i+1]!='/'); i++){
      if( z[i]=='\n' ) (*pLN)++;
    }
    if( z[i] ) i += 2;
    *pType = TK_SPACE;
    return i;
  }
  if( z[0]=='/' && z[1]=='/' ){
    for(i=2; z[i] && z[i]!='\n'; i++){}
    if( z[i] ){
      (*pLN)++;
      i++;
    }
    *pType = TK_SPACE;
    return i;
  }
  if( z[0]=='\\' && (z[1]=='\n' || (z[1]=='\r' && z[2]=='\n')) ){
    *pType = TK_SPACE;
    return 1;
  }
  *pType = TK_OTHER;
  return 1;
}

/*
** Return the next non-whitespace token
*/
const char *next_non_whitespace(const char *z, int *pLen, int *pType){
  int len;
  int eType;
  int ln = 0;
  while( (len = token_length(z, &eType, &ln))>0 && eType==TK_SPACE ){
    z += len;
  }
  *pLen = len;
  *pType = eType;
  return z;
}

/*
** Return index into z[] for the first balanced TK_OTHER token with
** value cValue.
*/
static int distance_to(const char *z, char cVal){
  int len;
  int dist = 0;
  int eType;
  int nNest = 0;
  int ln = 0;
  while( z[0] && (len = token_length(z, &eType, &ln))>0 ){
    if( eType==TK_OTHER ){
      if( z[0]==cVal && nNest==0 ){
        break;
      }else if( z[0]=='(' ){
        nNest++;
      }else if( z[0]==')' ){
        nNest--;
      }
    }
    dist += len;
    z += len;
  }
  return dist;
}

/*
** Return the first non-whitespace characters in z[]
*/
static const char *skip_space(const char *z){
  while( isspace(z[0]) ){ z++; }
  return z;
}

/*
** Remove excess whitespace and nested "()" from string z.
*/
static char *simplify_expr(char *z){
  int n = (int)strlen(z);
  while( n>0 ){
    if( isspace(z[0]) ){
      z++;
      n--;
      continue;
    }
    if( z[0]=='(' && z[n-1]==')' ){
      z++;
      n -= 2;
      continue;
    }
    break;
  }
  z[n] = 0;
  return z;
}

/*
** Return true if the input is a string literal.
*/
static int is_string_lit(const char *z){
  int nu1, nu2;
  z = next_non_whitespace(z, &nu1, &nu2);
  if( strcmp(z, "NULL")==0 ) return 1;
  return z[0]=='"';
}

/*
** Return true if the input is an expression of string literals:
**
**      EXPR ? "..." : "..."
*/
static int is_string_expr(const char *z){
  int len = 0, eType;
  const char *zOrig = z;
  len = distance_to(z, '?');
  if( z[len]==0 && skip_space(z)[0]=='(' ){
    z = skip_space(z) + 1;
    len = distance_to(z, '?');
  }
  z += len;
  if( z[0]=='?' ){
    z++;
    z = next_non_whitespace(z, &len, &eType);
    if( eType==TK_STR ){
      z += len;
      z = next_non_whitespace(z, &len, &eType);
      if( eType==TK_OTHER && z[0]==':' ){
        z += len;
        z = next_non_whitespace(z, &len, &eType);
        if( eType==TK_STR ){
          z += len;
          z = next_non_whitespace(z, &len, &eType);
          if( eType==TK_EOF ) return 1;
          if( eType==TK_OTHER && z[0]==')' && skip_space(zOrig)[0]=='(' ){
            z += len;
            z = next_non_whitespace(z, &len, &eType);
            if( eType==TK_EOF ) return 1;
          }
        }
      }
    }
  }
  return 0;
}

/*
** A list of functions that return strings that are safe to insert into
** SQL using %s.
*/
static const char *azSafeFunc[] = {
  "filename_collation",
  "leaf_is_closed_sql",
  "timeline_query_for_www",
  "timeline_query_for_tty",
  "blob_sql_text",
  "glob_expr",
  "fossil_all_reserved_names",
  "configure_inop_rhs",
  "db_setting_inop_rhs",
};

/*
** Return true if the input is an argument that is safe to use with %s
** while building an SQL statement.
*/
static int is_sql_safe(const char *z){
  int len, eType;
  int i;

  /* A string literal is safe for use with %s */
  if( is_string_lit(z) ) return 1;

  /* Certain functions are guaranteed to return a string that is safe
  ** for use with %s */
  z = next_non_whitespace(z, &len, &eType);
  for(i=0; i<sizeof(azSafeFunc)/sizeof(azSafeFunc[0]); i++){
    if( eType==TK_ID
     && strncmp(z, azSafeFunc[i], len)==0
     && strlen(azSafeFunc[i])==len
    ){
      return 1;
    }
  }

  /* Expressions of the form:  EXPR ? "..." : "...." can count as
  ** a string literal. */
  if( is_string_expr(z) ) return 1;

  /* If the "safe-for-%s" comment appears in the argument, then
  ** let it through */
  if( strstr(z, "/*safe-for-%s*/")!=0 ) return 1;

  return 0;
}

/*
** Return true if the input is an argument that is never safe for use
** with %s.
*/
static int never_safe(const char *z){
  if( strstr(z,"/*safe-for-%s*/")!=0 ) return 0;
  if( z[0]=='P' ){
    if( strncmp(z,"PIF(",4)==0 ) return 0;
    if( strncmp(z,"PCK(",4)==0 ) return 0;
    return 1;
  }
  if( strncmp(z,"cgi_param",9)==0 ) return 1;
  return 0;
}

/*
** Processing flags
*/
#define FMT_SQL   0x00001     /* Generator for SQL text */
#define FMT_HTML  0x00002     /* Generator for HTML text */
#define FMT_URL   0x00004     /* Generator for URLs */
#define FMT_SAFE  0x00008     /* Generator for human-readable text */
#define FMT_LIT   0x00010     /* Just verify that a string literal */
#define FMT_PX    0x00020     /* Must have a literal prefix in format string */

/*
** A list of internal Fossil interfaces that take a printf-style format
** string.
*/
struct FmtFunc {
  const char *zFName;    /* Name of the function */
  int iFmtArg;           /* Index of format argument.  Leftmost is 1. */
  unsigned fmtFlags;     /* Processing flags */
} aFmtFunc[] = {
  { "admin_log",                  1, FMT_SAFE },
  { "audit_append",               3, FMT_SAFE },
  { "backofficeTrace",            1, FMT_SAFE },
  { "blob_append_sql",            2, FMT_SQL },
  { "blob_appendf",               2, FMT_SAFE },
  { "cgi_debug",                  1, FMT_SAFE },
  { "cgi_panic",                  1, FMT_SAFE },
  { "cgi_printf",                 1, FMT_HTML },
  { "cgi_printf_header",          1, FMT_HTML },
  { "cgi_redirectf",              1, FMT_URL },
  { "chref",                      2, FMT_URL },
  { "CX",                         1, FMT_HTML },
  { "db_blob",                    2, FMT_SQL },
  { "db_debug",                   1, FMT_SQL },
  { "db_double",                  2, FMT_SQL },
  { "db_err",                     1, FMT_SAFE },
  { "db_exists",                  1, FMT_SQL },
  { "db_get_mprintf",             2, FMT_SAFE },
  { "db_int",                     2, FMT_SQL },
  { "db_int64",                   2, FMT_SQL },
  { "db_lset",                    1, FMT_LIT },
  { "db_lset_int",                1, FMT_LIT },
  { "db_multi_exec",              1, FMT_SQL },
  { "db_optional_sql",            2, FMT_SQL },
  { "db_prepare",                 2, FMT_SQL },
  { "db_prepare_ignore_error",    2, FMT_SQL },
  { "db_set",                     1, FMT_LIT },
  { "db_set_int",                 1, FMT_LIT },
  { "db_set_mprintf",             3, FMT_PX },
  { "db_static_prepare",          2, FMT_SQL },
  { "db_text",                    2, FMT_SQL },
  { "db_unset",                   1, FMT_LIT },
  { "db_unset_mprintf",           2, FMT_PX },
  { "emailerError",               2, FMT_SAFE },
  { "entry_attribute",            4, FMT_LIT },
  { "fileedit_ajax_error",        2, FMT_SAFE },
  { "form_begin",                 2, FMT_URL },
  { "fossil_error",               2, FMT_SAFE },
  { "fossil_errorlog",            1, FMT_SAFE },
  { "fossil_fatal",               1, FMT_SAFE },
  { "fossil_fatal_recursive",     1, FMT_SAFE },
  { "fossil_panic",               1, FMT_SAFE },
  { "fossil_print",               1, FMT_SAFE },
  { "fossil_trace",               1, FMT_SAFE },
  { "fossil_warning",             1, FMT_SAFE },
  { "href",                       1, FMT_URL },
  { "json_new_string_f",          1, FMT_SAFE },
  { "json_set_err",               2, FMT_SAFE },
  { "json_warn",                  2, FMT_SAFE },
  { "mprintf",                    1, FMT_SAFE },
  { "multiple_choice_attribute",  3, FMT_LIT },
  { "onoff_attribute",            3, FMT_LIT },
  { "pop3_print",                 2, FMT_SAFE },
  { "smtp_send_line",             2, FMT_SAFE },
  { "smtp_server_send",           2, FMT_SAFE },
  { "socket_set_errmsg",          1, FMT_SAFE },
  { "ssl_set_errmsg",             1, FMT_SAFE },
  { "style_header",               1, FMT_HTML },
  { "style_set_current_page",     1, FMT_URL },
  { "style_submenu_element",      2, FMT_URL },
  { "style_submenu_sql",          3, FMT_SQL },
  { "textarea_attribute",         5, FMT_LIT },
  { "tktsetup_generic",           1, FMT_LIT },
  { "webpage_error",              1, FMT_SAFE },
  { "xfersetup_generic",          1, FMT_LIT },
  { "xhref",                      2, FMT_URL },
};

/*
** Comparison function for two FmtFunc entries
*/
static int fmtfunc_cmp(const void *pAA, const void *pBB){
  const struct FmtFunc *pA = (const struct FmtFunc*)pAA;
  const struct FmtFunc *pB = (const struct FmtFunc*)pBB;
  return strcmp(pA->zFName, pB->zFName);
}

/*
** Determine if the indentifier zIdent of length nIndent is a Fossil
** internal interface that uses a printf-style argument.  Return zero if not.
** Return the index of the format string if true with the left-most
** argument having an index of 1.
*/
static int isFormatFunc(const char *zIdent, int nIdent, unsigned *pFlags){
  int upr, lwr;
  lwr = 0;
  upr = sizeof(aFmtFunc)/sizeof(aFmtFunc[0]) - 1;
  while( lwr<=upr ){
    unsigned x = (lwr + upr)/2;
    int c = strncmp(zIdent, aFmtFunc[x].zFName, nIdent);
    if( c==0 ){
      if( aFmtFunc[x].zFName[nIdent]==0 ){
        *pFlags = aFmtFunc[x].fmtFlags;
        return aFmtFunc[x].iFmtArg;
      }
      c = -1;
    }
    if( c<0 ){
      upr = x - 1;
    }else{
      lwr = x + 1;
    }
  }
  *pFlags = 0;
  return 0;
}

/*
** Return the expected number of arguments for the format string.
** Return -1 if the value cannot be computed.
**
** For each argument less than nType, store the conversion character
** for that argument in cType[i].
**
** Store the number of initial literal characters of the format string
** in *pInit.
*/
static int formatArgCount(const char *z, int nType, char *cType, int *pInit){
  int nArg = 0;
  int i, k;
  int len;
  int eType;
  int ln = 0;
  *pInit = 0;
  while( z[0] ){
    len = token_length(z, &eType, &ln);
    if( eType==TK_STR ){
      for(i=1; i<len-1 && isalpha(z[i]); i++){}
      *pInit = i-1;
      for(i=1; i<len-1; i++){
        if( z[i]!='%' ) continue;
        if( z[i+1]=='%' ){ i++; continue; }
        for(k=i+1; k<len && !isalpha(z[k]); k++){
          if( z[k]=='*' || z[k]=='#' ){
            if( nArg<nType ) cType[nArg] = z[k];
            nArg++;
          }
        }
        if( z[k]!='R' ){
          if( nArg<nType ) cType[nArg] = z[k];
          nArg++;
        }
      }
    }
    z += len;
  }
  return nArg;
}

/*
** The function call that begins at zFCall[0] (which is on line lnFCall of the
** original file) is a function that uses a printf-style format string
** on argument number fmtArg.  It has processings flags fmtFlags.  Do
** compile-time checking on this function, output any errors, and return
** the number of errors.
*/
static int checkFormatFunc(
  const char *zFilename, /* Name of the file being processed */
  const char *zFCall,    /* Pointer to start of function call */
  int lnFCall,           /* Line number that holds z[0] */
  int fmtArg,            /* Format string should be this argument */
  int fmtFlags           /* Extra processing flags */
){
  int szFName;
  int eToken;
  int ln = lnFCall;
  int len;
  const char *zStart;
  char *z;
  char *zCopy;
  int nArg = 0;
  const char **azArg = 0;
  int i, k;
  int nErr = 0;
  char *acType;
  int nInit = 0;

  szFName = token_length(zFCall, &eToken, &ln);
  zStart = next_non_whitespace(zFCall+szFName, &len, &eToken);
  assert( zStart[0]=='(' && len==1 );
  len = distance_to(zStart+1, ')');
  zCopy = safe_malloc( len + 1 );
  memcpy(zCopy, zStart+1, len);
  zCopy[len] = 0;
  azArg = 0;
  nArg = 0;
  z = zCopy;
  while( z[0] ){
    char cEnd;
    len = distance_to(z, ',');
    cEnd = z[len];
    z[len] = 0;
    azArg = safe_realloc((char*)azArg, (sizeof(azArg[0])+1)*(nArg+1));
    azArg[nArg++] = simplify_expr(z);
    if( cEnd==0 ) break;
    z += len + 1;
  }
  acType = (char*)&azArg[nArg];
  if( fmtArg>nArg ){
    printf("%s:%d: too few arguments to %.*s()\n",
           zFilename, lnFCall, szFName, zFCall);
    nErr++;
  }else{
    const char *zFmt = azArg[fmtArg-1];
    const char *zOverride = strstr(zFmt, "/*works-like:");
    if( zOverride ) zFmt = zOverride + sizeof("/*works-like:")-1;
    if( fmtFlags & FMT_LIT ){
      if( !is_string_lit(zFmt) ){
        printf("%s:%d: argument %d to %.*s() should be a string literal\n",
               zFilename, lnFCall, fmtArg, szFName, zFCall);
        nErr++;
      }
    }else if( !is_string_lit(zFmt) ){
      printf("%s:%d: %.*s() has non-constant format on arg[%d]\n",
             zFilename, lnFCall, szFName, zFCall, fmtArg-1);
      nErr++;
    }else if( (k = formatArgCount(zFmt, nArg, acType, &nInit))>=0
             && nArg!=fmtArg+k ){
      printf("%s:%d: too %s arguments to %.*s() "
             "- got %d and expected %d\n",
             zFilename, lnFCall, (nArg<fmtArg+k ? "few" : "many"),
             szFName, zFCall, nArg, fmtArg+k);
      nErr++;
    }else if( (fmtFlags & FMT_PX)!=0 ){
      if( nInit==0 ){
        printf("%s:%d: format string on %.*s() should have"
               " an ASCII character prefix\n",
          zFilename, lnFCall, szFName, zFCall);
        nErr++;
      }
    }else if( (fmtFlags & FMT_SAFE)==0 ){
      for(i=0; i<nArg && i<k; i++){
        if( (acType[i]=='s' || acType[i]=='z' || acType[i]=='b') ){
          const char *zExpr = azArg[fmtArg+i];
          if( never_safe(zExpr) ){
            printf("%s:%d: Argument %d to %.*s() is not safe for"
                   " a query parameter\n",
               zFilename, lnFCall, i+fmtArg, szFName, zFCall);
             nErr++;
   
          }else if( (fmtFlags & FMT_SQL)!=0 && !is_sql_safe(zExpr) ){
            printf("%s:%d: Argument %d to %.*s() not safe for SQL\n",
               zFilename, lnFCall, i+fmtArg, szFName, zFCall);
             nErr++;
          }
        }
      }
    }
  }
  if( nErr ){
    for(i=0; i<nArg; i++){
      printf("   arg[%d]: %s\n", i, azArg[i]);
    }
  }else if( eVerbose>1 ){
    printf("%s:%d: %.*s() ok for %d arguments\n",
      zFilename, lnFCall, szFName, zFCall, nArg);
  }
  free((char*)azArg);
  free(zCopy);
  return nErr;
}


/*
** Do a design-rule check of format strings for the file named zName
** with content zContent.  Write errors on standard output.  Return
** the number of errors.
*/
static int scan_file(const char *zName, const char *zContent){
  const char *z;
  int ln = 0;
  int szToken;
  int eToken;
  const char *zPrev = 0;
  int ePrev = 0;
  int szPrev = 0;
  int lnPrev = 0;
  int nCurly = 0;
  int x;
  unsigned fmtFlags = 0;
  int nErr = 0;

  if( zContent==0 ){
    printf("cannot read file: %s\n", zName);
    return 1;
  }
  for(z=zContent; z[0]; z += szToken){
    szToken = token_length(z, &eToken, &ln);
    if( eToken==TK_SPACE ) continue;
    if( eToken==TK_OTHER ){
      if( z[0]=='{' ){
        nCurly++;
      }else if( z[0]=='}' ){
        nCurly--;
      }else if( nCurly>0 && z[0]=='(' && ePrev==TK_ID
            && (x = isFormatFunc(zPrev,szPrev,&fmtFlags))>0 ){
        nErr += checkFormatFunc(zName, zPrev, lnPrev, x, fmtFlags);
      }
    }
    zPrev = z;
    ePrev = eToken;
    szPrev = szToken;
    lnPrev = ln;
  }
  return nErr;
}

/*
** Check for format-string design rule violations on all files listed
** on the command-line.
**
** The eVerbose global variable is incremented with each "-v" argument.
*/
int main(int argc, char **argv){
  int i;
  int nErr = 0;
  qsort(aFmtFunc, sizeof(aFmtFunc)/sizeof(aFmtFunc[0]),
        sizeof(aFmtFunc[0]), fmtfunc_cmp);
  for(i=1; i<argc; i++){
    char *zFile;
    if( strcmp(argv[i],"-v")==0 ){
      eVerbose++;
      continue;
    }
    if( eVerbose>0 ) printf("Processing %s...\n", argv[i]);
    zFile = read_file(argv[i]);
    nErr += scan_file(argv[i], zFile);
    free(zFile);
  }
  return nErr;
}