Package web2py :: Package gluon :: Module sanitizer
[hide private]
[frames] | no frames]

Source Code for Module web2py.gluon.sanitizer

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """ 
  5  :: 
  6   
  7      # from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496942 
  8      # Title: Cross-site scripting (XSS) defense 
  9      # Submitter: Josh Goldfoot (other recipes) 
 10      # Last Updated: 2006/08/05 
 11      # Version no: 1.0 
 12   
 13  """ 
 14   
 15   
 16  from htmllib import HTMLParser 
 17  from cgi import escape 
 18  from urlparse import urlparse 
 19  from formatter import AbstractFormatter 
 20  from htmlentitydefs import entitydefs 
 21  from xml.sax.saxutils import quoteattr 
 22   
 23  __all__ = ['sanitize'] 
 24   
 25   
26 -def xssescape(text):
27 """Gets rid of < and > and & and, for good measure, :""" 28 29 return escape(text, quote=True).replace(':', '&#58;')
30 31
32 -class XssCleaner(HTMLParser):
33
34 - def __init__( 35 self, 36 permitted_tags=[ 37 'a', 38 'b', 39 'blockquote', 40 'br/', 41 'i', 42 'li', 43 'ol', 44 'ul', 45 'p', 46 'cite', 47 'code', 48 'pre', 49 'img/', 50 ], 51 allowed_attributes={'a': ['href', 'title'], 'img': ['src', 'alt' 52 ], 'blockquote': ['type']}, 53 fmt=AbstractFormatter, 54 strip_disallowed = False 55 ):
56 57 HTMLParser.__init__(self, fmt) 58 self.result = '' 59 self.open_tags = [] 60 self.permitted_tags = [i for i in permitted_tags if i[-1] != '/'] 61 self.requires_no_close = [i[:-1] for i in permitted_tags 62 if i[-1] == '/'] 63 self.permitted_tags += self.requires_no_close 64 self.allowed_attributes = allowed_attributes 65 66 # The only schemes allowed in URLs (for href and src attributes). 67 # Adding "javascript" or "vbscript" to this list would not be smart. 68 69 self.allowed_schemes = ['http', 'https', 'ftp'] 70 71 #to strip or escape disallowed tags? 72 self.strip_disallowed = strip_disallowed 73 self.in_disallowed = False
74
75 - def handle_data(self, data):
76 if data and not self.in_disallowed: 77 self.result += xssescape(data)
78
79 - def handle_charref(self, ref):
80 if self.in_disallowed: 81 return 82 elif len(ref) < 7 and ref.isdigit(): 83 self.result += '&#%s;' % ref 84 else: 85 self.result += xssescape('&#%s' % ref)
86
87 - def handle_entityref(self, ref):
88 if self.in_disallowed: 89 return 90 elif ref in entitydefs: 91 self.result += '&%s;' % ref 92 else: 93 self.result += xssescape('&%s' % ref)
94
95 - def handle_comment(self, comment):
96 if self.in_disallowed: 97 return 98 elif comment: 99 self.result += xssescape('<!--%s-->' % comment)
100
101 - def handle_starttag( 102 self, 103 tag, 104 method, 105 attrs, 106 ):
107 if tag not in self.permitted_tags: 108 if self.strip_disallowed: 109 self.in_disallowed = True 110 else: 111 self.result += xssescape('<%s>' % tag) 112 else: 113 bt = '<' + tag 114 if tag in self.allowed_attributes: 115 attrs = dict(attrs) 116 self.allowed_attributes_here = [x for x in 117 self.allowed_attributes[tag] if x in attrs 118 and len(attrs[x]) > 0] 119 for attribute in self.allowed_attributes_here: 120 if attribute in ['href', 'src', 'background']: 121 if self.url_is_acceptable(attrs[attribute]): 122 bt += ' %s="%s"' % (attribute, 123 attrs[attribute]) 124 else: 125 bt += ' %s=%s' % (xssescape(attribute), 126 quoteattr(attrs[attribute])) 127 if bt == '<a' or bt == '<img': 128 return 129 if tag in self.requires_no_close: 130 bt += ' /' 131 bt += '>' 132 self.result += bt 133 self.open_tags.insert(0, tag)
134
135 - def handle_endtag(self, tag, attrs):
136 bracketed = '</%s>' % tag 137 if tag not in self.permitted_tags: 138 if self.strip_disallowed: 139 self.in_disallowed = False 140 else: 141 self.result += xssescape(bracketed) 142 elif tag in self.open_tags: 143 self.result += bracketed 144 self.open_tags.remove(tag)
145
146 - def unknown_starttag(self, tag, attributes):
147 self.handle_starttag(tag, None, attributes)
148
149 - def unknown_endtag(self, tag):
150 self.handle_endtag(tag, None)
151
152 - def url_is_acceptable(self, url):
153 """ 154 Accepts relative and absolute urls 155 """ 156 157 parsed = urlparse(url) 158 return (parsed[0] in self.allowed_schemes and '.' in parsed[1]) \ 159 or (parsed[0] == '' and parsed[2].startswith('/'))
160
161 - def strip(self, rawstring, escape=True):
162 """ 163 Returns the argument stripped of potentially harmful 164 HTML or Javascript code 165 166 @type escape: boolean 167 @param escape: If True (default) it escapes the potentially harmful 168 content, otherwise remove it 169 """ 170 171 if not isinstance(rawstring, str): return str(rawstring) 172 for tag in self.requires_no_close: 173 rawstring = rawstring.replace("<%s/>" % tag, "<%s />" % tag) 174 if not escape: 175 self.strip_disallowed = True 176 self.result = '' 177 self.feed(rawstring) 178 for endtag in self.open_tags: 179 if endtag not in self.requires_no_close: 180 self.result += '</%s>' % endtag 181 return self.result
182
183 - def xtags(self):
184 """ 185 Returns a printable string informing the user which tags are allowed 186 """ 187 188 tg = '' 189 for x in sorted(self.permitted_tags): 190 tg += '<' + x 191 if x in self.allowed_attributes: 192 for y in self.allowed_attributes[x]: 193 tg += ' %s=""' % y 194 tg += '> ' 195 return xssescape(tg.strip())
196 197
198 -def sanitize(text, permitted_tags=[ 199 'a', 200 'b', 201 'blockquote', 202 'br/', 203 'i', 204 'li', 205 'ol', 206 'ul', 207 'p', 208 'cite', 209 'code', 210 'pre', 211 'img/', 212 'h1','h2','h3','h4','h5','h6', 213 'table','tr','td','div', 214 ], 215 allowed_attributes = { 216 'a': ['href', 'title'], 217 'img': ['src', 'alt'], 218 'blockquote': ['type'], 219 'td': ['colspan'], 220 }, 221 escape=True):
222 if not isinstance(text, str): return str(text) 223 return XssCleaner(permitted_tags=permitted_tags, 224 allowed_attributes=allowed_attributes).strip(text, escape)
225