1
2
3
4 """
5 ::
6
7 # from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496942
8 # Title: Cross-site scripting (XSS) defense
9 # Submitter: Josh Goldfoot (other recipes)
10 # Last Updated: 2006/08/05
11 # Version no: 1.0
12
13 """
14
15
16 from htmllib import HTMLParser
17 from cgi import escape
18 from urlparse import urlparse
19 from formatter import AbstractFormatter
20 from htmlentitydefs import entitydefs
21 from xml.sax.saxutils import quoteattr
22
23 __all__ = ['sanitize']
24
25
27 """Gets rid of < and > and & and, for good measure, :"""
28
29 return escape(text, quote=True).replace(':', ':')
30
31
33
34 - def __init__(
35 self,
36 permitted_tags=[
37 'a',
38 'b',
39 'blockquote',
40 'br/',
41 'i',
42 'li',
43 'ol',
44 'ul',
45 'p',
46 'cite',
47 'code',
48 'pre',
49 'img/',
50 ],
51 allowed_attributes={'a': ['href', 'title'], 'img': ['src', 'alt'
52 ], 'blockquote': ['type']},
53 fmt=AbstractFormatter,
54 strip_disallowed = False
55 ):
56
57 HTMLParser.__init__(self, fmt)
58 self.result = ''
59 self.open_tags = []
60 self.permitted_tags = [i for i in permitted_tags if i[-1] != '/']
61 self.requires_no_close = [i[:-1] for i in permitted_tags
62 if i[-1] == '/']
63 self.permitted_tags += self.requires_no_close
64 self.allowed_attributes = allowed_attributes
65
66
67
68
69 self.allowed_schemes = ['http', 'https', 'ftp']
70
71
72 self.strip_disallowed = strip_disallowed
73 self.in_disallowed = False
74
76 if data and not self.in_disallowed:
77 self.result += xssescape(data)
78
80 if self.in_disallowed:
81 return
82 elif len(ref) < 7 and ref.isdigit():
83 self.result += '&#%s;' % ref
84 else:
85 self.result += xssescape('&#%s' % ref)
86
88 if self.in_disallowed:
89 return
90 elif ref in entitydefs:
91 self.result += '&%s;' % ref
92 else:
93 self.result += xssescape('&%s' % ref)
94
100
107 if tag not in self.permitted_tags:
108 if self.strip_disallowed:
109 self.in_disallowed = True
110 else:
111 self.result += xssescape('<%s>' % tag)
112 else:
113 bt = '<' + tag
114 if tag in self.allowed_attributes:
115 attrs = dict(attrs)
116 self.allowed_attributes_here = [x for x in
117 self.allowed_attributes[tag] if x in attrs
118 and len(attrs[x]) > 0]
119 for attribute in self.allowed_attributes_here:
120 if attribute in ['href', 'src', 'background']:
121 if self.url_is_acceptable(attrs[attribute]):
122 bt += ' %s="%s"' % (attribute,
123 attrs[attribute])
124 else:
125 bt += ' %s=%s' % (xssescape(attribute),
126 quoteattr(attrs[attribute]))
127 if bt == '<a' or bt == '<img':
128 return
129 if tag in self.requires_no_close:
130 bt += ' /'
131 bt += '>'
132 self.result += bt
133 self.open_tags.insert(0, tag)
134
136 bracketed = '</%s>' % tag
137 if tag not in self.permitted_tags:
138 if self.strip_disallowed:
139 self.in_disallowed = False
140 else:
141 self.result += xssescape(bracketed)
142 elif tag in self.open_tags:
143 self.result += bracketed
144 self.open_tags.remove(tag)
145
148
151
153 """
154 Accepts relative and absolute urls
155 """
156
157 parsed = urlparse(url)
158 return (parsed[0] in self.allowed_schemes and '.' in parsed[1]) \
159 or (parsed[0] == '' and parsed[2].startswith('/'))
160
161 - def strip(self, rawstring, escape=True):
162 """
163 Returns the argument stripped of potentially harmful
164 HTML or Javascript code
165
166 @type escape: boolean
167 @param escape: If True (default) it escapes the potentially harmful
168 content, otherwise remove it
169 """
170
171 if not isinstance(rawstring, str): return str(rawstring)
172 for tag in self.requires_no_close:
173 rawstring = rawstring.replace("<%s/>" % tag, "<%s />" % tag)
174 if not escape:
175 self.strip_disallowed = True
176 self.result = ''
177 self.feed(rawstring)
178 for endtag in self.open_tags:
179 if endtag not in self.requires_no_close:
180 self.result += '</%s>' % endtag
181 return self.result
182
196
197
198 -def sanitize(text, permitted_tags=[
199 'a',
200 'b',
201 'blockquote',
202 'br/',
203 'i',
204 'li',
205 'ol',
206 'ul',
207 'p',
208 'cite',
209 'code',
210 'pre',
211 'img/',
212 'h1','h2','h3','h4','h5','h6',
213 'table','tr','td','div',
214 ],
215 allowed_attributes = {
216 'a': ['href', 'title'],
217 'img': ['src', 'alt'],
218 'blockquote': ['type'],
219 'td': ['colspan'],
220 },
221 escape=True):
222 if not isinstance(text, str): return str(text)
223 return XssCleaner(permitted_tags=permitted_tags,
224 allowed_attributes=allowed_attributes).strip(text, escape)
225