1
2
3
4
5
6
7 from htmllib import HTMLParser
8 from cgi import escape
9 from urlparse import urlparse
10 from formatter import AbstractFormatter
11 from htmlentitydefs import entitydefs
12 from xml.sax.saxutils import quoteattr
13
14 __all__=['sanitize']
15
17 """Gets rid of < and > and & and, for good measure, :"""
18 return escape(text, quote=True).replace(':',':')
19
21 - def __init__(self,
22 permitted_tags = ['a','b','blockquote','br/','i', 'li', 'ol', 'ul', 'p', 'cite','code','pre','img/'],
23 allowed_attributes={'a':['href','title'],'img':['src','alt'],'blockquote':['type']},
24 fmt = AbstractFormatter):
25 HTMLParser.__init__(self, fmt)
26 self.result=""
27 self.open_tags= []
28 self.permitted_tags=[i for i in permitted_tags if i[-1]!='/']
29 self.requires_no_close=[i[:-1] for i in permitted_tags if i[-1]=='/']
30 self.permitted_tags+=self.requires_no_close
31 self.allowed_attributes = allowed_attributes
32
33
34 self.allowed_schemes = ['http','https','ftp']
38 if len(ref) < 7 and ref.isdigit(): self.result += '&#%s;' % ref
39 else: self.result += xssescape('&#%s' % ref)
41 if ref in entitydefs: self.result += '&%s;' % ref
42 else: self.result += xssescape('&%s' % ref)
46 if tag not in self.permitted_tags:
47 self.result += xssescape("<%s>" % tag)
48 else:
49 bt = "<" + tag
50 if tag in self.allowed_attributes:
51 attrs = dict(attrs)
52 self.allowed_attributes_here = \
53 [x for x in self.allowed_attributes[tag] if x in attrs \
54 and len(attrs[x]) > 0]
55 for attribute in self.allowed_attributes_here:
56 if attribute in ['href', 'src', 'background']:
57 if self.url_is_acceptable(attrs[attribute]):
58 bt += ' %s="%s"' % (attribute, attrs[attribute])
59 else:
60 bt += ' %s=%s' % (xssescape(attribute), quoteattr(attrs[attribute]))
61 if bt == "<a" or bt == "<img": return
62 if tag in self.requires_no_close: bt += "/"
63 bt += ">"
64 self.result += bt
65 self.open_tags.insert(0, tag)
67 bracketed = "</%s>" % tag
68 if tag not in self.permitted_tags:
69 self.result += xssescape(bracketed)
70 elif tag in self.open_tags:
71 self.result += bracketed
72 self.open_tags.remove(tag)
78 """
79 Requires all URLs to be "absolute."
80 """
81 parsed = urlparse(url)
82 return parsed[0] in self.allowed_schemes and '.' in parsed[1]
83 - def strip(self, rawstring):
84 """
85 Returns the argument stripped of potentially harmful
86 HTML or Javascript code
87 """
88 self.result = ""
89 self.feed(rawstring)
90 for endtag in self.open_tags:
91 if endtag not in self.requires_no_close:
92 self.result += "</%s>" % endtag
93 return self.result
107
108 -def sanitize(text,permitted_tags = ['a','b','blockquote','br/','i', 'li', 'ol','ul', 'p', 'cite','code','pre','img/'],allowed_attributes={'a':['href','title'],'img':['src','alt'],'blockquote':['type']}):
109 return XssCleaner(permitted_tags=permitted_tagsa,allowed_attributes=allowed_attributes).strip(text)
110