| 1 |
"""Exposes several SGMLParser subclasses. |
|---|
| 2 |
|
|---|
| 3 |
This work, including the source code, documentation |
|---|
| 4 |
and related data, is placed into the public domain. |
|---|
| 5 |
|
|---|
| 6 |
The orginal author is Robert Brewer. |
|---|
| 7 |
|
|---|
| 8 |
THIS SOFTWARE IS PROVIDED AS-IS, WITHOUT WARRANTY |
|---|
| 9 |
OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF |
|---|
| 10 |
MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE |
|---|
| 11 |
ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE |
|---|
| 12 |
RESULTING FROM THE USE, MODIFICATION, OR |
|---|
| 13 |
REDISTRIBUTION OF THIS SOFTWARE. |
|---|
| 14 |
|
|---|
| 15 |
If you don't need thread-safety, you might create a single instance of the |
|---|
| 16 |
parser you want, and feed it yourself. You also might use the classes |
|---|
| 17 |
directly if you need to customize them in some way; for example, you may |
|---|
| 18 |
need to alter the list of unsafe_tags in the Sanitizer class, either |
|---|
| 19 |
per-instance or by subclassing it. |
|---|
| 20 |
|
|---|
| 21 |
If you need thread-safe parsing, you should use the functions provided. |
|---|
| 22 |
They create a new instance each time, so you get a *small* performance |
|---|
| 23 |
hit, but by the same token, each thread can work on its own instance. |
|---|
| 24 |
""" |
|---|
| 25 |
|
|---|
| 26 |
import re |
|---|
| 27 |
import sgmllib |
|---|
| 28 |
import htmlentitydefs |
|---|
| 29 |
from xml.sax.saxutils import quoteattr |
|---|
| 30 |
|
|---|
| 31 |
interesting = re.compile('[&<]') |
|---|
| 32 |
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' |
|---|
| 33 |
'<([a-zA-Z][^<>]*|' |
|---|
| 34 |
'/([a-zA-Z][^<>]*)?|' |
|---|
| 35 |
'![^<>]*)?') |
|---|
| 36 |
|
|---|
| 37 |
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') |
|---|
| 38 |
charref = re.compile('&#([0-9]+)[^0-9]') |
|---|
| 39 |
|
|---|
| 40 |
starttagopen = re.compile('<[>a-zA-Z]') |
|---|
| 41 |
|
|---|
| 42 |
|
|---|
| 43 |
class MoreReasonableSGMLParser(sgmllib.SGMLParser): |
|---|
| 44 |
"""Just like an SGML Parser, but with more information passed |
|---|
| 45 |
to the handle_ methods. For example, handle_entityref passes |
|---|
| 46 |
the whole match, ampersand, name, and trailer.""" |
|---|
| 47 |
|
|---|
| 48 |
|
|---|
| 49 |
|
|---|
| 50 |
|
|---|
| 51 |
def goahead(self, end): |
|---|
| 52 |
rawdata = self.rawdata |
|---|
| 53 |
i = 0 |
|---|
| 54 |
n = len(rawdata) |
|---|
| 55 |
while i < n: |
|---|
| 56 |
if self.nomoretags: |
|---|
| 57 |
self.handle_data(rawdata[i:n]) |
|---|
| 58 |
i = n |
|---|
| 59 |
break |
|---|
| 60 |
match = interesting.search(rawdata, i) |
|---|
| 61 |
if match: j = match.start() |
|---|
| 62 |
else: j = n |
|---|
| 63 |
if i < j: |
|---|
| 64 |
self.handle_data(rawdata[i:j]) |
|---|
| 65 |
i = j |
|---|
| 66 |
if i == n: break |
|---|
| 67 |
if rawdata[i] == '<': |
|---|
| 68 |
if starttagopen.match(rawdata, i): |
|---|
| 69 |
if self.literal: |
|---|
| 70 |
self.handle_data(rawdata[i]) |
|---|
| 71 |
i = i+1 |
|---|
| 72 |
continue |
|---|
| 73 |
k = self.parse_starttag(i) |
|---|
| 74 |
if k < 0: break |
|---|
| 75 |
i = k |
|---|
| 76 |
continue |
|---|
| 77 |
if rawdata.startswith("</", i): |
|---|
| 78 |
k = self.parse_endtag(i) |
|---|
| 79 |
if k < 0: break |
|---|
| 80 |
i = k |
|---|
| 81 |
self.literal = 0 |
|---|
| 82 |
continue |
|---|
| 83 |
if self.literal: |
|---|
| 84 |
if n > (i + 1): |
|---|
| 85 |
self.handle_data("<") |
|---|
| 86 |
i = i+1 |
|---|
| 87 |
else: |
|---|
| 88 |
|
|---|
| 89 |
break |
|---|
| 90 |
continue |
|---|
| 91 |
if rawdata.startswith("<!--", i): |
|---|
| 92 |
|
|---|
| 93 |
|
|---|
| 94 |
|
|---|
| 95 |
|
|---|
| 96 |
k = self.parse_comment(i) |
|---|
| 97 |
if k < 0: break |
|---|
| 98 |
i = k |
|---|
| 99 |
continue |
|---|
| 100 |
if rawdata.startswith("<?", i): |
|---|
| 101 |
k = self.parse_pi(i) |
|---|
| 102 |
if k < 0: break |
|---|
| 103 |
i = i+k |
|---|
| 104 |
continue |
|---|
| 105 |
if rawdata.startswith("<!", i): |
|---|
| 106 |
|
|---|
| 107 |
|
|---|
| 108 |
|
|---|
| 109 |
k = self.parse_declaration(i) |
|---|
| 110 |
if k < 0: break |
|---|
| 111 |
i = k |
|---|
| 112 |
continue |
|---|
| 113 |
elif rawdata[i] == '&': |
|---|
| 114 |
if self.literal: |
|---|
| 115 |
self.handle_data(rawdata[i]) |
|---|
| 116 |
i = i+1 |
|---|
| 117 |
continue |
|---|
| 118 |
match = charref.match(rawdata, i) |
|---|
| 119 |
if match: |
|---|
| 120 |
name = match.group(1) |
|---|
| 121 |
self.handle_charref(name) |
|---|
| 122 |
i = match.end(0) |
|---|
| 123 |
if rawdata[i-1] != ';': i = i-1 |
|---|
| 124 |
continue |
|---|
| 125 |
match = entityref.match(rawdata, i) |
|---|
| 126 |
if match: |
|---|
| 127 |
name = match.group(1) |
|---|
| 128 |
i = match.end(0) |
|---|
| 129 |
trailer = rawdata[i-1] |
|---|
| 130 |
self.handle_entityref(name, trailer) |
|---|
| 131 |
if trailer != ';': i = i-1 |
|---|
| 132 |
continue |
|---|
| 133 |
else: |
|---|
| 134 |
self.error('neither < nor & ??') |
|---|
| 135 |
|
|---|
| 136 |
|
|---|
| 137 |
match = incomplete.match(rawdata, i) |
|---|
| 138 |
if not match: |
|---|
| 139 |
self.handle_data(rawdata[i]) |
|---|
| 140 |
i = i+1 |
|---|
| 141 |
continue |
|---|
| 142 |
j = match.end(0) |
|---|
| 143 |
if j == n: |
|---|
| 144 |
break |
|---|
| 145 |
self.handle_data(rawdata[i:j]) |
|---|
| 146 |
i = j |
|---|
| 147 |
|
|---|
| 148 |
if end and i < n: |
|---|
| 149 |
self.handle_data(rawdata[i:n]) |
|---|
| 150 |
i = n |
|---|
| 151 |
self.rawdata = rawdata[i:] |
|---|
| 152 |
|
|---|
| 153 |
|
|---|
| 154 |
|
|---|
| 155 |
class Plaintext(MoreReasonableSGMLParser): |
|---|
| 156 |
"""Strips all HTML from content. |
|---|
| 157 |
Entities are translated to their Unicode equivalents where possible.""" |
|---|
| 158 |
|
|---|
| 159 |
def handle_data(self, data): |
|---|
| 160 |
self.result.append(data) |
|---|
| 161 |
|
|---|
| 162 |
def handle_charref(self, ref): |
|---|
| 163 |
try: |
|---|
| 164 |
self.result.append(unichr(int(ref))) |
|---|
| 165 |
except ValueError: |
|---|
| 166 |
self.result.append(u"?") |
|---|
| 167 |
|
|---|
| 168 |
def handle_entityref(self, ref, trailer): |
|---|
| 169 |
try: |
|---|
| 170 |
cp = htmlentitydefs.name2codepoint[ref] |
|---|
| 171 |
self.result.append(unichr(cp)) |
|---|
| 172 |
if trailer != ";": |
|---|
| 173 |
self.result.append(trailer) |
|---|
| 174 |
except KeyError: |
|---|
| 175 |
self.result.append("&" + ref + trailer) |
|---|
| 176 |
except ValueError: |
|---|
| 177 |
self.result.append("?") |
|---|
| 178 |
if trailer != ";": |
|---|
| 179 |
self.result.append(trailer) |
|---|
| 180 |
|
|---|
| 181 |
def plaintext(content): |
|---|
| 182 |
"""Strips all HTML from content. |
|---|
| 183 |
Entities are translated to their Unicode equivalents where possible.""" |
|---|
| 184 |
s = Plaintext() |
|---|
| 185 |
s.result = [] |
|---|
| 186 |
s.feed(content) |
|---|
| 187 |
s.close() |
|---|
| 188 |
return u"".join(s.result) |
|---|
| 189 |
|
|---|
| 190 |
|
|---|
| 191 |
class StripTags(MoreReasonableSGMLParser): |
|---|
| 192 |
"""Strips HTML tags from content. Entities are retained.""" |
|---|
| 193 |
|
|---|
| 194 |
def handle_data(self, data): |
|---|
| 195 |
self.result.append(data) |
|---|
| 196 |
|
|---|
| 197 |
def handle_charref(self, ref): |
|---|
| 198 |
self.result.append('&#' + ref + ';') |
|---|
| 199 |
|
|---|
| 200 |
def handle_entityref(self, ref, trailer): |
|---|
| 201 |
self.result.append('&' + ref + trailer) |
|---|
| 202 |
|
|---|
| 203 |
def striptags(content): |
|---|
| 204 |
"""Strips HTML tags from content. Entities are retained.""" |
|---|
| 205 |
s = StripTags() |
|---|
| 206 |
s.result = [] |
|---|
| 207 |
s.feed(content) |
|---|
| 208 |
s.close() |
|---|
| 209 |
return u"".join(s.result) |
|---|
| 210 |
|
|---|
| 211 |
|
|---|
| 212 |
class Sanitizer(MoreReasonableSGMLParser): |
|---|
| 213 |
"""Strips specific HTML tags from content. Entities are retained.""" |
|---|
| 214 |
|
|---|
| 215 |
unsafe_tags = [u'!doctype', u'applet', u'base', u'basefont', u'bgsound', |
|---|
| 216 |
u'blink', u'body', u'button', u'comment', u'embed', |
|---|
| 217 |
u'fieldset', u'fn', u'form', u'frame', u'frameset', |
|---|
| 218 |
u'head', u'html', u'iframe', u'ilayer', u'input', |
|---|
| 219 |
u'isindex', u'keygen', u'label', u'layer', u'legend', |
|---|
| 220 |
u'link', u'meta', u'noembed', u'noframes', u'noscript', |
|---|
| 221 |
u'object', u'optgroup', u'option', u'param', u'plaintext', |
|---|
| 222 |
u'select', u'script', u'style', u'textarea', u'title', |
|---|
| 223 |
] |
|---|
| 224 |
replacement = u"<!-- Prohibited Content -->" |
|---|
| 225 |
javascript = r"""(?i)href\w*=['"]javascript:""" |
|---|
| 226 |
unsafe_attributes = [u'abort', u'blur', u'change', u'click', 'dblclick', |
|---|
| 227 |
u'error', u'focus', u'keydown', u'keypress', u'keyup', |
|---|
| 228 |
u'load', u'mousedown', u'mouseout', u'mouseover', |
|---|
| 229 |
u'mouseup', u'reset', u'resize', u'submit', u'unload', |
|---|
| 230 |
] |
|---|
| 231 |
empty_tags = [u'area', u'base', u'basefont', u'br', u'hr', u'img', |
|---|
| 232 |
u'input', u'link', u'meta', u'param', |
|---|
| 233 |
] |
|---|
| 234 |
|
|---|
| 235 |
def handle_data(self, data): |
|---|
| 236 |
self.result.append(data) |
|---|
| 237 |
|
|---|
| 238 |
def handle_charref(self, ref): |
|---|
| 239 |
self.result.append('&#' + ref + ';') |
|---|
| 240 |
|
|---|
| 241 |
def handle_entityref(self, ref, trailer): |
|---|
| 242 |
self.result.append('&' + ref + trailer) |
|---|
| 243 |
|
|---|
| 244 |
def handle_decl(self, data): |
|---|
| 245 |
tag = data.split(" ")[0].lower() |
|---|
| 246 |
if ("!" + tag) in self.unsafe_tags: |
|---|
| 247 |
self.result.append(self.replacement) |
|---|
| 248 |
else: |
|---|
| 249 |
self.result.append(u'<!' + data + '>') |
|---|
| 250 |
|
|---|
| 251 |
def unknown_starttag(self, tag, attributes): |
|---|
| 252 |
if tag in self.unsafe_tags: |
|---|
| 253 |
self.result.append(self.replacement) |
|---|
| 254 |
else: |
|---|
| 255 |
attrs = [] |
|---|
| 256 |
for name, value in attributes: |
|---|
| 257 |
if name not in self.unsafe_attributes: |
|---|
| 258 |
attrs.append(' ' + name + '=' + quoteattr(value)) |
|---|
| 259 |
if tag in self.empty_tags: |
|---|
| 260 |
tail = ' />' |
|---|
| 261 |
else: |
|---|
| 262 |
tail = '>' |
|---|
| 263 |
self.result.append('<' + tag + ''.join(attrs) + tail) |
|---|
| 264 |
|
|---|
| 265 |
def unknown_endtag(self, tag): |
|---|
| 266 |
if tag in self.unsafe_tags: |
|---|
| 267 |
self.result.append(self.replacement) |
|---|
| 268 |
else: |
|---|
| 269 |
if tag not in self.empty_tags: |
|---|
| 270 |
self.result.append('</' + tag + '>') |
|---|
| 271 |
|
|---|
| 272 |
def sanitize(content): |
|---|
| 273 |
"""Strips specific HTML tags from content. Entities are retained.""" |
|---|
| 274 |
s = Sanitizer() |
|---|
| 275 |
s.result = [] |
|---|
| 276 |
s.feed(content) |
|---|
| 277 |
s.close() |
|---|
| 278 |
return u"".join(s.result) |
|---|
| 279 |
|
|---|