Contact: fumanchu@aminus.org

Log in as guest/misc to create tickets

root/cleanhtml.py

Revision 15 (checked in by fumanchu, 8 years ago)

Reorg for multiple projects in single repo.
New pyanalog package.

Line 
1 """Exposes several SGMLParser subclasses.
2
3 This work, including the source code, documentation
4 and related data, is placed into the public domain.
5
6 The orginal author is Robert Brewer, Amor Ministries.
7
8 THIS SOFTWARE IS PROVIDED AS-IS, WITHOUT WARRANTY
9 OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF
10 MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE
11 ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE
12 RESULTING FROM THE USE, MODIFICATION, OR
13 REDISTRIBUTION OF THIS SOFTWARE.
14
15 If you don't need thread-safety, you might create a single instance of the
16 parser you want, and feed it yourself. You also might use the classes
17 directly if you need to customize them in some way; for example, you may
18 need to alter the list of unsafe_tags in the Sanitizer class, either
19 per-instance or by subclassing it.
20
21 If you need thread-safe parsing, you should use the functions provided.
22 They create a new instance each time, so you get a *small* performance
23 hit, but by the same token, each thread can work on its own instance.
24 """
25
26 import re
27 import sgmllib
28 import htmlentitydefs
29 from xml.sax.saxutils import quoteattr
30
31 interesting = re.compile('[&<]')
32 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
33                            '<([a-zA-Z][^<>]*|'
34                               '/([a-zA-Z][^<>]*)?|'
35                               '![^<>]*)?')
36
37 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
38 charref = re.compile('&#([0-9]+)[^0-9]')
39
40 starttagopen = re.compile('<[>a-zA-Z]')
41
42
43 class MoreReasonableSGMLParser(sgmllib.SGMLParser):
44     """Just like an SGML Parser, but with more information passed
45     to the handle_ methods. For example, handle_entityref passes
46     the whole match, ampersand, name, and trailer."""
47    
48     # Internal -- handle data as far as reasonable.  May leave state
49     # and data to be processed by a subsequent call.  If 'end' is
50     # true, force handling all data as if followed by EOF marker.
51     def goahead(self, end):
52         rawdata = self.rawdata
53         i = 0
54         n = len(rawdata)
55         while i < n:
56             if self.nomoretags:
57                 self.handle_data(rawdata[i:n])
58                 i = n
59                 break
60             match = interesting.search(rawdata, i)
61             if match: j = match.start()
62             else: j = n
63             if i < j:
64                 self.handle_data(rawdata[i:j])
65             i = j
66             if i == n: break
67             if rawdata[i] == '<':
68                 if starttagopen.match(rawdata, i):
69                     if self.literal:
70                         self.handle_data(rawdata[i])
71                         i = i+1
72                         continue
73                     k = self.parse_starttag(i)
74                     if k < 0: break
75                     i = k
76                     continue
77                 if rawdata.startswith("</", i):
78                     k = self.parse_endtag(i)
79                     if k < 0: break
80                     i = k
81                     self.literal = 0
82                     continue
83                 if self.literal:
84                     if n > (i + 1):
85                         self.handle_data("<")
86                         i = i+1
87                     else:
88                         # incomplete
89                         break
90                     continue
91                 if rawdata.startswith("<!--", i):
92                         # Strictly speaking, a comment is --.*--
93                         # within a declaration tag <!...>.
94                         # This should be removed,
95                         # and comments handled only in parse_declaration.
96                     k = self.parse_comment(i)
97                     if k < 0: break
98                     i = k
99                     continue
100                 if rawdata.startswith("<?", i):
101                     k = self.parse_pi(i)
102                     if k < 0: break
103                     i = i+k
104                     continue
105                 if rawdata.startswith("<!", i):
106                     # This is some sort of declaration; in "HTML as
107                     # deployed," this should only be the document type
108                     # declaration ("<!DOCTYPE html...>").
109                     k = self.parse_declaration(i)
110                     if k < 0: break
111                     i = k
112                     continue
113             elif rawdata[i] == '&':
114                 if self.literal:
115                     self.handle_data(rawdata[i])
116                     i = i+1
117                     continue
118                 match = charref.match(rawdata, i)
119                 if match:
120                     name = match.group(1)
121                     self.handle_charref(name)
122                     i = match.end(0)
123                     if rawdata[i-1] != ';': i = i-1
124                     continue
125                 match = entityref.match(rawdata, i)
126                 if match:
127                     name = match.group(1)
128                     i = match.end(0)
129                     trailer = rawdata[i-1]
130                     self.handle_entityref(name, trailer)
131                     if trailer != ';': i = i-1
132                     continue
133             else:
134                 self.error('neither < nor & ??')
135             # We get here only if incomplete matches but
136             # nothing else
137             match = incomplete.match(rawdata, i)
138             if not match:
139                 self.handle_data(rawdata[i])
140                 i = i+1
141                 continue
142             j = match.end(0)
143             if j == n:
144                 break # Really incomplete
145             self.handle_data(rawdata[i:j])
146             i = j
147         # end while
148         if end and i < n:
149             self.handle_data(rawdata[i:n])
150             i = n
151         self.rawdata = rawdata[i:]
152         # XXX if end: check for empty stack
153
154
155 class Plaintext(MoreReasonableSGMLParser):
156     """Strips all HTML from content.
157     Entities are translated to their Unicode equivalents where possible."""
158    
159     def handle_data(self, data):
160         self.result.append(data)
161    
162     def handle_charref(self, ref):
163         try:
164             self.result.append(unichr(int(ref)))
165         except ValueError:
166             self.result.append(u"?")
167        
168     def handle_entityref(self, ref, trailer):
169         try:
170             cp = htmlentitydefs.name2codepoint[ref]
171             self.result.append(unichr(cp))
172             if trailer != ";":
173                 self.result.append(trailer)
174         except KeyError:
175             self.result.append("&" + ref + trailer)
176         except ValueError:
177             self.result.append("?")
178             if trailer != ";":
179                 self.result.append(trailer)
180
181 def plaintext(content):
182     """Strips all HTML from content.
183     Entities are translated to their Unicode equivalents where possible."""
184     s = Plaintext()
185     s.result = []
186     s.feed(content)
187     s.close()
188     return u"".join(s.result)
189
190
191 class StripTags(MoreReasonableSGMLParser):
192     """Strips HTML tags from content. Entities are retained."""
193    
194     def handle_data(self, data):
195         self.result.append(data)
196    
197     def handle_charref(self, ref):
198         self.result.append('&#' + ref + ';')
199    
200     def handle_entityref(self, ref, trailer):
201         self.result.append('&' + ref + trailer)
202
203 def striptags(content):
204     """Strips HTML tags from content. Entities are retained."""
205     s = StripTags()
206     s.result = []
207     s.feed(content)
208     s.close()
209     return u"".join(s.result)
210
211
212 class Sanitizer(MoreReasonableSGMLParser):
213     """Strips specific HTML tags from content. Entities are retained."""
214    
215     unsafe_tags = [u'!doctype', u'applet', u'base', u'basefont', u'bgsound',
216                    u'blink', u'body', u'button', u'comment', u'embed',
217                    u'fieldset', u'fn', u'form', u'frame', u'frameset',
218                    u'head', u'html', u'iframe', u'ilayer', u'input',
219                    u'isindex', u'keygen', u'label', u'layer', u'legend',
220                    u'link', u'meta', u'noembed', u'noframes', u'noscript',
221                    u'object', u'optgroup', u'option', u'param', u'plaintext',
222                    u'select', u'script', u'style', u'textarea', u'title',
223                    ]
224     replacement = u"<!-- Prohibited Content -->"
225     javascript = r"""(?i)href\w*=['"]javascript:"""
226     unsafe_attributes = [u'abort', u'blur', u'change', u'click', 'dblclick',
227                          u'error', u'focus', u'keydown', u'keypress', u'keyup',
228                          u'load', u'mousedown', u'mouseout', u'mouseover',
229                          u'mouseup', u'reset', u'resize', u'submit', u'unload',
230                          ]
231     empty_tags = [u'area', u'base', u'basefont', u'br', u'hr', u'img',
232                   u'input', u'link', u'meta', u'param',
233                   ]
234    
235     def handle_data(self, data):
236         self.result.append(data)
237    
238     def handle_charref(self, ref):
239         self.result.append('&#' + ref + ';')
240    
241     def handle_entityref(self, ref, trailer):
242         self.result.append('&' + ref + trailer)
243    
244     def handle_decl(self, data):
245         tag = data.split(" ")[0].lower()
246         if ("!" + tag) in self.unsafe_tags:
247             self.result.append(self.replacement)
248         else:
249             self.result.append(u'<!' + data + '>')
250    
251     def unknown_starttag(self, tag, attributes):
252         if tag in self.unsafe_tags:
253             self.result.append(self.replacement)
254         else:
255             attrs = []
256             for name, value in attributes:
257                 if name not in self.unsafe_attributes:
258                     attrs.append(' ' + name + '=' + quoteattr(value))
259             if tag in self.empty_tags:
260                 tail = ' />'
261             else:
262                 tail = '>'
263             self.result.append('<' + tag + ''.join(attrs) + tail)
264    
265     def unknown_endtag(self, tag):
266         if tag in self.unsafe_tags:
267             self.result.append(self.replacement)
268         else:
269             if tag not in self.empty_tags:
270                 self.result.append('</' + tag + '>')
271
272 def sanitize(content):
273     """Strips specific HTML tags from content. Entities are retained."""
274     s = Sanitizer()
275     s.result = []
276     s.feed(content)
277     s.close()
278     return u"".join(s.result)
279
Note: See TracBrowser for help on using the browser.