from HTMLParser import HTMLParser from htmlentitydefs import name2codepoint class DumbHTMLParser(HTMLParser): # class attribute text="" # internal state variable _skip_data = False _last_empty = False def handle_data(self, data): if self._skip_data: # skip data if in script or style block return if ( data.strip() == ""): # reduce multiple blank lines to 1 if ( self._last_empty ): return else: self._last_empty = True else: self._last_empty = False self.text=self.text + data def handle_starttag(self, tag, attrs): if (tag == "p" ): self.text= self.text + "\n" if (tag in ("style", "script")): self._skip_data = True def handle_endtag(self, tag): if (tag in ("style", "script")): self._skip_data = False def handle_entityref(self, name): if self._skip_data: return c = unichr(name2codepoint[name]) try: self.text= self.text + c except UnicodeEncodeError: # print a space as a placeholder pass def html2text(html): parser = DumbHTMLParser() parser.feed(html) parser.close() return parser.text def init(instance): instance.registerUtil('html2text', html2text) if "__main__" == __name__: html='''

Home
Download
Docs
Prerequisites

Roundup requires Python 2.5 or newer (but not Python 3) with a functioning anydbm module. Download the latest version from http://www.python.org/. It is highly recommended that users install the latest patch version of python as these contain many fixes to serious bugs.

Some variants of Linux will need an additional “python dev” package installed for Roundup installation to work. Debian and derivatives, are known to require this.

If you’re on windows, you will either need to be using the ActiveState python distribution (at http://www.activestate.com/Products/ActivePython/), or you’ll have to install the win32all package separately (get it from http://starship.python.net/crew/mhammond/win32/).

''' print html2text(html)