Message6037
This is another solution by twb using lynx with options to set
the encoding. Wanted to record here for future reference
as it uses the lynx -dump option I originally suggested.
1 def strip_html():
2 '''Parse global "message_string" variable as a MIME message.
3 Look for text/html MIME objects,
4 run lynx -dump on them,
5 insert them as text/plain MIME objects.
6 Return the new message (as a string).
7
8 NB: this often results in a multipart/alternative branch with
TWO text/plain leaves,
9 but that actually seems to work out pretty well.'''
10
11 message_object = email.message_from_string(message_string)
12
13 # NB: this is the earliest point we can extract header fields.
14 syslog.syslog('SUBJECT {}'.format(message_object.get('Subject',
'No Subject')))
15 syslog.syslog('MESSAGE-ID
{}'.format(message_object.get('Message-ID', 'No Message-ID')))
16
17 # NB: this walk traverses ALL nodes in a flattened tree,
18 # so we do not need to manually recurse on branch nodes.
19 # (unlike perl). --twb, Sep 2015
20 for part in message_object.walk():
21 if 'text/html' == part.get_content_type():
22 syslog.syslog('STRIPPED an html part')
23 # Pipe it through lynx to render as plain text.
24 #
25 # NOTE: postfix runs maxwell with C (not C.UTF-8)
locale!
26 # This breaks non-ASCII for things like open(mode='wt')
and check_output(universal_newlines).
27 # As a workaround leave everything as b'' bytes and pass
encoding hints to lynx and set_payload.
28 #
29 # NOTE: It is very unintuitive, but
30 # get_payload(decode=False) ⇒ u'…'
31 # get_payload(decode=True) ⇒ b'…'
32 # This is because decode=True decodes only C-T-E:
base64 (or quoted-printable);
33 # the e.g. ISO-8859-1 to Unicode decoding happens
later!
34 output = subprocess.check_output(
35 ['lynx',
36 '--dump',
37 '--stdin',
38 '--assume-charset',
part.get_content_charset(failobj='UTF-8')],
39 universal_newlines=False, # KLUDGE — see above
40 input=part.get_payload(decode=True),
41 env={'LC_ALL': 'C.UTF-8'})
42 # Edit the part to be plain text.
43 # FIXME: instead of editing the old MIME object,
44 # *delete* it and create a new text/plain object.
45 # The only part we might want to keep is inline vs.
attachment (disposition)?
46 # But how do I *do* that during a .walk()? --- it's not
a foldr!
47 del part['Content-Type']
48 del part['Content-Transfer-Encoding'] # alloc #31444
49 part['Content-Type'] = 'text/plain'
50 part.set_payload(output, 'UTF-8')
51 # FIXME: should append '.txt' to filename=fred.html
where present.
52 # I can't see how to change it without also overriding
the disposition,
53 # which is undesirable. --twb, Sep 2015
54
55 return message_object |
|
Date |
User |
Action |
Args |
2017-10-16 00:15:58 | rouilj | set | messageid: <1508112958.5.0.213398074469.issue2550799@psf.upfronthosting.co.za> |
2017-10-16 00:15:58 | rouilj | set | recipients:
+ rouilj, ber, marlowa |
2017-10-16 00:15:58 | rouilj | link | issue2550799 messages |
2017-10-16 00:15:54 | rouilj | create | |
|