How do I generate binary RFC822-style headers in Python 3.2? - unicode

How do I convince email.generator.Generator to use binary in Python 3.2? This seems like precisely the use case for the policy framework that was introduced in Python 3.3, but I would like my code to run in 3.2.
from email.parser import Parser
from email.generator import Generator
from io import BytesIO, StringIO
data = "Key: \N{SNOWMAN}\r\n\r\n"
message = Parser().parse(StringIO(data))
with open("/tmp/rfc882test", "w") as out:
Generator(out, maxheaderlen=0).flatten(message)
Fails with UnicodeEncodeError: 'ascii' codec can't encode character '\u2603' in position 0: ordinal not in range(128).

Your data is not a valid RFC2822 header, which I suspect misleads you. It's a Unicode string, but RFC2822 is always only ASCII. To have non-ASCII characters you need to encode them with a character set and either base64 or quoted-printable encoding.
Hence, valid code would be this:
from email.parser import Parser
from email.generator import Generator
from io import BytesIO, StringIO
data = "Key: =?utf8?b?4piD?=\r\n\r\n"
message = Parser().parse(StringIO(data))
with open("/tmp/rfc882test", "w") as out:
Generator(out, maxheaderlen=0).flatten(message)
Which of course avoids the error completely.
The question is how to generate such headers as =?utf8?b?4piD?= and the answer lies in the email.header module.
I made this example with:
>>> from email import header
>>> header.Header('\N{SNOWMAN}', 'utf8').encode()
'=?utf8?b?4piD?='
To handle files that have a Key: Value format the email module is the wrong solution. Handling such files are easy enough without the email module, and you will not have to work around the restrictions of RF2822. For example:
# -*- coding: UTF-8 -*-
import io
import sys
if sys.version_info > (3,):
def u(s): return s
else:
def u(s): return s.decode('unicode-escape')
def parse(infile):
res = {}
payload = ''
for line in infile:
key, value = line.strip().split(': ',1)
if key in res:
raise ValueError(u("Key {0} appears twice").format(key))
res[key] = value
return res
def generate(outfile, data):
for key in data:
outfile.write(u("{0}: {1}\n").format(key, data[key]))
if __name__ == "__main__":
# Ensure roundtripping:
data = {u('Key'): u('Value'), u('Foo'): u('Bar'), u('Frötz'): u('Öpöpöp')}
with io.open('/tmp/outfile.conf', 'wt', encoding='UTF8') as outfile:
generate(outfile, data)
with io.open('/tmp/outfile.conf', 'rt', encoding='UTF8') as infile:
res = parse(infile)
assert data == res
That code took 15 minutes to write, and works in both Python 2 and Python 3. If you want line continuations etc that's easy to add as well.
Here is a more complete one that supports comments etc.

A useful solution comes from http://mail.python.org/pipermail/python-dev/2010-October/104409.html :
from email.parser import Parser
from email.generator import BytesGenerator
# How do I get surrogateescape from a BytesIO/StringIO?
data = "Key: \N{SNOWMAN}\r\n\r\n" # write this to headers.txt
headers = open("headers.txt", "r", encoding="ascii", errors="surrogateescape")
message = Parser().parse(headers)
with open("/tmp/rfc882test", "wb") as out:
BytesGenerator(out, maxheaderlen=0).flatten(message)
This is for a program that wants to read and write a binary Key: value file without caring about the encoding. To consume the headers as decoded text without being able to write them back out with Generator(), Parser().parse(open("headers.txt", "r", encoding="utf-8")) should be sufficient.

Related

When using ruamel.yaml and preserve_quotes=True is there a way to force roundtriped yaml to use single quotes?

[Updated question, preserve_quotes works as expected]
How can I force the output to use single quotes whatever the type quotes used in the input, but only if quotes were used?
Note : the (humble) source code of my projects using ruamel.yaml =>
https://github.com/looztra/yamkix (an opinionated yaml formatter)
https://github.com/looztra/kubesplit/tree/v0_scooter (a tool designed to split a set of kubernetes resources in a single stream to a set of files)
Your example output has the single quotes changed, i.e. not preserved
and a minimal example shows that is not how things work.
import sys
import ruamel.yaml
yaml_str = """---
metadata:
annotations:
first: '1'
second: any string
"""
yaml = ruamel.yaml.YAML(typ='rt')
data = list(yaml.load_all(yaml_str))
yaml.preserve_quotes = True
yaml.explicit_start = True
yaml.dump_all(data, sys.stdout)
which gives:
---
metadata:
annotations:
first: '1'
second: any string
Using your format_yaml routine:
import sys
import pathlib
import ruamel.yaml
YAML = ruamel.yaml.YAML
yaml_file = pathlib.Path("temp.yaml")
yaml_file.write_text("""---
metadata:
annotations:
first: '1'
second: any string
""")
def format_yaml(input_file,
output_file,
explicit_start=True,
explicit_end=False,
default_flow_style=False,
dash_inwards=True,
quotes_preserved=True,
parsing_mode='rt'):
"""
Load a file and save it formated :
:param input_file: the input file
:param output_file: the output file
:param explicit_start: write the start of the yaml doc even when there is \
only one done in the file
:param default_flow_style: if False, block style will be used for nested \
arrays/maps
:param dash_inwards: push dash inwards if True
:param quotes_preserved: preserve quotes if True
:param parsing_typ: safe or roundtrip (rt) more
"""
yaml = YAML(typ=parsing_mode)
yaml.explicit_start = explicit_start
yaml.explicit_end = explicit_end
yaml.default_flow_style = default_flow_style
yaml.preserve_quotes = quotes_preserved
if dash_inwards:
yaml.indent(mapping=2, sequence=4, offset=2)
if input_file is not None:
with open(input_file, 'rt') as f_input:
parsed = yaml.load_all(f_input.read())
else:
parsed = yaml.load_all(sys.stdin.read())
ready_for_dump = []
try:
# Read the parsed content to force the scanner to issue errors if any
for data in parsed:
ready_for_dump.append(data)
except ScannerError as e:
print("Something is wrong in the input file, got error from Scanner")
print(e)
return
if output_file is not None:
with open(output_file, 'wt') as out:
yaml.dump_all(ready_for_dump, out)
else:
yaml.dump_all(ready_for_dump, sys.stdout)
format_yaml(yaml_file, None)
which also gives:
---
metadata:
annotations:
first: '1'
second: any string
and if you use double quotes in the input, you'll get double quotes in the output.
So please provide is with a minimal full example program that shows the behaviour
that you get, plus the ruamel.yaml and python versions and the platforms you have tested this on (the above was tested with 0.15.97 on Python 3.7.3 and Python 2.7.15 (with pathlib2 installed) on Linux).
BTW, the easiest way of changing all single quoted string to dump as double quoted ones
in round-trip mode is by overriding the represent_single_quoted_scalarstring method
on yaml.representer (e.g by assigning the represent_double_quoted_scalarstring method to it).

Movesense decode SBEM data from LogBook

I'm trying to get the LogBook data over BLE to my App.
This works fine for JSON, the data seems accurate.
But it takes along time due to the JSON encoding.
Getting the SBEM data is way faster. But I can't find any documentation on the encoding. I found out that the "Content" string is Base64 encoded.
It starts with SBEM which means, it is uncompressed as stated here:
https://bitbucket.org/suunto/movesense-device-lib/src/5bcf0b40644a17d48977cf011ebcf6191650c6f0/MovesenseCoreLib/resources/movesense-api/mem/logbook.yaml?fileviewer=file-view-default#lines-186
But I couldn't find anything else.
Has somebody further information on that or found out what the encoding is like?
Best regards
Alex
First some clarification: When requesting the JSON log from MDS/Logbook/ service the data itself is transferred from Movesense sensor in SBEM format and the conversion is performed on the phone. If you have specific examples where the said conversion is slow (there very well might be) it's a good idea to add a bitbucket issue to movesense-mobile-lib.
About the SBEM format. This is "Suunto Oy internal" binary format for presenting xml (and nowadays json) files. This means that the interpretation of it may change when the format evolves. With that warning aside, here's the format:
Data is encoded in chunks with ID(1-2 bytes), length(1-4 bytes) and content
consists of two separate sections: Descriptors & Data which can be in separate "files" (like in Logbook service)
Descriptors describe the format of the data in data chunks ("format string")
Data chunks contain the binary data in described format.
If you want to learn about the SBEM format that the DataLogger / Logbook services use, see the "generated/sbem-code" folder that is created during the build.
And finally, here is a simple python code for parsing SBEM format:
from __future__ import print_function
import sys
import re
import glob, os
data_path = sys.argv[0]
descriptor_path = sys.argv[1]
ReservedSbemId_e_Escape = b"\255"
ReservedSbemId_e_Descriptor = 0
#print("data_path:",data_path)
print("descriptor_path:",descriptor_path)
# reads sbem ID upto uint16 from file
def readId(f):
byte1 = f.read(1)
id = None
if not byte1:
print("EOF found")
elif byte1 < ReservedSbemId_e_Escape:
id = int.from_bytes(byte1, byteorder='little')
#print("one byte id:", id)
else:
# read 2 following bytes
id_bytes = f.read(2)
id = int.from_bytes(id_bytes, byteorder='little')
#print("two byte id:",id)
return id
# reads sbem length upto uint32 from file
def readLen(f):
byte1 = f.read(1)
if byte1 < ReservedSbemId_e_Escape:
datasize = int.from_bytes(byte1, byteorder='little')
#print("one byte len:", len)
else:
# read 4 following bytes
id_bytes = f.read(4)
datasize = int.from_bytes(id_bytes, byteorder='little')
#print("4 byte len:",len)
return datasize
# read sbem chunkheader from file
def readChunkHeader(f):
id = readId(f)
if id is None:
return (None,None)
datasize = readLen(f)
ret = (id, datasize)
print("SBEM chunk header:", ret)
print(" offset:", f.tell())
return ret
def readHeader(f):
# read header
header_bytes = f.read(8)
print("SBEM Header: ", header_bytes)
def parseDescriptorChunk(data_bytes):
print("parseDescriptorChunk data:", chunk_bytes)
return
def parseDataChunk(data_bytes):
print("parseDataChunk data:", chunk_bytes)
return
# read descriptors
with open(descriptor_path, 'rb') as f_desc:
readHeader(f_desc)
while True:
(id, datasize) = readChunkHeader(f_desc)
if id is None:
# print("None id:",id)
break;
chunk_bytes = f_desc.read(datasize)
if (len(chunk_bytes) != datasize):
print("ERROR: too few bytes returned.")
break
if id == ReservedSbemId_e_Descriptor:
parseDescriptorChunk(chunk_bytes)
else:
print("WARNING: data chunk in descriptor file!")
parseDataChunk(chunk_bytes)
# read data
with open(data_path, 'rb') as f_data:
readHeader(f_data)
while True:
(id, datasize) = readChunkHeader(f_data)
if id is None:
# print("None id:",id)
break;
chunk_bytes = f_data.read(datasize)
if (len(chunk_bytes) != datasize):
print("ERROR: too few bytes returned.")
break
if id == ReservedSbemId_e_Descriptor:
parseDescriptorChunk(chunk_bytes)
else:
parseDataChunk(chunk_bytes)
Full Disclaimer: I work for the Movesense team

How do I print a dictionary vs a defaultdict based dictionary as a yaml file using ruamel.yaml?

Please refer to this trivial block of code shown below. My goal is to use defaultdict to come up with a relatively simple dictionary, and further print the results out as a yaml file.
When I manually define the dictionary, it seems to work just fine and the YAML is displayed exactly the way I want it, but when I use defaultdict to come up with a dictionary, I get an error message and unfortunately I am not able to decipher that.
When I print the dictionary as a JSON, it prints the exact same output. What I am missing?
import sys,ruamel.yaml
import json
from collections import defaultdict
def dict_maker():
return defaultdict(dict_maker)
S = ruamel.yaml.scalarstring.DoubleQuotedScalarString
app = "someapp"
d = {'beats':{'name':S(app), 'udp_address':S('239.1.1.1:10101')}}
foo = dict_maker()
foo["beats"]["name"] = S(app)
foo["beats"]["udp_address"] = S("239.1.1.1:10101")
print "Regular dictionary"
print json.dumps(d, indent=4)
print "defaultdict dictionary"
print json.dumps(foo, indent=4)
print "dictionary as a yaml\n"
ruamel.yaml.dump(d, sys.stdout, Dumper=ruamel.yaml.RoundTripDumper)
print "defaultdict dictionary as a yaml\n"
ruamel.yaml.dump(foo, sys.stdout, Dumper=ruamel.yaml.RoundTripDumper)
Error Message
raise RepresenterError("cannot represent an object: %s" % data)
ruamel.yaml.representer.RepresenterError: cannot represent an object: defaultdict(<function dict_maker at 0x7f1253725a28>, {'beats': defaultdict(<function dict_maker at 0x7f1253725a28>, {'name': u'someapp', 'udp_address': u'239.1.1.1:10101'})})
You seem to be using the word "dictionary" when refering to a Python dict. There is however no such thing as a "defaultdict based dictionary", that would imply that foo after
foo = dict_maker()
would be a dict, and of course it is not: foo is a defaultdict which is dict based (i.e. exactly the other way around from what you write).
That JSON dumps this, is not surprising, as it cannot do more than stupidly dump the key-value pairs as if it were a dict. But when you try to load that JSON back, you see how useless this is as, you cannot continue working with it (at least not in the way expected):
import sys
import json
from collections import defaultdict
import io
def dict_maker():
return defaultdict(dict_maker)
app = "someapp"
foo = dict_maker()
foo["beats"]["name"] = app
foo["beats"]["udp_address"] = "239.1.1.1:10101"
io = io.StringIO()
json.dump(foo, io, indent=4)
io.seek(0)
bar = json.load(io)
bar['otherapp']['name'] = 'some_alt_app'
print(bar['beats']['udp_address'])
The above throws: KeyError: 'otherapp'. And that is because JSON doesn't keep all the information needed.
However, if you use the unsafe YAML dumper, then ruamel.yaml can dump and load this fine:
import sys
from ruamel.yaml import YAML
from collections import defaultdict
import io
def dict_maker():
return defaultdict(dict_maker)
app = "someapp"
yaml = YAML(typ='unsafe')
foo = dict_maker()
foo["beats"]["name"] = app
foo["beats"]["udp_address"] = "239.1.1.1:10101"
io = io.StringIO()
yaml.dump(foo, io)
io.seek(0)
print(io.getvalue())
bar = yaml.load(io)
bar['otherapp']['name'] = 'some_alt_app'
print(bar['beats']['udp_address'])
this doesn't throw an error, as bar is again a defaultdict with dict_maker as the function it defaults to. The above prints
239.1.1.1:10101
as you would expect.
That the RoundTripDumper/Loader doesn't support this out-of-the-box, is because it is based on the SafeDumper/Loader, which cannot dump/load arbitrary Python instances like defaultdict and its dict_maker function reference. Enabling that would make the loading unsafe.
So if you need to use the RoundTripDumper you should add a representer for defaultdict or a subclass thereof (and possible one for dict_maker as well). To be able to load that, you need constructor(s) as well. How to do that is described in the documentation (Dumping Python classes)

subprocess.run managing optional stdin and stdout

In python >= 3.5 we can give optional stdout, stdin, stderr to subprocess.run()
per the docs:
Valid values are PIPE, DEVNULL, an existing file descriptor (a positive integer),
an existing file object, and None. PIPE indicates that a new pipe to the child
should be created
I want to support passing through (at least) None or existing file objects whilst managing resources pythonically.
How should I manage the optional file resources in something like:
import subprocess
def wraps_subprocess(args=['ls', '-l'], stdin=None, stdout=None):
# ... do important stuff
subprocess.run(args=args, stdin=stdin, stdout=stdout)
A custom contextmanager (idea taken from this answer) seems to work:
import contextlib
#contextlib.contextmanager
def awesome_open(path_or_file_or_none, mode='rb'):
if isinstance(path_or_file_or_none, str):
file_ = needs_closed = open(path_or_file_or_none, mode)
else:
file_ = path_or_file_or_none
needs_closed = None
try:
yield file_
finally:
if needs_closed:
needs_closed.close()
which would be used used like
import subprocess
def wraps_subprocess(args=['ls', '-l'], stdin=None, stdout=None):
# ... do important stuff
with awesome_open(stdin, mode='rb') as fin, awesome_open(stdout, mode='wb') as fout:
subprocess.run(args=args, stdin=fin, stdout=fout)
But I think there is probably a better way.

Extracting the body of an email from mbox file, decoding it to plain text regardless of Charset and Content Transfer Encoding

I am trying to use Python 3 to extract the body of email messages from a thunderbird mbox file. It is an IMAP account.
I would like to have the text part of the body of the email available to process as a unicode string. It should 'look like' the email does in Thunderbird, and not contain escaped characters such as \r\n =20 etc.
I think that it is the Content Transfer Encodings that I don't know how to decode or remove.
I receive emails with a variety of different Content Types, and different Content Transfer Encodings.
This is my current attempt :
import mailbox
import quopri,base64
def myconvert(encoded,ContentTransferEncoding):
if ContentTransferEncoding == 'quoted-printable':
result = quopri.decodestring(encoded)
elif ContentTransferEncoding == 'base64':
result = base64.b64decode(encoded)
mboxfile = 'C:/Users/Username/Documents/Thunderbird/Data/profile/ImapMail/server.name/INBOX'
for msg in mailbox.mbox(mboxfile):
if msg.is_multipart(): #Walk through the parts of the email to find the text body.
for part in msg.walk():
if part.is_multipart(): # If part is multipart, walk through the subparts.
for subpart in part.walk():
if subpart.get_content_type() == 'text/plain':
body = subpart.get_payload() # Get the subpart payload (i.e the message body)
for k,v in subpart.items():
if k == 'Content-Transfer-Encoding':
cte = v # Keep the Content Transfer Encoding
elif subpart.get_content_type() == 'text/plain':
body = part.get_payload() # part isn't multipart Get the payload
for k,v in part.items():
if k == 'Content-Transfer-Encoding':
cte = v # Keep the Content Transfer Encoding
print(body)
print('Body is of type:',type(body))
body = myconvert(body,cte)
print(body)
But this fails with :
Body is of type: <class 'str'>
Traceback (most recent call last):
File "C:/Users/David/Documents/Python/test2.py", line 31, in <module>
body = myconvert(body,cte)
File "C:/Users/David/Documents/Python/test2.py", line 6, in myconvert
result = quopri.decodestring(encoded)
File "C:\Python32\lib\quopri.py", line 164, in decodestring
return a2b_qp(s, header=header)
TypeError: 'str' does not support the buffer interface
Here is some code that does the job, it prints errors instead of crashing for those messages where it would fail. I hope that it may be useful. Note that if there is a bug in Python 3, and that is fixed, then the lines .get_payload(decode=True) may then return a str object instead of a bytes object. I ran this code today on 2.7.2 and on Python 3.2.1.
import mailbox
def getcharsets(msg):
charsets = set({})
for c in msg.get_charsets():
if c is not None:
charsets.update([c])
return charsets
def handleerror(errmsg, emailmsg,cs):
print()
print(errmsg)
print("This error occurred while decoding with ",cs," charset.")
print("These charsets were found in the one email.",getcharsets(emailmsg))
print("This is the subject:",emailmsg['subject'])
print("This is the sender:",emailmsg['From'])
def getbodyfromemail(msg):
body = None
#Walk through the parts of the email to find the text body.
if msg.is_multipart():
for part in msg.walk():
# If part is multipart, walk through the subparts.
if part.is_multipart():
for subpart in part.walk():
if subpart.get_content_type() == 'text/plain':
# Get the subpart payload (i.e the message body)
body = subpart.get_payload(decode=True)
#charset = subpart.get_charset()
# Part isn't multipart so get the email body
elif part.get_content_type() == 'text/plain':
body = part.get_payload(decode=True)
#charset = part.get_charset()
# If this isn't a multi-part message then get the payload (i.e the message body)
elif msg.get_content_type() == 'text/plain':
body = msg.get_payload(decode=True)
# No checking done to match the charset with the correct part.
for charset in getcharsets(msg):
try:
body = body.decode(charset)
except UnicodeDecodeError:
handleerror("UnicodeDecodeError: encountered.",msg,charset)
except AttributeError:
handleerror("AttributeError: encountered" ,msg,charset)
return body
#mboxfile = 'C:/Users/Username/Documents/Thunderbird/Data/profile/ImapMail/server.name/INBOX'
print(mboxfile)
for thisemail in mailbox.mbox(mboxfile):
body = getbodyfromemail(thisemail)
print(body[0:1000])
This script seems to return all messages correctly:
def getcharsets(msg):
charsets = set({})
for c in msg.get_charsets():
if c is not None:
charsets.update([c])
return charsets
def getBody(msg):
while msg.is_multipart():
msg=msg.get_payload()[0]
t=msg.get_payload(decode=True)
for charset in getcharsets(msg):
t=t.decode(charset)
return t
Former answer from acd often returns only some footer of the real message.
(
at least in the GMANE email messagens I am opening for this toolbox:
https://pypi.python.org/pypi/gmane
)
cheers