I've modularized the previous version of the email fetching script,
focusing solely on message retrieval.
I omitted the decoding and content processing,
leaving those tasks to be handled in the main processing script if needed.
I've also added documentation for the module.
Save the following code as "mbox_reader.py" and use it:
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-"""
Module: mbox_reader.pyThis module provides a class for reading messages from an mbox file.
Usage:
from mbox_reader import MboxReadermboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
for i, obj in enumerate(mbox):
# Your code hereClasses:
MboxReader(filename)
A class for reading messages from an mbox file."""
import email
from email.policy import default
from email.header import decode_headerclass MboxReader:
def __init__(self, filename):
self.handle = open(filename, 'rb')def __enter__(self):
return selfdef __exit__(self, exc_type, exc_value, exc_traceback):
self.handle.close()def __iter__(self):
return selfdef __next__(self):
lines = [ ]
while True:
line = self.handle.readline()
if not line:
raise StopIteration
if line.startswith(b'From '):
message = email.message_from_bytes(b'\n'.join(lines), policy=default)
if message:
return message
lines = [ ]
else:
lines.append(line.rstrip(b'\r\n'))if __name__ == "__main__":
mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
for i, obj in enumerate(mbox):
if i > 100: breakdate = obj.get('Date')
subject = obj.get('Subject')
print(i, date, subject)
It can be called as follows:
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-from mbox_reader import MboxReader
mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
for i, obj in enumerate(mbox):
if i > 100: break
date = obj.get('Date')
subject = obj.get('Subject')
print(i, date, subject)