I've modularized the previous version of the email fetching script,
focusing solely on message retrieval.
I omitted the decoding and content processing,
leaving those tasks to be handled in the main processing script if needed.
I've also added documentation for the module.
Save the following code as "mbox_reader.py" and use it:
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
"""
Module: mbox_reader.py
This module provides a class for reading messages from an mbox file.
Usage:
from mbox_reader import MboxReader
mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
for i, obj in enumerate(mbox):
# Your code here
Classes:
MboxReader(filename)
A class for reading messages from an mbox file.
"""
import email
from email.policy import default
from email.header import decode_header
class MboxReader:
def __init__(self, filename):
self.handle = open(filename, 'rb')
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, exc_traceback):
self.handle.close()
def __iter__(self):
return self
def __next__(self):
lines = [ ]
while True:
line = self.handle.readline()
if not line:
raise StopIteration
if line.startswith(b'From '):
message = email.message_from_bytes(b'\n'.join(lines), policy=default)
if message:
return message
lines = [ ]
else:
lines.append(line.rstrip(b'\r\n'))
if __name__ == "__main__":
mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
for i, obj in enumerate(mbox):
if i > 100: break
date = obj.get('Date')
subject = obj.get('Subject')
print(i, date, subject)
It can be called as follows:
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from mbox_reader import MboxReader
mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
for i, obj in enumerate(mbox):
if i > 100: break
date = obj.get('Date')
subject = obj.get('Subject')
print(i, date, subject)