My Tech Life

Memo by a Japanese Software Developer in his late 50s.

Fetching emails in bulk using Python (More improved version).

I've modularized the previous version of the email fetching script,

focusing solely on message retrieval.

I omitted the decoding and content processing,

leaving those tasks to be handled in the main processing script if needed.

 

I've also added documentation for the module.

 

Save the following code as "mbox_reader.py" and use it:

 

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

"""
Module: mbox_reader.py

This module provides a class for reading messages from an mbox file.

Usage:
    from mbox_reader import MboxReader

    mboxfilename = 'category_social.mbox'
    with MboxReader(mboxfilename) as mbox:
        for i, obj in enumerate(mbox):
            # Your code here

Classes:
    MboxReader(filename)
        A class for reading messages from an mbox file.

"""

import email
from email.policy import default
from email.header import decode_header

class MboxReader:
    def __init__(self, filename):
        self.handle = open(filename, 'rb')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.handle.close()

    def __iter__(self):
        return self

    def __next__(self):
        lines = [ ]
        while True:
            line = self.handle.readline()
            if not line:
                raise StopIteration
            if line.startswith(b'From '):
                message = email.message_from_bytes(b'\n'.join(lines), policy=default)
                if message:
                    return message
                lines = [ ]
            else:
                lines.append(line.rstrip(b'\r\n'))

if __name__ == "__main__":
    mboxfilename = 'category_social.mbox'
    with MboxReader(mboxfilename) as mbox:
        for i, obj in enumerate(mbox):
            if i > 100: break

            date = obj.get('Date')
            subject = obj.get('Subject')
            print(i, date, subject)

 

 

It can be called as follows:

 

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

from mbox_reader import MboxReader

mboxfilename = 'category_social.mbox'
with MboxReader(mboxfilename) as mbox:
    for i, obj in enumerate(mbox):
        if i > 100: break
        date = obj.get('Date')
        subject = obj.get('Subject')
        print(i, date, subject)