My Tech Life

Memo by a Japanese Software Developer in his late 50s.

Counting the frequency of specific terms in emails with Python.

Assuming there is a file

named "category_social.mbox" in the same folder,

the following Python script

counts the frequency of term occurrences

using CountVectorizer,

only for emails with the word "follow" in their titles.

 

This serves as a preliminary step

for AI and machine learning processing.

 

While this example focuses on reading emails,

it can be applied in various ways,

such as reading files

or web scraping.

 

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

from sklearn.feature_extraction.text import CountVectorizer
from separatewordsj2 import separatewordsj2
from email.header import decode_header
import mailbox
import re
from bs4 import BeautifulSoup

mboxfilename = 'category_social.mbox'
# def ----------------------------------------------------
def get_decoded_item(instr):
    outstr = instr
    try:
        decoded = decode_header(instr)
        outstr = decoded[0][0].decode(decoded[0][1])
    except Exception as e:
        pass
        #print(e)

    return outstr
# def ----------------------------------------------------
def charset_decode(indata):
    charset = indata.get_content_charset()
    if charset:
        return indata.get_payload(decode=True).decode(charset)
    return indata.get_payload(decode=True).decode('utf-8')
# def ----------------------------------------------------
def get_content(payload):
    ret = ''
    if isinstance(payload, list):
        temp = [ ]
        for part in payload:
            charset_decoded = charset_decode(part)
            temp.append(charset_decoded)

        ret = "\n".join(temp)
    else:
        if isinstance(payload, str):
            ret = payload
        else:
            ret = charset_decode(payload)

    soup = BeautifulSoup(ret, 'html.parser')
    return soup.get_text()
# main ----------------------------------------------------
lines = [ ]
mbox = mailbox.mbox(mboxfilename)
for i, message in enumerate(mbox):
    #if i > 100: break

    temp_title = get_decoded_item(message['subject'])
    if re.search(r'フォロー', temp_title):
        temp = get_content(message.get_payload())
        lines.append(temp)

mbox.close()

wd = separatewordsj2()
vec = CountVectorizer(analyzer=wd.extract_words)
X = vec.fit_transform(lines)

names_list = vec.get_feature_names_out()
x_list = X.toarray()

print(','.join(names_list))
print(','.join(map(str, x_list[0])))