Counting the frequency of specific terms in emails with Python.

Assuming there is a file

named "category_social.mbox" in the same folder,

the following Python script

counts the frequency of term occurrences

using CountVectorizer,

only for emails with the word "follow" in their titles.

This serves as a preliminary step

for AI and machine learning processing.

While this example focuses on reading emails,

it can be applied in various ways,

such as reading files

or web scraping.

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

from sklearn.feature_extraction.text import CountVectorizer
from separatewordsj2 import separatewordsj2
from email.header import decode_header
import mailbox
import re
from bs4 import BeautifulSoup

mboxfilename = 'category_social.mbox'
# def ----------------------------------------------------
def get_decoded_item(instr):
outstr = instr
try:
decoded = decode_header(instr)
outstr = decoded[0][0].decode(decoded[0][1])
except Exception as e:
pass
#print(e)

return outstr
# def ----------------------------------------------------
def charset_decode(indata):
charset = indata.get_content_charset()
if charset:
return indata.get_payload(decode=True).decode(charset)
return indata.get_payload(decode=True).decode('utf-8')
# def ----------------------------------------------------
def get_content(payload):
ret = ''
if isinstance(payload, list):
temp = [ ]
for part in payload:
charset_decoded = charset_decode(part)
temp.append(charset_decoded)

ret = "\n".join(temp)
else:
if isinstance(payload, str):
ret = payload
else:
ret = charset_decode(payload)

soup = BeautifulSoup(ret, 'html.parser')
return soup.get_text()
# main ----------------------------------------------------
lines = [ ]
mbox = mailbox.mbox(mboxfilename)
for i, message in enumerate(mbox):
#if i > 100: break

temp_title = get_decoded_item(message['subject'])
if re.search(r'フォロー', temp_title):
temp = get_content(message.get_payload())
lines.append(temp)

mbox.close()

wd = separatewordsj2()
vec = CountVectorizer(analyzer=wd.extract_words)
X = vec.fit_transform(lines)

names_list = vec.get_feature_names_out()
x_list = X.toarray()

print(','.join(names_list))
print(','.join(map(str, x_list[0])))

My Tech Life

Memo by a Japanese Software Developer in his late 50s.

Counting the frequency of specific terms in emails with Python.