特定のメールの語句の出現頻度を集計する

同じフォルダに「category_social.mbox」があると仮定して、

タイトルにワード「フォロー」が含まれているメールのみ、

CountVectorizerで語句の出現頻度を集計するPython スクリプトです。

AIや機械学習処理の前準備になる部分です。

今回はメール読み込みですが、それを、

ファイル読み込みにしたり、Webスクレイピングにしたり、

いろいろと応用できます。

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-

from sklearn.feature_extraction.text import CountVectorizer
from separatewordsj2 import separatewordsj2
from email.header import decode_header
import mailbox
import re
from bs4 import BeautifulSoup

mboxfilename = 'category_social.mbox'
# def ----------------------------------------------------
def get_decoded_item(instr):
outstr = instr
try:
decoded = decode_header(instr)
outstr = decoded[0][0].decode(decoded[0][1])
except Exception as e:
pass
#print(e)

return outstr
# def ----------------------------------------------------
def charset_decode(indata):
charset = indata.get_content_charset()
if charset:
return indata.get_payload(decode=True).decode(charset)
return indata.get_payload(decode=True).decode('utf-8')
# def ----------------------------------------------------
def get_content(payload):
ret = ''
if isinstance(payload, list):
temp = [ ]
for part in payload:
charset_decoded = charset_decode(part)
temp.append(charset_decoded)

ret = "\n".join(temp)
else:
if isinstance(payload, str):
ret = payload
else:
ret = charset_decode(payload)

soup = BeautifulSoup(ret, 'html.parser')
return soup.get_text()
# main ----------------------------------------------------
lines = [ ]
mbox = mailbox.mbox(mboxfilename)
for i, message in enumerate(mbox):
#if i > 100: break

temp_title = get_decoded_item(message['subject'])
if re.search(r'フォロー', temp_title):
temp = get_content(message.get_payload())
lines.append(temp)

mbox.close()

wd = separatewordsj2()
vec = CountVectorizer(analyzer=wd.extract_words)
X = vec.fit_transform(lines)

names_list = vec.get_feature_names_out()
x_list = X.toarray()

print(','.join(names_list))
print(','.join(map(str, x_list[0])))

my tech life

a japanese software developer

特定のメールの語句の出現頻度を集計する