Assuming there is a file
named "category_social.mbox" in the same folder,
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-from sklearn.feature_extraction.text import CountVectorizer
from separatewordsj2 import separatewordsj2
from email.header import decode_header
import mailbox
import re
from bs4 import BeautifulSoupmboxfilename = 'category_social.mbox'
# def ----------------------------------------------------
def get_decoded_item(instr):
outstr = instr
try:
decoded = decode_header(instr)
outstr = decoded[0][0].decode(decoded[0][1])
except Exception as e:
pass
#print(e)return outstr
# def ----------------------------------------------------
def charset_decode(indata):
charset = indata.get_content_charset()
if charset:
return indata.get_payload(decode=True).decode(charset)
return indata.get_payload(decode=True).decode('utf-8')
# def ----------------------------------------------------
def get_content(payload):
ret = ''
if isinstance(payload, list):
temp = [ ]
for part in payload:
charset_decoded = charset_decode(part)
temp.append(charset_decoded)ret = "\n".join(temp)
else:
if isinstance(payload, str):
ret = payload
else:
ret = charset_decode(payload)soup = BeautifulSoup(ret, 'html.parser')
return soup.get_text()
# main ----------------------------------------------------
lines = [ ]
mbox = mailbox.mbox(mboxfilename)
for i, message in enumerate(mbox):
#if i > 100: breaktemp_title = get_decoded_item(message['subject'])
if re.search(r'フォロー', temp_title):
temp = get_content(message.get_payload())
lines.append(temp)mbox.close()
wd = separatewordsj2()
vec = CountVectorizer(analyzer=wd.extract_words)
X = vec.fit_transform(lines)names_list = vec.get_feature_names_out()
x_list = X.toarray()print(','.join(names_list))
print(','.join(map(str, x_list[0])))