When performing Japanese natural language processing in Python,
I borrowed wisdom from senior bloggers,
so I managed to create a process for tokenization using MeCab.
I'll paste the source code at the end of this article.
As a FYI in this blog, I'll add "self.tagger.parse("")" to the sample code.
Without this, it seems that undefined analysis results might get mixed in.
You can run the source code.
It tokenizes the phrases in the data list after "if name == "main":".
(base) C:\PyTest\jobs_new2>python separatewordsj2.py
When you want to use CountVectorizer with Japanese tokenization,
you would use it like this.
#!/usr/bin/env python3
# -*- coding: utf8 -*-from sklearn.feature_extraction.text import CountVectorizer
from separatewordsj2 import separatewordsj2
wd = separatewordsj2()
vec = CountVectorizer(analyzer=wd.extract_words)
Save the source code below as "separatewordsj2.py"
# -*- coding: utf-8 -*-
import MeCab
class separatewordsj2:
INDEX_CATEGORY = 0
INDEX_ROOT_FORM = 6
TARGET_CATEGORIES = ["助詞","名詞","動詞","形容詞","副詞","連体詞","感動詞","助動詞"]def __init__(self, dictionary="-Ochasen"):
self.dictionary = dictionary
self.tagger = MeCab.Tagger(self.dictionary)
# for bug
self.tagger.parse("")def extract_words(self, text):
if not text:
return [ ]words = [ ]
node = self.tagger.parseToNode(text)
while node:
features = node.feature.split(",")
#print(node.surface, features)
if features[self.INDEX_CATEGORY] in self.TARGET_CATEGORIES:
temp_words = features[self.INDEX_ROOT_FORM]
if features[self.INDEX_ROOT_FORM] == "*":
temp_words = node.surface
words.append(temp_words)node = node.next
return words
if __name__ == "__main__":
data = [
"オブジェクト指向言語を使用する",
"蛙の「子」は変える",
"""私の名前!は
山田田中と申します""",
"あれは●終わってしまった"
]wd = separatewordsj2()
for d in data:
ret = wd.extract_words(d)
for r in ret:
print(r)