import termextract.janome
import termextract.core
from pprint import pprint # このサンプルでの処理結果の整形表示のため
import dbm
DF = dbm.open("df", "n")
from janome.tokenizer import Tokenizer
t = Tokenizer()
「人工知能」( https://ja.wikipedia.org/wiki/%E4%BA%BA%E5%B7%A5%E7%9F%A5%E8%83%BD ) / 「自然言語処理」( https://ja.wikipedia.org/wiki/%E8%87%AA%E7%84%B6%E8%A8%80%E8%AA%9E%E5%87%A6%E7%90%86 ) / 「専門用語」( https://ja.wikipedia.org/wiki/%E5%B0%82%E9%96%80%E7%94%A8%E8%AA%9E )
input_files = ["jpn_sample_s.txt", "jpn_sample2_s.txt", "jpn_sample3_s.txt"]
for file in input_files:
f = open(file, "r", encoding="utf-8")
text = f.read()
f.close()
tokenize_text = t.tokenize(text)
frequency = termextract.janome.cmp_noun_dict(tokenize_text)
termextract.core.store_df(frequency, dbm=DF)
DF.close()
f = open("jpn_sample_s.txt", "r", encoding="utf-8")
text = f.read()
f.close
tokenize_text = t.tokenize(text)
frequency = termextract.janome.cmp_noun_dict(tokenize_text)
pprint(frequency)
tf = termextract.core.frequency2tf(frequency)
pprint(tf)
DF = dbm.open("df", "r")
idf = termextract.core.get_idf(frequency, dbm=DF)
DF.close()
term_imp = termextract.core.term_importance(tf, idf)
pprint(term_imp)
膠着言語の場合は、termextract.core.modify_agglutinative_lang()で整形をする
import collections
data_collection = collections.Counter(term_imp)
for cmp_noun, value in data_collection.most_common():
print(termextract.core.modify_agglutinative_lang(cmp_noun), value, sep="\t")