import termextract.mecab
import termextract.core
from pprint import pprint # このサンプルでの処理結果の整形表示のため
import dbm
db = dbm.open("termextrat", "n")
input_files = ["mecab_out_sample.txt", "mecab_out_sample2.txt", "mecab_out_sample3.txt"]
for file in input_files:
f = open(file, "r", encoding="utf-8")
tagged_text = f.read()
f.close
frequency = termextract.mecab.cmp_noun_dict(tagged_text)
termextract.core.store_lr(frequency, dbm=db)
db.close
f = open("mecab_out_sample.txt", "r", encoding="utf-8")
tagged_text = f.read()
f.close
frequency = termextract.mecab.cmp_noun_dict(tagged_text)
pprint(frequency)
db = dbm.open("termextract", "r")
LR = termextract.core.score_lr(frequency,
ignore_words=termextract.mecab.IGNORE_WORDS,
lr_mode=1, average_rate=1, dbm=db
)
db.close
pprint(LR)
term_imp = termextract.core.term_importance(frequency, LR)
pprint(term_imp)
膠着言語の場合は、TermExtract.Core.modify_agglutinative_lang()で整形をする
import collections
data_collection = collections.Counter(term_imp)
for cmp_noun, value in data_collection.most_common():
print(termextract.core.modify_agglutinative_lang(cmp_noun), value, sep="\t")