import termextract.janome
import termextract.core
from pprint import pprint # このサンプルでの処理結果の整形表示のため
f = open("jpn_sample_s.txt", "r", encoding="utf-8")
text = f.read()
f.close
print(text)
事前にjanomeのインストールが必要
from janome.tokenizer import Tokenizer
t = Tokenizer()
tokenize_text = t.tokenize(text)
frequency = termextract.janome.cmp_noun_dict(tokenize_text)
pprint(frequency)
#erm_list = termextract.janome.cmp_noun_list(tokenize_text)
#pprint(term_list)
lr = termextract.core.score_lr(
frequency,
ignore_words=termextract.mecab.IGNORE_WORDS,
lr_mode=1, average_rate=1)
pprint(lr)
term_imp = termextract.core.term_importance(frequency, lr)
pprint(term_imp)
膠着言語の場合は、termextract.core.modify_agglutinative_lang()で整形をする
import collections
data_collection = collections.Counter(term_imp)
for cmp_noun, value in data_collection.most_common():
print(termextract.core.modify_agglutinative_lang(cmp_noun), value, sep="\t")