package TermExtract::ChainesPlainTextGB; use TermExtract::Calc_Imp; use strict; use Exporter (); use vars qw(@ISA $VERSION @EXPORT); @ISA = qw(TermExtract::Calc_Imp Exporter); @EXPORT = qw(); $VERSION = "0.31"; # ======================================================================== # get_noun_frq -- Get noun frequency. # The values of the hash are frequency of the noun. # (専門用語とその頻度を得るサブルーチン) # # Over-write TermExtract::Calc_Imp::get_noun_frq # # ======================================================================== sub get_noun_frq { my $self = shift; my $data = shift; # 入力データ my $mode = shift || 0; # 入力データがファイルか、変数かの識別用フラグ my %cmp_noun_list = (); # 複合語と頻度情報を入れたハッシュ(関数の戻り値) my @terms = (); $self->IsAgglutinativeLang; # 膠着言語指定(単語間1字空けなし) $self->IgnoreWords("和","与"); # 重要度計算外の語を指定 # 入力がファイルの場合 if ($mode ne 'var') { local($/) = undef; open (IN, $data) || die "Can not open input file. $!"; $data = ; close IN; } foreach my $morph ((split /\n/, $data)) { chomp $morph; $morph = CutStopWords($morph); LOOP: foreach my $word ((split /\s+/, $morph)) { next if $word eq ""; my $terms = cut_GB($word); $cmp_noun_list{ join ' ', @$terms }++ if $$terms[0]; } } return \%cmp_noun_list; } # ストップワード処理 sub CutStopWords { my $word = shift; my $noun = ""; my $term = ""; my $match = 0; LOOP: while($word ne "") { # ストップワードを指定 if ($word =~ s/^ //) { $noun = " "; $match = 1; } elsif ($word =~ s/^\t//) { $noun = " "; $match = 1; } elsif ($word =~ s/^;//) { $noun = " "; $match = 1; } elsif ($word =~ s/^不过难道//) { $noun = " "; $match = 1; } elsif ($word =~ s/^不知怎么//) { $noun = " "; $match = 1; } elsif ($word =~ s/^关于为了//) { $noun = " "; $match = 1; } elsif ($word =~ s/^有时候//) { $noun = " "; $match = 1; } elsif ($word =~ s/^有一天//) { $noun = " "; $match = 1; } elsif ($word =~ s/^一会儿//) { $noun = " "; $match = 1; } elsif ($word =~ s/^那么样//) { $noun = " "; $match = 1; } elsif ($word =~ s/^怎么样//) { $noun = " "; $match = 1; } elsif ($word =~ s/^这么样//) { $noun = " "; $match = 1; } elsif ($word =~ s/^十二分//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以上//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以下//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以前//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以后//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以左//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以右//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以西//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以南//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以北//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以内//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以外//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之上//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之下//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之前//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之后//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之左//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之右//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之西//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之南//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之北//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之内//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之外//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之中//) { $noun = " "; $match = 1; } elsif ($word =~ s/^上面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^下面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^前面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^后面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^左面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^右面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^西面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^南面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^北面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^内面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^外面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^多少//) { $noun = " "; $match = 1; } elsif ($word =~ s/^何如//) { $noun = " "; $match = 1; } elsif ($word =~ s/^若何//) { $noun = " "; $match = 1; } elsif ($word =~ s/^奈何//) { $noun = " "; $match = 1; } elsif ($word =~ s/^何以//) { $noun = " "; $match = 1; } elsif ($word =~ s/^那儿//) { $noun = " "; $match = 1; } elsif ($word =~ s/^那里//) { $noun = " "; $match = 1; } elsif ($word =~ s/^那些//) { $noun = " "; $match = 1; } elsif ($word =~ s/^那里//) { $noun = " "; $match = 1; } elsif ($word =~ s/^不知//) { $noun = " "; $match = 1; } elsif ($word =~ s/^早已//) { $noun = " "; $match = 1; } elsif ($word =~ s/^早就//) { $noun = " "; $match = 1; } elsif ($word =~ s/^正在//) { $noun = " "; $match = 1; } elsif ($word =~ s/^将要//) { $noun = " "; $match = 1; } elsif ($word =~ s/^就要//) { $noun = " "; $match = 1; } elsif ($word =~ s/^立刻//) { $noun = " "; $match = 1; } elsif ($word =~ s/^然后//) { $noun = " "; $match = 1; } elsif ($word =~ s/^常常//) { $noun = " "; $match = 1; } elsif ($word =~ s/^一再//) { $noun = " "; $match = 1; } elsif ($word =~ s/^再三//) { $noun = " "; $match = 1; } elsif ($word =~ s/^一直//) { $noun = " "; $match = 1; } elsif ($word =~ s/^一向//) { $noun = " "; $match = 1; } elsif ($word =~ s/^向来//) { $noun = " "; $match = 1; } elsif ($word =~ s/^忽然//) { $noun = " "; $match = 1; } elsif ($word =~ s/^偶然//) { $noun = " "; $match = 1; } elsif ($word =~ s/^仍旧//) { $noun = " "; $match = 1; } elsif ($word =~ s/^依然//) { $noun = " "; $match = 1; } elsif ($word =~ s/^十分//) { $noun = " "; $match = 1; } elsif ($word =~ s/^非常//) { $noun = " "; $match = 1; } elsif ($word =~ s/^格外//) { $noun = " "; $match = 1; } elsif ($word =~ s/^至于//) { $noun = " "; $match = 1; } elsif ($word =~ s/^由于//) { $noun = " "; $match = 1; } elsif ($word =~ s/^分外//) { $noun = " "; $match = 1; } elsif ($word =~ s/^更加//) { $noun = " "; $match = 1; } elsif ($word =~ s/^相当//) { $noun = " "; $match = 1; } elsif ($word =~ s/^稍微//) { $noun = " "; $match = 1; } elsif ($word =~ s/^略微//) { $noun = " "; $match = 1; } elsif ($word =~ s/^有点//) { $noun = " "; $match = 1; } elsif ($word =~ s/^有些//) { $noun = " "; $match = 1; } elsif ($word =~ s/^十足//) { $noun = " "; $match = 1; } elsif ($word =~ s/^万分//) { $noun = " "; $match = 1; } elsif ($word =~ s/^千万//) { $noun = " "; $match = 1; } elsif ($word =~ s/^一定//) { $noun = " "; $match = 1; } elsif ($word =~ s/^必定//) { $noun = " "; $match = 1; } elsif ($word =~ s/^必然//) { $noun = " "; $match = 1; } elsif ($word =~ s/^没有//) { $noun = " "; $match = 1; } elsif ($word =~ s/^不必//) { $noun = " "; $match = 1; } elsif ($word =~ s/^未必//) { $noun = " "; $match = 1; } elsif ($word =~ s/^究竟//) { $noun = " "; $match = 1; } elsif ($word =~ s/^到底//) { $noun = " "; $match = 1; } elsif ($word =~ s/^偏偏//) { $noun = " "; $match = 1; } elsif ($word =~ s/^索性//) { $noun = " "; $match = 1; } elsif ($word =~ s/^反正//) { $noun = " "; $match = 1; } elsif ($word =~ s/^大概//) { $noun = " "; $match = 1; } elsif ($word =~ s/^好在//) { $noun = " "; $match = 1; } elsif ($word =~ s/^几乎//) { $noun = " "; $match = 1; } elsif ($word =~ s/^明明//) { $noun = " "; $match = 1; } elsif ($word =~ s/^向往//) { $noun = " "; $match = 1; } elsif ($word =~ s/^沿着//) { $noun = " "; $match = 1; } elsif ($word =~ s/^自从//) { $noun = " "; $match = 1; } elsif ($word =~ s/^除非//) { $noun = " "; $match = 1; } elsif ($word =~ s/^除了//) { $noun = " "; $match = 1; } elsif ($word =~ s/^按照//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以及//) { $noun = " "; $match = 1; } elsif ($word =~ s/^而且//) { $noun = " "; $match = 1; } elsif ($word =~ s/^并且//) { $noun = " "; $match = 1; } elsif ($word =~ s/^或者//) { $noun = " "; $match = 1; } elsif ($word =~ s/^不是//) { $noun = " "; $match = 1; } elsif ($word =~ s/^不但//) { $noun = " "; $match = 1; } elsif ($word =~ s/^与其//) { $noun = " "; $match = 1; } elsif ($word =~ s/^不如//) { $noun = " "; $match = 1; } elsif ($word =~ s/^但是//) { $noun = " "; $match = 1; } elsif ($word =~ s/^尽管//) { $noun = " "; $match = 1; } elsif ($word =~ s/^可是//) { $noun = " "; $match = 1; } elsif ($word =~ s/^如果//) { $noun = " "; $match = 1; } elsif ($word =~ s/^假如//) { $noun = " "; $match = 1; } elsif ($word =~ s/^即使//) { $noun = " "; $match = 1; } elsif ($word =~ s/^只有//) { $noun = " "; $match = 1; } elsif ($word =~ s/^只要//) { $noun = " "; $match = 1; } elsif ($word =~ s/^除非//) { $noun = " "; $match = 1; } elsif ($word =~ s/^所以//) { $noun = " "; $match = 1; } elsif ($word =~ s/^因此//) { $noun = " "; $match = 1; } elsif ($word =~ s/^既然//) { $noun = " "; $match = 1; } elsif ($word =~ s/^不管//) { $noun = " "; $match = 1; } elsif ($word =~ s/^况且//) { $noun = " "; $match = 1; } elsif ($word =~ s/^于是//) { $noun = " "; $match = 1; } elsif ($word =~ s/^何况//) { $noun = " "; $match = 1; } elsif ($word =~ s/^然而//) { $noun = " "; $match = 1; } elsif ($word =~ s/^只是//) { $noun = " "; $match = 1; } elsif ($word =~ s/^就是//) { $noun = " "; $match = 1; } elsif ($word =~ s/^或是//) { $noun = " "; $match = 1; } elsif ($word =~ s/^不是//) { $noun = " "; $match = 1; } elsif ($word =~ s/^就是//) { $noun = " "; $match = 1; } elsif ($word =~ s/^是故//) { $noun = " "; $match = 1; } elsif ($word =~ s/^是以//) { $noun = " "; $match = 1; } elsif ($word =~ s/^而已//) { $noun = " "; $match = 1; } elsif ($word =~ s/^那里//) { $noun = " "; $match = 1; } elsif ($word =~ s/^全部//) { $noun = " "; $match = 1; } elsif ($word =~ s/^今天//) { $noun = " "; $match = 1; } elsif ($word =~ s/^本月//) { $noun = " "; $match = 1; } elsif ($word =~ s/^昨天//) { $noun = " "; $match = 1; } elsif ($word =~ s/^上周//) { $noun = " "; $match = 1; } elsif ($word =~ s/^后天//) { $noun = " "; $match = 1; } elsif ($word =~ s/^明天//) { $noun = " "; $match = 1; } elsif ($word =~ s/^一日//) { $noun = " "; $match = 1; } elsif ($word =~ s/^他日//) { $noun = " "; $match = 1; } elsif ($word =~ s/^日日//) { $noun = " "; $match = 1; } elsif ($word =~ s/^一旦//) { $noun = " "; $match = 1; } elsif ($word =~ s/^天天//) { $noun = " "; $match = 1; } elsif ($word =~ s/^年年//) { $noun = " "; $match = 1; } elsif ($word =~ s/^好久//) { $noun = " "; $match = 1; } elsif ($word =~ s/^片刻//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之间//) { $noun = " "; $match = 1; } elsif ($word =~ s/^上边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^下边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^前边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^后边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^右边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^东边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^西边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^南边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^北边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^内边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^外边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^旁边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^上面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^下面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^前面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^后面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^左面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^右面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^东面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^西面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^南面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^北面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^内面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^外面//) { $noun = " "; $match = 1; } elsif ($word =~ s/^上头//) { $noun = " "; $match = 1; } elsif ($word =~ s/^下头//) { $noun = " "; $match = 1; } elsif ($word =~ s/^前头//) { $noun = " "; $match = 1; } elsif ($word =~ s/^后头//) { $noun = " "; $match = 1; } elsif ($word =~ s/^外头//) { $noun = " "; $match = 1; } elsif ($word =~ s/^里头//) { $noun = " "; $match = 1; } elsif ($word =~ s/^什么//) { $noun = " "; $match = 1; } elsif ($word =~ s/^哪儿//) { $noun = " "; $match = 1; } elsif ($word =~ s/^哪里//) { $noun = " "; $match = 1; } elsif ($word =~ s/^几时//) { $noun = " "; $match = 1; } elsif ($word =~ s/^怎样//) { $noun = " "; $match = 1; } elsif ($word =~ s/^怎么//) { $noun = " "; $match = 1; } elsif ($word =~ s/^多么//) { $noun = " "; $match = 1; } elsif ($word =~ s/^这儿//) { $noun = " "; $match = 1; } elsif ($word =~ s/^这里//) { $noun = " "; $match = 1; } elsif ($word =~ s/^这么//) { $noun = " "; $match = 1; } elsif ($word =~ s/^这样//) { $noun = " "; $match = 1; } elsif ($word =~ s/^那么//) { $noun = " "; $match = 1; } elsif ($word =~ s/^那样//) { $noun = " "; $match = 1; } elsif ($word =~ s/^这些//) { $noun = " "; $match = 1; } elsif ($word =~ s/^我们//) { $noun = " "; $match = 1; } elsif ($word =~ s/^咱们//) { $noun = " "; $match = 1; } elsif ($word =~ s/^你们//) { $noun = " "; $match = 1; } elsif ($word =~ s/^他们//) { $noun = " "; $match = 1; } elsif ($word =~ s/^甚么//) { $noun = " "; $match = 1; } elsif ($word =~ s/^已经//) { $noun = " "; $match = 1; } elsif ($word =~ s/^曾经//) { $noun = " "; $match = 1; } elsif ($word =~ s/^刚刚//) { $noun = " "; $match = 1; } elsif ($word =~ s/^马上//) { $noun = " "; $match = 1; } elsif ($word =~ s/^顿时//) { $noun = " "; $match = 1; } elsif ($word =~ s/^终于//) { $noun = " "; $match = 1; } elsif ($word =~ s/^常常//) { $noun = " "; $match = 1; } elsif ($word =~ s/^时常//) { $noun = " "; $match = 1; } elsif ($word =~ s/^时时//) { $noun = " "; $match = 1; } elsif ($word =~ s/^经常//) { $noun = " "; $match = 1; } elsif ($word =~ s/^始终//) { $noun = " "; $match = 1; } elsif ($word =~ s/^永远//) { $noun = " "; $match = 1; } elsif ($word =~ s/^忽然//) { $noun = " "; $match = 1; } elsif ($word =~ s/^偶然//) { $noun = " "; $match = 1; } elsif ($word =~ s/^暂时//) { $noun = " "; $match = 1; } elsif ($word =~ s/^渐渐//) { $noun = " "; $match = 1; } elsif ($word =~ s/^连忙//) { $noun = " "; $match = 1; } elsif ($word =~ s/^总共//) { $noun = " "; $match = 1; } elsif ($word =~ s/^仅仅//) { $noun = " "; $match = 1; } elsif ($word =~ s/^异常//) { $noun = " "; $match = 1; } elsif ($word =~ s/^极其//) { $noun = " "; $match = 1; } elsif ($word =~ s/^多么//) { $noun = " "; $match = 1; } elsif ($word =~ s/^比较//) { $noun = " "; $match = 1; } elsif ($word =~ s/^简直//) { $noun = " "; $match = 1; } elsif ($word =~ s/^幸亏//) { $noun = " "; $match = 1; } elsif ($word =~ s/^多亏//) { $noun = " "; $match = 1; } elsif ($word =~ s/^也许//) { $noun = " "; $match = 1; } elsif ($word =~ s/^大约//) { $noun = " "; $match = 1; } elsif ($word =~ s/^好在//) { $noun = " "; $match = 1; } elsif ($word =~ s/^难道//) { $noun = " "; $match = 1; } elsif ($word =~ s/^顺着//) { $noun = " "; $match = 1; } elsif ($word =~ s/^对于//) { $noun = " "; $match = 1; } elsif ($word =~ s/^为着//) { $noun = " "; $match = 1; } elsif ($word =~ s/^要么//) { $noun = " "; $match = 1; } elsif ($word =~ s/^宁可//) { $noun = " "; $match = 1; } elsif ($word =~ s/^虽然//) { $noun = " "; $match = 1; } elsif ($word =~ s/^倘若//) { $noun = " "; $match = 1; } elsif ($word =~ s/^即使//) { $noun = " "; $match = 1; } elsif ($word =~ s/^哪怕//) { $noun = " "; $match = 1; } elsif ($word =~ s/^纵然//) { $noun = " "; $match = 1; } elsif ($word =~ s/^因为//) { $noun = " "; $match = 1; } elsif ($word =~ s/^不论//) { $noun = " "; $match = 1; } elsif ($word =~ s/^那么//) { $noun = " "; $match = 1; } elsif ($word =~ s/^罢了//) { $noun = " "; $match = 1; } elsif ($word =~ s/^呜呼//) { $noun = " "; $match = 1; } elsif ($word =~ s/^啊呀//) { $noun = " "; $match = 1; } elsif ($word =~ s/^啊哟//) { $noun = " "; $match = 1; } elsif ($word =~ s/^这里//) { $noun = " "; $match = 1; } elsif ($word =~ s/^到处//) { $noun = " "; $match = 1; } elsif ($word =~ s/^有时//) { $noun = " "; $match = 1; } elsif ($word =~ s/^时而//) { $noun = " "; $match = 1; } elsif ($word =~ s/^时时//) { $noun = " "; $match = 1; } elsif ($word =~ s/^俄顷//) { $noun = " "; $match = 1; } elsif ($word =~ s/^须臾//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以上//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以下//) { $noun = " "; $match = 1; } elsif ($word =~ s/^不下//) { $noun = " "; $match = 1; } elsif ($word =~ s/^两//) { $noun = " "; $match = 1; } elsif ($word =~ s/^吨//) { $noun = " "; $match = 1; } elsif ($word =~ s/^头//) { $noun = " "; $match = 1; } elsif ($word =~ s/^间//) { $noun = " "; $match = 1; } elsif ($word =~ s/^种//) { $noun = " "; $match = 1; } elsif ($word =~ s/^页//) { $noun = " "; $match = 1; } elsif ($word =~ s/^对//) { $noun = " "; $match = 1; } elsif ($word =~ s/^边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^哪//) { $noun = " "; $match = 1; } elsif ($word =~ s/^谁//) { $noun = " "; $match = 1; } elsif ($word =~ s/^连//) { $noun = " "; $match = 1; } elsif ($word =~ s/^让//) { $noun = " "; $match = 1; } elsif ($word =~ s/^为//) { $noun = " "; $match = 1; } elsif ($word =~ s/^岂//) { $noun = " "; $match = 1; } elsif ($word =~ s/^别//) { $noun = " "; $match = 1; } elsif ($word =~ s/^很//) { $noun = " "; $match = 1; } elsif ($word =~ s/^极//) { $noun = " "; $match = 1; } elsif ($word =~ s/^顶//) { $noun = " "; $match = 1; } elsif ($word =~ s/^尔//) { $noun = " "; $match = 1; } elsif ($word =~ s/^净//) { $noun = " "; $match = 1; } elsif ($word =~ s/^总//) { $noun = " "; $match = 1; } elsif ($word =~ s/^噢//) { $noun = " "; $match = 1; } elsif ($word =~ s/^哼//) { $noun = " "; $match = 1; } elsif ($word =~ s/^才//) { $noun = " "; $match = 1; } elsif ($word =~ s/^顾//) { $noun = " "; $match = 1; } elsif ($word =~ s/^为//) { $noun = " "; $match = 1; } elsif ($word =~ s/^则//) { $noun = " "; $match = 1; } elsif ($word =~ s/^过//) { $noun = " "; $match = 1; } elsif ($word =~ s/^啊//) { $noun = " "; $match = 1; } elsif ($word =~ s/^呢//) { $noun = " "; $match = 1; } elsif ($word =~ s/^吗//) { $noun = " "; $match = 1; } elsif ($word =~ s/^啦//) { $noun = " "; $match = 1; } elsif ($word =~ s/^岂//) { $noun = " "; $match = 1; } elsif ($word =~ s/^宁//) { $noun = " "; $match = 1; } elsif ($word =~ s/^长//) { $noun = " "; $match = 1; } elsif ($word =~ s/^你//) { $noun = " "; $match = 1; } elsif ($word =~ s/^她//) { $noun = " "; $match = 1; } elsif ($word =~ s/^尔//) { $noun = " "; $match = 1; } elsif ($word =~ s/^您//) { $noun = " "; $match = 1; } elsif ($word =~ s/^朝//) { $noun = " "; $match = 1; } elsif ($word =~ s/^到//) { $noun = " "; $match = 1; } elsif ($word =~ s/^在//) { $noun = " "; $match = 1; } elsif ($word =~ s/^由//) { $noun = " "; $match = 1; } elsif ($word =~ s/^并//) { $noun = " "; $match = 1; } elsif ($word =~ s/^没//) { $noun = " "; $match = 1; } elsif ($word =~ s/^而//) { $noun = " "; $match = 1; } elsif ($word =~ s/^间//) { $noun = " "; $match = 1; } elsif ($word =~ s/^边//) { $noun = " "; $match = 1; } elsif ($word =~ s/^暂//) { $noun = " "; $match = 1; } elsif ($word =~ s/^颇//) { $noun = " "; $match = 1; } elsif ($word =~ s/^极//) { $noun = " "; $match = 1; } elsif ($word =~ s/^别//) { $noun = " "; $match = 1; } elsif ($word =~ s/^还//) { $noun = " "; $match = 1; } elsif ($word =~ s/^我//) { $noun = " "; $match = 1; } elsif ($word =~ s/^他//) { $noun = " "; $match = 1; } elsif ($word =~ s/^它//) { $noun = " "; $match = 1; } elsif ($word =~ s/^吾//) { $noun = " "; $match = 1; } elsif ($word =~ s/^余//) { $noun = " "; $match = 1; } elsif ($word =~ s/^予//) { $noun = " "; $match = 1; } elsif ($word =~ s/^若//) { $noun = " "; $match = 1; } elsif ($word =~ s/^乃//) { $noun = " "; $match = 1; } elsif ($word =~ s/^而//) { $noun = " "; $match = 1; } elsif ($word =~ s/^其//) { $noun = " "; $match = 1; } elsif ($word =~ s/^之//) { $noun = " "; $match = 1; } elsif ($word =~ s/^彼//) { $noun = " "; $match = 1; } elsif ($word =~ s/^从//) { $noun = " "; $match = 1; } elsif ($word =~ s/^到//) { $noun = " "; $match = 1; } elsif ($word =~ s/^在//) { $noun = " "; $match = 1; } elsif ($word =~ s/^当//) { $noun = " "; $match = 1; } elsif ($word =~ s/^把//) { $noun = " "; $match = 1; } elsif ($word =~ s/^跟//) { $noun = " "; $match = 1; } elsif ($word =~ s/^同//) { $noun = " "; $match = 1; } elsif ($word =~ s/^按//) { $noun = " "; $match = 1; } elsif ($word =~ s/^用//) { $noun = " "; $match = 1; } elsif ($word =~ s/^将//) { $noun = " "; $match = 1; } elsif ($word =~ s/^才//) { $noun = " "; $match = 1; } elsif ($word =~ s/^就//) { $noun = " "; $match = 1; } elsif ($word =~ s/^正//) { $noun = " "; $match = 1; } elsif ($word =~ s/^在//) { $noun = " "; $match = 1; } elsif ($word =~ s/^将//) { $noun = " "; $match = 1; } elsif ($word =~ s/^准//) { $noun = " "; $match = 1; } elsif ($word =~ s/^都//) { $noun = " "; $match = 1; } elsif ($word =~ s/^全//) { $noun = " "; $match = 1; } elsif ($word =~ s/^共//) { $noun = " "; $match = 1; } elsif ($word =~ s/^只//) { $noun = " "; $match = 1; } elsif ($word =~ s/^光//) { $noun = " "; $match = 1; } elsif ($word =~ s/^很//) { $noun = " "; $match = 1; } elsif ($word =~ s/^太//) { $noun = " "; $match = 1; } elsif ($word =~ s/^挺//) { $noun = " "; $match = 1; } elsif ($word =~ s/^最//) { $noun = " "; $match = 1; } elsif ($word =~ s/^更//) { $noun = " "; $match = 1; } elsif ($word =~ s/^几//) { $noun = " "; $match = 1; } elsif ($word =~ s/^何//) { $noun = " "; $match = 1; } elsif ($word =~ s/^又//) { $noun = " "; $match = 1; } elsif ($word =~ s/^再//) { $noun = " "; $match = 1; } elsif ($word =~ s/^也//) { $noun = " "; $match = 1; } elsif ($word =~ s/^或//) { $noun = " "; $match = 1; } elsif ($word =~ s/^除//) { $noun = " "; $match = 1; } elsif ($word =~ s/^被//) { $noun = " "; $match = 1; } elsif ($word =~ s/^叫//) { $noun = " "; $match = 1; } elsif ($word =~ s/^受//) { $noun = " "; $match = 1; } elsif ($word =~ s/^比//) { $noun = " "; $match = 1; } elsif ($word =~ s/^及//) { $noun = " "; $match = 1; } elsif ($word =~ s/^既//) { $noun = " "; $match = 1; } elsif ($word =~ s/^亦//) { $noun = " "; $match = 1; } elsif ($word =~ s/^然//) { $noun = " "; $match = 1; } elsif ($word =~ s/^况//) { $noun = " "; $match = 1; } elsif ($word =~ s/^其//) { $noun = " "; $match = 1; } elsif ($word =~ s/^尚//) { $noun = " "; $match = 1; } elsif ($word =~ s/^乎//) { $noun = " "; $match = 1; } elsif ($word =~ s/^哉//) { $noun = " "; $match = 1; } elsif ($word =~ s/^也//) { $noun = " "; $match = 1; } elsif ($word =~ s/^耳//) { $noun = " "; $match = 1; } elsif ($word =~ s/^噫//) { $noun = " "; $match = 1; } elsif ($word =~ s/^呀//) { $noun = " "; $match = 1; } elsif ($word =~ s/^末//) { $noun = " "; $match = 1; } elsif ($word =~ s/^矣//) { $noun = " "; $match = 1; } elsif ($word =~ s/^已//) { $noun = " "; $match = 1; } elsif ($word =~ s/^焉//) { $noun = " "; $match = 1; } elsif ($word =~ s/^久//) { $noun = " "; $match = 1; } elsif ($word =~ s/^少//) { $noun = " "; $match = 1; } elsif ($word =~ s/^地//) { $noun = " "; $match = 1; } elsif ($word =~ s/^得//) { $noun = " "; $match = 1; } elsif ($word =~ s/^不//) { $noun = " "; $match = 1; } elsif ($word =~ s/^勿//) { $noun = " "; $match = 1; } elsif ($word =~ s/^未//) { $noun = " "; $match = 1; } elsif ($word =~ s/^莫//) { $noun = " "; $match = 1; } elsif ($word =~ s/^休//) { $noun = " "; $match = 1; } elsif ($word =~ s/^能//) { $noun = " "; $match = 1; } elsif ($word =~ s/^得//) { $noun = " "; $match = 1; } elsif ($word =~ s/^会//) { $noun = " "; $match = 1; } elsif ($word =~ s/^可//) { $noun = " "; $match = 1; } elsif ($word =~ s/^必//) { $noun = " "; $match = 1; } elsif ($word =~ s/^足//) { $noun = " "; $match = 1; } elsif ($word =~ s/^着//) { $noun = " "; $match = 1; } elsif ($word =~ s/^也//) { $noun = " "; $match = 1; } elsif ($word =~ s/^亦//) { $noun = " "; $match = 1; } elsif ($word =~ s/^又//) { $noun = " "; $match = 1; } elsif ($word =~ s/^正//) { $noun = " "; $match = 1; } elsif ($word =~ s/^竟//) { $noun = " "; $match = 1; } elsif ($word =~ s/^且//) { $noun = " "; $match = 1; } elsif ($word =~ s/^即//) { $noun = " "; $match = 1; } elsif ($word =~ s/^就//) { $noun = " "; $match = 1; } elsif ($word =~ s/^俄//) { $noun = " "; $match = 1; } elsif ($word =~ s/^可//) { $noun = " "; $match = 1; } elsif ($word =~ s/^却//) { $noun = " "; $match = 1; } elsif ($word =~ s/^倒//) { $noun = " "; $match = 1; } elsif ($word =~ s/^上//) { $noun = " "; $match = 1; } elsif ($word =~ s/^下//) { $noun = " "; $match = 1; } elsif ($word =~ s/^前//) { $noun = " "; $match = 1; } elsif ($word =~ s/^后//) { $noun = " "; $match = 1; } elsif ($word =~ s/^里//) { $noun = " "; $match = 1; } elsif ($word =~ s/^旁//) { $noun = " "; $match = 1; } elsif ($word =~ s/^甚//) { $noun = " "; $match = 1; } elsif ($word =~ s/^略//) { $noun = " "; $match = 1; } elsif ($word =~ s/^太//) { $noun = " "; $match = 1; } elsif ($word =~ s/^今//) { $noun = " "; $match = 1; } elsif ($word =~ s/^昔//) { $noun = " "; $match = 1; } elsif ($word =~ s/^先//) { $noun = " "; $match = 1; } elsif ($word =~ s/^后//) { $noun = " "; $match = 1; } elsif ($word =~ s/^久//) { $noun = " "; $match = 1; } elsif ($word =~ s/^个//) { $noun = " "; $match = 1; } elsif ($word =~ s/^只//) { $noun = " "; $match = 1; } elsif ($word =~ s/^把//) { $noun = " "; $match = 1; } elsif ($word =~ s/^因//) { $noun = " "; $match = 1; } elsif ($word =~ s/^故//) { $noun = " "; $match = 1; } elsif ($word =~ s/^就//) { $noun = " "; $match = 1; } elsif ($word =~ s/^便//) { $noun = " "; $match = 1; } elsif ($word =~ s/^但//) { $noun = " "; $match = 1; } elsif ($word =~ s/^惟//) { $noun = " "; $match = 1; } elsif ($word =~ s/^或//) { $noun = " "; $match = 1; } elsif ($word =~ s/^件//) { $noun = " "; $match = 1; } elsif ($word =~ s/^条//) { $noun = " "; $match = 1; } elsif ($word =~ s/^双//) { $noun = " "; $match = 1; } elsif ($word =~ s/^打//) { $noun = " "; $match = 1; } elsif ($word =~ s/^合//) { $noun = " "; $match = 1; } elsif ($word =~ s/^群//) { $noun = " "; $match = 1; } elsif ($word =~ s/^来//) { $noun = " "; $match = 1; } elsif ($word =~ s/^去//) { $noun = " "; $match = 1; } elsif ($word =~ s/^起//) { $noun = " "; $match = 1; } elsif ($word =~ s/^住//) { $noun = " "; $match = 1; } elsif ($word =~ s/^已//) { $noun = " "; $match = 1; } elsif ($word =~ s/^方//) { $noun = " "; $match = 1; } elsif ($word =~ s/^将//) { $noun = " "; $match = 1; } elsif ($word =~ s/^着//) { $noun = " "; $match = 1; } elsif ($word =~ s/^了//) { $noun = " "; $match = 1; } elsif ($word =~ s/^次//) { $noun = " "; $match = 1; } elsif ($word =~ s/^回//) { $noun = " "; $match = 1; } elsif ($word =~ s/^遭//) { $noun = " "; $match = 1; } elsif ($word =~ s/^声//) { $noun = " "; $match = 1; } elsif ($word =~ s/^下//) { $noun = " "; $match = 1; } elsif ($word =~ s/^口//) { $noun = " "; $match = 1; } elsif ($word =~ s/^笑//) { $noun = " "; $match = 1; } elsif ($word =~ s/^走//) { $noun = " "; $match = 1; } elsif ($word =~ s/^几//) { $noun = " "; $match = 1; } elsif ($word =~ s/^点//) { $noun = " "; $match = 1; } elsif ($word =~ s/^多//) { $noun = " "; $match = 1; } elsif ($word =~ s/^余//) { $noun = " "; $match = 1; } elsif ($word =~ s/^又//) { $noun = " "; $match = 1; } elsif ($word =~ s/^再//) { $noun = " "; $match = 1; } elsif ($word =~ s/^也//) { $noun = " "; $match = 1; } elsif ($word =~ s/^勿//) { $noun = " "; $match = 1; } elsif ($word =~ s/^的//) { $noun = " "; $match = 1; } elsif ($word =~ s/^是//) { $noun = " "; $match = 1; } elsif ($word =~ s/^已//) { $noun = " "; $match = 1; } elsif ($word =~ s/^了//) { $noun = " "; $match = 1; } elsif ($word =~ s/^有//) { $noun = " "; $match = 1; } elsif ($word =~ s/^等//) { $noun = " "; $match = 1; } elsif ($word =~ s/^于//) { $noun = " "; $match = 1; } elsif ($word =~ s/^以//) { $noun = " "; $match = 1; } elsif ($word =~ s/^、//) { $noun = " "; $match = 1; } elsif ($word =~ s/^。//) { $noun = " "; $match = 1; } elsif ($word =~ s/^”//) { $noun = " "; $match = 1; } elsif ($word =~ s/^“//) { $noun = " "; $match = 1; } elsif ($word =~ s/^,//) { $noun = " "; $match = 1; } elsif ($word =~ s/^ //) { $noun = " "; $match = 1; } elsif ($word =~ s/^《//) { $noun = " "; $match = 1; } elsif ($word =~ s/^》//) { $noun = " "; $match = 1; } elsif ($word =~ s/^://) { $noun = " "; $match = 1; } elsif ($word =~ s/^(//) { $noun = " "; $match = 1; } elsif ($word =~ s/^)//) { $noun = " "; $match = 1; } elsif ($word =~ s/^;//) { $noun = " "; $match = 1; } elsif ($word =~ s/^〈//) { $noun = " "; $match = 1; } elsif ($word =~ s/^〉//) { $noun = " "; $match = 1; } elsif ($word =~ s/^「//) { $noun = " "; $match = 1; } elsif ($word =~ s/^」//) { $noun = " "; $match = 1; } elsif ($word =~ s/^『//) { $noun = " "; $match = 1; } elsif ($word =~ s/^』//) { $noun = " "; $match = 1; } elsif ($word =~ s/^【//) { $noun = " "; $match = 1; } elsif ($word =~ s/^】//) { $noun = " "; $match = 1; } elsif ($word =~ s/^〔//) { $noun = " "; $match = 1; } elsif ($word =~ s/^〕//) { $noun = " "; $match = 1; } unless ($match) { if ($word =~ s/^([\x00-\x7F])//) {} elsif ($word =~ s/^([\x81-\xFE][\x40-\xFE])//) {} elsif ($word =~ s/^([\x81-\xEF][\x30-\x39][\x81-\xEF][\x30-\x39])//) {} else {} $noun = $1; } $term .= $noun; $noun = ""; $match = 0; } $term =~ s/^ //; return $term; } # GBコードの文字列を1文字づつ切り出す sub cut_GB { my $word = shift; my @terms = (); my $iPos = 0; my $iLen = 0; my $ascii = ""; my $was_ascii = 0; my $noun = ""; for($iPos = 0;$word ne ""; $word = substr($word, $iLen)) { if ($word =~ /^([\x00-\x7F])/) { $iLen = 1; if ($was_ascii == 1) { $ascii .= $1; } else { $ascii = $1; } $was_ascii = 1; next; } elsif ($word =~ /^([\x81-\xFE][\x40-\xFE])/) { $iLen = 2; $noun = $1; } elsif ($word =~ /^([\x81-\xEF][\x30-\x39][\x81-\xEF][\x30-\x39])/) { $iLen = 4; $noun = $1; } else { $iLen = 1; $was_ascii = 1; next; } push @terms, $ascii if $was_ascii == 1; push @terms, $noun; $ascii = ""; $noun = ""; $was_ascii = 0; } return \@terms; } 1; __END__ =head1 NAME TermExtract::ChainesPlainTextGB -- 専門用語抽出モジュール(中国語GB版) =head1 SYNOPSIS use TermExtract::ChainesPlainTextGB; =head1 DESCRIPTION 中国語のテキストデータをからそのまま専門用語を抽出するプログラム。 当モジュールの使用法については、親クラス(TermExtract::Calc_Imp)か、 以下のサンプルスクリプトを参照のこと。 =head2 Sample Script #!/usr/local/bin/perl -w # # ex_CPT_GB.pl # # # 標準出力に専門用語とその重要度を返すプログラム # # version 0.07 # # use TermExtract::ChainesPlainTextGB; #use strict; my $data = new TermExtract::ChainesPlainTextGB; my $InputFile = "ChainesPlainText_out.txt"; # 入力ファイル指定 # プロセスの異常終了時処理 # (ロックディレクトリを使用した場合のみ) $SIG{INT} = $SIG{QUIT} = $SIG{TERM} = 'sigexit'; # 出力モードを指定 # 1 → 専門用語+重要度、2 → 専門用語のみ # 3 → カンマ区切り my $output_mode = 1; # # ドキュメント中の専門用語の頻度を重要度計算に使うかどうか選択 # (デフォルトは使用 $obj->use_frq) # #$data->no_frq; # ドキュメント中の専門用語の頻度を重要度計算に使わない #$data->use_frq; # ドキュメント中の専門用語の頻度を重要度計算に使う # # 重要度計算で連接語の"延べ数"をとるか"異なり数"をとるか選択 # (デフォルトは"延べ数"をとる $obj->use_total) # #$data->use_total; # 延べ数をとる #$data->use_uniq; # 異なり数をとる # # 重要度計算で、学習機能を使うかどうか選択 # (デフォルトは、使用する $obj->use_stat) # #$data->use_stat; # 学習機能を使う #$data->no_stat; # 学習機能を使わない # # 重要度計算で、「ドキュメント中の用語の頻度」と「連接語の重要度」 # のどちらに比重をおくかを設定する。 # デフォルト値は1 # 値が大きいほど「ドキュメント中の用語の頻度」の比重が高まる # #$data->average_rate(0.5); # # 学習機能用DBにデータを蓄積するかどうか選択 # 重要度計算で、学習機能を使うときは、セットしておいたほうが # 無難。処理対象に学習機能用DBに登録されていない語が含まれる # と正しく動作しない。 # (デフォルトは、蓄積する $obj->use_storage) # #$data->use_storage; # 蓄積する #$data->no_storage; # 蓄積しない # # 学習機能用DBに使用するDBMをSDBM_Fileに指定 # (デフォルトは、DB_FileのBTREEモード) # #$data->use_SDBM; # # 過去のドキュメントの累積統計を使う場合のデータベースの # ファイル名をセット # (デフォルトは "stat.db"と"comb.db") # #$data->stat_db("stat.db"); #$data->comb_db("comb.db"); # # データベースの排他ロックのための一時ディレクトリを指定 # ディレクトリ名が空文字列(デフォルト)の場合はロックしない # #$data->lock_dir("lock_dir"); # # データを読み込み # 専門用語リストを配列に返す # (累積統計DB使用、ドキュメント中の頻度使用にセット) # #my @noun_list = $data->get_imp_word($str, 'var'); # 入力が変数 my @noun_list = $data->get_imp_word($InputFile); # 入力がファイル # # 前回読み込んだテキストファイルを元に # モードを変えて、専門用語リストを配列に返す #$data->use_stat->no_frq; #my @noun_list2 = $data->get_imp_word(); # また、その結果を別のモードによる結果と掛け合わせる #@noun_list = $data->result_filter (\@noun_list, \@noun_list2, 30, 1000); # # 専門用語リストと計算した重要度を標準出力に出す # foreach (@noun_list) { # 数値のみは表示しない next if $_->[0] =~ /^\d+$/; # 1文字(GB)のみは表示しない next if $_->[0] =~ /^[\x00-\x7F]$/; next if $_->[0] =~ /^[\x81-\xFE][\x40-\xFE]$/; next if $_->[0] =~ /^[\x81-\xEF][\x30-\x39][\x81-\xEF][\x30-\x39]$/; # 結果表示($output_modeに応じて、出力様式を変更 printf "%-60s %16.2f\n", $_->[0], $_->[1] if $output_mode == 1; printf "%s\n", $_->[0] if $output_mode == 2; printf "%s,", $_->[0] if $output_mode == 3; } =head1 Methods このモジュールでは、get_imp_word のみ実装し、それ以外のメソッドは親 モジュール TermExtract::Calc_Imp で実装されている。 get_imp_word はストップワードにより文章を複合語の単位までに分割して いる。それ以外のメソッドについては、TermExtract::Calc_Imp のPODドキュ メントを参照すること。 =head2 get_imp_word 中国語文を次のルールにより複合語に生成する。第1引数は、処理対象のデ ータ、第2引数は第1引数の種別である。デフォルトでは、第1引数は、中国 語文のテキストファイルとなる。第2引数に文字列'var'がセットされたときに は、第一引数を中国語文のテキストデータが入ったスカラー変数と解釈する。 1.中国語文を次の条件により、複合語に分割する 1)改行があった場合は、そこで複合語の区切りとする 2)指定したストップワードが出現した場合は、そこで複合語の区切り とする。ストップワードは以下のとおり。 有时候 有一天 一会儿 那么样 怎么样 这么样 十二分 以上 以下 以前 以后 以左 以右 以西 以南 以北 以内 以外 之上 之下 之前 之后 之左 之右 之西 之南 之北 之内 之外 之中 上面 下面 前面 后面 左面 右面 西面 南面 北面 内面 外面 多少 何如 若何 奈何 何以 那儿 那里 那些 那里 不知 早已 早就 正在 将要 就要 立刻 然后 常常 一再 再三 一直 一向 向来 忽然 偶然 仍旧 依然 十分 非常 格外 至于 由于 分外 更加 相当 稍微 略微 有点 有些 十足 万分 千万 一定 必定 必然 没有 不必 未必 究竟 到底 偏偏 索性 反正 大概 好在 几乎 明明 向往 沿着 自从 除非 除了 按照 以及 而且 并且 或者 不是 不但 与其 不如 但是 尽管 可是 如果 假如 即使 只有 只要 除非 所以 因此 既然 不管 况且 于是 何况 然而 只是 就是 或是 不是 就是 是故 是以 而已 那里 全部 今天 本月 昨天 上周 后天 明天 一日 他日 日日 一旦 天天 年年 好久 片刻 之间 上边 下边 前边 后边 右边 东边 西边 南边 北边 内边 外边 旁边 上面 下面 前面 后面 左面 右面 东面 西面 南面 北面 内面 外面 上头 下头 前头 后头 外头 里头 什么 哪儿 哪里 几时 怎样 怎么 多么 这儿 这里 这么 这样 那么 那样 这些 我们 咱们 你们 他们 甚么 已经 曾经 刚刚 马上 顿时 终于 常常 时常 时时 经常 始终 永远 忽然 偶然 暂时 渐渐 连忙 总共 仅仅 异常 极其 多么 比较 简直 幸亏 多亏 也许 大约 好在 难道 顺着 对于 为着 要么 宁可 虽然 倘若 即使 哪怕 纵然 因为 不论 那么 罢了 呜呼 啊呀 啊哟 这里 到处 有时 时而 时时 俄顷 须臾 以上 以下 不下 两 吨 头 间 种 页 对 边 哪 谁 连 让 为 岂 别 很 极 顶 尔 净 总 噢 哼 才 顾 为 则 过 啊 呢 吗 啦 岂 宁 长 你 她 尔 您 朝 到 在 由 并 没 而 间 边 暂 颇 极 别 还 我 他 它 吾 余 予 若 乃 而 其 之 彼 从 到 在 当 把 跟 同 按 用 将 才 就 正 在 将 准 都 全 共 只 光 很 太 挺 最 更 几 何 又 再 也 或 除 被 叫 受 比 及 既 亦 然 况 其 尚 乎 哉 也 耳 噫 呀 末 矣 已 焉 久 少 地 得 不 勿 未 莫 休 能 得 会 可 必 足 着 也 亦 又 正 竟 且 即 就 俄 可 却 倒 上 下 前 后 里 旁 甚 略 太 今 昔 先 后 久 个 只 把 因 故 就 便 但 惟 或 件 条 双 打 合 群 来 去 起 住 已 方 将 着 了 次 回 遭 声 下 口 笑 走 几 点 多 余 又 再 也 勿 的 是 已 了 有 等 于 以 、 。 ” “ ,   《 》 : ; ( ) ; 〈 〉 「 」 『 』 【 】 〔 〕 2.重要度計算において次の語は無視する 和 与 =head1 SEE ALSO TermExtract::Calc_Imp TermExtract::Chasen TermExtract::MeCab TermExtract::BrillsTagger TermExtract::EnglishPlainText TermExtract::ChainesPlainTextUC =head1 COPYRIGHT このプログラムは、東京大学 中川裕志教授の英文専門用語抽出のアイデア  を元に、東京大学 前田朗 (maeda@lib.u-tokyo.ac.jp)が作成したものである。 ストップワードの調整は、東京大学 小島浩之(kojima@e.u-tokyo.ac.jp)が 行った。 なお、本プログラムの使用において生じたいかなる結果に関しても当方では 一切責任を負わない。 =cut