package TermExtract::Chasen;
use TermExtract::Calc_Imp;

use strict;
use Exporter ();
use Encode 'from_to';
use vars qw(@ISA $VERSION @EXPORT);

@ISA = qw(TermExtract::Calc_Imp Exporter);
@EXPORT = qw();
$VERSION = "2.17";

# ========================================================================
# get_noun_frq -- Get noun frequency.
#                 The values of the hash are frequency of the noun.
# ѸȤ٤륵֥롼
#
#  Over-write TermExtract::Calc_Imp::get_noun_frq
#
# ========================================================================
sub get_noun_frq {
    my $self = shift;
    my $data = shift;           # ϥǡ
    my $mode = shift || 0;      # ϥǡե뤫ѿμѥե饰
    my %cmp_noun_list = ();     # ʣپ줿ϥåʴؿ͡
    my @input = ();             # ǲϷ̤
    my $must  = 0;              # θ줬̾ǤʤФʤʤϿ
    my @terms = ();             # ʣꥹȺѤκ
    my @unknown = ();           # ̤θѺѿ
    my @alphabet = ();          # ե٥åѺѿ

    $self->IsAgglutinativeLang; # ñ֣ʤ

    # ѸꥹȤءɲä륵֥롼
    my $add = sub {
        my $terms         = shift;
        my $cmp_noun_list = shift;

        # Ƭפʸκ
        if (defined $terms->[0]) {
            shift @$terms if $terms->[0] eq '';
        }
        # ;ʬʸκ
        if (defined $terms->[0]) {
            my $end = $terms->[$#$terms];
            if ( $end eq 'ʤ'  || $end eq ''   || $end eq ''       || 
                 $end eq ''    || $end eq ''   || $end eq ''       ||
                 $end eq ''    || $end eq ''   || $end eq ''       ||
                 $end =~ /^\s+$/ || $must) 
                { pop @$terms }
        }
        # convert eucjp to utf-8.
        foreach my $elem (@$terms) {
            Encode::from_to($elem, 'EUC-JP', 'UTF-8');
        }
        $cmp_noun_list->{ join ' ', @$terms }++ if defined $terms->[0];
        @$terms  = ();
    };

    # Ϥեξ
    if ($mode ne 'var') {
        local($/) = undef;
        open (IN, $data) || die "Can not open input file. $!";
        $data = <IN>;
        close IN;
    }
    # convert utf-8 to eucjp.
    Encode::from_to($data, 'UTF-8', 'EUC-JP');

    # ñ̾Ϣ
    foreach my $morph ((split "\n", $data)) {
        chomp $morph;
	    my ($noun, $part_of_speach) = (split(/\t/, $morph))[0,3];
        $part_of_speach = "" unless defined $part_of_speach;  # ʻ

        # 桦ͤǶڤ줿̤θפϡĤΤޤȤޤˤƤ
        #     ե٥å   \x41-\x5A, \x61-\x7A
        if ($part_of_speach eq '̤θ' & $noun !~ /^[\(\)\[\]\<\>|\"\'\;\,]/) {
            if (@unknown) {
                # ̤θפ桦ͤǷӤĤʤ
                unless ($unknown[$#unknown] =~ /[\x41-\x5A|\x61-\x7A]$/ &
                       $noun =~ /^[\x41-\x5A|\x61-\x7A]/) {
                    push @unknown, $noun;  # ̤θפҤȤޤȤˤ
                    next;
                }
            }
            else {
                push @unknown, $noun;
                next;
            }
        }
        # ̤θפκǸ夬ʤ
        while (@unknown) {
            if ($unknown[$#unknown] =~ /^[\x21-\x2F]|[{|}:\;\<\>\[\]]$/) {
                pop @unknown;
            }
            else {
            	last;
            }
        }
        push @terms, join "", @unknown  if @unknown;
        @unknown = ();

        # -ե٥åȤϡĤΤޤȤޤˤƤ
        if ($part_of_speach eq '-ե٥å') {
            push @alphabet, $noun;
            next;
        }
        push @terms, join "", @alphabet  if @alphabet;
        @alphabet = ();

        if( $part_of_speach eq '̾-'                               ||
            $part_of_speach eq '̾-³'                           ||
            $part_of_speach eq '̾--'                          ||
            $part_of_speach eq '̾--³'                      ||
            $part_of_speach eq '-ե٥å'                     ||
            $part_of_speach =~ /̾\-ͭ̾/                          ||
            $part_of_speach eq '̤θ' & 
                               $noun !~ /^[\x21-\x2F]|[{|}:\;\<\>\[\]]$/
          ){
            if ($part_of_speach eq '̤θ' & $noun =~ /.,$/) {
                chop $noun;
                push @terms, $noun if $noun ne "";
                &$add(\@terms, \%cmp_noun_list) unless $must;
            }
            else {
                push @terms, $noun;
            }
            $must = 0; next;
        }
        elsif(($part_of_speach eq '̾-ư촴' | 
               $part_of_speach eq '̾-ʥƻ촴')
           ){
            push @terms, $noun;
            $must = 1; next;
        }
        elsif($part_of_speach eq '̾--ư촴' & @terms){
            push @terms, $noun;
            $must = 1; next;
        }
        elsif($part_of_speach =~ /^ư/){
            @terms = ();
        }
        else {
            &$add(\@terms, \%cmp_noun_list) unless $must;
        }
        @terms = () if $must;
        $must = 0;
    }

    return \%cmp_noun_list;
}

1;

__END__

=head1 NAME

    TermExtract::Chasen -- ѸХ⥸塼ʡ䦡)

=head1 SYNOPSIS

    use TermExtract::Chasen;

=head1 DESCRIPTION

    ϥƥȤ򡢡䦡סüؤǺƤܸǲ
  ץˤˤη̤ȤϥƥȤѸФ
  ץࡣ
    ʤ䦡פνϤϥǥեȤΥեޥåȻʥ쥳ɤ
  ɤñ졢裴եɤʻסˤȤƤ롣
    ˡˤĤƤϡƥ饹TermExtract::Calc_Imp)ʲΥץ
  ץȤ򻲾ȤΤȡ

=head2 Sample Script

 #!/opt/local/bin/perl5.30 -w
 
 #
 #  ex_chasen.pl
 #
 #ե뤫䦡פηǲϺѤߤΥǡɤ߼
 #  ɸϤѸȤν٤֤ץ
 #
 #   version 0.32
 #
 #   maeda@lib.u-tokyo.ac.jp
 
 use TermExtract::Chasen;
 #use strict;
 my $data = new TermExtract::Chasen;
 my $InputFile = "chasen_out.txt";    # ϥե
 
 # ץΰ۾ｪλ
 # (åǥ쥯ȥѤΤߡ
 $SIG{INT} = $SIG{QUIT} = $SIG{TERM} = 'sigexit';
 
 # ϥ⡼ɤ
 # 1  Ѹܽ١2  ѸΤ
 # 3  ޶ڤ
 my $output_mode = 1;
 
 #
 # ٷ׻ǡϢܸ"ٿ""ۤʤ""ѡץ쥭ƥ"Τ
 # Ȥ뤫򡣥ѡץ쥭ƥϡֳؽǽפȤʤ
 # ޤ"ϢܸξȤʤ"⤢ꡢξѸи
 # (ꤵƤIDFȤ߹碌ˤǽٷ׻Ԥ
 # ʥǥեȤ"ٿ"Ȥ $obj->use_total)
 #
 #$data->use_total;      # ٿȤ
 #$data->use_uniq;       # ۤʤȤ
 #$data->use_Perplexity; # ѡץ쥭ƥȤ(TermExtract 3.04 ʾ)
 #$data->no_LR;          # ܾȤʤ (TermExtract 4.02 ʾ)
 
 #
 # ٷ׻ǡϢܾ˳ݤ碌Ѹиپ򤹤
 # $data->no_LR; ȤȤ߹碌Ѹи٤Τߤν٤⻻вǽ
 # ʥǥեȤ "Frequency" $data->use_frq)
 # TFϤѸ줬¾Ѹΰ˻ȤƤˤ⥫
 # Frequency Ѹ줬¾Ѹΰ˻ȤƤ˥Ȥʤ
 #
 #$data->use_TF;   # TF (Term Frequency) (TermExtract 4.02 ʾ)
 #$data->use_frq;  # FrequencyˤѸ
 #$data->no_frq;   # پȤʤ
 
 #
 # ٷ׻ǡؽǽȤɤ
 # ʥǥեȤϡѤʤ $obj->no_stat)
 #
 #$data->use_stat; # ؽǽȤ
 #$data->no_stat;  # ؽǽȤʤ
 
 #
 # ٷ׻ǡ֥ɥѸ١פȡϢܸν١
 # ΤɤŤ򤪤ꤹ롣
 # ǥեͤϣ
 # ͤ礭ۤɡ֥ɥѸ١פŤޤ
 #
 #$data->average_rate(0.5);
 
 #
 # ؽǽDB˥ǡѤ뤫ɤ
 # ٷ׻ǡؽǽȤȤϡåȤƤۤ
 # ̵񡣽оݤ˳ؽǽDBϿƤʤ줬ޤޤ
 # ưʤ
 # ʥǥեȤϡѤʤ $obj->no_storage
 #
 #$data->use_storage; # Ѥ
 #$data->no_storage;  # Ѥʤ
 
 #
 # ؽǽDB˻ѤDBMSDBM_File˻
 # ʥǥեȤϡDB_FileBTREE⡼ɡ
 #
 #$data->use_SDBM;
 
 # ΥɥȤפȤΥǡ١
 # ե̾򥻥å
 # ʥǥեȤ "stat.db""comb.db"
 #
 #$data->stat_db("stat.db");
 #$data->comb_db("comb.db");
 
 #
 # ǡ١¾åΤΰǥ쥯ȥ
 # ǥ쥯ȥ̾ʸʥǥեȡˤξϥåʤ
 #
 #$data->lock_dir("lock_dir");
 
 #
 # ǲϺѤߤΥƥȤ顢ǡɤ߹
 # ѸꥹȤ֤
 # DBѡɥٻѤ˥åȡ
 #
 #my @noun_list = $data->get_imp_word($str, 'var');     # Ϥѿ
 my @noun_list = $data->get_imp_word($InputFile); # Ϥե
 
 #
 # ɤ߹ǲϺѤߥƥȥե򸵤
 # ⡼ɤѤơѸꥹȤ֤
 #$data->use_stat->no_frq;
 #my @noun_list2 = $data->get_imp_word();
 # ޤη̤̤Υ⡼ɤˤ̤ȳݤ碌
 #@noun_list = $data->result_filter (\@noun_list, \@noun_list2, 30, 1000);
 
 #
 #  ѸꥹȤȷ׻٤ɸϤ˽Ф
 #
 foreach (@noun_list) {
    # աɽʤ
    next if $_->[0] =~ /^()*(ʿ)*(\d+ǯ)*(\d+)*(\d+)*()*()*(\d+)*(\d+ʬ)*(\d+)*$/;
    # ͤΤߤɽʤ
    next if $_->[0] =~ /^\d+$/;
 
    # ɽ$output_mode˱ơͼѹ
    printf "%-60s %16.2f\n", $_->[0], $_->[1] if $output_mode == 1;
    printf "%s\n",           $_->[0]          if $output_mode == 2;
    printf "%s,",            $_->[0]          if $output_mode == 3;
 }
 
 # ץΰ۾ｪλDBΥå
 # (åǥ쥯ȥѤΤߡ
 sub sigexit {
    $data->unlock_db;
 }

=head1 Methods

    Υ⥸塼Ǥϡget_imp_word Τ߼ʳΥ᥽åɤϿ
  ⥸塼 TermExtract::Calc_Imp ǼƤ롣
    get_imp_word ϷǲϤԤФ줿ñ򡢸ġñθ
  ʻ򸵤ʣƤ롣ʳΥ᥽åɤˤĤƤϡ
  TermExtract::Calc_Imp PODɥȤ򻲾Ȥ뤳ȡ

=head2 get_imp_word

    ǲϤηФ줿ñ򼡤Υ롼ˤʣ롣
  ϡоݤΥǡ裲裱μ̤Ǥ롣ǥե
  Ǥϡ裱ϡǲϺѤߤΥƥȥեȤʤ롣裲ʸ
   'var'åȤ줿ȤˤϡǲϺѤΥƥȥǡ
  ä顼ѿȲ᤹롣

    ʻñ̾줬Ϣ³Ǹ줿ȤϷ礹
       ̾    
       ̾    ³
       ̾                
       ̾                ³
       ̾    ͭ̾
       ̤θ
           ե٥å

        ̤θפξ硢䦡פΥС2.3.3Ǥ . ʥԥꥪ
          ˤ - ʥϥեˤʤɤǤ줬ʬ䤵롣ǡASCIIε
         줿Ȥϡθ礷ƽ褦ˤƤ롣
         εϽ
            ()[]<>|"';,

        䦡פΥС2.3.3ǤϡۤȤɤαʸ̤θפǤ
        ʤֵ-ե٥åȡפȤưñ̤ǰ롣Τᡢ
        ֵ-ե٥åȡפϲǽʸ¤Ϣ뤷ƣȤư褦
        ˤʣñ줬ڤʤǰƤޤΤλ
        СΡ䦡פפǤϤԶ
        ʤ

    ʻñ̾줬줿Ȥϡ³줬嵭̾줫
      ȽꤷۤʤȤʣȤưʤ

       ̾    ư촴
       ̾    ʥƻ촴

    ʻñ̾줬줿Ȥϡ³줬嵭̾줫
      ȽꤷۤʤȤʣȤưʤޤʣƬ
      Ѵ롣

        ̾             ư촴

    ʻ줬ưξϡʣѴ

    ΣʸΡ̤θפϸζڤȤ롣ޤ̤θפ ,  
      ǽȤˤζڤȤ롣

          !"#$%&'()*+,-./{|}:;<>[]

    ʣʤƬñ̾줬ܡפξϡܡפΤߺ롣

    ʣʤñ̾ΤθξϡΤߺ
      롣ޤξ롣

      "ʤ", "", "", "", "", "", "", "" ,""

=head1 SEE ALSO

    TermExtract::Calc_Imp
    TermExtract::MeCab
    TermExtract::BrillsTagger
    TermExtract::EnglishPlainText
    TermExtract::ChainesPlainTextUC
    TermExtract::ChainesPlainTextGB
    TermExtract::ICTCLAS
    TermExtract::JapanesePlainTextEUC
    TermExtract::JapanesePlainTextSJIS

=head1 COPYRIGHT

    Υץϡء͵ֶ͹Ωءä§
  Ѹ켫ưХƥפtermex.pl 򻲹ͤ˥ɤ
  Ū˽ľΤǤ롣
    κȤϡءϯ (maeda@lib.u-tokyo.ac.jp)Ԥä

    ϼΤȤꡣ

    ΩץȤ⥸塼ؽ񤭴¾Υץफ
      Ȥ߹ߤǽȤ

    ǲϺѤߤΥƥȥեǤϤʤѿϲǽ
      ˤˤUNIXĶǤ Text::Chasen ⥸塼ˤб
      ǽˤʤä

    ꥸʥPerlбˤShift-JISEUCˤܸϤܸ
      бѥåƤPerl(Jperl)Ȥ鷺Ȥǽˤʤä

    ˸ͭ̾ȤʸȤ褦ѥ᡼
      ꤷ

    ΣʸΡ̤θפϸζڤȤǧ褦ˤޤ
       ̤θפ , ǽȤˤζڤȤ

        !"#$%&'()*+,-./{|}:;<>[]

    ʣΡ̤θפñ̾åȤ߹
      ʡ䦡ver 2.3.3οСؤб

    ʣΡֵ-ե٥åȡפñåȤ߹
      ʡ䦡ver 2.3.3οСؤб

    γݤΤᡢPerl"strict"⥸塼ڤperl-wץ
      ؤбԤä

    ʤܥץλѤˤʤ̤˴ؤƤǤ
  Ǥʤ

=cut
