package TermExtract::MeCab;
use TermExtract::Calc_Imp;

use strict;
use Exporter ();
use Encode 'from_to';
use vars qw(@ISA $VERSION @EXPORT);

@ISA = qw(TermExtract::Calc_Imp Exporter);
@EXPORT = qw();
$VERSION = "2.16";

# ========================================================================
# get_noun_frq -- Get noun frequency.
#                 The values of the hash are frequency of the noun.
# ѸȤ٤륵֥롼
#
#  Over-write TermExtract::Calc_Imp::get_noun_frq
#
# ========================================================================
sub get_noun_frq {
    my $self = shift;
    my $data = shift;           # ϥǡ
    my $mode = shift || 0;      # ϥǡե뤫ѿμѥե饰
    my %cmp_noun_list = ();     # ʣپ줿ϥåʴؿ͡
    my $must  = 0;              # θ줬̾ǤʤФʤʤϿ
    my @terms = ();             # ʣꥹȺѤκ

    $self->IsAgglutinativeLang; # ñ֣ʤ

    # ѸꥹȤءɲä륵֥롼
    my $add = sub {
        my $terms         = shift;
        my $cmp_noun_list = shift;

        # Ƭפʸκ
        if (defined $terms->[0] && scalar @{$terms} > 1) {
            shift @$terms if $terms->[0] eq '';
        }
        # ;ʬʸκ
        if (defined $terms->[0]) {
            my $end = $terms->[$#$terms];
            if ( $end eq 'ʤ'  || $end eq ''   || $end eq ''       || 
                 $end eq ''    || $end eq ''   || $end eq ''       ||
                 $end eq ''    || $end eq ''   || $end eq ''       ||
                 $end =~ /^\s+$/ || $must) 
                { pop @$terms }
        }
        # convert eucjp to utf-8.
        foreach my $elem (@$terms) {
            Encode::from_to($elem, 'EUC-JP', 'UTF-8');
        }
        $cmp_noun_list->{ join ' ', @$terms }++ if defined $terms->[0];
        @$terms = ();
    };

    # Ϥեξ
    if ($mode ne 'var') {
        local($/) = undef;
        open (IN, $data) || die "Can not open input file. $!";
        $data = <IN>;
        close IN;
    }
    # convert utf-8 to eucjp.
    Encode::from_to($data, 'UTF-8', 'EUC-JP');

    # ñ̾Ϣ
    foreach my $morph ((split "\n", $data)) {
        chomp $morph;
	    my ($noun, $values) = (split /\t/, $morph);
        my $part_of_speach = "";
        my $cl_1           = "";
        my $cl_2           = "";

        if (defined $values) {
            ($part_of_speach, $cl_1, $cl_2) = (split /,/, $values)[0,1,2];
        }

        if( $part_of_speach eq '̾' & $cl_1 eq ''                       ||
            $part_of_speach eq '̾' & $cl_1 eq '' & $cl_2 eq ''     ||
            $part_of_speach eq '̾' & $cl_1 eq '' & $cl_2 eq '³' ||
            $part_of_speach eq '̾' & $cl_1 eq 'ͭ̾'                   ||
            $part_of_speach eq '' & $cl_1 eq 'ե٥å'             ||
            $part_of_speach eq '̾' & $cl_1 eq '³' &
                $noun !~ /^[\x21-\x2F]|[{|}:\;\<\>\[\]]$/
          ){
            push @terms, $noun;
            $must = 0; next;
        }
        elsif(($part_of_speach eq '̾' & $cl_1 eq 'ư촴'   | 
               $part_of_speach eq '̾' & $cl_1 eq 'ʥƻ촴')
           ){
            push @terms, $noun;
            $must = 1; next;
        }
        elsif($part_of_speach eq '̾' & $cl_1 eq '' & $cl_2 eq 'ư촴'){
            push @terms, $noun;
            $must = 1; next;
        }
        elsif($part_of_speach eq 'ư'){
            @terms  = ();
        }
        else {
            &$add(\@terms, \%cmp_noun_list) unless $must;
        }
        @terms = () if $must;
        $must = 0;
    }
    close IN;

    return \%cmp_noun_list;
}

1;

__END__

=head1 NAME

    TermExtract::MeCab -- ѸХ⥸塼ʡ)

=head1 SYNOPSIS

    use TermExtract::MeCab;

=head1 DESCRIPTION

    ϥƥȤ򡢡סüؤǺƤ
    ǲϥץˤˤη̤Ȥϥƥ
    ȤѸФץࡣ
      ʤפνϤϥǥեȤΥեޥåȻ
    ȤƤ롣
      ˡˤĤƤϡƥ饹TermExtract::Calc_Imp)ʲ
    ץ륹ץȤ򻲾ȤΤȡ

=head2 Sample Script

 #!/opt/local/bin/perl5.32 -w
 
 #
 #  ex_mecab.pl
 #
 #ե뤫פηǲϺѤߤΥǡɤ߼
 #  ɸϤѸȤν٤֤ץ
 #
 #   version 0.32
 #
 #   maeda@lib.u-tokyo.ac.jp
 
 use TermExtract::MeCab;
 #use strict;
 my $data = new TermExtract::MeCab;
 my $InputFile = "mecab_out.txt";    # ϥե
 
 # ץΰ۾ｪλ
 # (åǥ쥯ȥѤΤߡ
 $SIG{INT} = $SIG{QUIT} = $SIG{TERM} = 'sigexit';
 
 # ϥ⡼ɤ
 # 1  Ѹܽ١2  ѸΤ
 # 3  ޶ڤ
 my $output_mode = 1;
 
 #
 # ٷ׻ǡϢܸ"ٿ""ۤʤ""ѡץ쥭ƥ"Τ
 # Ȥ뤫򡣥ѡץ쥭ƥϡֳؽǽפȤʤ
 # ޤ"ϢܸξȤʤ"⤢ꡢξѸи
 # (ꤵƤIDFȤ߹碌ˤǽٷ׻Ԥ
 # ʥǥեȤ"ٿ"Ȥ $obj->use_total)
 #
 #$data->use_total;      # ٿȤ
 #$data->use_uniq;       # ۤʤȤ
 #$data->use_Perplexity; # ѡץ쥭ƥȤ(TermExtract 3.04 ʾ)
 #$data->no_LR;          # ܾȤʤ (TermExtract 4.02 ʾ)
 
 #
 # ٷ׻ǡϢܾ˳ݤ碌Ѹиپ򤹤
 # $data->no_LR; ȤȤ߹碌Ѹи٤Τߤν٤⻻вǽ
 # ʥǥեȤ "Frequency" $data->use_frq)
 # TFϤѸ줬¾Ѹΰ˻ȤƤˤ⥫
 # Frequency Ѹ줬¾Ѹΰ˻ȤƤ˥Ȥʤ
 #
 #$data->use_TF;   # TF (Term Frequency) (TermExtract 4.02 ʾ)
 #$data->use_frq;  # FrequencyˤѸ
 #$data->no_frq;   # پȤʤ
 
 #
 # ٷ׻ǡؽǽȤɤ
 # ʥǥեȤϡѤʤ $obj->no_stat)
 #
 #$data->use_stat; # ؽǽȤ
 #$data->no_stat;  # ؽǽȤʤ
 
 #
 # ٷ׻ǡ֥ɥѸ١פȡϢܸν١
 # ΤɤŤ򤪤ꤹ롣
 # ǥեͤϣ
 # ͤ礭ۤɡ֥ɥѸ١פŤޤ
 #
 #$data->average_rate(0.5);
 
 #
 # ؽǽDB˥ǡѤ뤫ɤ
 # ٷ׻ǡؽǽȤȤϡåȤƤۤ
 # ̵񡣽оݤ˳ؽǽDBϿƤʤ줬ޤޤ
 # ưʤ
 # ʥǥեȤϡѤʤ $obj->no_storage
 #
 #$data->use_storage; # Ѥ
 #$data->no_storage;  # Ѥʤ
 
 #
 # ؽǽDB˻ѤDBMSDBM_File˻
 # ʥǥեȤϡDB_FileBTREE⡼ɡ
 #
 #$data->use_SDBM;
 
 # ΥɥȤפȤΥǡ١
 # ե̾򥻥å
 # ʥǥեȤ "stat.db""comb.db"
 #
 #$data->stat_db("stat.db");
 #$data->comb_db("comb.db");
 
 #
 # ǡ١¾åΤΰǥ쥯ȥ
 # ǥ쥯ȥ̾ʸʥǥեȡˤξϥåʤ
 #
 #$data->lock_dir("lock_dir");
 
 #
 # ַǲϡ׺ѤߤΥƥȥե뤫顢ǡɤ߹
 #  ѸꥹȤ֤
 #  DBѡɥٻѤ˥åȡ
 #
 #my @noun_list = $data->get_imp_word($str,'var');      # Ϥѿ
 my @noun_list = $data->get_imp_word($InputFile);  # Ϥե
 
 # ɤ߹ַǲϡ׺Ѥߥƥȥե򸵤
 # ⡼ɤѤơѸꥹȤ֤
 #$data->use_stat->no_frq;
 #my @noun_list2 = $data->get_imp_word();
 # ޤη̤̤Υ⡼ɤˤ̤ȳݤ碌
 #@noun_list = $data->result_filter (\@noun_list, \@noun_list2, 30, 1000);
 
 #
 #  ѸꥹȤȷ׻٤ɸϤ˽Ф
 #
 foreach (@noun_list) {
    # աɽʤ
    next if $_->[0] =~ /^()*(ʿ)*(\d+ǯ)*(\d+)*(\d+)*()*()*(\d+)*(\d+ʬ)*(\d+)*$/;
    # ͤΤߤɽʤ
    next if $_->[0] =~ /^\d+$/;
 
    # ɽ
    printf "%-60s %16.2f\n", $_->[0], $_->[1] if $output_mode == 1;
    printf "%s\n",           $_->[0]          if $output_mode == 2;
    printf "%s,",            $_->[0]          if $output_mode == 3;
 }
 
 # ץΰ۾ｪλDBΥå
 # (åǥ쥯ȥѤΤߡ
 sub sigexit {
    $data->unlock_db;
 }

=head1 Methods

    Υ⥸塼Ǥϡget_imp_word Τ߼ʳΥ᥽åɤϿ
  ⥸塼 TermExtract::Calc_Imp ǼƤ롣
    get_imp_word ϷǲϤԤФ줿ñ򡢸ġñθ
  ʻ򸵤ʣƤ롣ʳΥ᥽åɤˤĤƤϡ
  TermExtract::Calc_Imp PODɥȤ򻲾Ȥ뤳ȡ

=head2 get_imp_word

    ǲϷ̤򼡤Υ롼ˤʣ롣裱ϡ
  ݤΥǡ裲裱μ̤Ǥ롣ǥեȤǤϡ裱
  ǲϺѤߤΥƥȥեȤʤ롣裲ʸ 'var'
  Ȥ줿ȤˤϡǲϺѤΥƥȥǡä
  ѿȲ᤹롣

    ʻñ̾줬Ϣ³Ǹ줿ȤϷ礹
       ̾    
       ̾    ³
       ̾                
       ̾                ³
       ̾    ͭ̾
           ե٥å

          ASCIIε椬줿Ȥϡθ礷ƽ褦
          ˤƤ롣εϽ
            ()[]<>|"';,

    ʻñ̾줬줿Ȥϡ³줬嵭̾줫
      ȽꤷۤʤȤʣȤưʤ

       ̾    ư촴
       ̾    ʥƻ촴

    ʻñ̾줬줿Ȥϡ³줬嵭̾줫
      ȽꤷۤʤȤʣȤưʤޤʣƬ
      Ѵ롣

        ̾             ư촴

    ʻ줬ưξϡʣѴ

    ΣʸΡ̤θפϸζڤȤ롣ޤ̤θפ , 
       ȤˤζڤȤ롣

        !"#$%&'()*+,-./{|}:;<>[]

    ʣʤƬñ̾줬ܡפξϡܡפΤߺ롣

    ʣʤñ̾ΤθξϡΤߺ
      롣ޤξ롣

      "ʤ", "", "", "", "", "", "", "" ,""

=head1 SEE ALSO

    TermExtract::Calc_Imp
    TermExtract::Chasen
    TermExtract::BrillsTagger
    TermExtract::EnglishPlainText
    TermExtract::ChainesPlainTextUC
    TermExtract::ChainesPlainTextGB
    TermExtract::ICTCLAS
    TermExtract::JapanesePlainTextEUC
    TermExtract::JapanesePlainTextSJIS

=head1 COPYRIGHT

    Υץϡء͵ֶ͹Ωءä§
  Ѹ켫ưХƥפtermex.pl򻲹ͤˡ
  ˺ΤǤ롣
     κȤϡءϯ (maeda@lib.u-tokyo.ac.jp)Ԥä

    ʤܥץλѤˤʤ̤˴ؤƤǤ
  Ǥʤ

=cut
