package TermExtract::JapanesePlainTextSJIS;
use TermExtract::Calc_Imp;

use strict;
use Exporter ();
use vars qw(@ISA $VERSION @EXPORT);

@ISA = qw(TermExtract::Calc_Imp Exporter);
@EXPORT = qw();
$VERSION = "0.23";


# ========================================================================
# get_noun_frq -- Get noun frequency.
#                 The values of the hash are frequency of the noun.
# ipƂ̕px𓾂Tu[`j
#
#  Over-write TermExtract::Calc_Imp::get_noun_frq
#
# ========================================================================

sub get_noun_frq {
    my $self = shift;
    my $data = shift;           # ̓f[^
    my $mode = shift || 0;      # ̓f[^t@CAϐ̎ʗptO
    my %cmp_noun_list = ();     # ƕpxꂽnbVi֐̖߂lj
    my @terms = ();
    my @StopWords = ();

    $self->IsAgglutinativeLang; # PwiPԂP󂯂Ȃj

    # ͂t@C̏ꍇ
    if ($mode ne 'var') {
        local($/) = undef;
        open (IN, $data) || die "Can not open input file. $!";
        $data = <IN>;
        close IN;
    }

    foreach my $morph ((split /\n/, $data)) {
        chomp $morph;
        next if $morph eq "";
        my $terms = get_katakana_kanji($morph);
        foreach my $cmp_noun (@$terms) {
            next if @$cmp_noun < 2;
            $cmp_noun_list{ join ' ', @$cmp_noun }++ if $$cmp_noun[0];
        }
    }

    return \%cmp_noun_list;
}

# J^Jiyъ𒊏oTu[`
sub get_katakana_kanji {
    my $word = shift;
    my @terms = ();
    my $iPos = 0; my $iLen = 0;
    my $pool = "";
    my $status = "0";  # 0  uvA1  upyтPoCgLA2  uJ^Jiv
    my @cmp_noun = ();
    for($iPos = 0;$word ne ""; $word = substr($word, $iLen)) {
        my $noun = "";
        # 1oCgiASCII)
        if ($word =~ /^([\x00-\x7E])/) {
            $iLen = 1;
            $noun = $1;
            if ($status == 2) { push @cmp_noun, $pool; $pool = ""; }
            $pool .= $noun;
            $status = 1;
        } 
        # 1oCgipJij
        elsif ($word =~ /^([\xA6-\xDF])/) {
            $iLen = 1;
            if ($status == 1) { push @cmp_noun, $pool; $pool = ""; }
            $pool .= $noun;
            $status = 2;
        }
        # QoCg
        elsif ($word =~ /^([\x81-\x9F|\xE0-\xEF][\x40-\xFC])/) {
            $iLen = 2;
            $noun = $1;
            # QoCgp
            if ($noun =~ /^\x82[\x60-\x9A]/) {
                if ($status == 2) { push @cmp_noun, $pool; $pool = ""; }
                $pool .= $noun;
                $status = 1;
            } 
            # J^Ji
            elsif ($noun eq "\x81\x5B" | $noun =~ /^\x83[\x3F-\x96]/) {
                if ($status == 1) { push @cmp_noun, $pool; $pool = ""; }
                $pool .= $noun;
                $status = 2;
            }
            # 
            elsif ($noun =~ /^[\x88-\xEA]/) {
                if ($status != 0) { push @cmp_noun, $pool; $pool = ""; }
                $status = 0;
                push @cmp_noun, $noun;
            }
            # LȂ
            else {
                if ($status != 0) { push @cmp_noun, $pool; $pool = ""; }
                $status = 0;
                add_cmp_noun (\@terms, \@cmp_noun);
            }
        }
        else {
            $iLen = 1;
        }
    }
    # Ō̈̂ݏ
    push @cmp_noun, $pool if $status != 0;
    add_cmp_noun (\@terms, \@cmp_noun) if @cmp_noun;
    return \@terms;
}

# o^
sub add_cmp_noun {
    my $terms =    shift;
    my $cmp_noun = shift;
    my @work     = @$cmp_noun;
    push @$terms, \@work;
    @$cmp_noun    = ();
}

1;

__END__


=head1 NAME

    TermExtract::JapanesePlainTextSJIS
     -- pꎩoW[iauJ^JiEovSJISŁj

=head1 SYNOPSIS

    use TermExtract::JapanesePlainTextSJIS;

=head1 DESCRIPTION

    {̃eLXgf[^iSJIS)炻̂܂ܐp𒊏ovOB

    W[̎gp@ɂẮAeNXiTermExtract::Calc_Imp)A
  ȉ̃TvXNvgQƂ̂ƁB

=head2 Sample Script

 #!/opt/local/bin/perl5.34 -w
 
 #
 #  ex_JPTS.pl
 #
 #  Wo͂ɐpƂ̏dvxԂvO
 #  auJ^JiEovShif-JIS
 #
 #   version 0.04
 #
 #
 
 use TermExtract::JapanesePlainTextSJIS;
 #use strict;
 my $data = new TermExtract::JapanesePlainTextSJIS;
 my $InputFile = "JPTS_out.txt";    # ̓t@C
 
 # vZẌُI
 # (bNfBNggpꍇ̂݁j
 $SIG{INT} = $SIG{QUIT} = $SIG{TERM} = 'sigexit';
 
 # o̓[hw
 # 1  p{dvxA2  p̂
 # 3  J}؂
 my $output_mode = 1;
 
 #
 # dvxvZŁAAڌ"א"A"قȂ萔"A"p[vLVeB"̂
 # Ƃ邩IBp[vLVeB́uwK@\vgȂ
 # ܂A"Aڌ̏gȂ"IȀꍇ͗po
 # (Ɛݒ肳ĂIDF̑gݍ킹jŏdvxvZs
 # iftHg"א"Ƃ $obj->use_total)
 #
 #$data->use_total;      # אƂ
 #$data->use_uniq;       # قȂ萔Ƃ
 #$data->use_Perplexity; # p[vLVeBƂ(TermExtract 3.04 ȏ)
 #$data->no_LR;          # אڏgȂ (TermExtract 4.02 ȏ)
 
 #
 # dvxvZŁAAڏɊ|킹popxI
 # $data->no_LR; Ƃ̑gݍ킹ŗpopx݂̂̏dvxZo\
 # iftHg "Frequency" $data->use_frq)
 # TF͂pꂪ̗p̈ꕔɎgĂꍇɂJEg
 # Frequency ͗pꂪ̗p̈ꕔɎgĂꍇɃJEgȂ
 #
 #$data->use_TF;   # TF (Term Frequency) (TermExtract 4.02 ȏ)
 #$data->use_frq;  # Frequencyɂppx
 #$data->no_frq;   # pxgȂ


 
 #
 # dvxvZŁAwK@\gǂI
 # iftHǵAgpȂ $obj->no_stat)
 #
 #$data->use_stat; # wK@\g
 #$data->no_stat;  # wK@\gȂ
 
 #
 # dvxvZŁAuhLg̗p̕pxvƁuAڌ̏dvxv
 # ̂ǂɔdݒ肷B
 # ftHgl͂P
 # l傫قǁuhLg̗p̕pxv̔d܂
 #
 #$data->average_rate(0.5);
 
 #
 # wK@\pDBɃf[^~ς邩ǂI
 # dvxvZŁAwK@\gƂ́AZbgĂق
 # BΏۂɊwK@\pDBɓo^ĂȂꂪ܂܂
 # Ɛ삵ȂB
 # iftHǵA~ςȂ $obj->no_storagej
 #
 #$data->use_storage; # ~ς
 #$data->no_storage;  # ~ςȂ
 
 #
 # wK@\pDBɎgpDBMSDBM_FileɎw
 # iftHǵADB_FileBTREE[hj
 #
 #$data->use_SDBM;
 
 #
 # ߋ̃hLg̗ݐϓvgꍇ̃f[^x[X
 # t@CZbg
 # iftHg "stat.db""comb.db"j
 #
 #$data->stat_db("statUC.db");
 #$data->comb_db("combUC.db");
 
 #
 # f[^x[X̔rbN̂߂̈ꎞfBNgw
 # fBNg󕶎iftHgj̏ꍇ̓bNȂ
 #
 #$data->lock_dir("lock_dir");
 
 #
 # f[^ǂݍ
 # pꃊXgzɕԂ
 # iݐϓvDBgpAhLg̕pxgpɃZbgj
 #
 #my @noun_list = $data->get_imp_word($str, 'var');     # ͂ϐ
 my @noun_list = $data->get_imp_word($InputFile); # ͂t@C
 
 #
 # Oǂݍ񂾃eLXgt@C
 # [hςāApꃊXgzɕԂ
 #$data->use_stat->no_frq;
 #my @noun_list2 = $data->get_imp_word();
 # ܂Ǎʂʂ̃[hɂ錋ʂƊ|킹
 #@noun_list = $data->result_filter (\@noun_list, \@noun_list2, 30, 1000);
 
 #
 #  pꃊXgƌvZdvxWo͂ɏo
 #
 foreach (@noun_list) {
    # tE͕\Ȃ
    next if $_->[0] =~ /^(a)*()*(\d+N)*(\d+)*(\d+)*(ߑO)*(ߌ)*(\d+)*(\d+)*(\d+b)*$/;
 
    # l݂͕̂\Ȃ
    next if $_->[0] =~ /^\d+$/;
 
    # ʕ\i$output_modeɉāAo͗lύX
    printf "%-60s %16.2f\n", $_->[0], $_->[1] if $output_mode == 1;
    printf "%s\n",           $_->[0]          if $output_mode == 2;
    printf "%s,",            $_->[0]          if $output_mode == 3;
 }
 
 
=head1 Methods

    ̃W[ł́Aget_imp_word ̂ݎAȊÕ\bh͐e
  W[ TermExtract::Calc_Imp ŎĂB
    get_imp_word ̓Xgbv[hɂ蕶͂𕡍̒Pʂ܂łɕ
  BȊÕ\bhɂẮATermExtract::Calc_Imp PODhL
  gQƂ邱ƁB

=head2 get_imp_word

    {͂̕玟̃[ɂ蕡𒊏oBṔA
  Ώۂ̃f[^AQ͑P̎ʂłBftHgł́AP
  ́A{̃eLXgt@CƂȂBQɕ'var'Zbg
  ꂽƂɂ́A{̃eLXgf[^XJ[ϐ
  Ɖ߂B

    iPjAuvAuJ^JivAupyтPoCgLv
@@@@oB
     (Qjsꍇ́Aŕ̋؂Ƃ
    iRjuv͂PPʁAuJ^JivyсupyтPoCgL
@@@@vA͌Pʂł܂Ƃ܂B
    iSjLRjQPʈȏAĂꍇɐpƂB
    iTjdvx̌vŹALRj̒Pʂōs

=head1 SEE ALSO

    TermExtract::Calc_Imp
    TermExtract::Chasen
    TermExtract::MeCab
    TermExtract::BrillsTagger
    TermExtract::EnglishPlainText
    TermExtract::ChainesPlainTextUC
    TermExtract::ChainesPlainTextGB
    TermExtract::ICTCLAS
    TermExtract::JapanesePlainTextEUC

=head1 COPYRIGHT

    ̃vÓAw OcN (maeda@lib.u-tokyo.ac.jp)쐬
  ̂łB̐pꒊõACfA͓w Tu̒
  Xgbv[hɂpdvxvZ̗_ƁAuK\ƃeLX
  gE}CjOv(ǖ؏ҁAVc`F ΏX 2003.10jɋLڂ
  J^JiEoɂL[[h؂õACfAɂĂB

    ȂA{vO̎gpɂĐȂ錋ʂɊւĂł
  ؐӔC𕉂ȂB

=cut