/*
 * Decompiled with CFR 0.152.
 */
package com.nexwave.nquindexer;

import com.nexwave.nquindexer.SaxDocFileParser;
import com.nexwave.nquindexer.WordAndScoring;
import com.nexwave.nsidita.DocFileInfo;
import com.nexwave.stemmer.snowball.SnowballStemmer;
import com.nexwave.stemmer.snowball.ext.EnglishStemmer;
import com.nexwave.stemmer.snowball.ext.FrenchStemmer;
import com.nexwave.stemmer.snowball.ext.GermanStemmer;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

public class SaxHTMLIndex
extends SaxDocFileParser {
    private Map<String, String> tempDico;
    private int i = 0;
    private ArrayList<String> cleanUpList = null;
    private ArrayList<String> cleanUpPunctuation = null;
    private int SCORING_FOR_H1 = 50;
    private int SCORING_FOR_H2 = 45;
    private int SCORING_FOR_H3 = 40;
    private int SCORING_FOR_H4 = 35;
    private int SCORING_FOR_H5 = 30;
    private int SCORING_FOR_H6 = 25;
    private int SCORING_FOR_BOLD = 5;
    private int SCORING_FOR_ITALIC = 3;
    private int SCORING_FOR_NORMAL_TEXT = 1;
    private int SCORING_FOR_KEYWORD = 100;
    private int SCORING_FOR_INDEXTERM = 75;
    private List<WordAndScoring> wsList = null;

    public List<WordAndScoring> getWsList() {
        return this.wsList;
    }

    public SaxHTMLIndex() {
    }

    public SaxHTMLIndex(ArrayList<String> arrayList) {
        this.cleanUpList = arrayList;
    }

    public SaxHTMLIndex(ArrayList<String> arrayList, ArrayList<String> arrayList2) {
        this.cleanUpList = arrayList;
        this.cleanUpPunctuation = arrayList2;
    }

    public int init(Map<String, String> map) {
        this.tempDico = map;
        return 0;
    }

    public DocFileInfo runExtractData(File file, String string, boolean bl) {
        Object object;
        Object object2;
        Object object4;
        this.fileDesc = new DocFileInfo(file);
        this.strbf = new StringBuffer("");
        this.parseDocument(file);
        String string2 = this.cleanBuffer(this.strbf);
        string2 = string2.replaceAll("\\s+", " ");
        this.wsList = new ArrayList<WordAndScoring>();
        if (string.equalsIgnoreCase("ja") || string.equalsIgnoreCase("zh") || string.equalsIgnoreCase("ko")) {
            object4 = new LinkedList();
            try {
                string2 = string2.replaceAll("@@@([^\\s]*)@@@", "");
                CJKAnalyzer object32 = new CJKAnalyzer(Version.LUCENE_30);
                object2 = new StringReader(string2);
                object = object32.tokenStream("", (Reader)object2);
                TermAttribute termAttribute = (TermAttribute)object.addAttribute(TermAttribute.class);
                OffsetAttribute offsetAttribute = (OffsetAttribute)object.addAttribute(OffsetAttribute.class);
                while (object.incrementToken()) {
                    String string3 = termAttribute.term();
                    ((LinkedList)object4).add(string3);
                    WordAndScoring wordAndScoring = new WordAndScoring(string3, string3, 1);
                    boolean bl2 = false;
                    for (WordAndScoring wordAndScoring2 : this.wsList) {
                        if (!wordAndScoring2.getStem().equals(wordAndScoring.getStem())) continue;
                        bl2 = true;
                        int n = wordAndScoring2.getScoring();
                        wordAndScoring2.setScoring(n + wordAndScoring.getScoring());
                        break;
                    }
                    if (bl2) continue;
                    this.wsList.add(wordAndScoring);
                }
            }
            catch (IOException iOException) {
                System.out.println("Error tokenizing content using CJK Analyzer. IOException");
                iOException.printStackTrace();
            }
        } else {
            object4 = string.equalsIgnoreCase("en") ? new EnglishStemmer() : (string.equalsIgnoreCase("de") ? new GermanStemmer() : (string.equalsIgnoreCase("fr") ? new FrenchStemmer() : null));
            this.wsList = new ArrayList<WordAndScoring>();
            StringTokenizer stringTokenizer = new StringTokenizer(string2, " ");
            while (stringTokenizer.hasMoreTokens()) {
                object2 = stringTokenizer.nextToken();
                object = this.getWordAndScoring((String)object2, (SnowballStemmer)object4, bl);
                if (object == null) continue;
                boolean bl3 = false;
                for (WordAndScoring wordAndScoring : this.wsList) {
                    if (!wordAndScoring.getStem().equals(((WordAndScoring)object).getStem())) continue;
                    bl3 = true;
                    int n = wordAndScoring.getScoring();
                    wordAndScoring.setScoring(n + ((WordAndScoring)object).getScoring());
                    break;
                }
                if (bl3) continue;
                this.wsList.add((WordAndScoring)object);
            }
        }
        for (WordAndScoring wordAndScoring : this.wsList) {
            if (wordAndScoring != null && this.tempDico.containsKey(wordAndScoring.getStem())) {
                object2 = this.tempDico.get(wordAndScoring.getStem());
                object2 = ((String)object2).concat(",").concat(Integer.toString(this.i)).concat("*").concat(Integer.toString(wordAndScoring.getScoring()));
                this.tempDico.put(wordAndScoring.getStem(), (String)object2);
                continue;
            }
            if (wordAndScoring == null) continue;
            object2 = null;
            object2 = Integer.toString(this.i).concat("*").concat(Integer.toString(wordAndScoring.getScoring()));
            this.tempDico.put(wordAndScoring.getStem(), (String)object2);
        }
        ++this.i;
        return this.fileDesc;
    }

    private WordAndScoring getWordAndScoring(String string, SnowballStemmer snowballStemmer, boolean bl) {
        WordAndScoring wordAndScoring = null;
        if (string.indexOf("@@@") != -1 && string.indexOf("@@@") != string.lastIndexOf("@@@")) {
            String string2 = string.substring(0, string.indexOf("@@@"));
            if (string2.length() > 0) {
                String string3 = string.substring(string.indexOf("@@@elem_") + "@@@elem_".length(), string.lastIndexOf("@@@"));
                int n = this.SCORING_FOR_NORMAL_TEXT;
                if ("h1".equalsIgnoreCase(string3)) {
                    n = this.SCORING_FOR_H1;
                } else if ("h2".equalsIgnoreCase(string3)) {
                    n = this.SCORING_FOR_H2;
                } else if ("h3".equalsIgnoreCase(string3)) {
                    n = this.SCORING_FOR_H3;
                } else if ("h4".equalsIgnoreCase(string3)) {
                    n = this.SCORING_FOR_H4;
                } else if ("h5".equalsIgnoreCase(string3)) {
                    n = this.SCORING_FOR_H5;
                } else if ("h6".equalsIgnoreCase(string3)) {
                    n = this.SCORING_FOR_H6;
                } else if ("em".equalsIgnoreCase(string3)) {
                    n = this.SCORING_FOR_ITALIC;
                } else if ("strong".equalsIgnoreCase(string3)) {
                    n = this.SCORING_FOR_BOLD;
                } else if ("meta_keywords".equalsIgnoreCase(string3)) {
                    n = this.SCORING_FOR_KEYWORD;
                } else if ("meta_indexterms".equalsIgnoreCase(string3)) {
                    n = this.SCORING_FOR_INDEXTERM;
                }
                String string4 = string2;
                if (snowballStemmer != null && bl) {
                    string4 = snowballStemmer.doStem(string2);
                }
                wordAndScoring = new WordAndScoring(string2, string4, n);
            }
        } else {
            String string5 = string;
            if (snowballStemmer != null && bl) {
                string5 = snowballStemmer.doStem(string);
            }
            wordAndScoring = new WordAndScoring(string, string5, this.SCORING_FOR_NORMAL_TEXT);
        }
        return wordAndScoring;
    }

    private String cleanBuffer(StringBuffer stringBuffer) {
        String string = stringBuffer.toString().toLowerCase();
        StringBuffer stringBuffer2 = new StringBuffer("");
        StringBuffer stringBuffer3 = new StringBuffer("");
        if (this.cleanUpList == null || this.cleanUpList.isEmpty()) {
            stringBuffer2.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");
            stringBuffer2.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");
            stringBuffer2.append("|\\beach\\b|\\bhas\\b|\\bhave\\b|\\bof\\b|\\b\\xA9\\b|\\bnot\\b");
            stringBuffer2.append("|\\bfor\\b|\\bthis\\b|\\bas\\b|\\bit\\b|\\bhe\\b|\\bshe\\b");
            stringBuffer2.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
            stringBuffer2.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
            stringBuffer2.append("|\\bI\\b|\\bme\\b|\\bmy\\b");
            string = string.replaceFirst("Copyright \ufffd 1998-2007 NexWave Solutions.", " ");
        } else {
            stringBuffer2.append("\\ba\\b");
            for (String string2 : this.cleanUpList) {
                stringBuffer2.append("|\\b").append(string2).append("\\b");
            }
        }
        if (this.cleanUpPunctuation != null && !this.cleanUpPunctuation.isEmpty()) {
            stringBuffer3.append("\\u3002");
            for (String string2 : this.cleanUpPunctuation) {
                stringBuffer3.append("|").append(string2);
            }
        }
        string = this.minimalClean(string, stringBuffer2, stringBuffer3);
        return string;
    }
}

