package org.fbk.cit.hlt.thewikimachine.analysis;

import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/analysis/NGramExtractor.class */
public class NGramExtractor {
    private static NGramExtractor ourInstance;
    int nGramLength;
    static Logger logger = Logger.getLogger(NGramExtractor.class.getName());
    private static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);
    DecimalFormat nf = new DecimalFormat("000,000,000.#");
    Tokenizer tokenizer = new HardTokenizer();

    public NGramExtractor(int i) {
        this.nGramLength = i;
    }

    private String tokenizedForm(Token[] tokenArr, int i, int i2) {
        StringBuilder sb = new StringBuilder();
        sb.append(tokenArr[i].getForm());
        for (int i3 = i + 1; i3 <= i2; i3++) {
            sb.append(' ');
            sb.append(tokenArr[i3].getForm());
        }
        return sb.toString();
    }

    public List<String> extract(String str) {
        return extract(this.tokenizer.tokenArray(str), str);
    }

    public List<String> extract(Token[] tokenArr, String str) {
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < tokenArr.length; i++) {
            tokenArr[i].getStart();
            int i2 = i + this.nGramLength + 1;
            if (i2 > tokenArr.length) {
                i2 = tokenArr.length;
            }
            for (int i3 = i; i3 < i2; i3++) {
                tokenArr[i3].getEnd();
                arrayList.add(tokenizedForm(tokenArr, i, i3));
            }
        }
        return arrayList;
    }
}
