package org.tallison.gramreaper;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.FileAttribute;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;

/* loaded from: input_file:org/tallison/gramreaper/DumpTerms.class */
public class DumpTerms {
    static Options OPTIONS;
    private final DumpTermsConfig config;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/tallison/gramreaper/DumpTerms$DumpTermsConfig.class */
    public static class DumpTermsConfig {
        Path indexDirPath;
        Path indexPath;
        String field = null;
        Integer topN = -1;
        Long minDocFreq = -1L;
        Long maxDocFreq = -1L;
        Double minDocPercentage = Double.valueOf(-1.0d);
        Double maxDocPercentage = Double.valueOf(-1.0d);
        boolean includeDocFreq = false;
        Set<String> stopWords = new HashSet();
        Set<String> startWords = new HashSet();
        Path outputFile;

        private DumpTermsConfig() {
        }

        public static DumpTermsConfig build(String[] strArr) throws IOException {
            DefaultParser defaultParser = new DefaultParser();
            DumpTermsConfig dumpTermsConfig = new DumpTermsConfig();
            try {
                CommandLine parse = defaultParser.parse(DumpTerms.OPTIONS, strArr);
                if (parse.hasOption("o")) {
                    dumpTermsConfig.outputFile = Paths.get(parse.getOptionValue("o"), new String[0]);
                }
                if (parse.hasOption("i")) {
                    dumpTermsConfig.indexPath = Paths.get(parse.getOptionValue("i"), new String[0]);
                }
                if (parse.hasOption("n")) {
                    dumpTermsConfig.topN = Integer.valueOf(Integer.parseInt(parse.getOptionValue("n")));
                }
                if (parse.hasOption("min")) {
                    dumpTermsConfig.minDocFreq = Long.valueOf(Long.parseLong(parse.getOptionValue("min")));
                }
                if (parse.hasOption("max")) {
                    dumpTermsConfig.maxDocFreq = Long.valueOf(Long.parseLong(parse.getOptionValue("max")));
                }
                if (parse.hasOption("minP")) {
                    dumpTermsConfig.minDocPercentage = Double.valueOf(Double.parseDouble(parse.getOptionValue("minP")));
                }
                if (parse.hasOption("maxP")) {
                    dumpTermsConfig.maxDocPercentage = Double.valueOf(Double.parseDouble(parse.getOptionValue("maxP")));
                }
                if (parse.hasOption("f")) {
                    dumpTermsConfig.field = parse.getOptionValue("f");
                }
                if (parse.hasOption("s")) {
                    loadSet(parse.getOptionValue("s"), dumpTermsConfig.stopWords);
                }
                if (parse.hasOption("startWords")) {
                    loadSet(parse.getOptionValue("startWords"), dumpTermsConfig.startWords);
                }
                if (parse.hasOption("includeDF")) {
                    dumpTermsConfig.includeDocFreq = true;
                }
                if (parse.hasOption("indexDir")) {
                    dumpTermsConfig.indexDirPath = Paths.get(parse.getOptionValue("indexDir"), new String[0]);
                }
                if (dumpTermsConfig.indexDirPath == null && dumpTermsConfig.indexPath == null) {
                    throw new ParseException("Must specify either an indexDir or an indexPath");
                }
                return dumpTermsConfig;
            } catch (ParseException e) {
                System.err.println(e.getMessage());
                DumpTerms.usage();
                return null;
            }
        }

        private static void loadSet(String str, Set<String> set) throws IOException {
            BufferedReader newBufferedReader = Files.newBufferedReader(Paths.get(str, new String[0]), StandardCharsets.UTF_8);
            String readLine = newBufferedReader.readLine();
            while (true) {
                String str2 = readLine;
                if (str2 == null) {
                    newBufferedReader.close();
                    return;
                }
                String trim = str2.trim();
                if (trim.length() == 0 || trim.startsWith("#")) {
                    readLine = newBufferedReader.readLine();
                } else {
                    set.add(trim);
                    readLine = newBufferedReader.readLine();
                }
            }
        }
    }

    public static void usage() {
        new HelpFormatter().printHelp(80, "java -jar gramreaper-x.y.jar DumpTerms -i lucene_index -min 10 -o output.txt", "Tool: DumpTerms", OPTIONS, "");
    }

    public DumpTerms(DumpTermsConfig dumpTermsConfig) {
        this.config = dumpTermsConfig;
    }

    public static void main(String[] strArr) throws Exception {
        DumpTermsConfig build = DumpTermsConfig.build(strArr);
        if (build == null) {
            return;
        }
        new DumpTerms(build).execute();
    }

    private void execute() throws IOException {
        if (this.config.indexPath != null) {
            processIndex(this.config.indexPath);
            return;
        }
        for (File file : this.config.indexDirPath.toFile().listFiles()) {
            try {
                processIndex(file.toPath());
            } catch (IOException e) {
                System.err.println("couldn't open index: " + file.getName());
            }
        }
    }

    private void processIndex(Path path) throws IOException {
        LeafReader wrap = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(FSDirectory.open(path)));
        if (this.config.topN.intValue() > -1) {
            dumpTopN(wrap);
        }
    }

    private void dumpTopN(LeafReader leafReader) throws IOException {
        if (this.config.field != null) {
            dumpTopNField(leafReader, this.config.field);
            return;
        }
        Iterator it = leafReader.getFieldInfos().iterator();
        while (it.hasNext()) {
            dumpTopNField(leafReader, ((FieldInfo) it.next()).name);
        }
    }

    private void dumpTopNField(LeafReader leafReader, String str) throws IOException {
        TokenCountPriorityQueue tokenCountPriorityQueue = new TokenCountPriorityQueue(this.config.topN.intValue());
        TermsEnum it = leafReader.terms(str).iterator();
        BytesRef next = it.next();
        int docCount = leafReader.getDocCount(str);
        while (next != null) {
            int docFreq = it.docFreq();
            if (this.config.minDocFreq.longValue() > -1 && docFreq < this.config.minDocFreq.longValue()) {
                next = it.next();
            } else if (this.config.minDocPercentage.doubleValue() <= -1.0d || docFreq / docCount >= this.config.minDocPercentage.doubleValue()) {
                if (tokenCountPriorityQueue.top() == null || tokenCountPriorityQueue.size() < this.config.topN.intValue() || docFreq >= ((TokenIntPair) tokenCountPriorityQueue.top()).getValue()) {
                    String utf8ToString = next.utf8ToString();
                    if (!this.config.stopWords.contains(utf8ToString) && !this.config.startWords.contains(utf8ToString)) {
                        tokenCountPriorityQueue.insertWithOverflow(new TokenIntPair(utf8ToString, docFreq));
                    }
                }
                next = it.next();
            } else {
                next = it.next();
            }
        }
        if (this.config.outputFile != null) {
            if (Files.isDirectory(this.config.outputFile, new LinkOption[0])) {
                writeTopN(this.config.outputFile.resolve(str), tokenCountPriorityQueue);
                return;
            } else {
                writeTopN(this.config.outputFile, tokenCountPriorityQueue);
                return;
            }
        }
        for (TokenIntPair tokenIntPair : tokenCountPriorityQueue.getArray()) {
            System.out.println(this.config.includeDocFreq ? clean(tokenIntPair.token) + "\t" + tokenIntPair.value : clean(tokenIntPair.token));
        }
    }

    private void writeTopN(Path path, TokenCountPriorityQueue tokenCountPriorityQueue) throws IOException {
        if (Files.isRegularFile(path, new LinkOption[0])) {
            System.err.println("File " + path.getFileName() + " already exists. Skipping.");
        }
        Files.createDirectories(path.getParent(), new FileAttribute[0]);
        BufferedWriter newBufferedWriter = Files.newBufferedWriter(path, StandardCharsets.UTF_8, new OpenOption[0]);
        Iterator<String> it = this.config.startWords.iterator();
        while (it.hasNext()) {
            newBufferedWriter.write(it.next() + "\n");
        }
        for (TokenIntPair tokenIntPair : tokenCountPriorityQueue.getArray()) {
            newBufferedWriter.write((this.config.includeDocFreq ? clean(tokenIntPair.token) + "\t" + tokenIntPair.value : clean(tokenIntPair.token)) + "\n");
        }
        newBufferedWriter.flush();
        newBufferedWriter.close();
    }

    private static String clean(String str) {
        return str == null ? "" : str.replaceAll("\\s+", " ");
    }

    static {
        Option option = new Option("f", "field", true, "Lucene field to process");
        Option option2 = new Option("i", "index", true, "Lucene index to process");
        OPTIONS = new Options().addOption(option).addOption(option2).addOption(new Option("indexDir", true, "use this if you have a directory with multiple indices")).addOption("n", "topN", true, "top n most frequent terms").addOption("min", true, "minimum doc frequency").addOption("max", true, "maximum doc frequency").addOption("maxP", true, "maximum doc freq percentage").addOption("minP", true, "minimum doc freq percentage").addOption("includeDF", false, "include the document frequency in the output; default is false").addOption("s", true, "stop words file -- UTF-8, one word per row").addOption("startWords", true, "start words file -- UTF-8, one word per row; every word will be added to the list").addOption("o", true, "output file");
    }
}
