package org.fbk.cit.hlt.thewikimachine.csv;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.analysis.HardTokenizer;
import org.fbk.cit.hlt.thewikimachine.analysis.Tokenizer;
import org.fbk.cit.hlt.thewikimachine.index.PageFormSearcher;
import org.fbk.cit.hlt.thewikimachine.index.TypeSearcher;
import org.fbk.cit.hlt.thewikimachine.index.util.FreqSetSearcher;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.ParsedPageTitle;

@Deprecated
/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/csv/RedirectFormExtractor.class */
public class RedirectFormExtractor {
    private static final int DEFAULT_MIN_FREQ = 100000;
    TypeSearcher typeSearcher;
    PageFormSearcher pageFormSearcher;
    private Map<String, Integer> frePageMap;
    private Tokenizer tokenizer = new HardTokenizer();
    static Logger logger = Logger.getLogger(RedirectFormExtractor.class.getName());
    private static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);

    public RedirectFormExtractor(TypeSearcher typeSearcher, PageFormSearcher pageFormSearcher, String str) throws IOException {
        this.typeSearcher = typeSearcher;
        this.pageFormSearcher = pageFormSearcher;
        this.frePageMap = createFreqPageMap(str);
    }

    private Map<String, Integer> createFreqPageMap(String str) throws IOException {
        logger.info("reading page/freq pairs from " + str + "...");
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(str), "UTF-8"));
        HashMap hashMap = new HashMap();
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                logger.info(hashMap.size() + " forms read");
                return hashMap;
            }
            String[] split = tabPattern.split(readLine);
            if (split.length == 2) {
                hashMap.put(split[1], new Integer(split[0]));
            }
        }
    }

    private String find(TypeSearcher.Entry entry, FreqSetSearcher.Entry[] entryArr, String str, String str2) {
        for (int i = 0; i < entryArr.length; i++) {
            if (entry.getType().equals(TypeSearcher.NOM_LABEL)) {
                if (str.equalsIgnoreCase(entryArr[i].getValue())) {
                    return entryArr[i].getValue();
                }
            } else if (str.equals(entryArr[i].getValue())) {
                return entryArr[i].getValue();
            }
        }
        return null;
    }

    public void start(String str) throws IOException {
        Integer num;
        logger.info("reading redirect pairs " + str + "...");
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(str), "UTF-8"));
        new HashMap();
        int i = 0;
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                logger.info("0/" + i);
                return;
            }
            String[] split = tabPattern.split(readLine);
            if (split.length == 2 && (num = this.frePageMap.get(split[1])) != null) {
                TypeSearcher.Entry search = this.typeSearcher.search(split[1]);
                FreqSetSearcher.Entry[] search2 = this.pageFormSearcher.search(split[1]);
                String str2 = this.tokenizer.tokenizedString(new ParsedPageTitle(split[0]).getForm());
                String find = find(search, search2, str2, split[1]);
                if (find != null) {
                    System.out.println("SI\t" + search.getType() + StringTable.HORIZONTAL_TABULATION + readLine + StringTable.HORIZONTAL_TABULATION + str2 + StringTable.HORIZONTAL_TABULATION + find + StringTable.HORIZONTAL_TABULATION + num);
                } else {
                    System.out.println("NO\t" + search.getType() + StringTable.HORIZONTAL_TABULATION + readLine + StringTable.HORIZONTAL_TABULATION + str2 + "\t[" + search2[0] + "]\t" + num);
                }
                i++;
            }
        }
    }

    public static void main(String[] strArr) throws IOException {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            try {
                OptionBuilder.withArgName("file");
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("wikipedia xml dump file");
                OptionBuilder.isRequired();
                OptionBuilder.withLongOpt("wikipedia-dump");
                Option create = OptionBuilder.create("d");
                OptionBuilder.withArgName("dir");
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("output directory in which to store output files");
                OptionBuilder.isRequired();
                OptionBuilder.withLongOpt("output-dir");
                Option create2 = OptionBuilder.create("o");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("number of threads (default 1)");
                OptionBuilder.withLongOpt("num-threads");
                Option create3 = OptionBuilder.create("t");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("number of pages to process (default all)");
                OptionBuilder.withLongOpt("num-pages");
                Option create4 = OptionBuilder.create("p");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("receive notification every n pages (default 10000)");
                OptionBuilder.withLongOpt("notification-point");
                Option create5 = OptionBuilder.create("n");
                OptionBuilder.withArgName("n-gram");
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("n-grams size (default is 10)");
                OptionBuilder.withLongOpt("n-gram");
                Option create6 = OptionBuilder.create("n");
                options.addOption("h", "help", false, "print this message");
                options.addOption("v", "version", false, "output version information and exit");
                options.addOption(create);
                options.addOption(create2);
                options.addOption(create3);
                options.addOption(create4);
                options.addOption(create6);
                options.addOption(create5);
                CommandLine parse = new PosixParser().parse(options, strArr);
                logger.debug(parse);
                if (parse.hasOption("help") || parse.hasOption("version")) {
                    throw new ParseException("");
                }
                int i = 100000;
                if (parse.hasOption("minimum-freq")) {
                    i = Integer.parseInt(parse.getOptionValue("minimum-freq"));
                }
                if (parse.hasOption("notification-point")) {
                    Integer.parseInt(parse.getOptionValue("notification-point"));
                }
                ExtractorParameters extractorParameters = new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"));
                logger.debug(extractorParameters);
                TypeSearcher typeSearcher = new TypeSearcher(extractorParameters.getWikipediaTypeIndexName());
                typeSearcher.loadCache();
                PageFormSearcher pageFormSearcher = new PageFormSearcher(extractorParameters.getWikipediaPageFormIndexName());
                if (parse.hasOption("key-freq")) {
                    pageFormSearcher.loadCache(parse.getOptionValue("key-freq"), i);
                }
                new RedirectFormExtractor(typeSearcher, pageFormSearcher, extractorParameters.getWikipediaPageFreqFileName()).start(extractorParameters.getWikipediaRedirFileName());
                logger.info("extraction ended " + new Date());
            } catch (ParseException e) {
                logger.error("Parsing failed: " + e.getMessage() + "\n");
                new HelpFormatter().printHelp(200, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.csv.RedirectFormExtractor", "\n", options, "\n", true);
                logger.info("extraction ended " + new Date());
            }
        } catch (Throwable th) {
            logger.info("extraction ended " + new Date());
            throw th;
        }
    }
}
