package org.fbk.cit.hlt.thewikimachine.wikipedia;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.text.DecimalFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.thewikimachine.index.CrossLanguageSearcher;
import org.fbk.cit.hlt.thewikimachine.index.PageTextSearcher;
import org.fbk.cit.hlt.thewikimachine.util.GenericFileUtils;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/wikipedia/CrossLanguageTextBuilder.class */
public class CrossLanguageTextBuilder {
    public static final int DEFAULT_TEXT_LENGTH = 5;
    public static final int DEFAULT_NUM_PAGES = 10000;
    public static final int DEFAULT_NOTIFICATION_POINT = 100;
    private Map<String, Map<String, String>> languageResourceMap;
    private String[] languages;
    private CrossLanguageSearcher crossLanguageSearcher;
    private int notificationPoint;
    public static final char DEFAULT_COLUMN_SEPARATOR = ' ';
    static Logger logger = Logger.getLogger(CrossLanguageTextBuilder.class.getName());
    private static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);
    private static DecimalFormat df = new DecimalFormat("###,###,###,###");

    public CrossLanguageTextBuilder(String str, String str2, int i, int i2, String str3) throws Exception {
        this(str, str2, i, i2, str3, ' ');
    }

    public CrossLanguageTextBuilder(String str, String str2, int i, int i2, String str3, char c) throws Exception {
        String str4;
        Map<String, String> search;
        this.notificationPoint = 100;
        init(str);
        String resource = getResource(str2, "page-freq");
        PrintWriter printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str3), "UTF-8")));
        Map<String, PageTextSearcher> createPageTextSearcherMap = createPageTextSearcherMap(i2);
        String resource2 = getResource(str2, "cross-lang-index");
        logger.info("reading cross language index from " + resource2 + "...");
        this.crossLanguageSearcher = new CrossLanguageSearcher(resource2);
        logger.info("reading page/frequency pairs from " + resource + "...");
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(resource), "UTF-8"));
        int i3 = 1;
        long currentTimeMillis = System.currentTimeMillis();
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                break;
            }
            if (i3 >= i) {
                logger.info("Exit after " + i3 + " lines (" + i + ")");
                break;
            }
            if (i3 % this.notificationPoint == 0) {
                logger.info(i3 + StringTable.HORIZONTAL_TABULATION + df.format(i3) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + " ms " + new Date());
                currentTimeMillis = System.currentTimeMillis();
            }
            String[] split = tabPattern.split(readLine);
            if (split.length == 2 && (search = this.crossLanguageSearcher.search((str4 = split[1]))) != null) {
                StringBuilder sb = new StringBuilder();
                StringBuilder sb2 = new StringBuilder();
                sb.append(str4);
                sb2.append(createPageTextSearcherMap.get(str2).search(str4));
                int i4 = 0;
                for (String str5 : search.keySet()) {
                    if (createPageTextSearcherMap.get(str5) != null) {
                        String str6 = search.get(str5);
                        sb.append('_');
                        sb.append(str5);
                        sb2.append(c);
                        sb2.append(createPageTextSearcherMap.get(str5).search(str6));
                    }
                    i4++;
                }
                printWriter.println(sb.toString() + c + sb2.toString());
                i3++;
            }
        }
        logger.info(i3 + StringTable.HORIZONTAL_TABULATION + df.format(i3) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + " ms " + new Date());
        lineNumberReader.close();
        printWriter.close();
    }

    public int getNotificationPoint() {
        return this.notificationPoint;
    }

    public void setNotificationPoint(int i) {
        this.notificationPoint = i;
    }

    String getResource(String str, String str2) {
        return this.languageResourceMap.get(str).get(str2);
    }

    Map<String, PageTextSearcher> createPageTextSearcherMap(int i) throws IOException {
        HashMap hashMap = new HashMap();
        for (int i2 = 0; i2 < this.languages.length; i2++) {
            String resource = getResource(this.languages[i2], "text-index");
            logger.info("reading text index from " + resource + "...");
            PageTextSearcher pageTextSearcher = new PageTextSearcher(resource);
            pageTextSearcher.setMaximunTextLength(i);
            hashMap.put(this.languages[i2], pageTextSearcher);
        }
        return hashMap;
    }

    void init(String str) {
        if (!str.endsWith(File.separator)) {
            str = str + File.separator;
        }
        logger.info("wikipedia models " + str);
        this.languages = new File(str).list();
        logger.info("supported languages");
        logger.info(Arrays.toString(this.languages));
        this.languageResourceMap = new TreeMap();
        for (int i = 0; i < this.languages.length; i++) {
            try {
                this.languageResourceMap.put(this.languages[i], GenericFileUtils.searchForFilesInTheSameFolder(str + this.languages[i], "text-index", "page-freq", "cross-lang-index"));
            } catch (IOException e) {
                logger.error(e);
            }
        }
    }

    public static void main(String[] strArr) throws Exception {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            OptionBuilder.withArgName("dir");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("wikiepedia model dir");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("model-dir");
            Option create = OptionBuilder.create("m");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_STRING);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("source language");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("source-language");
            Option create2 = OptionBuilder.create("s");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("maximum length of the text (default is 5)");
            OptionBuilder.withLongOpt("maximum-length");
            Option create3 = OptionBuilder.create("l");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("maximum number of pages to process (default is 10000)");
            OptionBuilder.withLongOpt("num-pages");
            Option create4 = OptionBuilder.create("p");
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("cross language text");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("cross-text");
            Option create5 = OptionBuilder.create("t");
            OptionBuilder.withArgName("char");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("column separator");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("col-separator");
            Option create6 = OptionBuilder.create();
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("receive notification every n pages (default is " + df.format(100L) + ")");
            OptionBuilder.withLongOpt("notification-point");
            Option create7 = OptionBuilder.create("b");
            options.addOption("h", "help", false, "print this message");
            options.addOption("v", "version", false, "output version information and exit");
            options.addOption(create);
            options.addOption(create2);
            options.addOption(create3);
            options.addOption(create4);
            options.addOption(create5);
            options.addOption(create7);
            options.addOption(create6);
            CommandLine parse = new PosixParser().parse(options, strArr);
            if (parse.hasOption("help") || parse.hasOption("version")) {
                throw new ParseException("");
            }
            if (parse.hasOption("notification-point")) {
                Integer.parseInt(parse.getOptionValue("notification-point"));
            }
            int i = 5;
            if (parse.hasOption("maximum-length")) {
                i = Integer.parseInt(parse.getOptionValue("maximum-length"));
            }
            char c = ' ';
            if (parse.hasOption("col-separator")) {
                c = parse.getOptionValue("col-separator").charAt(0);
            }
            int i2 = 10000;
            if (parse.hasOption("num-pages")) {
                i2 = Integer.parseInt(parse.getOptionValue("num-pages"));
            }
            logger.debug(parse.getOptionValue("model-dir") + StringTable.HORIZONTAL_TABULATION + parse.getOptionValue("source-language") + StringTable.HORIZONTAL_TABULATION + i2 + StringTable.HORIZONTAL_TABULATION + i + StringTable.HORIZONTAL_TABULATION + parse.getOptionValue("cross-text"));
            new CrossLanguageTextBuilder(parse.getOptionValue("model-dir"), parse.getOptionValue("source-language"), i2, i, parse.getOptionValue("cross-text"), c);
        } catch (ParseException e) {
            if (e.getMessage().length() > 0) {
                System.out.println("Parsing failed: " + e.getMessage() + "\n");
            }
            new HelpFormatter().printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.wikipedia.CrossLanguageTextBuilder", "\n", options, "\n", true);
        }
    }
}
