package org.fbk.cit.hlt.thewikimachine.csv;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.text.DecimalFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.core.math.Node;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.util.SynchronizedIndexer;
import org.fbk.cit.hlt.thewikimachine.util.WeightedSet;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/csv/PageAllCategoryExtractor.class */
public class PageAllCategoryExtractor extends AbstractCategoryExtractor {
    static Logger logger = Logger.getLogger(PageAllCategoryExtractor.class.getName());
    private static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);
    protected static DecimalFormat tf = new DecimalFormat("000,000,000.#");
    protected static DecimalFormat df = new DecimalFormat("###,###,###,###");
    protected static DecimalFormat wf = new DecimalFormat("###,###,###,###.000");
    Map<String, Set<String>> pageCategoryMap;
    Map<String, Set<String>> categorySuperCategoryMap;
    SynchronizedIndexer categoryIndex;
    PrintWriter pageTopCategoryWriter;
    Writer categoryIndexWriter;
    Map<String, Double> categoryWeight;

    public PageAllCategoryExtractor(int i) {
        super(i);
        this.categoryIndex = new SynchronizedIndexer();
    }

    public void interactive() throws Exception {
        while (true) {
            System.out.println("\nPlease write a key and type <return> to continue (CTRL C to exit):");
            String[] split = tabPattern.split(new BufferedReader(new InputStreamReader(System.in)).readLine().toString());
            if (split.length == 1) {
                long nanoTime = System.nanoTime();
                WeightedSet search = search(split[0]);
                logger.debug(search.toSortedMap() + StringTable.HORIZONTAL_TABULATION + tf.format(System.nanoTime() - nanoTime));
                logger.debug("print\n" + print(split[0], search));
            } else if (split.length == 2) {
                long nanoTime2 = System.nanoTime();
                WeightedSet search2 = search(split[0]);
                WeightedSet search3 = search(split[1]);
                Node[] nodeArray = toNodeArray(search2);
                Node[] nodeArray2 = toNodeArray(search3);
                double dot = dot(nodeArray, nodeArray2);
                logger.debug(split[0] + StringTable.HORIZONTAL_TABULATION + split[1] + StringTable.HORIZONTAL_TABULATION + wf.format(dot / Math.sqrt(dot(nodeArray, nodeArray) * dot(nodeArray2, nodeArray2))) + StringTable.HORIZONTAL_TABULATION + wf.format(dot) + StringTable.HORIZONTAL_TABULATION + tf.format(System.nanoTime() - nanoTime2));
            }
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.csv.CSVExtractor
    public void notification(int i, long j, long j2) {
        logger.info(df.format(this.categoryIndex.size()) + StringTable.HORIZONTAL_TABULATION + df.format(i) + StringTable.HORIZONTAL_TABULATION + df.format(j2 - j) + StringTable.HORIZONTAL_TABULATION + new Date());
    }

    public double dot(Node[] nodeArr, Node[] nodeArr2) {
        SortedMap reverseIndex = this.categoryIndex.reverseIndex();
        double d = 0.0d;
        int length = nodeArr.length;
        int length2 = nodeArr2.length;
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        while (i < length && i2 < length2) {
            if (nodeArr[i].index == nodeArr2[i2].index) {
                int i4 = i3;
                i3++;
                logger.debug(i4 + "/" + nodeArr[i].index + StringTable.HORIZONTAL_TABULATION + ((String) reverseIndex.get(Integer.valueOf(nodeArr[i].index))) + StringTable.HORIZONTAL_TABULATION + nodeArr2[i2].index + StringTable.HORIZONTAL_TABULATION + nodeArr[i].value + StringTable.HORIZONTAL_TABULATION + nodeArr2[i2].value);
                int i5 = i;
                i++;
                int i6 = i2;
                i2++;
                d += nodeArr[i5].value * nodeArr2[i6].value;
            } else if (nodeArr[i].index > nodeArr2[i2].index) {
                i2++;
            } else {
                i++;
            }
        }
        return d;
    }

    public static String normalizePageName(String str) {
        if (str.length() != 0 && !Character.isUpperCase(str.charAt(0))) {
            return str.substring(0, 1).toUpperCase() + str.substring(1, str.length());
        }
        return str;
    }

    private String tabulator(int i) {
        StringBuilder sb = new StringBuilder();
        for (int i2 = 0; i2 < i; i2++) {
            sb.append('\t');
        }
        return sb.toString();
    }

    void search(Set<String> set, WeightedSet weightedSet, int i) {
        if (set == null || i > getMaxDepth()) {
            return;
        }
        Iterator<String> it = set.iterator();
        int i2 = 0;
        while (it.hasNext()) {
            String normalizePageName = normalizePageName(it.next());
            if (!weightedSet.contains(normalizePageName)) {
                weightedSet.add(normalizePageName, this.categoryWeight.get(normalizePageName).doubleValue());
                try {
                    Set<String> set2 = this.categorySuperCategoryMap.get(normalizePageName);
                    if (set2 != null) {
                        search(set2, weightedSet, i + 1);
                    }
                } catch (Exception e) {
                    logger.error(e);
                }
            }
            i2++;
        }
    }

    WeightedSet search(String str) {
        WeightedSet weightedSet = new WeightedSet();
        Set<String> set = this.pageCategoryMap.get(str);
        if (set != null) {
            search(set, weightedSet, 1);
        }
        return weightedSet;
    }

    @Override // org.fbk.cit.hlt.thewikimachine.csv.CSVExtractor
    public void processLine(String str) {
        String[] split = tabPattern.split(str);
        Node[] nodeArray = toNodeArray(search(split[0]));
        if (nodeArray.length > 0) {
            String str2 = split[0] + StringTable.HORIZONTAL_TABULATION + Node.toString(nodeArray);
            synchronized (this) {
                this.pageTopCategoryWriter.println(str2);
            }
        }
    }

    Map<String, Set<String>> readMap(String str) throws IOException {
        logger.info("reading " + str + "...");
        HashMap hashMap = new HashMap();
        long currentTimeMillis = System.currentTimeMillis();
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(str), "UTF-8"));
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        String str2 = "";
        HashSet hashSet = new HashSet();
        String readLine = lineNumberReader.readLine();
        if (readLine != null) {
            String[] split = tabPattern.split(readLine);
            if (split.length == 2) {
                hashSet.add(split[1]);
                str2 = split[0];
                i2 = 0 + 1;
            }
            i3 = 0 + 1;
        }
        while (true) {
            String readLine2 = lineNumberReader.readLine();
            if (readLine2 == null) {
                logger.info(df.format(i3) + StringTable.HORIZONTAL_TABULATION + df.format(i) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + " ms " + new Date());
                hashMap.put(str2, hashSet);
                lineNumberReader.close();
                return hashMap;
            }
            String[] split2 = tabPattern.split(readLine2);
            if (split2.length == 2) {
                if (!split2[0].equals(str2)) {
                    hashMap.put(str2, hashSet);
                    i++;
                    hashSet = new HashSet();
                    i2 = 0;
                }
                hashSet.add(split2[1]);
                str2 = split2[0];
                i2++;
            }
            i3++;
        }
    }

    Map<String, Double> readCategoryFrequency(String str) throws IOException {
        logger.info("reading " + str + "...");
        HashMap hashMap = new HashMap();
        long currentTimeMillis = System.currentTimeMillis();
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(str), "UTF-8"));
        double d = 0.0d;
        int i = 0;
        new HashSet();
        String readLine = lineNumberReader.readLine();
        if (readLine != null) {
            String[] split = tabPattern.split(readLine);
            if (split.length == 2) {
                d = Double.parseDouble(split[1]);
                hashMap.put(split[0], Double.valueOf(0.0d));
            }
            i = 0 + 1;
        }
        while (true) {
            String readLine2 = lineNumberReader.readLine();
            if (readLine2 == null) {
                logger.info(df.format(i) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + " ms " + new Date());
                lineNumberReader.close();
                return hashMap;
            }
            String[] split2 = tabPattern.split(readLine2);
            if (split2.length == 2) {
                hashMap.put(split2[0], Double.valueOf(Math.log10(d / Double.parseDouble(split2[1]))));
            }
            i++;
        }
    }

    private Node[] toNodeArray(WeightedSet weightedSet) {
        SortedSet<Node> sortedSet = toSortedSet(weightedSet);
        return (Node[]) sortedSet.toArray(new Node[sortedSet.size()]);
    }

    private SortedSet<Node> toSortedSet(WeightedSet weightedSet) {
        TreeSet treeSet = new TreeSet();
        Iterator<String> it = weightedSet.iterator();
        int i = 0;
        while (it.hasNext()) {
            String next = it.next();
            treeSet.add(new Node(this.categoryIndex.get(next), weightedSet.get(next)));
            i++;
        }
        return treeSet;
    }

    private String print(String str, WeightedSet weightedSet) {
        StringBuilder sb = new StringBuilder();
        Iterator<String> it = weightedSet.iterator();
        int i = 0;
        while (it.hasNext()) {
            String next = it.next();
            int i2 = this.categoryIndex.get(next);
            double d = weightedSet.get(next);
            sb.append(str);
            sb.append('\t');
            sb.append(next);
            sb.append('\t');
            sb.append(d);
            sb.append('\t');
            sb.append(i2);
            sb.append("\n");
            i++;
        }
        return sb.toString();
    }

    @Override // org.fbk.cit.hlt.thewikimachine.csv.CSVExtractor
    public void start(ExtractorParameters extractorParameters) {
        try {
            this.pageCategoryMap = readMap(extractorParameters.getWikipediaPageCategoryFileName());
            this.categorySuperCategoryMap = readMap(extractorParameters.getWikipediaCategorySuperCategoryFileName());
            this.categoryWeight = readCategoryFrequency(extractorParameters.getWikipediaSortedPagePerCategoryCountFileName());
            this.pageTopCategoryWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPageAllCategoryFileName()), "UTF-8")));
            this.categoryIndexWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediCategoryIndexFileName()), "UTF-8"));
            read(extractorParameters.getWikipediaTitleIdFileName());
        } catch (IOException e) {
            logger.error(e);
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.csv.CSVExtractor
    public void end() {
        this.pageTopCategoryWriter.close();
        try {
            this.categoryIndex.write(this.categoryIndexWriter);
        } catch (IOException e) {
            logger.error(e);
        }
    }

    public static void main(String[] strArr) throws IOException {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        Options options = new Options();
        try {
            try {
                OptionBuilder.withArgName("file");
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("wikipedia xml dump file");
                OptionBuilder.isRequired();
                OptionBuilder.withLongOpt("wikipedia-dump");
                Option create = OptionBuilder.create("d");
                OptionBuilder.withArgName("dir");
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("output directory in which to store output files");
                OptionBuilder.isRequired();
                OptionBuilder.withLongOpt("output-dir");
                Option create2 = OptionBuilder.create("o");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("number of threads (default 1)");
                OptionBuilder.withLongOpt("num-threads");
                Option create3 = OptionBuilder.create("t");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("number of pages to process (default all)");
                OptionBuilder.withLongOpt("num-pages");
                Option create4 = OptionBuilder.create("p");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("receive notification every n pages (default 10000)");
                OptionBuilder.withLongOpt("notification-point");
                Option create5 = OptionBuilder.create("n");
                OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
                OptionBuilder.hasArg();
                OptionBuilder.withDescription("recursion maximum category depth (default is 7)");
                OptionBuilder.withLongOpt("max-depth");
                Option create6 = OptionBuilder.create();
                OptionBuilder.withDescription("enter in the interactive mode");
                OptionBuilder.withLongOpt("interactive-mode");
                Option create7 = OptionBuilder.create();
                options.addOption("h", "help", false, "print this message");
                options.addOption("v", "version", false, "output version information and exit");
                OptionBuilder.withDescription("trace mode");
                OptionBuilder.withLongOpt("trace");
                options.addOption(OptionBuilder.create());
                OptionBuilder.withDescription("debug mode");
                OptionBuilder.withLongOpt("debug");
                options.addOption(OptionBuilder.create());
                options.addOption(create7);
                options.addOption(create);
                options.addOption(create2);
                options.addOption(create3);
                options.addOption(create4);
                options.addOption(create6);
                options.addOption(create5);
                CommandLine parse = new PosixParser().parse(options, strArr);
                logger.debug(parse);
                Properties properties = new Properties();
                try {
                    properties.load(new InputStreamReader(new FileInputStream(property), "UTF-8"));
                } catch (Exception e) {
                    properties.setProperty("log4j.appender.stdout", "org.apache.log4j.ConsoleAppender");
                    properties.setProperty("log4j.appender.stdout.layout.ConversionPattern", "[%t] %-5p (%F:%L) - %m %n");
                    properties.setProperty("log4j.appender.stdout.layout", "org.apache.log4j.PatternLayout");
                    properties.setProperty("log4j.appender.stdout.Encoding", "UTF-8");
                }
                if (parse.hasOption("trace")) {
                    properties.setProperty("log4j.rootLogger", "trace,stdout");
                } else if (parse.hasOption("debug")) {
                    properties.setProperty("log4j.rootLogger", "debug,stdout");
                } else if (properties.getProperty("log4j.rootLogger") == null) {
                    properties.setProperty("log4j.rootLogger", "info,stdout");
                }
                PropertyConfigurator.configure(properties);
                if (parse.hasOption("help") || parse.hasOption("version")) {
                    throw new ParseException("");
                }
                int i = 1;
                if (parse.hasOption("num-threads")) {
                    i = Integer.parseInt(parse.getOptionValue("num-threads"));
                }
                if (parse.hasOption("num-pages")) {
                    Integer.parseInt(parse.getOptionValue("num-pages"));
                }
                int i2 = 10000;
                if (parse.hasOption("notification-point")) {
                    i2 = Integer.parseInt(parse.getOptionValue("notification-point"));
                }
                ExtractorParameters extractorParameters = new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"));
                logger.debug(extractorParameters);
                PageAllCategoryExtractor pageAllCategoryExtractor = new PageAllCategoryExtractor(i);
                if (parse.hasOption("max-depth")) {
                    pageAllCategoryExtractor.setMaxDepth(Integer.parseInt(parse.getOptionValue("max-depth")));
                }
                pageAllCategoryExtractor.setNotificationPoint(i2);
                if (parse.hasOption("interactive-mode")) {
                    try {
                        pageAllCategoryExtractor.interactive();
                    } catch (Exception e2) {
                        logger.error(e2);
                    }
                } else {
                    pageAllCategoryExtractor.start(extractorParameters);
                }
                logger.info("extraction ended " + new Date());
            } catch (Throwable th) {
                logger.info("extraction ended " + new Date());
                throw th;
            }
        } catch (ParseException e3) {
            logger.error("Parsing failed: " + e3.getMessage() + "\n");
            new HelpFormatter().printHelp(200, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.csv.PageAllCategoryExtractor", "\n", options, "\n", true);
            logger.info("extraction ended " + new Date());
        }
    }
}
