package org.fbk.cit.hlt.thewikimachine.wikipedia;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URLDecoder;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.core.io.FolderScanner;
import org.fbk.cit.hlt.core.io.GZFilter;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.PageMap;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.ParsedPageLink;

@Deprecated
/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/wikipedia/WikipediaTrafficExtractor.class */
public class WikipediaTrafficExtractor {
    private static final int DEFAULT_NUM_FILES = 1000;
    private static final int PAGE_COLUMN = 0;
    private static final int TRAFFIC_COLUMN = 1;
    private Map<String, Counter> trafficMap = new HashMap();
    private PageMap redirectPageMap;
    String trafficSourceName;
    int numFiles;
    static Logger logger = Logger.getLogger(WikipediaTrafficExtractor.class.getName());
    private static Pattern spacePattern = Pattern.compile(" ");
    private static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/wikipedia/WikipediaTrafficExtractor$Counter.class */
    public class Counter {
        int count;

        public Counter(int i) {
            this.count = i;
        }

        public void inc() {
            this.count++;
        }

        public void inc(int i) {
            this.count += i;
        }

        public int get() {
            return this.count;
        }

        public String toString() {
            return Integer.toString(this.count);
        }
    }

    public WikipediaTrafficExtractor(String str, int i) throws IOException {
        this.trafficSourceName = str;
        this.numFiles = i;
    }

    public void start(ExtractorParameters extractorParameters) {
        try {
            this.redirectPageMap = new PageMap(new File(extractorParameters.getWikipediaRedirFileName()));
            logger.info(this.redirectPageMap.size() + " redirect pages");
            File file = new File(extractorParameters.getWikipediaPageTrafficFileName());
            if (!file.exists() || file.length() <= 0) {
                initFromPageFreq(new File(extractorParameters.getWikipediaPageFreqFileName()));
            } else {
                initFromTraffic(file);
            }
            File file2 = new File(this.trafficSourceName);
            if (!file2.exists()) {
                logger.error(file2 + " does not exist");
                return;
            }
            logger.info("updating traffic statistics from " + file2 + "...");
            if (file2.isFile()) {
                long currentTimeMillis = System.currentTimeMillis();
                process(file2, extractorParameters.getLang());
                logger.info(file2 + " processed in " + (System.currentTimeMillis() - currentTimeMillis) + " ms, " + this.trafficMap.size() + " pages\t" + new Date());
            } else {
                FolderScanner folderScanner = new FolderScanner(file2);
                folderScanner.setFilter(new GZFilter());
                int i = 0;
                int i2 = 1;
                while (folderScanner.hasNext()) {
                    Object[] next = folderScanner.next();
                    for (int i3 = 0; i3 < next.length && i2 <= this.numFiles; i3++) {
                        long currentTimeMillis2 = System.currentTimeMillis();
                        File file3 = (File) next[i3];
                        logger.debug(file3);
                        process(file3, extractorParameters.getLang());
                        logger.info(i2 + " - " + file3 + " processed in " + (System.currentTimeMillis() - currentTimeMillis2) + " ms, " + this.trafficMap.size() + " pages (" + (this.trafficMap.size() - i) + ")\t" + new Date());
                        i = this.trafficMap.size();
                        i2++;
                    }
                }
            }
            write("prova2.csv");
        } catch (IOException e) {
            logger.error(e);
        }
    }

    private void write(String str) throws IOException {
        logger.info("writing " + this.trafficMap.size() + " pages...");
        PrintWriter printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str), "UTF-8")));
        for (String str2 : this.trafficMap.keySet()) {
            printWriter.print(this.trafficMap.get(str2).get());
            printWriter.print(StringTable.HORIZONTAL_TABULATION);
            printWriter.print(str2);
            printWriter.print("\n");
        }
        printWriter.close();
        logger.info(this.trafficMap.size() + " pages wrote (" + new Date() + ")");
    }

    private void initFromPageFreq(File file) throws IOException {
        logger.info("initializing from page freq " + file + "(" + file.length() + ")...");
        int i = 0;
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                logger.info(this.trafficMap.size() + " pages read\t" + new Date());
                lineNumberReader.close();
                return;
            } else {
                String[] split = tabPattern.split(readLine);
                if (split.length == 2) {
                    this.trafficMap.put(split[1], new Counter(0));
                }
                i++;
            }
        }
    }

    private void initFromTraffic(File file) throws IOException {
        logger.info("initializing from traffic " + file + "(" + file.length() + ")...");
        int i = 0;
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                logger.info(this.trafficMap.size() + " pages read\t" + new Date());
                lineNumberReader.close();
                return;
            } else {
                String[] split = tabPattern.split(readLine);
                if (split.length == 2) {
                    this.trafficMap.put(split[1], new Counter(Integer.parseInt(split[0])));
                }
                i++;
            }
        }
    }

    private void process(File file, String str) throws IOException {
        logger.info("processing " + file + ParsedPageLink.START_SUFFIX_PATTERN + str + ")...");
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)), "UTF-8"));
        int i = 0;
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                lineNumberReader.close();
                return;
            }
            String[] split = spacePattern.split(readLine);
            if (split[0].equals(str)) {
                try {
                    String decode = URLDecoder.decode(split[1], "UTF-8");
                    char charAt = decode.charAt(0);
                    if (Character.isLowerCase(charAt)) {
                        decode = Character.toUpperCase(charAt) + decode.substring(1, decode.length());
                    }
                    int parseInt = Integer.parseInt(split[2]);
                    if (parseInt > 0) {
                        String str2 = this.redirectPageMap.get(decode);
                        if (str2 != null) {
                            decode = str2;
                        }
                        Counter counter = this.trafficMap.get(decode);
                        if (counter != null) {
                            counter.inc(parseInt);
                        }
                    }
                } catch (Exception e) {
                    logger.error("Error at line " + i);
                }
            }
            i++;
        }
    }

    public static void main(String[] strArr) throws Exception {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("wikipedia xml dump file");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("wikipedia-dump");
            Option create = OptionBuilder.create("d");
            OptionBuilder.withArgName("stats-dir");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("wikipedia statistics directory");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("stats-dir");
            Option create2 = OptionBuilder.create();
            OptionBuilder.withArgName("dir");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("output directory in which to store output files");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("output-dir");
            Option create3 = OptionBuilder.create("o");
            OptionBuilder.withArgName("num-files");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("number of files to process (default all)");
            OptionBuilder.withLongOpt("num-files");
            Option create4 = OptionBuilder.create();
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("receive notification every n pages (default 10000)");
            OptionBuilder.withLongOpt("notification-point");
            Option create5 = OptionBuilder.create("n");
            options.addOption("h", "help", false, "print this message");
            options.addOption("v", "version", false, "output version information and exit");
            OptionBuilder.withDescription("if set, use the output folder as base dir");
            OptionBuilder.withLongOpt("base-dir");
            Option create6 = OptionBuilder.create();
            options.addOption(create);
            options.addOption(create2);
            options.addOption(create3);
            options.addOption(create4);
            options.addOption(create5);
            options.addOption(create6);
            CommandLine parse = new PosixParser().parse(options, strArr);
            if (parse.hasOption("num-threads")) {
                Integer.parseInt(parse.getOptionValue("num-threads"));
            }
            int i = 1000;
            if (parse.hasOption("num-files")) {
                i = Integer.parseInt(parse.getOptionValue("num-files"));
            }
            if (parse.hasOption("notification-point")) {
                Integer.parseInt(parse.getOptionValue("notification-point"));
            }
            ExtractorParameters extractorParameters = parse.hasOption("base-dir") ? new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"), true) : new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"));
            logger.debug("extracting statistics (" + extractorParameters.getWikipediaPageTrafficFileName() + ")...");
            new WikipediaTrafficExtractor(parse.getOptionValue("stats-dir"), i).start(extractorParameters);
            logger.info("extraction ended " + new Date());
        } catch (ParseException e) {
            System.out.println("Parsing failed: " + e.getMessage() + "\n");
            new HelpFormatter().printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.wikipedia.WikipediaTrafficExtractor", "\n", options, "\n", true);
        }
    }
}
