package org.fbk.cit.hlt.thewikimachine.wikipedia;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLDecoder;
import java.text.DecimalFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import net.didion.jwnl.dictionary.database.DatabaseManagerImpl;
import net.didion.jwnl.princeton.file.PrincetonRandomAccessDictionaryFile;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.thewikimachine.index.PageIdIndexer;
import org.fbk.cit.hlt.thewikimachine.util.GenericFileUtils;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.PageMap;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.ParsedPageLink;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/wikipedia/WikipediaTrafficDownloader.class */
public class WikipediaTrafficDownloader {
    static Logger logger = Logger.getLogger(WikipediaTrafficDownloader.class.getName());
    public static final String[] languages = {"lt", "sq", "be", "fi", "lv", "sr", "bg", "fr", "nl", "sv", "ca", "hr", "no", "tr", "cs", "hu", "pl", "uk", "da", PageIdIndexer.ID_FIELD_NAME, "pt", "de", "is", "ro", "it", "ru", "es", "sk", "et", "sl", "en"};
    private static Pattern spacePattern = Pattern.compile(" ");
    private static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);
    private static DecimalFormat tf = new DecimalFormat("###,###,###");
    static final int SLEEP_TIME = 10000;
    static final int DEFAULT_NOTIFICATION_POINT = 10000;
    public static final String BASE_URL = "http://dumps.wikimedia.org/other/pagecounts-raw/";
    private String year;
    private String month;
    private Map<String, PageMap> redirectMap;
    private Map<String, Map<String, Counter>> trafficMap;

    public WikipediaTrafficDownloader(String str, String str2, int i, String str3, String str4, String str5) {
        long currentTimeMillis = System.currentTimeMillis();
        logger.info("process started " + new Date());
        str4 = str4.length() == 1 ? SchemaSymbols.ATTVAL_FALSE_0 + str4 : str4;
        this.year = str3;
        this.month = str4;
        String str6 = (str.endsWith(File.separator) ? str : str + File.separator) + str3 + File.separator + str3 + '-' + str4 + File.separator;
        logger.debug(str6);
        String str7 = (str2.endsWith(File.separator) ? str2 : str2 + File.separator) + str3 + '-' + str4 + File.separator;
        str5 = str5.endsWith(File.separator) ? str5 : str5 + File.separator;
        File file = new File(str7);
        if (!file.exists()) {
            file.mkdirs();
        }
        logger.debug(str7);
        logger.debug(str5);
        init(languages, str5);
        Document document = null;
        try {
            document = Jsoup.connect(str6).get();
        } catch (IOException e) {
            logger.error(e);
        }
        int i2 = 0;
        Iterator<Element> it = document.select("a").iterator();
        while (it.hasNext()) {
            Element next = it.next();
            if (next.html().startsWith("pagecounts")) {
                String attr = next.attr("href");
                try {
                    try {
                        URL url = new URL(str6 + attr);
                        logger.info(tf.format(i2) + "/" + tf.format(r0.size()) + StringTable.HORIZONTAL_TABULATION + url);
                        process(url);
                        write(str7, attr);
                        try {
                            logger.info("waiting " + tf.format(i) + "ms...");
                            Thread.sleep(i);
                        } catch (InterruptedException e2) {
                        }
                    } catch (Exception e3) {
                        logger.error(e3);
                        try {
                            logger.info("waiting " + tf.format(i) + "ms...");
                            Thread.sleep(i);
                        } catch (InterruptedException e4) {
                            logger.error(e4);
                        }
                        i2++;
                    }
                } finally {
                    try {
                        logger.info("waiting " + tf.format(i) + "ms...");
                        Thread.sleep(i);
                    } catch (InterruptedException e22) {
                        logger.error(e22);
                    }
                    int i3 = i2 + 1;
                }
            }
        }
        logger.info(tf.format(i2) + " files processed in " + tf.format(System.currentTimeMillis() - currentTimeMillis) + " ms " + new Date());
    }

    private void init(String[] strArr, String str) {
        this.trafficMap = new HashMap();
        this.redirectMap = new HashMap();
        for (int i = 0; i < strArr.length; i++) {
            try {
                Map<String, String> searchForFilesInTheSameFolder = GenericFileUtils.searchForFilesInTheSameFolder(str + strArr[i], "page-freq.csv", "redirect.csv");
                File file = new File(searchForFilesInTheSameFolder.get("page-freq.csv"));
                logger.debug("initializing " + strArr[i] + "...");
                this.trafficMap.put(strArr[i], initFromPageFreq(file));
                this.redirectMap.put(strArr[i], new PageMap(new File(searchForFilesInTheSameFolder.get("redirect.csv"))));
            } catch (IOException e) {
                logger.error(e);
            }
        }
    }

    private Map<String, Counter> initFromPageFreq(File file) throws IOException {
        logger.info("reading " + file + ParsedPageLink.START_SUFFIX_PATTERN + tf.format(file.length()) + ")...");
        int i = 0;
        HashMap hashMap = new HashMap();
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                logger.info(tf.format(hashMap.size()) + " pages read " + new Date());
                lineNumberReader.close();
                return hashMap;
            }
            String[] split = tabPattern.split(readLine);
            if (split.length == 2) {
                hashMap.put(split[1], new Counter(0));
            }
            i++;
        }
    }

    private void write(String str, String str2) {
        logger.info("writing partial result after " + str2 + "...");
        logger.info("writing " + languages.length + " languages in " + str + "...");
        for (String str3 : this.trafficMap.keySet()) {
            Map<String, Counter> map = this.trafficMap.get(str3);
            try {
                PrintWriter printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(str + str3 + '-' + this.year + '-' + this.month + ".log"), true), "UTF-8")));
                printWriter.println(str3 + StringTable.HORIZONTAL_TABULATION + str2 + StringTable.HORIZONTAL_TABULATION + map.size() + StringTable.HORIZONTAL_TABULATION + new Date());
                printWriter.close();
                write(map, str3, new File(str + str3 + '-' + this.year + '-' + this.month + ".csv"));
            } catch (IOException e) {
                logger.error(e);
            }
        }
    }

    void write(Map<String, Counter> map, String str, File file) throws IOException {
        long currentTimeMillis = System.currentTimeMillis();
        logger.info("writing " + tf.format(map.size()) + " " + str + " pages in " + file + "...");
        PrintWriter printWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8")));
        for (String str2 : this.trafficMap.get(str).keySet()) {
            printWriter.print(map.get(str2).get());
            printWriter.print(StringTable.HORIZONTAL_TABULATION);
            printWriter.print(str2);
            printWriter.print("\n");
        }
        printWriter.close();
        logger.info(tf.format(map.size()) + " pages wrote in " + tf.format(System.currentTimeMillis() - currentTimeMillis) + " " + new Date());
    }

    void process(URL url) throws IOException {
        Map<String, Counter> map;
        long currentTimeMillis = System.currentTimeMillis();
        logger.info("processing " + url + "...");
        char[] cArr = new char[1024];
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new GZIPInputStream(url.openConnection().getInputStream())));
        int i = 0;
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                System.out.print("\n");
                lineNumberReader.close();
                logger.info(url + " processed in " + tf.format(System.currentTimeMillis() - currentTimeMillis) + " ms " + new Date());
                return;
            }
            String[] split = spacePattern.split(readLine);
            if (split.length > 3 && (map = this.trafficMap.get(split[0])) != null) {
                try {
                    String decode = URLDecoder.decode(split[1], "UTF-8");
                    PageMap pageMap = this.redirectMap.get(split[0]);
                    String str = pageMap.get(decode);
                    if (str != null) {
                        String str2 = pageMap.get(str);
                        decode = str2 != null ? str2 : str;
                    }
                    int parseInt = Integer.parseInt(split[2]);
                    Counter counter = map.get(decode);
                    if (counter != null) {
                        counter.inc(parseInt);
                    }
                } catch (IllegalArgumentException e) {
                }
            }
            if (i % 10000 == 0) {
                System.out.print(".");
            }
            i++;
        }
    }

    public static void main(String[] strArr) {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        OptionBuilder.withArgName(DatabaseManagerImpl.URL);
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("base url from which to process the traffic statistics (default http://dumps.wikimedia.org/other/pagecounts-raw/)");
        OptionBuilder.withLongOpt("base-url");
        options.addOption(OptionBuilder.create("u"));
        OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("year for which traffic statistics are processed");
        OptionBuilder.isRequired();
        OptionBuilder.withLongOpt("year");
        options.addOption(OptionBuilder.create("y"));
        OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("month for which traffic statistics are processed");
        OptionBuilder.isRequired();
        OptionBuilder.withLongOpt("month");
        options.addOption(OptionBuilder.create("m"));
        OptionBuilder.withArgName("dir");
        OptionBuilder.withDescription("output folder in which to store the traffic statistics");
        OptionBuilder.isRequired();
        OptionBuilder.hasArg();
        OptionBuilder.withLongOpt("output-dir");
        options.addOption(OptionBuilder.create("o"));
        OptionBuilder.withArgName("dir");
        OptionBuilder.withDescription("root folder (model folder) from which to read the page frequency and redirect files");
        OptionBuilder.isRequired();
        OptionBuilder.hasArg();
        OptionBuilder.withLongOpt("root-dir");
        options.addOption(OptionBuilder.create(PrincetonRandomAccessDictionaryFile.READ_ONLY));
        OptionBuilder.withArgName("milliseconds");
        OptionBuilder.withDescription("sleep time between queries (default 10000)");
        OptionBuilder.hasArg();
        OptionBuilder.withLongOpt("sleep");
        options.addOption(OptionBuilder.create("s"));
        options.addOption("h", "help", false, "Print this message");
        try {
            CommandLine parse = new PosixParser().parse(options, strArr);
            String optionValue = parse.getOptionValue("output-dir");
            String str = BASE_URL;
            if (parse.hasOption("base-url")) {
                str = parse.getOptionValue("base-url");
            }
            int i = 10000;
            if (parse.hasOption("sleep")) {
                try {
                    i = Integer.parseInt(parse.getOptionValue("sleep"));
                } catch (Exception e) {
                }
            }
            new WikipediaTrafficDownloader(str, optionValue, i, parse.getOptionValue("year"), parse.getOptionValue("month"), parse.getOptionValue("root-dir"));
        } catch (ParseException e2) {
            System.out.println("Parsing failed: " + e2.getMessage() + "\n");
            new HelpFormatter().printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.wikipedia.WikipediaTrafficDownloader", "\n", options, "\n", true);
        }
    }
}
