package org.fbk.cit.hlt.thewikimachine;

import java.io.BufferedReader;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.didion.jwnl.princeton.file.PrincetonRandomAccessDictionaryFile;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.codehaus.jackson.map.ObjectMapper;
import org.fbk.cit.hlt.thewikimachine.util.BaseFolder;
import org.fbk.cit.hlt.thewikimachine.util.CommandLineWithLogger;
import org.fbk.cit.hlt.thewikimachine.util.Downloader;
import org.fbk.cit.hlt.thewikimachine.util.UnixBunzip2Wrapper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/ExtractAllDumps.class */
public class ExtractAllDumps extends BaseFolder {
    static Logger logger = Logger.getLogger(ExtractAllDumps.class.getName());
    static final Integer ARTICLE_LIMIT = 1500000;
    static final Pattern wikiDumpBZippedPattern = Pattern.compile("^(\\w{2})wiki-(\\d+)-pages-articles.xml.bz2$");
    static final Pattern wikiDumpPattern = Pattern.compile("^(\\w{2})wiki-(\\d+)-pages-articles.xml$");
    static final Pattern wikiDataDumpPattern = Pattern.compile("^wikidatawiki-(\\d+)-pages-articles.xml$");
    boolean decompress;
    boolean download;
    boolean extract;
    boolean cleanDumps;
    boolean cleanModels;
    String[] givenLanguages;
    boolean fake;
    String pattern = "--category-similarity --base-dir -t 8 -r --templates --file --person-info --abstract --incoming-outgoing";
    String server = "dumps.wikimedia.org";

    private static String downloadPage(String str) {
        InputStream inputStream = null;
        try {
            try {
                inputStream = new URL(str).openStream();
                BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
                StringBuffer stringBuffer = new StringBuffer();
                while (true) {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        break;
                    }
                    stringBuffer.append(readLine).append(System.getProperty("line.separator"));
                }
                String stringBuffer2 = stringBuffer.toString();
                if (inputStream != null) {
                    try {
                        inputStream.close();
                    } catch (IOException e) {
                        logger.warn(e.getMessage());
                    }
                }
                return stringBuffer2;
            } catch (Throwable th) {
                if (inputStream != null) {
                    try {
                        inputStream.close();
                    } catch (IOException e2) {
                        logger.warn(e2.getMessage());
                        throw th;
                    }
                }
                throw th;
            }
        } catch (MalformedURLException e3) {
            logger.warn(e3.getMessage());
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException e4) {
                    logger.warn(e4.getMessage());
                    return null;
                }
            }
            return null;
        } catch (IOException e5) {
            logger.warn(e5.getMessage());
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException e6) {
                    logger.warn(e6.getMessage());
                    return null;
                }
            }
            return null;
        }
    }

    private static void downloadLastWikipediaVersion(String str, String str2, TreeSet<Integer> treeSet, String str3, boolean z) {
        String format = String.format("http://" + str + "/%1$swiki/", str2);
        logger.debug("Path: " + format);
        logger.info(String.format("Downloading %s version of Wikipedia", str2));
        TreeSet treeSet2 = new TreeSet(Collections.reverseOrder());
        logger.info("Getting information on dumps for language " + str2);
        try {
            Iterator<Element> it = Jsoup.connect(format).get().select("a").iterator();
            while (it.hasNext()) {
                Matcher matcher = wikiVersionPattern.matcher(it.next().html());
                if (matcher.matches()) {
                    treeSet2.add(Integer.valueOf(Integer.parseInt(matcher.group(1))));
                }
            }
            if (treeSet.size() > 0 && treeSet2.size() > 0 && treeSet.first().intValue() >= ((Integer) treeSet2.first()).intValue()) {
                logger.info("No new versions for " + str2);
                return;
            }
            if (treeSet2.size() == 0) {
                logger.warn("No versions for " + str2);
                return;
            }
            logger.info("There is a new version for " + str2 + ": " + treeSet2.first());
            if (!str3.endsWith(File.separator)) {
                str3 = str3 + File.separator;
            }
            String format2 = String.format("http://" + str + "/%1$swiki/%2$s/%1$swiki-%2$s-pages-articles.xml.bz2", str2, ((Integer) treeSet2.first()).toString());
            String format3 = String.format("http://" + str + "/%1$swiki/%2$s/%1$swiki-%2$s-md5sums.txt", str2, ((Integer) treeSet2.first()).toString());
            String str4 = str3 + String.format("%1$swiki-%2$s-pages-articles.xml.bz2", str2, ((Integer) treeSet2.first()).toString());
            String format4 = String.format("%1$swiki-%2$s-pages-articles.xml.bz2", str2, ((Integer) treeSet2.first()).toString());
            String downloadPage = downloadPage(format3);
            if (downloadPage == null) {
                logger.warn("Unable to open MD5 file");
                return;
            }
            boolean z2 = false;
            for (String str5 : downloadPage.split(System.getProperty("line.separator"))) {
                String[] split = str5.trim().split("\\s+");
                if (split.length >= 2 && split[1].equals(format4)) {
                    logger.debug(String.format("Found %s", split[1]));
                    z2 = true;
                }
            }
            if (!z2) {
                logger.info("Version of file is not ready yet.");
                return;
            }
            logger.info("Downloading file: " + format2);
            logger.info("Destination file: " + str4);
            if (z) {
                logger.info("Not downloading, it's fake!");
                return;
            }
            try {
                Downloader.Download(format2, str4);
            } catch (Exception e) {
                logger.error(e.getMessage());
            }
        } catch (Exception e2) {
            logger.warn(e2.getMessage());
        }
    }

    private static Map<String, Integer> getWikipediaInfo(String str) {
        try {
            return (Map) ((Map) ((Map) new ObjectMapper().readValue(downloadPage(String.format("http://%s.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=statistics&maxlag=5&format=json", str)), Map.class)).get(new String("query"))).get(new String("statistics"));
        } catch (Exception e) {
            logger.warn(e.getMessage());
            return null;
        }
    }

    private static File[] getListOfDumps(File file, final String[] strArr) {
        return file.listFiles(new FilenameFilter() { // from class: org.fbk.cit.hlt.thewikimachine.ExtractAllDumps.1
            @Override // java.io.FilenameFilter
            public boolean accept(File file2, String str) {
                Matcher matcher = ExtractAllDumps.wikiDumpPattern.matcher(str);
                if (matcher.find()) {
                    return strArr == null || Arrays.asList(strArr).contains(matcher.group(1));
                }
                return false;
            }
        });
    }

    private static File[] getListOfZippedDumps(File file, final String[] strArr) {
        return file.listFiles(new FilenameFilter() { // from class: org.fbk.cit.hlt.thewikimachine.ExtractAllDumps.2
            @Override // java.io.FilenameFilter
            public boolean accept(File file2, String str) {
                Matcher matcher = ExtractAllDumps.wikiDumpBZippedPattern.matcher(str);
                if (matcher.find()) {
                    return strArr == null || Arrays.asList(strArr).contains(matcher.group(1));
                }
                return false;
            }
        });
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.fbk.cit.hlt.thewikimachine.util.BaseFolder
    public void init(CommandLine commandLine) {
        super.init(commandLine, null);
        this.decompress = commandLine.hasOption("decompress");
        this.download = commandLine.hasOption("download");
        this.extract = commandLine.hasOption("extract");
        if (commandLine.hasOption("server")) {
            this.server = commandLine.getOptionValue("server");
        }
        this.cleanDumps = commandLine.hasOption("clean-dumps");
        this.cleanModels = commandLine.hasOption("clean-models");
        if (commandLine.hasOption("all")) {
            this.cleanModels = true;
            this.cleanDumps = true;
            this.extract = true;
            this.download = true;
            this.decompress = true;
        }
        this.givenLanguages = commandLine.getOptionValues("languages");
        if (this.givenLanguages != null) {
            logger.info("Language filter: " + Arrays.toString(this.givenLanguages));
        }
        this.fake = commandLine.hasOption("fake");
    }

    public void start() {
        addMandatoryFolder(ArchiveStreamFactory.DUMP);
        addMandatoryFolder("base");
        exitOnMissingStuff(true);
        logger.info("Retrieving already present information");
        Map<String, TreeSet<Integer>> presentVersions = getPresentVersions(this.givenLanguages);
        HashMap hashMap = new HashMap(presentVersions);
        File file = new File(this.folders.get(ArchiveStreamFactory.DUMP));
        File[] listFiles = file.listFiles();
        if (listFiles != null) {
            for (File file2 : listFiles) {
                for (Pattern pattern : new Pattern[]{wikiDumpBZippedPattern, wikiDumpPattern}) {
                    Matcher matcher = pattern.matcher(file2.getName());
                    if (matcher.find()) {
                        String group = matcher.group(1);
                        String group2 = matcher.group(2);
                        if (presentVersions.keySet().contains(group)) {
                            presentVersions.get(group).add(Integer.valueOf(Integer.parseInt(group2)));
                        }
                    }
                }
            }
        }
        if (this.download) {
            for (String str : presentVersions.keySet()) {
                downloadLastWikipediaVersion(this.server, str, presentVersions.get(str), this.folders.get(ArchiveStreamFactory.DUMP), this.fake);
            }
        }
        if (this.decompress) {
            for (File file3 : getListOfZippedDumps(file, this.givenLanguages)) {
                String substring = file3.getAbsolutePath().substring(0, file3.getAbsolutePath().length() - 4);
                if (new File(substring).exists()) {
                    logger.warn("File " + substring + " exists, skipping");
                } else if (this.fake) {
                    logger.info("Not unzipping, it's fake!");
                } else {
                    try {
                        UnixBunzip2Wrapper.bunzip2(file3.getAbsolutePath());
                    } catch (Exception e) {
                        logger.error(e.getMessage());
                    }
                }
            }
        }
        if (this.extract) {
            addMandatoryFolder("lsa");
            addMandatoryFolder("res");
            addMandatoryFolder("cl");
            addMandatoryFolder("airpedia");
            addMandatoryFolder("namnom");
            addMandatoryFolder("topic");
            addMandatoryFile("ontology");
            exitOnMissingStuff(true);
            for (File file4 : getListOfDumps(file, this.givenLanguages)) {
                ExtractorParameters extractorParameters = new ExtractorParameters(file4.getAbsolutePath(), this.folders.get("base"), true);
                if (this.givenLanguages == null || Arrays.asList(this.givenLanguages).contains(extractorParameters.getLang())) {
                    String extractionOutputDirName = extractorParameters.getExtractionOutputDirName();
                    if (new File(extractionOutputDirName).exists()) {
                        logger.info("Folder " + extractionOutputDirName + " exists, skipping");
                    } else {
                        logger.info("Getting Wikipedia information");
                        Map<String, Integer> wikipediaInfo = getWikipediaInfo(extractorParameters.getLang());
                        if (wikipediaInfo.get("articles").intValue() <= ARTICLE_LIMIT.intValue() || this.givenLanguages != null) {
                            StringBuffer stringBuffer = new StringBuffer();
                            stringBuffer.append(this.pattern);
                            stringBuffer.append(' ');
                            stringBuffer.append("-o").append(' ').append(this.folders.get("base"));
                            stringBuffer.append(' ');
                            stringBuffer.append("-d").append(' ').append(file4.getAbsolutePath());
                            stringBuffer.append(' ');
                            stringBuffer.append("--lsm-dir").append(' ').append(this.folders.get("lsa"));
                            String str2 = this.folders.get("res") + "topic-type-mapping/datasets/dbpedia-mappings.tsv";
                            exitOnMissingFile(str2);
                            stringBuffer.append(' ');
                            stringBuffer.append("--dbpedia-pars").append(' ').append(str2).append(' ').append(this.files.get("ontology"));
                            String str3 = (this.folders.get("cl") + "current" + File.separator) + "langs" + File.separator;
                            exitOnMissingFolder(str3);
                            stringBuffer.append(' ');
                            stringBuffer.append("--cross-language-dir").append(' ').append(str3);
                            String str4 = this.folders.get("airpedia") + "current" + File.separator;
                            exitOnMissingFolder(str4);
                            stringBuffer.append(' ');
                            stringBuffer.append("--airpedia2-dir").append(' ').append(str4);
                            String str5 = this.folders.get("namnom") + "current" + File.separator;
                            exitOnMissingFolder(str5);
                            stringBuffer.append(' ');
                            stringBuffer.append("--namnom-dir").append(' ').append(str5);
                            String str6 = this.folders.get("topic") + "current" + File.separator;
                            exitOnMissingFolder(str6);
                            stringBuffer.append(' ');
                            stringBuffer.append("--topic-dir").append(' ').append(str6);
                            String[] split = stringBuffer.toString().trim().split("\\s+");
                            logger.info("Starting model extraction: " + Arrays.toString(split));
                            if (this.fake) {
                                logger.info("Not extracting, it's fake!");
                            } else {
                                try {
                                    ModelExtractor.main(split);
                                } catch (Exception e2) {
                                    logger.error(e2.getMessage());
                                    e2.printStackTrace();
                                }
                            }
                        } else {
                            logger.info(String.format("Wikipedia in [%s] is too big [%d articles], skipping", extractorParameters.getLang(), wikipediaInfo.get("articles")));
                        }
                    }
                }
            }
        }
        if (this.cleanDumps) {
            System.out.println(hashMap);
        }
        if (this.cleanModels) {
        }
    }

    public static void main(String[] strArr) {
        ExtractAllDumps extractAllDumps = new ExtractAllDumps();
        CommandLineWithLogger commandLineWithLogger = new CommandLineWithLogger();
        extractAllDumps.extendCommandLine(commandLineWithLogger);
        OptionBuilder.isRequired();
        OptionBuilder.withDescription("Languages filter");
        OptionBuilder.hasArg();
        OptionBuilder.withArgName("iso-codes");
        OptionBuilder.withLongOpt("languages");
        commandLineWithLogger.addOption(OptionBuilder.create("l"));
        OptionBuilder.isRequired();
        OptionBuilder.withDescription("Data folder");
        OptionBuilder.hasArg();
        OptionBuilder.withArgName("folder");
        OptionBuilder.withLongOpt("data-folder");
        commandLineWithLogger.addOption(OptionBuilder.create("d"));
        OptionBuilder.withDescription("Server");
        OptionBuilder.hasArg();
        OptionBuilder.withArgName("address");
        OptionBuilder.withLongOpt("server");
        commandLineWithLogger.addOption(OptionBuilder.create());
        OptionBuilder.withDescription("Decompress bzip files");
        OptionBuilder.withLongOpt("decompress");
        commandLineWithLogger.addOption(OptionBuilder.create("z"));
        OptionBuilder.withDescription("Download Wikipedia dumps");
        OptionBuilder.withLongOpt("download");
        commandLineWithLogger.addOption(OptionBuilder.create("w"));
        OptionBuilder.withDescription("Extract models from Wikipedia dumps");
        OptionBuilder.withLongOpt("extract");
        commandLineWithLogger.addOption(OptionBuilder.create("e"));
        OptionBuilder.withDescription("Clean dumps folder");
        OptionBuilder.withLongOpt("clean-dumps");
        commandLineWithLogger.addOption(OptionBuilder.create(PrincetonRandomAccessDictionaryFile.READ_ONLY));
        OptionBuilder.withDescription("Clean models folder");
        OptionBuilder.withLongOpt("clean-models");
        commandLineWithLogger.addOption(OptionBuilder.create("m"));
        OptionBuilder.withDescription("Do everything");
        OptionBuilder.withLongOpt("all");
        commandLineWithLogger.addOption(OptionBuilder.create());
        OptionBuilder.withDescription("Fake (do not do anything)");
        OptionBuilder.withLongOpt("fake");
        commandLineWithLogger.addOption(OptionBuilder.create());
        CommandLine commandLine = null;
        try {
            commandLine = commandLineWithLogger.getCommandLine(strArr);
            PropertyConfigurator.configure(commandLineWithLogger.getLoggerProps());
        } catch (Exception e) {
            System.exit(1);
        }
        extractAllDumps.init(commandLine);
        extractAllDumps.start();
    }
}
