package org.fbk.cit.hlt.thewikimachine.wikipedia;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import net.didion.jwnl.princeton.file.PrincetonRandomAccessDictionaryFile;
import opennlp.tools.parser.AbstractBottomUpParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.fbk.cit.hlt.thewikimachine.index.util.SerialUtils;
import org.fbk.cit.hlt.thewikimachine.util.FrequencyHashSet;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;

@Deprecated
/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/wikipedia/StatisticsIndexer.class */
public class StatisticsIndexer {
    public static final String PAGE_FIELD_NAME = "page";
    public static final String TRAFFIC_FIELD_NAME = "num";

    public static void main(String[] strArr) {
        Logger logger = Logger.getLogger(Thread.currentThread().getStackTrace()[1].getClassName());
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        PosixParser posixParser = new PosixParser();
        Options options = new Options();
        OptionBuilder.withDescription("Language");
        OptionBuilder.isRequired();
        OptionBuilder.hasArg();
        OptionBuilder.withArgName("iso-code");
        options.addOption(OptionBuilder.create("l"));
        OptionBuilder.withDescription("Output folder where to save the Lucene index");
        OptionBuilder.isRequired();
        OptionBuilder.hasArg();
        OptionBuilder.withArgName("folder");
        options.addOption(OptionBuilder.create("o"));
        OptionBuilder.withDescription("Redirect file");
        OptionBuilder.isRequired();
        OptionBuilder.hasArg();
        OptionBuilder.withArgName("file");
        options.addOption(OptionBuilder.create(PrincetonRandomAccessDictionaryFile.READ_ONLY));
        OptionBuilder.withDescription("Input folder with files");
        OptionBuilder.isRequired();
        OptionBuilder.hasArg();
        OptionBuilder.withArgName("folder");
        options.addOption(OptionBuilder.create(AbstractBottomUpParser.INCOMPLETE));
        OptionBuilder.withLongOpt("pattern");
        OptionBuilder.withDescription("Starting pattern for file names (default pagecounts)");
        OptionBuilder.hasArg();
        OptionBuilder.withArgName("pattern");
        options.addOption(OptionBuilder.create("p"));
        OptionBuilder.withLongOpt("stop");
        OptionBuilder.withDescription("Stop after <num> files");
        OptionBuilder.hasArg();
        OptionBuilder.withArgName(TRAFFIC_FIELD_NAME);
        options.addOption(OptionBuilder.create());
        options.addOption(AbstractBottomUpParser.COMPLETE, "clean", false, "Clean the output folder before writing on it");
        options.addOption("h", "help", false, "Print this message");
        CommandLine commandLine = null;
        try {
            commandLine = posixParser.parse(options, strArr);
        } catch (ParseException e) {
            System.out.println();
            if (e.getMessage().length() > 0) {
                System.out.println("ERR: " + e.getMessage());
                System.out.println();
            }
            new HelpFormatter().printHelp(400, "java -mx4g " + Thread.currentThread().getStackTrace()[1].getClassName(), "\n", options, "\n", true);
            System.out.println();
            System.exit(0);
        }
        if (commandLine.hasOption("help")) {
            throw new ParseException("");
        }
        String optionValue = commandLine.getOptionValue('o');
        String optionValue2 = commandLine.getOptionValue('l');
        String optionValue3 = commandLine.getOptionValue('r');
        String optionValue4 = commandLine.getOptionValue('i');
        if (!optionValue4.endsWith(File.separator)) {
            optionValue4 = optionValue4 + File.separator;
        }
        if (!optionValue.endsWith(File.separator)) {
            optionValue = optionValue + File.separator;
        }
        boolean z = commandLine.hasOption('c');
        String optionValue5 = commandLine.hasOption('p') ? commandLine.getOptionValue('p') : "pagecounts";
        int parseInt = commandLine.hasOption("stop") ? Integer.parseInt(commandLine.getOptionValue("stop")) : 0;
        String str = optionValue;
        if (z) {
            System.out.println("Cleaning the folder");
            File file = new File(str);
            if (file.exists()) {
                for (String str2 : file.list()) {
                    new File(str + str2).delete();
                }
            } else if (!file.mkdirs()) {
                System.out.println("Unable to create directory " + str);
                System.exit(1);
            }
        }
        logger.info("Loading redirect file " + optionValue3);
        HashMap hashMap = new HashMap();
        Pattern compile = Pattern.compile("\\s+");
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(optionValue3));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                String[] split = compile.split(readLine);
                if (split.length >= 2) {
                    String str3 = split[1];
                    for (int i = 2; i < split.length; i++) {
                        str3 = str3 + StringTable.LOW_LINE + split[i];
                    }
                    hashMap.put(split[0], str3);
                }
            }
        } catch (Exception e2) {
            e2.printStackTrace();
        }
        logger.info("Redirect file loaded");
        FrequencyHashSet frequencyHashSet = new FrequencyHashSet();
        File file2 = new File(optionValue4);
        if (!file2.exists() || !file2.isDirectory()) {
            logger.error("Invalid input folder");
            System.exit(0);
        }
        int i2 = 0;
        for (File file3 : file2.listFiles()) {
            String name = file3.getName();
            String absolutePath = file3.getAbsolutePath();
            if (name.startsWith(optionValue5) && name.endsWith(CompressorStreamFactory.GZIP)) {
                logger.info(i2 + " downloading " + absolutePath + "...");
                try {
                    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
                    GZIPInputStream gZIPInputStream = new GZIPInputStream(new FileInputStream(absolutePath));
                    byte[] bArr = new byte[1024];
                    while (true) {
                        int read = gZIPInputStream.read(bArr);
                        if (read <= 0) {
                            break;
                        } else {
                            byteArrayOutputStream.write(bArr, 0, read);
                        }
                    }
                    gZIPInputStream.close();
                    BufferedReader bufferedReader2 = new BufferedReader(new StringReader(byteArrayOutputStream.toString()));
                    while (true) {
                        String readLine2 = bufferedReader2.readLine();
                        if (readLine2 == null) {
                            break;
                        }
                        String[] split2 = compile.split(readLine2);
                        if (split2.length >= 3 && split2[0].equals(optionValue2)) {
                            try {
                                String decode = URLDecoder.decode(split2[1], "UTF-8");
                                if (hashMap.get(decode) != null) {
                                    decode = (String) hashMap.get(decode);
                                }
                                frequencyHashSet.add(decode, Integer.parseInt(split2[2]));
                            } catch (Exception e3) {
                            }
                        }
                    }
                    byteArrayOutputStream.close();
                } catch (IOException e4) {
                    e4.printStackTrace();
                    System.exit(0);
                }
                if (parseInt > 0) {
                    i2++;
                    if (i2 >= parseInt) {
                        break;
                    }
                } else {
                    continue;
                }
            }
        }
        logger.info("Writing Lucene index");
        try {
            IndexWriter indexWriter = new IndexWriter(optionValue, new WhitespaceAnalyzer());
            for (String str4 : frequencyHashSet.keySet()) {
                Document document = new Document();
                document.add(new Field("page", optionValue2 + ":" + str4, Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.add(new Field(TRAFFIC_FIELD_NAME, SerialUtils.toByteArray(frequencyHashSet.get(str4).intValue()), Field.Store.YES));
                indexWriter.addDocument(document);
            }
            logger.info("Optimizing and closing");
            indexWriter.optimize();
            indexWriter.close();
        } catch (IOException e5) {
            e5.printStackTrace();
            System.exit(0);
        }
    }
}
