package org.fbk.cit.hlt.thewikimachine.csv;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import opennlp.tools.parser.AbstractBottomUpParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.core.lsa.BOW;
import org.fbk.cit.hlt.core.lsa.LSM;
import org.fbk.cit.hlt.core.math.Vector;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.ParsedPageLink;
import org.tukaani.xz.common.Util;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/csv/OneExamplePerSenseExtractor.class */
public class OneExamplePerSenseExtractor {
    protected LSM lsm;
    private int numForms;
    private int numThreads;
    private ExecutorService myExecutor;
    public static final int DEFAULT_THREADS_NUMBER = 1;
    public static final int DEFAULT_NOTIFICATION_POINT = 100000;
    public static final int DEFAULT_LSM_DIM = 100;
    public static final int PAGE_COLUMN_INDEX = 1;
    public static final int FREQ_COLUMN_INDEX = 2;
    public static final int LS_COLUMN_INDEX = 3;
    public static final int BOW_COLUMN_INDEX = 4;
    public static final int DEFAULT_MINIMUM_FORM_FREQ = 1;
    public static final int DEFAULT_MINIMUM_PAGE_FREQ = 1;
    public static final boolean DEFAULT_NORMALIZE = false;
    private int notificationPoint;
    PrintWriter senseWriter;
    public static final int DEFAULT_NUM_FORMS = Integer.MAX_VALUE;
    public static final int DEFAULT_QUEUE_SIZE = 10000;
    private int minimumFormFreq;
    private int minimumPageFreq;
    private boolean normalized;
    private int tfType;
    static Logger logger = Logger.getLogger(OneExamplePerSenseExtractor.class.getName());
    private static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);
    private static Pattern spacePattern = Pattern.compile(" ");
    static DecimalFormat df = new DecimalFormat("###,###,###,###");

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/csv/OneExamplePerSenseExtractor$SenseExtractor.class */
    public class SenseExtractor extends Thread implements Runnable {
        List<String[]> senseList;
        String form;
        int totalFreq = 0;

        /* JADX INFO: Access modifiers changed from: package-private */
        /* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/csv/OneExamplePerSenseExtractor$SenseExtractor$Example.class */
        public class Example implements Comparable<Example> {
            private BOW bow;
            private Vector bowVector;
            private Vector lsVector;
            private int freq;
            private String page;

            Example(String str, List<String[]> list) {
                this.page = str;
                this.freq = list.size();
                SenseExtractor.this.totalFreq += this.freq;
                new StringBuilder();
                this.bow = new BOW();
                for (int i = 0; i < list.size(); i++) {
                    String[] strArr = list.get(i);
                    try {
                        String[] split = OneExamplePerSenseExtractor.spacePattern.split(strArr[7].toLowerCase());
                        String[] split2 = OneExamplePerSenseExtractor.spacePattern.split(strArr[8].toLowerCase());
                        this.bow.addAll(split);
                        this.bow.addAll(split2);
                    } catch (Exception e) {
                        OneExamplePerSenseExtractor.logger.error(e);
                    }
                }
                this.bowVector = OneExamplePerSenseExtractor.this.lsm.mapDocument(this.bow);
                if (OneExamplePerSenseExtractor.this.normalized) {
                    this.bowVector.normalize();
                }
                this.lsVector = OneExamplePerSenseExtractor.this.lsm.mapPseudoDocument(this.bowVector);
                if (OneExamplePerSenseExtractor.this.normalized) {
                    this.lsVector.normalize();
                }
            }

            public String getPage() {
                return this.page;
            }

            public BOW getBow() {
                return this.bow;
            }

            public Vector getBowVector() {
                return this.bowVector;
            }

            public Vector getLsVector() {
                return this.lsVector;
            }

            public int getFreq() {
                return this.freq;
            }

            public String toString() {
                return this.page + '\t' + (this.freq / SenseExtractor.this.totalFreq) + '\t' + this.lsVector + '\t' + this.bowVector;
            }

            @Override // java.lang.Comparable
            public int compareTo(Example example) {
                return example.getFreq() - this.freq;
            }
        }

        SenseExtractor(List<String[]> list, String str) {
            this.form = str;
            this.senseList = list;
        }

        @Override // java.lang.Thread, java.lang.Runnable
        public void run() {
            Example[] createExampleArray = createExampleArray(createExampleListMap());
            Arrays.sort(createExampleArray, new Comparator<Example>() { // from class: org.fbk.cit.hlt.thewikimachine.csv.OneExamplePerSenseExtractor.SenseExtractor.1
                @Override // java.util.Comparator
                public int compare(Example example, Example example2) {
                    return example2.getFreq() - example.getFreq();
                }
            });
            writeExampleArray(createExampleArray);
        }

        private void writeExampleArray(Example[] exampleArr) {
            StringBuilder sb = new StringBuilder();
            for (Example example : exampleArr) {
                sb.append(this.form);
                sb.append('\t');
                sb.append(example);
                sb.append('\n');
            }
            synchronized (this) {
                OneExamplePerSenseExtractor.this.senseWriter.print(sb.toString());
            }
        }

        Map<String, List<String[]>> createExampleListMap() {
            HashMap hashMap = new HashMap();
            for (int i = 0; i < this.senseList.size(); i++) {
                String[] strArr = this.senseList.get(i);
                String str = strArr[3];
                List list = (List) hashMap.get(str);
                if (list == null) {
                    list = new ArrayList();
                    hashMap.put(str, list);
                }
                list.add(strArr);
            }
            return hashMap;
        }

        Example[] createExampleArray(Map<String, List<String[]>> map) {
            Example[] exampleArr = new Example[map.size()];
            int i = 0;
            for (String str : map.keySet()) {
                exampleArr[i] = new Example(str, map.get(str));
                i++;
            }
            return exampleArr;
        }
    }

    public OneExamplePerSenseExtractor(LSM lsm, File file) throws IOException {
        this(lsm, file, 1);
    }

    public OneExamplePerSenseExtractor(LSM lsm, String str) throws IOException {
        this(lsm, new File(str), 1);
    }

    public OneExamplePerSenseExtractor(LSM lsm, String str, int i) throws IOException {
        this(lsm, new File(str), i);
    }

    public OneExamplePerSenseExtractor(LSM lsm, File file, int i) throws IOException {
        this.lsm = lsm;
        this.numThreads = i;
        this.tfType = 2;
        this.normalized = false;
        this.minimumFormFreq = 1;
        this.minimumPageFreq = 1;
        this.notificationPoint = 100000;
        logger.info("creating the thread executor (" + i + ")");
        this.myExecutor = new ThreadPoolExecutor(i, i, 1L, TimeUnit.MINUTES, new ArrayBlockingQueue(10000), new ThreadPoolExecutor.CallerRunsPolicy());
        this.senseWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8")));
    }

    public int getTfType() {
        return this.tfType;
    }

    public void setTfType(int i) {
        this.tfType = i;
    }

    public void setNormalized(boolean z) {
        this.normalized = z;
    }

    public boolean isNormalized() {
        return this.normalized;
    }

    public int getMinimumFormFreq() {
        return this.minimumFormFreq;
    }

    public void setMinimumFormFreq(int i) {
        this.minimumFormFreq = i;
    }

    public int getMinimumPageFreq() {
        return this.minimumPageFreq;
    }

    public void setMinimumPageFreq(int i) {
        this.minimumPageFreq = i;
    }

    public int getNumForms() {
        return this.numForms;
    }

    public void setNumForms(int i) {
        this.numForms = i;
    }

    public int getNumThreads() {
        return this.numThreads;
    }

    public void setNumThreads(int i) {
        this.numThreads = i;
    }

    public int getNotificationPoint() {
        return this.notificationPoint;
    }

    public void setNotificationPoint(int i) {
        this.notificationPoint = i;
    }

    public void extract(String str) throws IOException {
        extract(new File(str));
    }

    public void extract(File file) throws IOException {
        int i;
        logger.info("reading " + file + "...");
        long currentTimeMillis = System.currentTimeMillis();
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
        int i2 = 0;
        int i3 = 0;
        int i4 = 0;
        String str = "";
        String[] strArr = null;
        ArrayList arrayList = new ArrayList();
        logger.info("totalFreq\tcount\ttime\tdate");
        String readLine = lineNumberReader.readLine();
        try {
            if (readLine != null) {
                strArr = tabPattern.split(readLine);
                if (strArr.length == 9) {
                    arrayList.add(strArr);
                    str = strArr[2];
                    i3 = 0 + 1;
                }
            }
        } catch (Exception e) {
            logger.error("Error at line 0");
            logger.error(e);
        } finally {
        }
        while (true) {
            String readLine2 = lineNumberReader.readLine();
            if (readLine2 == null) {
                break;
            }
            if (i2 > this.numForms) {
                logger.info("Exit after " + i2 + " forms (" + this.numForms + ")");
                break;
            }
            try {
                strArr = tabPattern.split(readLine2);
                if (strArr.length == 9) {
                    if (!strArr[2].equals(str)) {
                        this.myExecutor.execute(new SenseExtractor(arrayList, str));
                        arrayList = new ArrayList();
                        i2++;
                        i3 = 0;
                    }
                    arrayList.add(strArr);
                    str = strArr[2];
                    i3++;
                }
            } catch (Exception e2) {
                logger.error("Error at line " + i4);
                logger.error(e2);
            } finally {
            }
            if (i4 % this.notificationPoint == 0) {
                logger.info(df.format(i4) + StringTable.HORIZONTAL_TABULATION + df.format(i2) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
                currentTimeMillis = System.currentTimeMillis();
            }
        }
        lineNumberReader.close();
        arrayList.add(strArr);
        logger.debug("executing " + str + ParsedPageLink.START_SUFFIX_PATTERN + arrayList.size() + ")...");
        this.myExecutor.execute(new SenseExtractor(arrayList, str));
        logger.info(df.format(i4) + StringTable.HORIZONTAL_TABULATION + df.format(i2) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
        try {
            this.myExecutor.shutdown();
            logger.info("waiting for execution...");
            this.myExecutor.awaitTermination(Util.VLI_MAX, TimeUnit.NANOSECONDS);
        } catch (InterruptedException e3) {
            logger.error(e3);
        }
        logger.info("closing the streams...");
        this.senseWriter.close();
        logger.info("done it");
    }

    public static void main(String[] strArr) {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("sorted form/page file");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("input");
            Option create = OptionBuilder.create(AbstractBottomUpParser.INCOMPLETE);
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("one sense per example file");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("output");
            Option create2 = OptionBuilder.create("o");
            OptionBuilder.withArgName("FUNC");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("term frequency function; FUNC is 0=`" + BOW.labels[0] + "1=`" + BOW.labels[1] + "',2=`" + BOW.labels[2] + "',3=`" + BOW.labels[3] + " (default is 2)");
            OptionBuilder.withLongOpt("tf");
            Option create3 = OptionBuilder.create();
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("maximum number of forms to process (default is all)");
            OptionBuilder.withLongOpt("num-forms");
            Option create4 = OptionBuilder.create("f");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("number of threads (default 1)");
            OptionBuilder.withLongOpt("num-threads");
            Option create5 = OptionBuilder.create("t");
            OptionBuilder.withArgName("dir");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("lsm dir");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("lsm");
            Option create6 = OptionBuilder.create("l");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("lsm dim");
            OptionBuilder.withLongOpt("dim");
            Option create7 = OptionBuilder.create("d");
            OptionBuilder.withDescription("normalize vectors (default is false)");
            OptionBuilder.withLongOpt("normalized");
            Option create8 = OptionBuilder.create("n");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("receive notification every n pages (default is 100000)");
            OptionBuilder.withLongOpt("notification-point");
            Option create9 = OptionBuilder.create("b");
            options.addOption("h", "help", false, "print this message");
            options.addOption("v", "version", false, "output version information and exit");
            options.addOption(create);
            options.addOption(create3);
            options.addOption(create2);
            options.addOption(create5);
            options.addOption(create9);
            options.addOption(create4);
            options.addOption(create6);
            options.addOption(create7);
            options.addOption(create8);
            CommandLine parse = new PosixParser().parse(options, strArr);
            logger.debug(options);
            logger.debug(parse.getOptionValue("output") + StringTable.HORIZONTAL_TABULATION + parse.getOptionValue("input") + StringTable.HORIZONTAL_TABULATION + parse.getOptionValue("lsm"));
            String optionValue = parse.getOptionValue("lsm");
            if (!optionValue.endsWith(File.separator)) {
                optionValue = optionValue + File.separator;
            }
            File file = new File(optionValue + "X-Ut");
            File file2 = new File(optionValue + "X-S");
            File file3 = new File(optionValue + "X-row");
            File file4 = new File(optionValue + "X-col");
            File file5 = new File(optionValue + "X-df");
            int i = 100;
            if (parse.hasOption("dim")) {
                i = Integer.parseInt(parse.getOptionValue("dim"));
            }
            logger.debug(parse.getOptionValue("lsm") + StringTable.HORIZONTAL_TABULATION + parse.getOptionValue("dim"));
            boolean z = false;
            if (parse.hasOption("normalized")) {
                z = true;
            }
            LSM lsm = new LSM(file, file2, file3, file4, file5, i, true, z);
            int i2 = 1;
            if (parse.hasOption("num-threads")) {
                i2 = Integer.parseInt(parse.getOptionValue("num-threads"));
            }
            if (parse.hasOption("min-freq")) {
                Integer.parseInt(parse.getOptionValue("min-freq"));
            }
            if (parse.hasOption("min-page")) {
                Integer.parseInt(parse.getOptionValue("min-page"));
            }
            int i3 = Integer.MAX_VALUE;
            if (parse.hasOption("num-forms")) {
                i3 = Integer.parseInt(parse.getOptionValue("num-forms"));
            }
            int i4 = 100000;
            if (parse.hasOption("notification-point")) {
                i4 = Integer.parseInt(parse.getOptionValue("notification-point"));
            }
            int i5 = 2;
            if (parse.hasOption("tf")) {
                i5 = Integer.parseInt(parse.getOptionValue("tf"));
            }
            logger.info("extracting one example per sense using " + i2 + " threads");
            OneExamplePerSenseExtractor oneExamplePerSenseExtractor = new OneExamplePerSenseExtractor(lsm, parse.getOptionValue("output"), i2);
            oneExamplePerSenseExtractor.setTfType(i5);
            oneExamplePerSenseExtractor.setNormalized(z);
            oneExamplePerSenseExtractor.setNotificationPoint(i4);
            oneExamplePerSenseExtractor.setNumForms(i3);
            oneExamplePerSenseExtractor.extract(parse.getOptionValue("input"));
        } catch (IOException e) {
            logger.error(e);
        } catch (ParseException e2) {
            System.err.println("Parsing failed: " + e2.getMessage() + "\n");
            new HelpFormatter().printHelp(200, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.csv.OneExamplePerSenseExtractor", "\n", options, "\n", true);
        }
    }
}
