package org.fbk.cit.hlt.thewikimachine.csv;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import opennlp.tools.parser.AbstractBottomUpParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.fbk.cit.hlt.core.lsa.BOW;
import org.fbk.cit.hlt.core.lsa.LSM;
import org.fbk.cit.hlt.core.math.Vector;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.ParsedPageLink;
import org.tukaani.xz.common.Util;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/csv/SimpleTrainingExtractor.class */
public class SimpleTrainingExtractor {
    protected LSM lsm;
    private int numThreads;
    private ExecutorService myExecutor;
    public static final int DEFAULT_NUMBER_OF_THREAD = 1;
    public static final int DEFAULT_NOTIFICATION_POINT = 10000;
    public static final int DEFAULT_LSM_DIM = 100;
    private int notificationPoint;
    PrintWriter trainingWriter;
    PrintWriter vectorWriter;
    PrintWriter bowWriter;
    static Logger logger = Logger.getLogger(SimpleTrainingExtractor.class.getName());
    private static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);
    private static Pattern spacePattern = Pattern.compile(" ");
    static DecimalFormat df = new DecimalFormat("###,###,###,###");

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/csv/SimpleTrainingExtractor$Training.class */
    public class Training extends Thread implements Runnable {
        List<String[]> list;
        String form;

        /* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/csv/SimpleTrainingExtractor$Training$Example.class */
        class Example {
            BOW bow = new BOW();
            int freq = 0;

            Example() {
            }

            void inc() {
                this.freq++;
            }

            void add(String[] strArr) {
                this.bow.addAll(strArr);
            }

            public BOW getBow() {
                return this.bow;
            }

            public int getFreq() {
                return this.freq;
            }
        }

        Training(List<String[]> list, String str) {
            this.form = str;
            this.list = list;
        }

        @Override // java.lang.Thread, java.lang.Runnable
        public void run() {
            SimpleTrainingExtractor.logger.debug(Thread.currentThread().getName() + " is training " + this.form + ParsedPageLink.START_SUFFIX_PATTERN + this.list.size() + ")...");
            Map<String, List<String[]>> createMap = createMap();
            Map<String, BOW> createBowMap = createBowMap(createMap);
            Map<String, Vector> createVectorMap = createVectorMap(createMap);
            writeMap(createMap);
            writeBowMap(createBowMap);
            writeVectorMap(createVectorMap);
        }

        private void writeMap(Map<String, List<String[]>> map) {
            StringBuilder sb = new StringBuilder();
            int i = 0;
            for (String str : map.keySet()) {
                List<String[]> list = map.get(str);
                sb.append(this.form);
                sb.append('\t');
                sb.append(str);
                sb.append('\t');
                sb.append(list.size());
                sb.append('\t');
                for (int i2 = 0; i2 < list.size(); i2++) {
                    String[] strArr = list.get(i2);
                    if (i2 > 0) {
                        sb.append(' ');
                    }
                    sb.append("(");
                    sb.append(i2);
                    sb.append(") ");
                    sb.append(strArr[7]);
                    sb.append(' ');
                    sb.append("<form>");
                    sb.append(strArr[2]);
                    sb.append("</form>");
                    if (strArr.length > 8) {
                        sb.append(' ');
                        sb.append(list.get(i2)[8]);
                    }
                }
                sb.append('\n');
                i++;
            }
            synchronized (this) {
                SimpleTrainingExtractor.this.trainingWriter.print(sb.toString());
            }
        }

        private void writeBowMap(Map<String, BOW> map) {
            StringBuilder sb = new StringBuilder();
            int i = 0;
            for (String str : map.keySet()) {
                BOW bow = map.get(str);
                sb.append(this.form);
                sb.append('\t');
                sb.append(str);
                sb.append('\t');
                sb.append(bow.toSingleLine());
                sb.append('\n');
                i++;
            }
            synchronized (this) {
                SimpleTrainingExtractor.this.bowWriter.print(sb.toString());
            }
        }

        private void writeVectorMap(Map<String, Vector> map) {
            StringBuilder sb = new StringBuilder();
            int i = 0;
            for (String str : map.keySet()) {
                Vector vector = map.get(str);
                sb.append(this.form);
                sb.append('\t');
                sb.append(str);
                sb.append('\t');
                sb.append(vector.toString());
                sb.append('\n');
                i++;
            }
            synchronized (this) {
                SimpleTrainingExtractor.this.vectorWriter.print(sb.toString());
            }
        }

        Map<String, List<String[]>> createMap() {
            HashMap hashMap = new HashMap();
            for (int i = 0; i < this.list.size(); i++) {
                String[] strArr = this.list.get(i);
                String str = strArr[3];
                List list = (List) hashMap.get(str);
                if (list == null) {
                    list = new ArrayList();
                    hashMap.put(str, list);
                }
                list.add(strArr);
            }
            return hashMap;
        }

        Vector createVector(List<String[]> list) {
            BOW bow = new BOW();
            for (int i = 0; i < list.size(); i++) {
                String[] strArr = list.get(i);
                String[] split = SimpleTrainingExtractor.spacePattern.split(strArr[7].toLowerCase());
                String[] split2 = SimpleTrainingExtractor.spacePattern.split(strArr[8].toLowerCase());
                bow.addAll(split);
                bow.addAll(split2);
            }
            Vector mapDocument = SimpleTrainingExtractor.this.lsm.mapDocument(bow);
            Vector mapPseudoDocument = SimpleTrainingExtractor.this.lsm.mapPseudoDocument(mapDocument);
            mapDocument.normalize();
            mapPseudoDocument.normalize();
            return mapPseudoDocument;
        }

        BOW createBow(List<String[]> list) {
            BOW bow = new BOW();
            for (int i = 0; i < list.size(); i++) {
                String[] strArr = list.get(i);
                String[] split = SimpleTrainingExtractor.spacePattern.split(strArr[7].toLowerCase());
                String[] split2 = SimpleTrainingExtractor.spacePattern.split(strArr[2].toLowerCase());
                String[] split3 = SimpleTrainingExtractor.spacePattern.split(strArr[8].toLowerCase());
                bow.addAll(split);
                bow.addAll(split2);
                bow.addAll(split3);
            }
            SimpleTrainingExtractor.logger.debug(bow);
            return bow;
        }

        Map<String, Vector> createVectorMap(Map<String, List<String[]>> map) {
            HashMap hashMap = new HashMap();
            for (String str : map.keySet()) {
                hashMap.put(str, createVector(map.get(str)));
            }
            return hashMap;
        }

        Map<String, BOW> createBowMap(Map<String, List<String[]>> map) {
            HashMap hashMap = new HashMap();
            for (String str : map.keySet()) {
                hashMap.put(str, createBow(map.get(str)));
            }
            return hashMap;
        }

        Map<String, Example> createBowMap() {
            HashMap hashMap = new HashMap();
            for (int i = 0; i < this.list.size(); i++) {
                String[] strArr = this.list.get(i);
                Example example = (Example) hashMap.get(strArr[3]);
                if (example == null) {
                    example = new Example();
                    hashMap.put(strArr[3], example);
                }
                example.inc();
                example.add(SimpleTrainingExtractor.spacePattern.split(strArr[7].toLowerCase()));
                example.add(SimpleTrainingExtractor.spacePattern.split(strArr[8].toLowerCase()));
            }
            return hashMap;
        }

        Set<String> createVectorSet(Map<String, Example> map) {
            HashSet hashSet = new HashSet();
            int i = 0;
            for (String str : map.keySet()) {
                Example example = map.get(str);
                BOW bow = example.getBow();
                Vector mapDocument = SimpleTrainingExtractor.this.lsm.mapDocument(bow);
                mapDocument.normalize();
                Vector mapPseudoDocument = SimpleTrainingExtractor.this.lsm.mapPseudoDocument(mapDocument);
                mapPseudoDocument.normalize();
                hashSet.add(this.form + '\t' + example.getFreq() + '\t' + str + '\t' + bow.toSortedLine() + '\t' + mapPseudoDocument.toString() + '\t' + mapDocument.toString(SimpleTrainingExtractor.this.lsm.getDimension()));
                i++;
            }
            return hashSet;
        }
    }

    public SimpleTrainingExtractor(LSM lsm, String str) throws IOException {
        this(lsm, new File(str), 1);
    }

    public SimpleTrainingExtractor(LSM lsm, String str, int i) throws IOException {
        this(lsm, new File(str), 1);
    }

    public SimpleTrainingExtractor(LSM lsm, File file, int i) throws IOException {
        this.lsm = lsm;
        this.numThreads = i;
        this.notificationPoint = 10000;
        logger.info("creating the thread executor (" + i + ")");
        this.myExecutor = Executors.newFixedThreadPool(i);
        this.trainingWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8")));
        this.bowWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file + ".bow"), "UTF-8")));
        this.vectorWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file + ".vec"), "UTF-8")));
    }

    public int getNumThreads() {
        return this.numThreads;
    }

    public void setNumThreads(int i) {
        this.numThreads = i;
    }

    public int getNotificationPoint() {
        return this.notificationPoint;
    }

    public void train(String str) throws IOException {
        train(new File(str));
    }

    public void train(File file) throws IOException {
        int i;
        logger.info("reading " + file + "...");
        long currentTimeMillis = System.currentTimeMillis();
        LineNumberReader lineNumberReader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
        int i2 = 0;
        int i3 = 0;
        int i4 = 0;
        String str = "";
        String[] strArr = null;
        ArrayList arrayList = new ArrayList();
        logger.info("totalFreq\tcount\ttime\tdate");
        String readLine = lineNumberReader.readLine();
        try {
            if (readLine != null) {
                strArr = tabPattern.split(readLine);
                if (strArr.length == 9) {
                    arrayList.add(strArr);
                    str = strArr[2];
                    i3 = 0 + 1;
                }
            }
        } catch (Exception e) {
            logger.error("Error at line 0");
            logger.error(e);
        } finally {
        }
        while (true) {
            String readLine2 = lineNumberReader.readLine();
            if (readLine2 == null) {
                break;
            }
            try {
                strArr = tabPattern.split(readLine2);
                if (strArr.length == 9) {
                    if (!strArr[2].equals(str)) {
                        logger.debug("executing " + str + ParsedPageLink.START_SUFFIX_PATTERN + arrayList.size() + ")...");
                        this.myExecutor.execute(new Training(arrayList, str));
                        arrayList = new ArrayList();
                        i2++;
                        i3 = 0;
                    }
                    arrayList.add(strArr);
                    str = strArr[2];
                    i3++;
                }
            } catch (Exception e2) {
                logger.error("Error at line " + i4);
                logger.error(e2);
            } finally {
            }
            if (i4 % this.notificationPoint == 0) {
                logger.info(df.format(i4) + StringTable.HORIZONTAL_TABULATION + df.format(i2) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
                currentTimeMillis = System.currentTimeMillis();
            }
        }
        lineNumberReader.close();
        arrayList.add(strArr);
        logger.debug("executing " + str + ParsedPageLink.START_SUFFIX_PATTERN + arrayList.size() + ")...");
        this.myExecutor.execute(new Training(arrayList, str));
        logger.info(df.format(i4) + StringTable.HORIZONTAL_TABULATION + df.format(i2) + StringTable.HORIZONTAL_TABULATION + df.format(System.currentTimeMillis() - currentTimeMillis) + StringTable.HORIZONTAL_TABULATION + new Date());
        try {
            this.myExecutor.shutdown();
            logger.info("waiting for execution...");
            this.myExecutor.awaitTermination(Util.VLI_MAX, TimeUnit.NANOSECONDS);
        } catch (InterruptedException e3) {
            logger.error(e3);
        }
        logger.info("closing the streams...");
        this.trainingWriter.close();
        this.vectorWriter.close();
        this.bowWriter.close();
        logger.info("done it");
    }

    private void write(Map<String, BOW> map, String str) {
        logger.debug(Thread.currentThread().getName() + " is writing " + str + "...");
        StringBuilder sb = new StringBuilder();
        int i = 0;
        for (String str2 : map.keySet()) {
            BOW bow = map.get(str2);
            sb.append(str);
            sb.append('\t');
            sb.append(str2);
            sb.append('\t');
            sb.append(bow.toSingleLine());
            sb.append('\n');
            i++;
        }
        synchronized (this) {
            this.trainingWriter.println(sb.toString());
        }
    }

    synchronized void write(Set<String> set) {
        Iterator<String> it = set.iterator();
        StringBuilder sb = new StringBuilder();
        int i = 0;
        while (it.hasNext()) {
            sb.append(it.next());
            sb.append('\n');
            i++;
        }
        this.trainingWriter.print(sb.toString());
    }

    public static void main(String[] strArr) {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            OptionBuilder.withArgName("input");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("input file");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("input");
            Option create = OptionBuilder.create(AbstractBottomUpParser.INCOMPLETE);
            OptionBuilder.withArgName("output");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("output file");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("output");
            Option create2 = OptionBuilder.create("o");
            OptionBuilder.withArgName("nt");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("number of threads");
            OptionBuilder.withLongOpt("nt");
            Option create3 = OptionBuilder.create("n");
            OptionBuilder.withArgName("lsm");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("lsm dir");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("lsm");
            Option create4 = OptionBuilder.create("l");
            OptionBuilder.withArgName("dim");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("lsm dim");
            OptionBuilder.withLongOpt("dim");
            Option create5 = OptionBuilder.create("d");
            options.addOption("h", "help", false, "print this message");
            options.addOption("v", "version", false, "output version information and exit");
            options.addOption(create);
            options.addOption(create2);
            options.addOption(create3);
            options.addOption(create4);
            options.addOption(create5);
            CommandLine parse = new PosixParser().parse(options, strArr);
            logger.debug(options);
            int i = 1;
            if (parse.hasOption("nt")) {
                i = Integer.parseInt(parse.getOptionValue("nt"));
            }
            logger.debug(parse.getOptionValue("nt") + StringTable.HORIZONTAL_TABULATION + parse.getOptionValue("qs"));
            logger.debug(parse.getOptionValue("output") + StringTable.HORIZONTAL_TABULATION + parse.getOptionValue("input") + StringTable.HORIZONTAL_TABULATION + parse.getOptionValue("lsm"));
            String optionValue = parse.getOptionValue("lsm");
            if (!optionValue.endsWith(File.separator)) {
                optionValue = optionValue + File.separator;
            }
            File file = new File(optionValue + "X-Ut");
            File file2 = new File(optionValue + "X-S");
            File file3 = new File(optionValue + "X-row");
            File file4 = new File(optionValue + "X-col");
            File file5 = new File(optionValue + "X-df");
            int i2 = 100;
            if (parse.hasOption("dim")) {
                i2 = Integer.parseInt(parse.getOptionValue("dim"));
            }
            logger.debug(parse.getOptionValue("lsm") + StringTable.HORIZONTAL_TABULATION + parse.getOptionValue("dim"));
            LSM lsm = new LSM(file, file2, file3, file4, file5, i2, true);
            if (parse.hasOption("num-threads")) {
                Integer.parseInt(parse.getOptionValue("num-threads"));
            }
            if (parse.hasOption("num-pages")) {
                Integer.parseInt(parse.getOptionValue("num-pages"));
            }
            if (parse.hasOption("notification-point")) {
                Integer.parseInt(parse.getOptionValue("notification-point"));
            }
            new SimpleTrainingExtractor(lsm, parse.getOptionValue("output"), i).train(parse.getOptionValue("input"));
        } catch (IOException e) {
            logger.error(e);
        } catch (ParseException e2) {
            System.err.println("Parsing failed: " + e2.getMessage() + "\n");
            new HelpFormatter().printHelp(200, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.csv.SimpleTrainingExtractor", "\n", options, "\n", true);
        }
    }
}
