package org.fbk.cit.hlt.core.lsa.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.fbk.cit.hlt.core.lsa.BOW;
import org.fbk.cit.hlt.core.lsa.LSM;
import org.fbk.cit.hlt.core.lsa.LSSimilarity;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExampleExtractor;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;

/* loaded from: input_file:org/fbk/cit/hlt/core/lsa/util/NgramComparator.class */
public class NgramComparator {
    static Logger logger = Logger.getLogger(NgramComparator.class.getName());

    public NgramComparator(String str, LSSimilarity lSSimilarity) throws IOException, MalformedURLException {
        logger.info("parsing " + str + "...");
        BOW bow = new BOW(getText(new File(str)));
        logger.info("size bow " + bow.size());
        List<String[]> rows = getRows(new File(str + ".np"));
        logger.info("size list " + rows.size());
        PrintWriter printWriter = new PrintWriter(new FileWriter(str + ".allscore"));
        for (int i = 0; i < rows.size(); i++) {
            logger.info("comparing line " + i + " " + rows.get(i)[0]);
            float[] compareAll = compareAll(bow, rows.get(i), lSSimilarity);
            printWriter.print(rows.get(i)[0]);
            for (int i2 = 1; i2 < rows.get(i).length; i2++) {
                printWriter.print(StringTable.HORIZONTAL_TABULATION + rows.get(i)[i2]);
                printWriter.print("=" + compareAll[i2 - 1]);
            }
            printWriter.print("\n");
        }
        printWriter.flush();
        printWriter.close();
    }

    private URL[] getConceptURL(String[] strArr) throws MalformedURLException {
        URL[] urlArr = new URL[strArr.length - 1];
        for (int i = 0; i < strArr.length - 1; i++) {
            urlArr[i] = new URL("http://en.wikipedia.org/wiki/" + strArr[i + 1]);
        }
        return urlArr;
    }

    private float[] compareAll(BOW bow, String[] strArr, LSSimilarity lSSimilarity) throws IOException, MalformedURLException {
        URL[] conceptURL = getConceptURL(strArr);
        BOW[] bowArr = new BOW[conceptURL.length];
        float[] fArr = new float[conceptURL.length];
        for (int i = 0; i < conceptURL.length; i++) {
            bowArr[i] = new BOW(toText(conceptURL[i]));
            fArr[i] = lSSimilarity.compare(bow, bowArr[i]);
            logger.info(i + ", " + conceptURL[i] + ", " + fArr[i]);
        }
        return fArr;
    }

    private float compare(BOW bow, String[] strArr, LSSimilarity lSSimilarity) throws IOException, MalformedURLException {
        URL[] conceptURL = getConceptURL(strArr);
        BOW[] bowArr = new BOW[conceptURL.length];
        float[] fArr = new float[conceptURL.length];
        for (int i = 0; i < conceptURL.length; i++) {
            bowArr[i] = new BOW(toText(conceptURL[i]));
            fArr[i] = lSSimilarity.compare(bow, bowArr[i]);
            logger.info(i + ", " + conceptURL[i] + ", " + fArr[i]);
        }
        int maxIndex = maxIndex(fArr);
        if (maxIndex == -1) {
            return 0.0f;
        }
        logger.info("max = " + conceptURL[maxIndex] + ", " + fArr[maxIndex]);
        return fArr[maxIndex];
    }

    private int maxIndex(float[] fArr) {
        float f = 0.0f;
        int i = -1;
        for (int i2 = 0; i2 < fArr.length; i2++) {
            if (fArr[i2] > f) {
                f = fArr[i2];
                i = i2;
            }
        }
        return i;
    }

    private List<String[]> getRows(File file) throws IOException {
        ArrayList arrayList = new ArrayList();
        LineNumberReader lineNumberReader = new LineNumberReader(new FileReader(file));
        while (true) {
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                lineNumberReader.close();
                return arrayList;
            }
            arrayList.add(readLine.trim().split(StringTable.HORIZONTAL_TABULATION));
        }
    }

    private String getText(File file) throws IOException {
        StringBuffer stringBuffer = new StringBuffer();
        BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
        while (true) {
            int read = bufferedReader.read();
            if (read == -1) {
                bufferedReader.close();
                return stringBuffer.toString();
            }
            stringBuffer.append((char) read);
        }
    }

    public String toText(URL url) {
        Parser parser = null;
        StringBuilder sb = new StringBuilder();
        try {
            parser = new Parser(url.openConnection());
            SimpleNodeIterator elements = parser.extractAllNodesThatMatch(new TagNameFilter(WikipediaExampleExtractor.Example.CONTENT_FROM_PAGE)).elements();
            while (elements.hasMoreNodes()) {
                sb.append(elements.nextNode().toPlainTextString());
                sb.append("\n");
            }
        } catch (IOException e) {
            logger.error(e);
        } catch (EncodingChangeException e2) {
            logger.error(e2);
            parser.reset();
        } catch (ParserException e3) {
            logger.error(e3);
        }
        logger.info(url + StringTable.HORIZONTAL_TABULATION + sb.length());
        return sb.toString();
    }

    public static void main(String[] strArr) throws Exception {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "log-config.txt";
        }
        PropertyConfigurator.configure(property);
        if (strArr.length != 6) {
            System.out.println("Usage: java -mx512M org.fbk.cit.hlt.core.lsa.util.NgramComparator input threshold size dim idf file");
            System.exit(1);
        }
        File file = new File(strArr[0] + "-Ut");
        File file2 = new File(strArr[0] + "-S");
        File file3 = new File(strArr[0] + "-row");
        File file4 = new File(strArr[0] + "-col");
        File file5 = new File(strArr[0] + "-df");
        Double.parseDouble(strArr[1]);
        new NgramComparator(strArr[5], new LSSimilarity(new LSM(file, file2, file3, file4, file5, Integer.parseInt(strArr[3]), Boolean.parseBoolean(strArr[4])), Integer.parseInt(strArr[2])));
    }
}
