package org.fbk.cit.hlt.thewikimachine.util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.fbk.cit.hlt.thewikimachine.analysis.HardTokenizer;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/util/SectionTitleTokenizer.class */
public class SectionTitleTokenizer {
    static Logger logger = Logger.getLogger(SectionTitleTokenizer.class.getName());

    public SectionTitleTokenizer(String str, String str2) throws IOException {
        this(str, str2, null);
    }

    public SectionTitleTokenizer(String str, String str2, String str3) throws IOException {
        HashSet hashSet = new HashSet();
        if (str3 != null) {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str3), "UTF-8"));
            logger.info("Loading stopwords file...");
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                String trim = readLine.trim();
                if (trim.length() > 0) {
                    hashSet.add(trim);
                }
            }
        }
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str2), "UTF-8"));
        Pattern compile = Pattern.compile("\\t");
        BufferedReader bufferedReader2 = new BufferedReader(new InputStreamReader(new FileInputStream(str), "UTF-8"));
        String str4 = "";
        HashSet hashSet2 = new HashSet();
        while (true) {
            String readLine2 = bufferedReader2.readLine();
            if (readLine2 == null) {
                bufferedWriter.close();
                return;
            }
            String[] split = compile.split(readLine2);
            if (split.length >= 2) {
                String str5 = split[1];
                String str6 = split[0];
                if (!str4.equals(str6)) {
                    str4 = str6;
                    hashSet2.clear();
                }
                HashSet hashSet3 = new HashSet(Arrays.asList(HardTokenizer.getInstance().stringArray(str5)));
                hashSet3.removeAll(hashSet);
                Iterator it = hashSet3.iterator();
                while (it.hasNext()) {
                    String lowerCase = ((String) it.next()).trim().toLowerCase();
                    if (!hashSet2.contains(lowerCase) && lowerCase.length() > 0) {
                        bufferedWriter.write(str6 + StringTable.HORIZONTAL_TABULATION + lowerCase + "\n");
                        hashSet2.add(lowerCase);
                    }
                }
            }
        }
    }
}
