package org.codelibs.elasticsearch.vi.nlp.tokenizer.tools;

import java.io.File;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.elasticsearch.vi.nlp.utils.CaseConverter;
import org.codelibs.elasticsearch.vi.nlp.utils.FileIterator;
import org.codelibs.elasticsearch.vi.nlp.utils.TextFileFilter;
import org.codelibs.elasticsearch.vi.nlp.utils.UTF8FileUtility;

/* loaded from: input_file:org/codelibs/elasticsearch/vi/nlp/tokenizer/tools/WordExtractor.class */
public class WordExtractor {
    private static final Logger logger = LogManager.getLogger(WordExtractor.class);
    static String POS_FILE_EXTENSION = ".pos";
    static boolean PRUNE_NAME = true;

    public static Set<String> getWords(String str) {
        HashSet hashSet = new HashSet();
        for (String str2 : str.split("\\s+")) {
            int indexOf = str2.indexOf(47);
            if (indexOf > 0) {
                String trim = str2.substring(0, indexOf).replaceAll("_", " ").trim();
                if (!PRUNE_NAME) {
                    hashSet.add(trim);
                } else if (!containsStopwords(trim) && !isName(trim)) {
                    hashSet.add(CaseConverter.toLower(trim));
                }
            }
        }
        return hashSet;
    }

    private static boolean containsStopwords(String str) {
        for (int i = 0; i < str.length(); i++) {
            if (str.charAt(i) == '.' || str.charAt(i) == ',' || str.charAt(i) == '-') {
                return true;
            }
        }
        return false;
    }

    private static boolean isName(String str) {
        char charAt;
        char charAt2;
        String[] split = str.split("\\s+");
        if (split.length == 1) {
            String str2 = split[0];
            if (str2.length() > 0 && (((charAt2 = str2.charAt(0)) >= 'A' && charAt2 <= 'Z') || CaseConverter.isValidUpper(charAt2))) {
                return true;
            }
        }
        for (String str3 : split) {
            if (str3.length() > 0 && (((charAt = str3.charAt(0)) >= 'a' && charAt <= 'z') || CaseConverter.isValidLower(charAt))) {
                return false;
            }
        }
        return true;
    }

    public static Set<String> extract(String str) {
        TreeSet treeSet = new TreeSet();
        File[] listFiles = FileIterator.listFiles(new File(str), new TextFileFilter(POS_FILE_EXTENSION));
        logger.error("# of files = " + listFiles.length);
        for (File file : listFiles) {
            for (String str2 : UTF8FileUtility.getLines(file.getAbsolutePath())) {
                treeSet.addAll(getWords(str2));
            }
        }
        return treeSet;
    }

    public static void main(String[] strArr) {
        Set<String> extract = extract("data/VTB-20090712");
        UTF8FileUtility.createWriter("data/dictionaries/extractedWords.aut.txt");
        UTF8FileUtility.write((String[]) extract.toArray(new String[extract.size()]));
        UTF8FileUtility.closeWriter();
        logger.info("Done.");
    }
}
