package org.codelibs.elasticsearch.vi.nlp.tokenizer;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.elasticsearch.vi.nlp.sd.SentenceDetector;
import org.codelibs.elasticsearch.vi.nlp.sd.SentenceDetectorFactory;
import org.codelibs.elasticsearch.vi.nlp.tokenizer.nio.XMLCorpusExporter;
import org.codelibs.elasticsearch.vi.nlp.tokenizer.tokens.TaggedWord;
import org.codelibs.elasticsearch.vi.nlp.utils.FileIterator;
import org.codelibs.elasticsearch.vi.nlp.utils.TextFileFilter;
import org.codelibs.elasticsearch.vi.nlp.utils.UTF8FileUtility;

/* loaded from: input_file:org/codelibs/elasticsearch/vi/nlp/tokenizer/VietTokenizer.class */
public final class VietTokenizer {
    private static final Logger logger = LogManager.getLogger(VietTokenizer.class);
    private static Tokenizer tokenizer = null;
    private static SentenceDetector sentenceDetector = null;
    private static boolean DEBUG = false;
    private static int nTokens = 0;

    public VietTokenizer() {
        tokenizer = TokenizerProvider.getInstance().getTokenizer();
        createSentenceDetector();
    }

    public VietTokenizer(String str) {
        tokenizer = TokenizerProvider.getInstance(str).getTokenizer();
        createSentenceDetector(str);
    }

    public VietTokenizer(Properties properties) {
        tokenizer = TokenizerProvider.getInstance(properties).getTokenizer();
        createSentenceDetector(properties);
    }

    private static void createSentenceDetector() {
        if (sentenceDetector == null) {
            sentenceDetector = SentenceDetectorFactory.create(org.codelibs.elasticsearch.vi.nlp.sd.IConstants.LANG_VIETNAMESE);
        }
    }

    private static void createSentenceDetector(String str) {
        Properties properties = new Properties();
        try {
            properties.load(new FileInputStream(str));
            createSentenceDetector(properties);
        } catch (FileNotFoundException e) {
            logger.warn(e);
        } catch (IOException e2) {
            logger.warn(e2);
        }
    }

    private static void createSentenceDetector(Properties properties) {
        if (sentenceDetector == null) {
            sentenceDetector = SentenceDetectorFactory.create(properties);
        }
    }

    public String segment(String str) {
        StringBuffer stringBuffer = new StringBuffer(1000);
        try {
            tokenizer.tokenize(new StringReader(str));
            List<TaggedWord> result = tokenizer.getResult();
            Iterator<TaggedWord> it = result.iterator();
            while (it.hasNext()) {
                String taggedWord = it.next().toString();
                stringBuffer.append(TokenizerOptions.USE_UNDERSCORE ? taggedWord.replaceAll("\\s+", "_") : "[" + taggedWord + "]");
                stringBuffer.append(' ');
            }
            nTokens += result.size();
        } catch (IOException e) {
            logger.warn(e);
        }
        return stringBuffer.toString().trim();
    }

    public String[] tokenize(String str) {
        ArrayList arrayList = new ArrayList();
        StringReader stringReader = new StringReader(str);
        if (TokenizerOptions.USE_SENTENCE_DETECTOR) {
            try {
                for (String str2 : sentenceDetector.detectSentences(stringReader)) {
                    arrayList.add(segment(str2));
                }
            } catch (IOException e) {
                logger.warn(e);
            }
        } else {
            arrayList.add(segment(str));
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    public void turnOnSentenceDetection() {
        TokenizerOptions.USE_SENTENCE_DETECTOR = true;
    }

    public void turnOffSentenceDetection() {
        TokenizerOptions.USE_SENTENCE_DETECTOR = false;
    }

    public void tokenize2(String str, String str2) {
        tokenizer.tokenize(str);
        tokenizer.exportResult(str2);
    }

    public void tokenize(String str, String str2) {
        UTF8FileUtility.createWriter(str2);
        String[] lines = UTF8FileUtility.getLines(str);
        if (TokenizerOptions.XML_OUTPUT) {
            ArrayList arrayList = new ArrayList();
            for (String str3 : lines) {
                try {
                    tokenizer.tokenize(new StringReader(str3));
                } catch (IOException e) {
                    logger.warn(e);
                }
                ArrayList arrayList2 = new ArrayList(tokenizer.getResult());
                arrayList.add(arrayList2);
                nTokens += arrayList2.size();
            }
            UTF8FileUtility.write(new XMLCorpusExporter().export(arrayList));
        } else {
            for (String str4 : lines) {
                for (String str5 : tokenize(str4)) {
                    UTF8FileUtility.write(str5.trim());
                    UTF8FileUtility.write("\n");
                }
            }
        }
        UTF8FileUtility.closeWriter();
    }

    public void tokenizeDirectory(String str, String str2) {
        TextFileFilter textFileFilter = new TextFileFilter(TokenizerOptions.TEXT_FILE_EXTENSION);
        File file = new File(str);
        String absolutePath = new File(".").getAbsolutePath();
        String str3 = absolutePath + File.separator + str;
        String str4 = absolutePath + File.separator + str2;
        if (DEBUG) {
            logger.info("currentDir = " + absolutePath);
            logger.info("inputDirPath = " + str3);
            logger.info("outputDirPath = " + str4);
        }
        File[] listFiles = FileIterator.listFiles(file, textFileFilter);
        logger.info("Tokenizing all files in the directory, please wait...");
        long currentTimeMillis = System.currentTimeMillis();
        for (File file2 : listFiles) {
            tokenize(file2.getAbsolutePath(), str4 + File.separator + file2.getName());
        }
        logger.info("Tokenized " + nTokens + " words of " + listFiles.length + " files in " + (((float) (System.currentTimeMillis() - currentTimeMillis)) / 1000.0f) + " (s).\n");
    }

    public static Tokenizer getTokenizer() {
        return tokenizer;
    }

    public static SentenceDetector getSentenceDetector() {
        return sentenceDetector;
    }
}
