package net.sf.okapi.steps.termextraction;

import com.ibm.icu.text.BreakIterator;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
import net.sf.okapi.common.BOMNewlineEncodingDetector;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.annotation.TermsAnnotation;
import net.sf.okapi.common.exceptions.OkapiException;
import net.sf.okapi.common.resource.AnnotatedSpan;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextUnitUtil;
import net.sf.okapi.steps.tokenization.Token;
import net.sf.okapi.steps.tokenization.Tokens;
import net.sf.okapi.steps.tokenization.TokensAnnotation;

/* loaded from: input_file:net/sf/okapi/steps/termextraction/SimpleTermExtractor.class */
public class SimpleTermExtractor {
    private Parameters params;
    private Map<String, Boolean> stopWords;
    private Map<String, Boolean> notStartWords;
    private Map<String, Boolean> notEndWords;
    private Map<String, Integer> terms;
    private Map<String, Integer> termsFromAnnotations;
    private Locale srcLocale;
    private BreakIterator breaker;
    private String rootDir;
    private String inputRootDir;

    public void initialize(Parameters parameters, LocaleId localeId, String str, String str2) {
        this.srcLocale = localeId.toJavaLocale();
        this.params = parameters;
        this.rootDir = str;
        this.inputRootDir = str2;
        this.stopWords = loadList(parameters.getStopWordsPath(), "stopWords_en.txt");
        this.notStartWords = loadList(parameters.getNotStartWordsPath(), "notStartWords_en.txt");
        this.notEndWords = loadList(parameters.getNotEndWordsPath(), "notEndWords_en.txt");
        this.terms = new LinkedHashMap();
        this.termsFromAnnotations = new LinkedHashMap();
        this.breaker = null;
    }

    public void processTextUnit(ITextUnit iTextUnit) {
        TermsAnnotation termsAnnotation;
        if (iTextUnit.isTranslatable()) {
            if (this.params.getUseStatistics()) {
                gathertermsFromStatistics(iTextUnit);
            }
            if (this.params.getUseTerminologyAnnotations() && (termsAnnotation = (TermsAnnotation) iTextUnit.getSource().getAnnotation(TermsAnnotation.class)) != null) {
                for (int i = 0; i < termsAnnotation.size(); i++) {
                    String term = termsAnnotation.getTerm(i);
                    if (this.termsFromAnnotations.containsKey(term)) {
                        this.termsFromAnnotations.put(term, Integer.valueOf(this.termsFromAnnotations.get(term).intValue() + 1));
                    } else {
                        this.termsFromAnnotations.put(term, 1);
                    }
                }
            }
            if (this.params.getUseTextAnalysisAnnotations()) {
                harvestTextAnalysisAnnotations(iTextUnit);
            }
        }
    }

    private void gathertermsFromStatistics(ITextUnit iTextUnit) {
        List<String> wordsFromDefaultBreaker;
        TokensAnnotation tokensAnnotation = (TokensAnnotation) iTextUnit.getAnnotation(TokensAnnotation.class);
        if (tokensAnnotation != null) {
            Tokens filteredList = tokensAnnotation.getFilteredList("WORD", "KANA", "IDEOGRAM");
            wordsFromDefaultBreaker = new ArrayList();
            Iterator<Token> it = filteredList.iterator();
            while (it.hasNext()) {
                addWord(wordsFromDefaultBreaker, it.next().getValue());
            }
        } else {
            wordsFromDefaultBreaker = getWordsFromDefaultBreaker(iTextUnit.getSource());
        }
        for (int i = 0; i < wordsFromDefaultBreaker.size(); i++) {
            if (!this.stopWords.containsKey(wordsFromDefaultBreaker.get(i))) {
                String str = "";
                int i2 = 0;
                while (i2 < this.params.getMaxWordsPerTerm()) {
                    if (i + i2 < wordsFromDefaultBreaker.size()) {
                        String str2 = wordsFromDefaultBreaker.get(i + i2);
                        if (this.stopWords.containsKey(str2)) {
                            i2 = this.params.getMaxWordsPerTerm() + 1;
                        } else if (i2 == 0 && this.notStartWords.containsKey(str2)) {
                            i2 = this.params.getMaxWordsPerTerm() + 1;
                        } else {
                            if (i2 > 0) {
                                str = str + getWordSeparator(str.charAt(str.length() - 1));
                            }
                            str = str + str2;
                            if (i2 + 1 >= this.params.getMinWordsPerTerm() && !this.notEndWords.containsKey(str2)) {
                                if (this.terms.containsKey(str)) {
                                    this.terms.put(str, Integer.valueOf(this.terms.get(str).intValue() + 1));
                                } else {
                                    this.terms.put(str, 1);
                                }
                            }
                        }
                    }
                    i2++;
                }
            }
        }
    }

    private void harvestTextAnalysisAnnotations(ITextUnit iTextUnit) {
        for (Segment segment : iTextUnit.getSource().getSegments()) {
            if (segment.getContent().hasAnnotation("generic")) {
                Iterator<AnnotatedSpan> it = segment.getContent().getAnnotatedSpans("generic").iterator();
                while (it.hasNext()) {
                    String text = it.next().span.toText();
                    if (this.termsFromAnnotations.containsKey(text)) {
                        this.termsFromAnnotations.put(text, Integer.valueOf(this.termsFromAnnotations.get(text).intValue() + 1));
                    } else {
                        this.termsFromAnnotations.put(text, 1);
                    }
                }
            }
        }
    }

    private String getWordSeparator(char c) {
        if (c <= 1792) {
            return " ";
        }
        switch (Character.getType(c)) {
            case 5:
                return "";
            default:
                return " ";
        }
    }

    public void completeExtraction() {
        cleanupLowCounts(this.terms);
        if (this.params.getRemoveSubTerms()) {
            this.terms = cleanupSubStrings(this.terms);
            cleanupLowCounts(this.terms);
        }
        this.terms.putAll(this.termsFromAnnotations);
        this.terms = new TreeMap(this.terms);
        if (this.params.getSortByOccurrence()) {
            this.terms = sortByValues(this.terms);
        }
        generateReport();
    }

    private void generateReport() {
        PrintWriter printWriter = null;
        try {
            try {
                String fillInputRootDirectoryVariable = Util.fillInputRootDirectoryVariable(Util.fillRootDirectoryVariable(this.params.getOutputPath(), this.rootDir), this.inputRootDir);
                Util.createDirectories(fillInputRootDirectoryVariable);
                printWriter = new PrintWriter(fillInputRootDirectoryVariable, BOMNewlineEncodingDetector.UTF_8);
                for (Map.Entry<String, Integer> entry : this.terms.entrySet()) {
                    printWriter.println(String.format("%d\t%s", entry.getValue(), entry.getKey()));
                }
                if (printWriter != null) {
                    printWriter.close();
                }
            } catch (IOException e) {
                throw new OkapiException("Error when writing output file.", e);
            }
        } catch (Throwable th) {
            if (printWriter != null) {
                printWriter.close();
            }
            throw th;
        }
    }

    public Map<String, Integer> getTerms() {
        return this.terms;
    }

    private void cleanupLowCounts(Map<String, Integer> map) {
        Iterator<Map.Entry<String, Integer>> it = map.entrySet().iterator();
        while (it.hasNext()) {
            if (it.next().getValue().intValue() < this.params.getMinOccurrences()) {
                it.remove();
            }
        }
    }

    private Map<String, Integer> cleanupSubStrings(Map<String, Integer> map) {
        TreeMap treeMap = new TreeMap(Collections.reverseOrder());
        treeMap.putAll(map);
        Iterator it = treeMap.entrySet().iterator();
        while (it.hasNext()) {
            Map.Entry entry = (Map.Entry) it.next();
            int i = 0;
            String str = (String) entry.getKey();
            String str2 = str + getWordSeparator(str.charAt(str.length() - 1));
            for (Map.Entry entry2 : treeMap.entrySet()) {
                if (((String) entry2.getKey()).startsWith(str2) && !entry2.equals(entry)) {
                    i += ((Integer) entry2.getValue()).intValue();
                }
            }
            if (((Integer) entry.getValue()).intValue() == i) {
                it.remove();
            } else {
                entry.setValue(Integer.valueOf(((Integer) entry.getValue()).intValue() - i));
            }
        }
        return treeMap;
    }

    private void addWord(List<String> list, String str) {
        if (str.length() != 0) {
            if ((str.length() != 1 || str.codePointAt(0) >= 126) && Character.isLetterOrDigit(str.codePointAt(0))) {
                if (this.params.getKeepCase()) {
                    list.add(str);
                } else {
                    list.add(str.toLowerCase(this.srcLocale));
                }
            }
        }
    }

    private List<String> getWordsFromDefaultBreaker(TextContainer textContainer) {
        String text = textContainer.contentIsOneSegment() ? TextUnitUtil.getText(textContainer.getFirstContent()) : TextUnitUtil.getText(textContainer.getUnSegmentedContentCopy());
        if (text.length() == 0) {
            return Collections.emptyList();
        }
        if (this.breaker == null) {
            this.breaker = BreakIterator.getWordInstance(this.srcLocale);
        }
        this.breaker.setText(text);
        ArrayList arrayList = new ArrayList();
        int first = this.breaker.first();
        int next = this.breaker.next();
        while (true) {
            int i = next;
            if (i == -1) {
                return arrayList;
            }
            addWord(arrayList, text.substring(first, i));
            first = i;
            next = this.breaker.next();
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v28, types: [java.io.InputStream] */
    private HashMap<String, Boolean> loadList(String str, String str2) {
        HashMap<String, Boolean> hashMap = new HashMap<>();
        BufferedReader bufferedReader = null;
        try {
            try {
                bufferedReader = new BufferedReader(new InputStreamReader(Util.isEmpty(str) ? SimpleTermExtractor.class.getResourceAsStream(str2) : new FileInputStream(str), StandardCharsets.UTF_8));
                while (true) {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        break;
                    }
                    String trim = readLine.trim();
                    if (trim.length() != 0 && trim.charAt(0) != '#' && !hashMap.containsKey(trim)) {
                        hashMap.put(trim, false);
                    }
                }
                if (bufferedReader != null) {
                    try {
                        bufferedReader.close();
                    } catch (IOException e) {
                        throw new OkapiException("Error reading word list.", e);
                    }
                }
                return hashMap;
            } catch (IOException e2) {
                throw new OkapiException("Error reading word list.", e2);
            }
        } catch (Throwable th) {
            if (bufferedReader != null) {
                try {
                    bufferedReader.close();
                } catch (IOException e3) {
                    throw new OkapiException("Error reading word list.", e3);
                }
            }
            throw th;
        }
    }

    private <K, V extends Comparable<V>> Map<K, V> sortByValues(Map<K, V> map) {
        TreeMap treeMap = new TreeMap((obj, obj2) -> {
            int compareTo = ((Comparable) map.get(obj2)).compareTo(map.get(obj));
            if (compareTo == 0) {
                return 1;
            }
            return compareTo;
        });
        treeMap.putAll(map);
        return treeMap;
    }
}
