package org.wikibrain.core.nlp;

import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.TIntIntMap;
import gnu.trove.map.TLongIntMap;
import gnu.trove.map.TLongObjectMap;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.map.hash.TLongObjectHashMap;
import gnu.trove.procedure.TLongIntProcedure;
import gnu.trove.set.TLongSet;
import gnu.trove.set.hash.TLongHashSet;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.dao.sql.AbstractSqlDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.model.LocalPage;
import org.wikibrain.utils.MurmurHash;
import org.wikibrain.utils.ParallelForEach;
import org.wikibrain.utils.Procedure;
import org.wikibrain.utils.WpIOUtils;
import org.wikibrain.utils.WpThreadUtils;

/* loaded from: input_file:org/wikibrain/core/nlp/Dictionary.class */
public class Dictionary implements Closeable {
    public static final int MAX_DICTIONARY_SIZE = 20000000;
    public static int PRUNE_INTERVAL = 10000;
    public static Logger LOG = LoggerFactory.getLogger(Dictionary.class);
    public static final Pattern PATTERN_MENTION = Pattern.compile("(.*?):/w/([^/]+)/(\\d+)(/[^ ]*($| ))?");
    private final Language language;
    private boolean containsMentions;
    private boolean countBigrams;
    private final WordStorage wordStorage;
    private AtomicLong totalWords;
    private AtomicLong totalBigrams;
    private AtomicLong totalNgrams;
    private final TLongIntMap unigramCounts;
    private final TLongIntMap bigramCounts;
    private final TLongIntMap ngramCounts;
    private StringTokenizer tokenizer;
    private NGramCreator nGramCreator;
    private BufferedWriter wordWriter;
    private File wordFile;
    private int maxDictionarySize;
    private int minPruneCount;
    private final TIntIntMap mentionCounts;
    private TLongSet interestingNGrams;
    private TLongSet interestingSubGrams;
    private final TLongObjectMap<String> words;

    /* loaded from: input_file:org/wikibrain/core/nlp/Dictionary$WordStorage.class */
    public enum WordStorage {
        ON_DISK,
        IN_MEMORY,
        NONE
    }

    public Dictionary(Language language) {
        this(language, WordStorage.NONE);
    }

    public Dictionary(Language language, WordStorage wordStorage) {
        this.containsMentions = true;
        this.countBigrams = false;
        this.totalWords = new AtomicLong();
        this.totalBigrams = new AtomicLong();
        this.totalNgrams = new AtomicLong();
        this.unigramCounts = new TLongIntHashMap();
        this.bigramCounts = new TLongIntHashMap();
        this.ngramCounts = new TLongIntHashMap();
        this.tokenizer = new StringTokenizer();
        this.nGramCreator = new NGramCreator();
        this.maxDictionarySize = MAX_DICTIONARY_SIZE;
        this.minPruneCount = 1;
        this.mentionCounts = new TIntIntHashMap();
        this.interestingNGrams = null;
        this.interestingSubGrams = null;
        this.words = new TLongObjectHashMap();
        this.language = language;
        this.wordStorage = wordStorage;
        if (wordStorage == WordStorage.ON_DISK) {
            try {
                this.wordFile = File.createTempFile("words", ".txt");
                this.wordFile.deleteOnExit();
                this.wordFile.delete();
                this.wordWriter = WpIOUtils.openWriter(this.wordFile);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }

    public void setInterestingNgrams(Iterator<String> it) {
        this.interestingSubGrams = new TLongHashSet();
        this.interestingNGrams = new TLongHashSet();
        while (it.hasNext()) {
            List<String> words = this.tokenizer.getWords(this.language, it.next());
            if (!words.isEmpty()) {
                StringBuilder sb = new StringBuilder();
                long j = -1;
                for (int i = 0; i < words.size(); i++) {
                    if (i > 0) {
                        sb.append(' ');
                    }
                    sb.append(words.get(i));
                    j = hashWord(sb.toString());
                    this.interestingSubGrams.add(j);
                }
                this.interestingNGrams.add(j);
            }
        }
    }

    public void countRawFile(File file) throws IOException {
        LineIterator lineIterator = FileUtils.lineIterator(file, "UTF-8");
        ParallelForEach.iterate(lineIterator, Math.min(3, WpThreadUtils.getMaxThreads()), AbstractSqlDao.DEFAULT_FETCH_SIZE, new Procedure<String>() { // from class: org.wikibrain.core.nlp.Dictionary.1
            public void call(String str) throws Exception {
                Dictionary.this.countRawText(str);
            }
        }, Integer.MAX_VALUE);
        lineIterator.close();
    }

    public void countNormalizedFile(File file) throws IOException {
        LineIterator lineIterator = FileUtils.lineIterator(file, "UTF-8");
        ParallelForEach.iterate(lineIterator, Math.min(3, WpThreadUtils.getMaxThreads()), AbstractSqlDao.DEFAULT_FETCH_SIZE, new Procedure<String>() { // from class: org.wikibrain.core.nlp.Dictionary.2
            public void call(String str) throws Exception {
                Dictionary.this.countNormalizedText(str);
            }
        }, Integer.MAX_VALUE);
        lineIterator.close();
    }

    public void countRawText(String str) {
        if (this.containsMentions) {
            Matcher matcher = PATTERN_MENTION.matcher(str);
            while (matcher.find()) {
                int intValue = Integer.valueOf(matcher.group(3)).intValue();
                synchronized (this.mentionCounts) {
                    this.mentionCounts.adjustOrPutValue(intValue, 1, 1);
                }
            }
            str = PATTERN_MENTION.matcher(str).replaceAll("$1 ");
        }
        countWords(this.tokenizer.getWords(this.language, str));
    }

    public void countNormalizedText(String str) {
        countWords(Arrays.asList(str.split(" +")));
    }

    private void countWords(List<String> list) {
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            countUnigram(it.next());
        }
        if (this.countBigrams) {
            Iterator<String> it2 = this.nGramCreator.getNGrams(list, 2, 2).iterator();
            while (it2.hasNext()) {
                countBigram(it2.next());
            }
        }
        if (this.interestingNGrams != null) {
            countNgrams(list);
        }
    }

    public void countUnigram(String str) {
        int adjustOrPutValue;
        String trim = str.trim();
        if (trim.isEmpty()) {
            return;
        }
        if (this.containsMentions) {
            Matcher matcher = PATTERN_MENTION.matcher(trim);
            if (matcher.matches()) {
                trim = matcher.group(1);
                int intValue = Integer.valueOf(matcher.group(3)).intValue();
                synchronized (this.mentionCounts) {
                    this.mentionCounts.adjustOrPutValue(intValue, 1, 1);
                }
            }
        }
        long hash = getHash(trim);
        if (this.wordStorage == WordStorage.IN_MEMORY) {
            synchronized (this.words) {
                if (!this.words.containsKey(hash)) {
                    this.words.put(hash, trim);
                }
            }
        }
        synchronized (this.unigramCounts) {
            adjustOrPutValue = this.unigramCounts.adjustOrPutValue(hash, 1, 1);
        }
        if (adjustOrPutValue == 1 && this.wordStorage == WordStorage.ON_DISK) {
            try {
                this.wordWriter.write(trim + "\n");
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        if (this.totalWords.incrementAndGet() % PRUNE_INTERVAL == 0) {
            pruneIfNecessary();
        }
    }

    public void countBigram(String str) {
        String trim = str.trim();
        if (trim.isEmpty()) {
            return;
        }
        if (this.containsMentions) {
            Matcher matcher = PATTERN_MENTION.matcher(trim);
            if (matcher.matches()) {
                trim = matcher.group(1);
            }
        }
        long hash = getHash(trim);
        synchronized (this.bigramCounts) {
            this.bigramCounts.adjustOrPutValue(hash, 1, 1);
        }
        if (this.totalBigrams.incrementAndGet() % PRUNE_INTERVAL == 0) {
            pruneIfNecessary();
        }
    }

    public void countNgrams(List<String> list) {
        for (int i = 0; i < list.size(); i++) {
            StringBuilder sb = new StringBuilder();
            for (int i2 = i; i2 < list.size(); i2++) {
                if (i2 > i) {
                    sb.append(' ');
                }
                sb.append(list.get(i));
                long hashWord = hashWord(sb.toString());
                if (this.interestingNGrams.contains(hashWord) || this.interestingSubGrams.contains(hashWord)) {
                }
            }
        }
    }

    public synchronized void pruneIfNecessary() {
        int size;
        int size2;
        int size3;
        int size4;
        while (true) {
            synchronized (this.unigramCounts) {
                size = this.unigramCounts.size();
            }
            synchronized (this.bigramCounts) {
                size2 = this.bigramCounts.size();
            }
            if (size + size2 <= this.maxDictionarySize) {
                return;
            }
            this.minPruneCount++;
            LOG.info("pruning dictionary entries with frequency less than " + this.minPruneCount);
            synchronized (this.unigramCounts) {
                this.unigramCounts.retainEntries(new TLongIntProcedure() { // from class: org.wikibrain.core.nlp.Dictionary.3
                    public boolean execute(long j, int i) {
                        return i >= Dictionary.this.minPruneCount;
                    }
                });
                size3 = this.unigramCounts.size();
            }
            synchronized (this.bigramCounts) {
                this.bigramCounts.retainEntries(new TLongIntProcedure() { // from class: org.wikibrain.core.nlp.Dictionary.4
                    public boolean execute(long j, int i) {
                        return i >= Dictionary.this.minPruneCount;
                    }
                });
                size4 = this.bigramCounts.size();
            }
            LOG.info("after pruning dictionary size is " + (size3 + size4));
        }
    }

    public void write(File file) throws IOException {
        write(file, 1);
    }

    public void write(File file, int i) throws IOException {
        if (this.wordStorage == WordStorage.NONE) {
            throw new UnsupportedOperationException();
        }
        IOUtils.closeQuietly(this);
        BufferedWriter openWriter = WpIOUtils.openWriter(file);
        openWriter.write("t " + this.totalWords.get() + " _\n");
        if (this.wordStorage == WordStorage.ON_DISK) {
            BufferedReader openBufferedReader = WpIOUtils.openBufferedReader(this.wordFile);
            while (true) {
                String readLine = openBufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                String trim = readLine.trim();
                int i2 = this.unigramCounts.get(getHash(trim));
                if (i2 >= i) {
                    openWriter.write("w " + i2 + " " + trim + "\n");
                }
            }
            openBufferedReader.close();
        } else {
            if (this.wordStorage != WordStorage.IN_MEMORY) {
                throw new IllegalStateException();
            }
            for (String str : this.words.valueCollection()) {
                int i3 = this.unigramCounts.get(getHash(str));
                if (i3 >= i) {
                    openWriter.write("w " + i3 + " " + str + "\n");
                }
            }
        }
        for (int i4 : this.mentionCounts.keys()) {
            openWriter.write("m " + i4 + " " + this.mentionCounts.get(i4) + "\n");
        }
        openWriter.close();
    }

    public void read(File file) throws IOException {
        read(file, Integer.MAX_VALUE, 1);
    }

    public void read(File file, int i, int i2) throws IOException {
        int intValue;
        if (this.wordStorage == WordStorage.ON_DISK) {
            throw new UnsupportedOperationException("Cannot read into dictionaries using disk storage");
        }
        TIntArrayList tIntArrayList = new TIntArrayList();
        BufferedReader openBufferedReader = WpIOUtils.openBufferedReader(file);
        while (true) {
            String readLine = openBufferedReader.readLine();
            if (readLine == null) {
                break;
            }
            String[] split = readLine.trim().split(" ", 3);
            if (split[0].equals("w") && (intValue = Integer.valueOf(split[1]).intValue()) >= i2) {
                tIntArrayList.add(intValue);
            }
        }
        openBufferedReader.close();
        tIntArrayList.sort();
        tIntArrayList.reverse();
        int i3 = 0;
        int i4 = Integer.MAX_VALUE;
        if (tIntArrayList.size() > i) {
            i3 = tIntArrayList.get(i - 1);
            for (int i5 = i - 1; i5 >= 0 && tIntArrayList.get(i5) == i3; i5--) {
                i4++;
            }
        }
        BufferedReader openBufferedReader2 = WpIOUtils.openBufferedReader(file);
        this.totalWords.set(0L);
        while (true) {
            String readLine2 = openBufferedReader2.readLine();
            if (readLine2 == null) {
                openBufferedReader2.close();
                tIntArrayList.sort();
                tIntArrayList.reverse();
                return;
            }
            String[] split2 = readLine2.trim().split(" ", 3);
            if (split2[0].equals("w")) {
                int intValue2 = Integer.valueOf(split2[1]).intValue();
                if (intValue2 >= i3 && intValue2 >= i2 && (intValue2 != i3 || i4 != 0)) {
                    if (intValue2 == i3) {
                        i4--;
                    }
                    String trim = split2[2].trim();
                    int intValue3 = Integer.valueOf(split2[1]).intValue();
                    long hash = getHash(trim);
                    this.unigramCounts.put(hash, intValue3);
                    if (this.wordStorage == WordStorage.IN_MEMORY) {
                        this.words.put(hash, trim);
                    }
                }
            } else if (split2[0].equals("m")) {
                this.mentionCounts.put(Integer.valueOf(split2[1]).intValue(), Integer.valueOf(split2[2]).intValue());
            } else {
                if (!split2[0].equals("t")) {
                    throw new IOException("unexpected line: " + readLine2);
                }
                this.totalWords.set(Long.valueOf(split2[1]).longValue());
            }
        }
    }

    public int getUnigramCount(String str) {
        return this.unigramCounts.get(getHash(str));
    }

    public int getBigramCount(String str, String str2) {
        return this.bigramCounts.get(getHash(str + " " + str2));
    }

    public int getBigramCount(String str) {
        return this.bigramCounts.get(getHash(str));
    }

    public int getMentionCount(int i) {
        return this.mentionCounts.get(i);
    }

    public int getMentionCount(String str) {
        if (!str.startsWith("/w/")) {
            throw new IllegalArgumentException("format for mentionUrl must be /w/langCode/articleId/ArticleTitle");
        }
        String[] split = str.split("/", 5);
        if (split.length != 5) {
            throw new IllegalArgumentException("format for mentionUrl must be /w/langCode/articleId/ArticleTitle");
        }
        return this.mentionCounts.get(Integer.valueOf(split[3]).intValue());
    }

    public final long getHash(String str) {
        return hashWord(str);
    }

    public long getTotalCount() {
        return this.totalWords.get();
    }

    public void setContainsMentions(boolean z) {
        this.containsMentions = z;
    }

    public void setCountBigrams(boolean z) {
        this.countBigrams = z;
    }

    public void setTokenizer(StringTokenizer stringTokenizer) {
        this.tokenizer = stringTokenizer;
    }

    public void setCreator(NGramCreator nGramCreator) {
        this.nGramCreator = nGramCreator;
    }

    public int getNumUnigrams() {
        return this.unigramCounts.size();
    }

    public int getNumBigrams() {
        return this.bigramCounts.size();
    }

    public int getNumMentionedArticles() {
        return this.mentionCounts.size();
    }

    public List<String> getFrequentUnigrams(int i) {
        if (this.wordStorage != WordStorage.IN_MEMORY) {
            throw new UnsupportedOperationException("WordStorage must be in memory to return strings");
        }
        int i2 = 0;
        if (i < this.unigramCounts.size()) {
            int[] values = this.unigramCounts.values();
            Arrays.sort(values);
            i2 = values[values.length - i];
        }
        final ArrayList arrayList = new ArrayList();
        final int i3 = i2;
        this.unigramCounts.forEachEntry(new TLongIntProcedure() { // from class: org.wikibrain.core.nlp.Dictionary.5
            public boolean execute(long j, int i4) {
                if (i4 < i3) {
                    return true;
                }
                arrayList.add(Dictionary.this.words.get(j));
                return true;
            }
        });
        Collections.sort(arrayList, new Comparator<String>() { // from class: org.wikibrain.core.nlp.Dictionary.6
            @Override // java.util.Comparator
            public int compare(String str, String str2) {
                int unigramCount = Dictionary.this.getUnigramCount(str2) - Dictionary.this.getUnigramCount(str);
                if (unigramCount == 0) {
                    unigramCount = str.compareTo(str2);
                }
                return unigramCount;
            }
        });
        return arrayList.size() > i ? arrayList.subList(0, i) : arrayList;
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v32, types: [java.util.List] */
    public List<String> getFrequentUnigramsAndMentions(LocalPageDao localPageDao, int i, int i2, int i3) throws DaoException {
        int i4;
        LocalPage byId;
        if (this.wordStorage != WordStorage.IN_MEMORY) {
            throw new UnsupportedOperationException("WordStorage must be in memory to return strings");
        }
        if (i < this.unigramCounts.size()) {
            int[] values = this.unigramCounts.values();
            Arrays.sort(values);
            i4 = Math.max(i2, values[values.length - i]);
        } else {
            i4 = 0;
        }
        final ArrayList arrayList = new ArrayList();
        final int i5 = i4;
        this.unigramCounts.forEachEntry(new TLongIntProcedure() { // from class: org.wikibrain.core.nlp.Dictionary.7
            public boolean execute(long j, int i6) {
                if (i6 < i5) {
                    return true;
                }
                arrayList.add(Dictionary.this.words.get(j));
                return true;
            }
        });
        Collections.sort(arrayList, new Comparator<String>() { // from class: org.wikibrain.core.nlp.Dictionary.8
            @Override // java.util.Comparator
            public int compare(String str, String str2) {
                int unigramCount = Dictionary.this.getUnigramCount(str2) - Dictionary.this.getUnigramCount(str);
                if (unigramCount == 0) {
                    unigramCount = str.compareTo(str2);
                }
                return unigramCount;
            }
        });
        ArrayList arrayList2 = arrayList;
        if (arrayList2.size() > i) {
            arrayList2 = arrayList2.subList(0, i);
        }
        for (int i6 : this.mentionCounts.keys()) {
            if (this.mentionCounts.get(i6) >= i3 && (byId = localPageDao.getById(this.language, i6)) != null) {
                arrayList2.add(makeMentionUrl(byId));
            }
        }
        Collections.sort(arrayList2, new Comparator<String>() { // from class: org.wikibrain.core.nlp.Dictionary.9
            @Override // java.util.Comparator
            public int compare(String str, String str2) {
                int mentionCount = (str2.startsWith("/w/") ? Dictionary.this.getMentionCount(str2) : Dictionary.this.getUnigramCount(str2)) - (str.startsWith("/w/") ? Dictionary.this.getMentionCount(str) : Dictionary.this.getUnigramCount(str));
                if (mentionCount == 0) {
                    mentionCount = str.compareTo(str2);
                }
                return mentionCount;
            }
        });
        return arrayList2;
    }

    private String makeMentionUrl(LocalPage localPage) {
        return "/w/" + this.language.getLangCode() + "/" + localPage.getLocalId() + "/" + localPage.getTitle().getCanonicalTitle().replaceAll(" ", "_");
    }

    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() throws IOException {
        if (this.wordWriter != null) {
            this.wordWriter.close();
        }
    }

    public static long hashWord(String str) {
        long hash64 = MurmurHash.hash64(str);
        if (hash64 == 0) {
            hash64 = 1;
        }
        return hash64;
    }

    public void setMaxDictionarySize(int i) {
        this.maxDictionarySize = i;
    }
}
