package org.languagetool.dev.dumpcheck;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.regex.Pattern;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import org.languagetool.Language;
import org.languagetool.dev.wikipedia.SwebleWikipediaTextFilter;
import org.languagetool.tokenizers.Tokenizer;

/* loaded from: input_file:org/languagetool/dev/dumpcheck/WikipediaSentenceSource.class */
public class WikipediaSentenceSource extends SentenceSource {
    private static final boolean ONLY_ARTICLES = false;
    private static final String ARTICLE_NAMESPACE = "0";
    private static final int MAX_ARTICLE_SIZE = -1;
    private final SwebleWikipediaTextFilter textFilter;
    private final XMLEventReader reader;
    private final Tokenizer sentenceTokenizer;
    private final List<WikipediaSentence> sentences;
    private final Language language;
    private int articleCount;
    private int skipCount;
    private int namespaceSkipCount;
    private int redirectSkipCount;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/languagetool/dev/dumpcheck/WikipediaSentenceSource$WikipediaSentence.class */
    public static class WikipediaSentence {
        final String sentence;
        final String title;
        final int articleCount;

        WikipediaSentence(String str, String str2, int i) {
            this.sentence = str;
            this.title = str2;
            this.articleCount = i;
        }
    }

    public WikipediaSentenceSource(InputStream inputStream, Language language) {
        this(inputStream, language, null);
    }

    public WikipediaSentenceSource(InputStream inputStream, Language language, Pattern pattern) {
        super(language, pattern);
        this.textFilter = new SwebleWikipediaTextFilter();
        this.articleCount = ONLY_ARTICLES;
        this.skipCount = ONLY_ARTICLES;
        this.namespaceSkipCount = ONLY_ARTICLES;
        this.redirectSkipCount = ONLY_ARTICLES;
        this.textFilter.enableMapping(false);
        try {
            System.setProperty("jdk.xml.totalEntitySizeLimit", String.valueOf(Integer.MAX_VALUE));
            this.reader = XMLInputFactory.newInstance().createXMLEventReader(inputStream);
            this.sentenceTokenizer = language.getSentenceTokenizer();
            this.sentences = new ArrayList();
            this.language = language;
        } catch (XMLStreamException e) {
            throw new RuntimeException((Throwable) e);
        }
    }

    @Override // org.languagetool.dev.dumpcheck.SentenceSource, java.util.Iterator
    public boolean hasNext() {
        try {
            fillSentences();
            return this.sentences.size() > 0;
        } catch (XMLStreamException e) {
            throw new RuntimeException((Throwable) e);
        }
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // org.languagetool.dev.dumpcheck.SentenceSource, java.util.Iterator
    public Sentence next() {
        try {
            fillSentences();
            if (this.sentences.size() == 0) {
                throw new NoSuchElementException();
            }
            WikipediaSentence remove = this.sentences.remove(ONLY_ARTICLES);
            return new Sentence(remove.sentence, getSource(), remove.title, "http://" + this.language.getShortCode() + ".wikipedia.org/wiki/" + remove.title, remove.articleCount);
        } catch (XMLStreamException e) {
            throw new RuntimeException((Throwable) e);
        }
    }

    @Override // org.languagetool.dev.dumpcheck.SentenceSource
    public String getSource() {
        return "wikipedia";
    }

    /* JADX WARN: Failed to find 'out' block for switch in B:10:0x004c. Please report as an issue. */
    private void fillSentences() throws XMLStreamException {
        String str = ONLY_ARTICLES;
        String str2 = ONLY_ARTICLES;
        while (this.sentences.size() == 0 && this.reader.hasNext()) {
            XMLEvent nextEvent = this.reader.nextEvent();
            if (nextEvent.getEventType() == 1) {
                String localPart = nextEvent.asStartElement().getName().getLocalPart();
                boolean z = MAX_ARTICLE_SIZE;
                switch (localPart.hashCode()) {
                    case 3525:
                        if (localPart.equals("ns")) {
                            z = true;
                            break;
                        }
                        break;
                    case 3556653:
                        if (localPart.equals("text")) {
                            z = 2;
                            break;
                        }
                        break;
                    case 110371416:
                        if (localPart.equals("title")) {
                            z = ONLY_ARTICLES;
                            break;
                        }
                        break;
                }
                switch (z) {
                    case ONLY_ARTICLES /* 0 */:
                        str = this.reader.nextEvent().asCharacters().getData();
                        this.articleCount++;
                        if (this.articleCount % 100 != 0) {
                            break;
                        } else {
                            System.out.println("Article: " + this.articleCount + " (skipped so far: " + this.skipCount + ")");
                            break;
                        }
                    case true:
                        str2 = this.reader.nextEvent().asCharacters().getData();
                        break;
                    case true:
                        handleTextElement(str2, str, this.articleCount);
                        break;
                }
            }
        }
    }

    private void handleTextElement(String str, String str2, int i) throws XMLStreamException {
        XMLEvent nextEvent = this.reader.nextEvent();
        StringBuilder sb = new StringBuilder();
        while (nextEvent.isCharacters()) {
            sb.append(nextEvent.asCharacters().getData());
            nextEvent = this.reader.nextEvent();
        }
        try {
            if (sb.toString().trim().toLowerCase().startsWith("#redirect")) {
                this.redirectSkipCount++;
                return;
            }
            for (String str3 : this.sentenceTokenizer.tokenize(this.textFilter.filter(sb.toString()).getPlainText())) {
                if (acceptSentence(str3)) {
                    this.sentences.add(new WikipediaSentence(str3, str2, i));
                }
            }
        } catch (Exception e) {
            System.err.println("Could not extract text, skipping document: " + e + ", full stacktrace follows:");
            e.printStackTrace();
        }
    }
}
