package org.languagetool.dev.wikipedia;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintStream;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.languagetool.Language;
import org.languagetool.dev.index.Indexer;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/* loaded from: input_file:org/languagetool/dev/wikipedia/WikipediaIndexHandler.class */
public class WikipediaIndexHandler extends DefaultHandler implements AutoCloseable {
    public static final String MAX_DOC_COUNT_VALUE = "maxDocCountValue";
    public static final String MAX_DOC_COUNT_FIELD = "maxDocCount";
    public static final String MAX_DOC_COUNT_FIELD_VAL = "1";
    private final Indexer indexer;
    private int start;
    private int end;
    private TextMapFilter textFilter;
    private int articleCount = 0;
    private boolean inText = false;
    private boolean inTitle = false;
    private StringBuilder text = new StringBuilder();
    private StringBuilder title = new StringBuilder();

    /* loaded from: input_file:org/languagetool/dev/wikipedia/WikipediaIndexHandler$DocumentLimitReachedException.class */
    private class DocumentLimitReachedException extends RuntimeException {
        final int limit;

        DocumentLimitReachedException(int i) {
            this.limit = i;
        }
    }

    public WikipediaIndexHandler(Directory directory, Language language, int i, int i2) {
        this.start = 0;
        this.end = 0;
        this.textFilter = new SwebleWikipediaTextFilter();
        this.indexer = new Indexer(directory, language);
        this.start = i;
        this.end = i2;
        if (i > i2 && i2 != 0) {
            throw new RuntimeException("\"start\" should be smaller than \"end\": " + i + ", " + i2);
        }
        this.textFilter = TextFilterTools.getTextFilter(language);
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
        if (str3.equals("title")) {
            this.inTitle = true;
        } else if (str3.equals("text")) {
            this.inText = true;
        }
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void endElement(String str, String str2, String str3) {
        if (str3.equals("title")) {
            this.inTitle = false;
        } else if (str3.equals("text")) {
            PrintStream printStream = System.out;
            StringBuilder sb = new StringBuilder();
            int i = this.articleCount + 1;
            this.articleCount = i;
            printStream.println(sb.append(i).append(": ").append((Object) this.title).toString());
            String sb2 = this.title.toString();
            this.title = new StringBuilder();
            if (this.articleCount < this.start) {
                return;
            }
            if (this.articleCount >= this.end && this.end != 0) {
                throw new DocumentLimitReachedException(this.end);
            }
            try {
                try {
                    String plainText = this.textFilter.filter(this.text.toString()).getPlainText();
                    if (!plainText.contains("#REDIRECT") && !plainText.trim().equals("")) {
                        this.indexer.index(plainText, false, this.articleCount);
                    }
                } catch (Exception e) {
                    System.err.println("Exception when filtering/indexing '" + sb2 + "' (" + this.articleCount + ") - skipping file. Stacktrace follows:");
                    e.printStackTrace();
                }
            } catch (Exception e2) {
                throw new RuntimeException("Failed checking article '" + sb2 + "' (" + this.articleCount + ")", e2);
            }
        }
        this.text = new StringBuilder();
        this.inText = false;
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void characters(char[] cArr, int i, int i2) {
        String str = new String(cArr, i, i2);
        if (this.inText) {
            this.text.append(str);
        } else if (this.inTitle) {
            this.title.append(str);
        }
    }

    @Override // java.lang.AutoCloseable
    public void close() throws Exception {
        this.indexer.close();
    }

    private void writeMetaDocuments() throws IOException {
        Document document = new Document();
        document.add(new StringField(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL, Field.Store.YES));
        document.add(new StringField(MAX_DOC_COUNT_VALUE, this.articleCount + "", Field.Store.YES));
        this.indexer.add(document);
    }

    public static void main(String... strArr) throws Exception {
        if (strArr.length != 4) {
            System.out.println("Usage: " + WikipediaIndexHandler.class.getSimpleName() + " <wikipediaDump> <indexDir> <languageCode> <maxDocs>");
            System.out.println("\t<wikipediaDump> a Wikipedia XML dump");
            System.out.println("\t<indexDir> directory where Lucene index will be written to, existing index content will be removed");
            System.out.println("\t<languageCode> short code like en for English, de for German etc");
            System.out.println("\t<maxDocs> maximum number of documents to be indexed, use 0 for no limit");
            System.exit(1);
        }
        File file = new File(strArr[0]);
        File file2 = new File(strArr[1]);
        String str = strArr[2];
        int parseInt = Integer.parseInt(strArr[3]);
        Language languageForShortName = Language.getLanguageForShortName(str);
        if (parseInt == 0) {
            System.out.println("Going to index all documents from " + file);
        } else {
            System.out.println("Going to index up to " + parseInt + " documents from " + file);
        }
        System.out.println("Output index dir: " + file2);
        long currentTimeMillis = System.currentTimeMillis();
        SAXParser newSAXParser = SAXParserFactory.newInstance().newSAXParser();
        FSDirectory open = FSDirectory.open(file2);
        Throwable th = null;
        try {
            WikipediaIndexHandler wikipediaIndexHandler = new WikipediaIndexHandler(open, languageForShortName, 1, parseInt);
            try {
                try {
                    newSAXParser.parse(new FileInputStream(file), wikipediaIndexHandler);
                    wikipediaIndexHandler.writeMetaDocuments();
                    wikipediaIndexHandler.close();
                } catch (DocumentLimitReachedException e) {
                    System.out.println("Document limit (" + e.limit + ") reached, stopping indexing");
                    wikipediaIndexHandler.writeMetaDocuments();
                    wikipediaIndexHandler.close();
                }
                System.out.printf("Indexing took %.2f minutes\n", Float.valueOf(((float) (System.currentTimeMillis() - currentTimeMillis)) / 60000.0f));
            } catch (Throwable th2) {
                wikipediaIndexHandler.writeMetaDocuments();
                wikipediaIndexHandler.close();
                throw th2;
            }
        } finally {
            if (open != null) {
                if (0 != 0) {
                    try {
                        open.close();
                    } catch (Throwable th3) {
                        th.addSuppressed(th3);
                    }
                } else {
                    open.close();
                }
            }
        }
    }
}
