package org.languagetool.dev.wikipedia;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.SAXParserFactory;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.MultiThreadedJLanguageTool;
import org.languagetool.rules.Rule;
import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.patterns.AbstractPatternRule;
import org.languagetool.tools.StringTools;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/* loaded from: input_file:org/languagetool/dev/wikipedia/WikipediaQuickCheck.class */
public class WikipediaQuickCheck {
    private static final Pattern WIKIPEDIA_URL_REGEX = Pattern.compile("https?://(..)\\.wikipedia\\.org/wiki/(.*)");
    private static final Pattern SECURE_WIKIPEDIA_URL_REGEX = Pattern.compile("https://secure\\.wikimedia\\.org/wikipedia/(..)/wiki/(.*)");
    private final File ngramDir;
    private final int maxSizeBytes;
    private List<String> disabledRuleIds;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/languagetool/dev/wikipedia/WikipediaQuickCheck$RevisionContentHandler.class */
    public class RevisionContentHandler extends DefaultHandler {
        private String timestamp;
        private final StringBuilder revisionText = new StringBuilder();
        private boolean inRevision = false;

        RevisionContentHandler() {
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
            if ("rev".equals(str3)) {
                this.timestamp = attributes.getValue("timestamp");
                this.inRevision = true;
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void endElement(String str, String str2, String str3) throws SAXException {
            if ("rev".equals(str3)) {
                this.inRevision = false;
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void characters(char[] cArr, int i, int i2) {
            String str = new String(cArr, i, i2);
            if (this.inRevision) {
                this.revisionText.append(str);
            }
        }

        public String getRevisionContent() {
            return this.revisionText.toString();
        }

        public String getTimestamp() {
            return this.timestamp;
        }
    }

    public WikipediaQuickCheck() {
        this(null, Integer.MAX_VALUE);
    }

    public WikipediaQuickCheck(File file) {
        this(file, Integer.MAX_VALUE);
    }

    public WikipediaQuickCheck(File file, int i) {
        this.disabledRuleIds = new ArrayList();
        this.ngramDir = file;
        this.maxSizeBytes = i;
    }

    public String getMediaWikiContent(URL url) throws IOException {
        return getContent(new URL("https://" + getLanguage(url).getShortCode() + ".wikipedia.org/w/api.php?titles=" + URLEncoder.encode(getPageTitle(url), "utf-8") + "&action=query&prop=revisions&rvprop=content|timestamp&format=xml"));
    }

    public Language getLanguage(URL url) {
        return Languages.getLanguageForShortCode(getUrlMatcher(url.toString()).group(1));
    }

    public String getPageTitle(URL url) {
        return getUrlMatcher(url.toString()).group(2);
    }

    private Matcher getUrlMatcher(String str) {
        Matcher matcher = WIKIPEDIA_URL_REGEX.matcher(str);
        Matcher matcher2 = SECURE_WIKIPEDIA_URL_REGEX.matcher(str);
        if (matcher.matches()) {
            return matcher;
        }
        if (matcher2.matches()) {
            return matcher2;
        }
        throw new RuntimeException("URL does not seem to be a valid Wikipedia URL: " + str);
    }

    public void setDisabledRuleIds(List<String> list) {
        this.disabledRuleIds = list;
    }

    public List<String> getDisabledRuleIds() {
        return this.disabledRuleIds;
    }

    public MarkupAwareWikipediaResult checkPage(URL url) throws IOException, PageNotFoundException {
        return checkPage(url, (ErrorMarker) null);
    }

    public MarkupAwareWikipediaResult checkPage(URL url, ErrorMarker errorMarker) throws IOException, PageNotFoundException {
        validateWikipediaUrl(url);
        String mediaWikiContent = getMediaWikiContent(url);
        if (mediaWikiContent.length() > this.maxSizeBytes) {
            throw new RuntimeException("Sorry, the content at " + url + " is too big - this process has been limited to " + this.maxSizeBytes + " bytes, but the content is " + mediaWikiContent.length() + " bytes");
        }
        MediaWikiContent revisionContent = getRevisionContent(mediaWikiContent);
        String content = revisionContent.getContent();
        if (content.trim().isEmpty()) {
            throw new PageNotFoundException("No content found at '" + url + "'");
        }
        if (content.toLowerCase().contains("#redirect")) {
            throw new PageNotFoundException("No content but redirect found at '" + url + "'");
        }
        return checkWikipediaMarkup(url, revisionContent, getLanguage(url), errorMarker);
    }

    MarkupAwareWikipediaResult checkWikipediaMarkup(URL url, MediaWikiContent mediaWikiContent, Language language, ErrorMarker errorMarker) throws IOException {
        PlainTextMapping filter = new SwebleWikipediaTextFilter().filter(mediaWikiContent.getContent());
        MultiThreadedJLanguageTool languageTool = getLanguageTool(language);
        ArrayList arrayList = new ArrayList();
        try {
            List<RuleMatch> check = languageTool.check(filter.getPlainText());
            languageTool.shutdown();
            int i = 0;
            for (RuleMatch ruleMatch : check) {
                try {
                    arrayList.add(new AppliedRuleMatch(ruleMatch, (errorMarker != null ? new SuggestionReplacer(filter, mediaWikiContent.getContent(), errorMarker) : new SuggestionReplacer(filter, mediaWikiContent.getContent())).applySuggestionsToOriginalText(ruleMatch)));
                } catch (Exception e) {
                    System.err.println("Failed to apply suggestion for rule match '" + ruleMatch + "' for URL " + url + ": " + e);
                    i++;
                }
            }
            return new MarkupAwareWikipediaResult(mediaWikiContent, arrayList, i);
        } catch (Throwable th) {
            languageTool.shutdown();
            throw th;
        }
    }

    public WikipediaQuickCheckResult checkPage(String str, Language language) throws IOException {
        MultiThreadedJLanguageTool languageTool = getLanguageTool(language);
        try {
            WikipediaQuickCheckResult wikipediaQuickCheckResult = new WikipediaQuickCheckResult(str, languageTool.check(str), language.getShortCode());
            languageTool.shutdown();
            return wikipediaQuickCheckResult;
        } catch (Throwable th) {
            languageTool.shutdown();
            throw th;
        }
    }

    public void validateWikipediaUrl(URL url) {
        getUrlMatcher(url.toString());
    }

    public String getPlainText(String str) {
        return new SwebleWikipediaTextFilter().filter(removeWikipediaLinks(getRevisionContent(str).getContent())).getPlainText();
    }

    public PlainTextMapping getPlainTextMapping(String str) {
        return new SwebleWikipediaTextFilter().filter(getRevisionContent(str).getContent());
    }

    String removeWikipediaLinks(String str) {
        return str.replaceAll("\\[\\[[a-z]{2,6}:.*?\\]\\]", "").replaceAll("\\[\\[:?(Category|Categoria|Categoría|Catégorie|Kategorie):.*?\\]\\]", "").replaceAll("(File|Fitxer|Fichero|Ficheiro|Fichier|Datei):.*?\\.(png|jpg|svg|jpeg|tiff|gif|PNG|JPG|SVG|JPEG|TIFF|GIF)\\|((thumb|miniatur)\\|)?((right|left)\\|)?", "");
    }

    private MediaWikiContent getRevisionContent(String str) {
        SAXParserFactory newInstance = SAXParserFactory.newInstance();
        RevisionContentHandler revisionContentHandler = new RevisionContentHandler();
        try {
            newInstance.newSAXParser().parse(new InputSource(new StringReader(str)), revisionContentHandler);
            return new MediaWikiContent(revisionContentHandler.getRevisionContent(), revisionContentHandler.getTimestamp());
        } catch (Exception e) {
            throw new RuntimeException("Could not parse XML: " + str, e);
        }
    }

    private MultiThreadedJLanguageTool getLanguageTool(Language language) throws IOException {
        MultiThreadedJLanguageTool multiThreadedJLanguageTool = new MultiThreadedJLanguageTool(language);
        enableWikipediaRules(multiThreadedJLanguageTool);
        Iterator<String> it = this.disabledRuleIds.iterator();
        while (it.hasNext()) {
            multiThreadedJLanguageTool.disableRule(it.next());
        }
        if (this.ngramDir != null) {
            multiThreadedJLanguageTool.activateLanguageModelRules(this.ngramDir);
        }
        disableSpellingRules(multiThreadedJLanguageTool);
        return multiThreadedJLanguageTool;
    }

    private void enableWikipediaRules(JLanguageTool jLanguageTool) {
        for (Rule rule : jLanguageTool.getAllRules()) {
            if (rule.getCategory().getName().equals("Wikipedia")) {
                jLanguageTool.enableRule(rule.getId());
            }
        }
    }

    private void disableSpellingRules(JLanguageTool jLanguageTool) {
        for (Rule rule : jLanguageTool.getAllActiveRules()) {
            if (rule.isDictionaryBasedSpellingRule()) {
                jLanguageTool.disableRule(rule.getId());
            }
        }
    }

    private String getContent(URL url) throws IOException {
        try {
            HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
            httpURLConnection.setRequestMethod("GET");
            httpURLConnection.setConnectTimeout(30000);
            httpURLConnection.setReadTimeout(30000);
            httpURLConnection.connect();
            InputStream inputStream = (InputStream) httpURLConnection.getContent();
            Throwable th = null;
            try {
                try {
                    String streamToString = StringTools.streamToString(inputStream, "UTF-8");
                    if (inputStream != null) {
                        if (0 != 0) {
                            try {
                                inputStream.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        } else {
                            inputStream.close();
                        }
                    }
                    return streamToString;
                } finally {
                }
            } finally {
            }
        } catch (SocketTimeoutException e) {
            throw new RuntimeException("Timeout accessing " + url, e);
        }
    }

    public static void main(String[] strArr) throws IOException, PageNotFoundException {
        if (strArr.length != 1) {
            System.out.println("Usage: " + WikipediaQuickCheck.class.getName() + " <url>");
            System.exit(1);
        }
        int i = 0;
        for (AppliedRuleMatch appliedRuleMatch : new WikipediaQuickCheck().checkPage(new URL(strArr[0]), new ErrorMarker("***", "***")).getAppliedRuleMatches()) {
            RuleMatchApplication ruleMatchApplication = appliedRuleMatch.getRuleMatchApplications().get(0);
            RuleMatch ruleMatch = appliedRuleMatch.getRuleMatch();
            Rule rule = ruleMatch.getRule();
            System.out.println();
            i++;
            System.out.print(i + ". " + ruleMatch.getMessage().replace("<suggestion>", "'").replace("</suggestion>", "'"));
            if (rule instanceof AbstractPatternRule) {
                System.out.println(" (" + rule.getFullId() + ")");
            } else {
                System.out.println(" (" + rule.getId() + ")");
            }
            System.out.println("    ..." + ruleMatchApplication.getOriginalErrorContext(50).replace("\n", "\\n") + "...");
        }
    }
}
