package org.apache.lucene.benchmark.byTask.feeds;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
import org.cyberneko.html.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/* loaded from: input_file:org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.class */
public class DemoHTMLParser implements HTMLParser {

    /* loaded from: input_file:org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser$Parser.class */
    public static final class Parser {
        public final Properties metaTags;
        public final String title;
        public final String body;
        static final Set<String> ENDLINE_ELEMENTS = createElementNameSet("p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl", "pre", "hr", "blockquote", "address", "fieldset", "table", "form", "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option");
        static final Set<String> SUPPRESS_ELEMENTS = createElementNameSet("style", "script");

        public Parser(Reader reader) throws IOException, SAXException {
            this(new InputSource(reader));
        }

        public Parser(InputSource inputSource) throws IOException, SAXException {
            this.metaTags = new Properties();
            SAXParser sAXParser = new SAXParser();
            sAXParser.setFeature("http://xml.org/sax/features/namespaces", true);
            sAXParser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
            sAXParser.setFeature("http://cyberneko.org/html/features/report-errors", false);
            sAXParser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
            sAXParser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
            final StringBuilder sb = new StringBuilder();
            final StringBuilder sb2 = new StringBuilder();
            DefaultHandler defaultHandler = new DefaultHandler() { // from class: org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser.Parser.1
                private int inBODY = 0;
                private int inHEAD = 0;
                private int inTITLE = 0;
                private int suppressed = 0;

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
                    String value;
                    if (this.inHEAD > 0) {
                        if ("title".equals(str2)) {
                            this.inTITLE++;
                            return;
                        }
                        if ("meta".equals(str2)) {
                            String value2 = attributes.getValue("name");
                            if (value2 == null) {
                                value2 = attributes.getValue("http-equiv");
                            }
                            String value3 = attributes.getValue("content");
                            if (value2 == null || value3 == null) {
                                return;
                            }
                            Parser.this.metaTags.setProperty(value2.toLowerCase(Locale.ROOT), value3);
                            return;
                        }
                        return;
                    }
                    if (this.inBODY > 0) {
                        if (Parser.SUPPRESS_ELEMENTS.contains(str2)) {
                            this.suppressed++;
                            return;
                        } else {
                            if (!"img".equals(str2) || (value = attributes.getValue("alt")) == null) {
                                return;
                            }
                            sb2.append('[').append(value).append(']');
                            return;
                        }
                    }
                    if (DocMaker.BODY_FIELD.equals(str2)) {
                        this.inBODY++;
                    } else if ("head".equals(str2)) {
                        this.inHEAD++;
                    } else if ("frameset".equals(str2)) {
                        throw new SAXException("This parser does not support HTML framesets.");
                    }
                }

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void endElement(String str, String str2, String str3) throws SAXException {
                    if (this.inBODY > 0) {
                        if (DocMaker.BODY_FIELD.equals(str2)) {
                            this.inBODY--;
                            return;
                        } else if (Parser.ENDLINE_ELEMENTS.contains(str2)) {
                            sb2.append('\n');
                            return;
                        } else {
                            if (Parser.SUPPRESS_ELEMENTS.contains(str2)) {
                                this.suppressed--;
                                return;
                            }
                            return;
                        }
                    }
                    if (this.inHEAD > 0) {
                        if ("head".equals(str2)) {
                            this.inHEAD--;
                        } else {
                            if (this.inTITLE <= 0 || !"title".equals(str2)) {
                                return;
                            }
                            this.inTITLE--;
                        }
                    }
                }

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void characters(char[] cArr, int i, int i2) throws SAXException {
                    if (this.inBODY > 0 && this.suppressed == 0) {
                        sb2.append(cArr, i, i2);
                    } else if (this.inTITLE > 0) {
                        sb.append(cArr, i, i2);
                    }
                }

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.EntityResolver
                public InputSource resolveEntity(String str, String str2) {
                    return new InputSource(new StringReader(""));
                }
            };
            sAXParser.setContentHandler(defaultHandler);
            sAXParser.setErrorHandler(defaultHandler);
            sAXParser.parse(inputSource);
            this.title = sb.toString().trim();
            this.body = sb2.toString();
        }

        private static final Set<String> createElementNameSet(String... strArr) {
            return Collections.unmodifiableSet(new HashSet(Arrays.asList(strArr)));
        }
    }

    @Override // org.apache.lucene.benchmark.byTask.feeds.HTMLParser
    public DocData parse(DocData docData, String str, Date date, Reader reader, TrecContentSource trecContentSource) throws IOException {
        try {
            return parse(docData, str, date, new InputSource(reader), trecContentSource);
        } catch (SAXException e) {
            throw new IOException("SAX exception occurred while parsing HTML document.", e);
        }
    }

    public DocData parse(DocData docData, String str, Date date, InputSource inputSource, TrecContentSource trecContentSource) throws IOException, SAXException {
        Date parseDate;
        Parser parser = new Parser(inputSource);
        Properties properties = parser.metaTags;
        String property = properties.getProperty("date");
        if (property != null && (parseDate = trecContentSource.parseDate(property)) != null) {
            date = parseDate;
        }
        docData.clear();
        docData.setName(str);
        docData.setBody(parser.body);
        docData.setTitle(parser.title);
        docData.setProps(properties);
        docData.setDate(date);
        return docData;
    }
}
