package org.apache.tika.parser.html;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Pattern;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Geographic;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/* loaded from: input_file:org/apache/tika/parser/html/HtmlParserTest.class */
public class HtmlParserTest extends TikaTest {

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/tika/parser/html/HtmlParserTest$EncodingDetectorRunner.class */
    public class EncodingDetectorRunner implements Callable<String> {
        static final String DONE = "done";
        private final ArrayBlockingQueue<Path> paths;
        private final Map<Path, String> encodings;
        private final EncodingDetector detector;

        private EncodingDetectorRunner(ArrayBlockingQueue<Path> arrayBlockingQueue, Map<Path, String> map, EncodingDetector encodingDetector) {
            this.paths = arrayBlockingQueue;
            this.encodings = map;
            this.detector = encodingDetector;
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // java.util.concurrent.Callable
        public String call() throws IOException {
            Path poll;
            for (int i = 0; i < this.encodings.size() && (poll = this.paths.poll()) != null; i++) {
                Assertions.assertEquals(this.encodings.get(poll), HtmlParserTest.this.getEncoding(this.detector, poll), "detector class=" + this.detector.getClass() + " : file=" + poll.toString());
            }
            return DONE;
        }
    }

    @Test
    public void testParseAscii() throws Exception {
        final StringWriter stringWriter = new StringWriter();
        final StringWriter stringWriter2 = new StringWriter();
        ContentHandler bodyContentHandler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        InputStream resourceAsStream = getResourceAsStream("/test-documents/testHTML.html");
        try {
            new JSoupParser().parse(resourceAsStream, new TeeContentHandler(new ContentHandler[]{bodyContentHandler, new DefaultHandler() { // from class: org.apache.tika.parser.html.HtmlParserTest.1
                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
                    if ("a".equals(str2)) {
                        if (attributes.getValue("href") != null) {
                            stringWriter.append((CharSequence) attributes.getValue("href"));
                        } else if (attributes.getValue("name") != null) {
                            stringWriter2.append((CharSequence) attributes.getValue("name"));
                        }
                    }
                }
            }}), metadata, new ParseContext());
            if (resourceAsStream != null) {
                resourceAsStream.close();
            }
            Assertions.assertEquals("Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
            Assertions.assertEquals("Tika Developers", metadata.get("Author"));
            Assertions.assertEquals("5", metadata.get("refresh"));
            Assertions.assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
            Assertions.assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
            Assertions.assertEquals("http://www.apache.org/", stringWriter.toString());
            Assertions.assertEquals("test-anchor", stringWriter2.toString());
            String obj = bodyContentHandler.toString();
            Assertions.assertTrue(obj.contains("Test Indexation Html"), "Did not contain expected text:Test Indexation Html");
            Assertions.assertTrue(obj.contains("Indexation du fichier"), "Did not contain expected text:Indexation du fichier");
        } catch (Throwable th) {
            if (resourceAsStream != null) {
                try {
                    resourceAsStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    @Disabled("The file 'testXHTML_utf8.html' is not available for testing")
    @Test
    public void XtestParseUTF8() throws IOException, SAXException, TikaException {
        String parseToString = new Tika().parseToString(getResourceAsStream("/test-documents/testXHTML_utf8.html"), new Metadata());
        Assertions.assertTrue(parseToString.contains("Title : Tilte with UTF-8 chars √∂√§√•"), "Did not contain expected text:Title : Tilte with UTF-8 chars √∂√§√•");
        Assertions.assertTrue(parseToString.contains("Content with UTF-8 chars"), "Did not contain expected text:Content with UTF-8 chars");
        Assertions.assertTrue(parseToString.contains("√•√§√∂"), "Did not contain expected text:√•√§√∂");
    }

    @Test
    public void testXhtmlParsing() throws Exception {
        Metadata metadata = new Metadata();
        String parseToString = new Tika().parseToString(getResourceAsStream("/test-documents/testXHTML.html"), metadata);
        Assertions.assertTrue(metadata.get("Content-Type").startsWith("application/xhtml+xml; charset="));
        Assertions.assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));
        Assertions.assertEquals("Tika Developers", metadata.get("Author"));
        Assertions.assertEquals("5", metadata.get("refresh"));
        assertContains("ability of Apache Tika", parseToString);
        assertContains("extract content", parseToString);
        assertContains("an XHTML document", parseToString);
    }

    @Test
    public void testParseEmpty() throws Exception {
        BodyContentHandler bodyContentHandler = new BodyContentHandler();
        new JSoupParser().parse(new ByteArrayInputStream(new byte[0]), bodyContentHandler, new Metadata(), new ParseContext());
        Assertions.assertEquals("", bodyContentHandler.toString());
    }

    @Test
    public void testCharactersDirectlyUnderBodyElement() throws Exception {
        Assertions.assertEquals("test", new Tika().parseToString(new ByteArrayInputStream("<html><body>test</body></html>".getBytes(StandardCharsets.UTF_8))));
    }

    @Test
    public void testBaseHref() throws Exception {
        assertRelativeLink("http://lucene.apache.org/tika/", "http://lucene.apache.org/", "tika/");
        assertRelativeLink("http://domain.com/?pid=1", "http://domain.com", "?pid=1");
        assertRelativeLink("http://domain.com/?pid=2", "http://domain.com?pid=1", "?pid=2");
        assertRelativeLink("http://domain.com/file.html", "http://domain.com/path/", "/file.html");
        assertRelativeLink("http://domain.com/path/file.html", "http://domain.com/path/", "./file.html");
        assertRelativeLink("http://domain.com/path/file.html", "http://domain.com/path/", "file.html");
        assertRelativeLink("http://domain2.com/newpath", "http://domain.com/path/to/file", "http://domain2.com/newpath");
        assertRelativeLink("http://domain.com/path/?pid=1", "http://domain.com/path/", "?pid=1");
        assertRelativeLink("http://domain.com/file?pid=1", "http://domain.com/file", "?pid=1");
        assertRelativeLink("http://domain.com/path/d;p?pid=1", "http://domain.com/path/d;p?q#f", "?pid=1");
    }

    private void assertRelativeLink(String str, String str2, String str3) throws Exception {
        final ArrayList arrayList = new ArrayList();
        new JSoupParser().parse(new ByteArrayInputStream(("<html><head><base href=\"" + str2 + "\"></head><body><a href=\"" + str3 + "\">test</a></body></html>").getBytes(StandardCharsets.UTF_8)), new DefaultHandler() { // from class: org.apache.tika.parser.html.HtmlParserTest.2
            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void startElement(String str4, String str5, String str6, Attributes attributes) {
                if (!str6.equals("a") || attributes.getValue("", "href") == null) {
                    return;
                }
                arrayList.add(attributes.getValue("", "href"));
            }
        }, new Metadata(), new ParseContext());
        Assertions.assertEquals(1, arrayList.size());
        Assertions.assertEquals(str, arrayList.get(0));
    }

    @Test
    public void testWhitespaceBetweenTableCells() throws Exception {
        String parseToString = new Tika().parseToString(new ByteArrayInputStream("<html><body><table><tr><td>a</td><td>b</td></table></body></html>".getBytes(StandardCharsets.UTF_8)));
        assertContains("a", parseToString);
        assertContains("b", parseToString);
        Assertions.assertFalse(parseToString.contains("ab"));
    }

    @Test
    public void testHttpEquivCharset() throws Exception {
        Metadata metadata = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-1\" /><title>the name is ándre</title></head><body></body></html>".getBytes(StandardCharsets.ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
        Assertions.assertEquals("ISO-8859-1", metadata.get("Content-Encoding"));
    }

    @Test
    public void testHtml5Charset() throws Exception {
        Metadata metadata = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><meta charset=\"ISO-8859-15\" /><title>the name is ándre</title></head><body></body></html>".getBytes(StandardCharsets.ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
        Assertions.assertEquals("ISO-8859-15", metadata.get("Content-Encoding"));
    }

    @Test
    public void testDetectOfCharset() throws Exception {
        Metadata metadata = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>Ž</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
        Assertions.assertEquals("Ž", metadata.get(TikaCoreProperties.TITLE));
    }

    @Test
    public void testUsingCharsetInContentTypeHeader() throws Exception {
        Metadata metadata = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>the name is ándre</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
        Assertions.assertEquals("UTF-8", metadata.get("Content-Encoding"));
        Metadata metadata2 = new Metadata();
        metadata2.set("Content-Type", "text/html; charset=ISO-8859-1");
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>the name is ándre</title></head><body></body></html>".getBytes(StandardCharsets.ISO_8859_1)), new BodyContentHandler(), metadata2, new ParseContext());
        Assertions.assertEquals("ISO-8859-1", metadata2.get("Content-Encoding"));
    }

    @Test
    public void testLineBreak() throws Exception {
        String[] split = new Tika().parseToString(new ByteArrayInputStream("<html><body><div>foo<br>bar</div>baz</body></html>".getBytes(StandardCharsets.US_ASCII))).trim().split("\\s+");
        Assertions.assertEquals(3, split.length);
        Assertions.assertEquals("foo", split[0]);
        Assertions.assertEquals("bar", split[1]);
        Assertions.assertEquals("baz", split[2]);
    }

    @Test
    public void testIgnoreCharsetDetectorLanguage() throws Exception {
        Metadata metadata = new Metadata();
        metadata.add("Content-Language", "en");
        new JSoupParser().parse(new ByteArrayInputStream("<html><title>Simple Content</title><body></body></html>".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
        Assertions.assertEquals("en", metadata.get("Content-Language"));
    }

    @Test
    public void testHttpEquivCharsetFunkyAttributes() throws Exception {
        Metadata metadata = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=ISO-8859-15; charset=iso-8859-15\" /><title>the name is ándre</title></head><body></body></html>".getBytes(StandardCharsets.ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
        Assertions.assertEquals("ISO-8859-15", metadata.get("Content-Encoding"));
        Metadata metadata2 = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><meta http-equiv=\"content-type\" content=\"text/html;;charset=ISO-8859-15\" /><title>the name is ándre</title></head><body></body></html>".getBytes(StandardCharsets.ISO_8859_1)), new BodyContentHandler(), metadata2, new ParseContext());
        Assertions.assertEquals("ISO-8859-15", metadata2.get("Content-Encoding"));
    }

    @Test
    public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
        Metadata metadata = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>the name is ándre</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
        Assertions.assertEquals("UTF-8", metadata.get("Content-Encoding"));
        Metadata metadata2 = new Metadata();
        metadata2.set("Content-Type", "charset=ISO-8859-1;text/html");
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>the name is ándre</title></head><body></body></html>".getBytes(StandardCharsets.ISO_8859_1)), new BodyContentHandler(), metadata2, new ParseContext());
        Assertions.assertEquals("ISO-8859-1", metadata2.get("Content-Encoding"));
    }

    @Test
    public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
        Metadata metadata = new Metadata();
        new JSoupParser().parse(getResourceAsStream("/test-documents/big-preamble.html"), new BodyContentHandler(), metadata, new ParseContext());
        Assertions.assertEquals("windows-1251", metadata.get("Content-Encoding"));
    }

    @Test
    public void testElementOrdering() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><meta http-equiv=\"content-type\" content=\"text/html\"><link rel=\"next\" href=\"next.html\" /></head><body><p>Simple Content</p></body></html>".getBytes(StandardCharsets.UTF_8)), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        String stringWriter2 = stringWriter.toString();
        Assertions.assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", stringWriter2));
        Assertions.assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", stringWriter2));
        Assertions.assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", stringWriter2));
        Assertions.assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", stringWriter2));
        Assertions.assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", stringWriter2));
        Assertions.assertTrue(Pattern.matches("(?s).*</body>.*</html>$", stringWriter2));
    }

    @Test
    public void testImgUrlExtraction() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /></head><body><img src=\"image.jpg\" /></body></html>".getBytes(StandardCharsets.UTF_8)), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        Assertions.assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", stringWriter.toString()));
    }

    @Test
    public void testFrameSrcExtraction() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /></head><frameset><frame src=\"frame.html\" /></frameset></html>".getBytes(StandardCharsets.UTF_8)), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        Assertions.assertTrue(Pattern.matches("(?s).*<frame .*src=\"http://domain.com/frame.html\"/>.*$", stringWriter.toString()));
    }

    @Test
    public void testIFrameSrcExtraction() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /></head><body><iframe src =\"framed.html\" width=\"100%\" height=\"300\"><p>Your browser doesn't support iframes!</p></body></html>".getBytes(StandardCharsets.UTF_8)), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        Assertions.assertTrue(Pattern.matches("(?s).*<iframe .*src=\"http://domain.com/framed.html\".*$", stringWriter.toString()));
    }

    @Test
    public void testAreaExtraction() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /></head><body><p><map name=\"map\" id=\"map\"><area shape=\"rect\" href=\"map.html\" alt=\"\" /></map></p></body></html>".getBytes(StandardCharsets.UTF_8)), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        Assertions.assertTrue(Pattern.matches("(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", stringWriter.toString()));
    }

    @Test
    public void testObjectExtraction() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /></head><body><p><object data=\"object.data\" type=\"text/html\"><param name=\"name\" value=\"value\" /></object></p></body></html>".getBytes(StandardCharsets.UTF_8)), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        String stringWriter2 = stringWriter.toString();
        Assertions.assertTrue(Pattern.matches("(?s).*<object data=\"http://domain.com/object.data\".*<param .*name=\"name\" value=\"value\"/>.*</object>.*$", stringWriter2), "<object> tag not correctly found in:\n" + stringWriter2);
    }

    @Test
    public void testMetaTagHandling() throws Exception {
        Metadata metadata = new Metadata();
        metadata.add("Content-Type", "text/html; charset=utf-8");
        metadata.add("Language", (String) null);
        StringWriter stringWriter = new StringWriter();
        new JSoupParser().parse(new ByteArrayInputStream("<html><body><h1>header</h1><p>some text</p></body></html>".getBytes(StandardCharsets.UTF_8)), makeHtmlTransformer(stringWriter), metadata, new ParseContext());
        String stringWriter2 = stringWriter.toString();
        Assertions.assertTrue(Pattern.matches("(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>.*$", stringWriter2));
        Assertions.assertFalse(Pattern.matches("(?s).*<meta name=\"Language\".*$", stringWriter2));
    }

    @Disabled("JSoup's dom has an empty body for these structures :(")
    @Test
    public void testBrokenFrameset() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /></head><body><frameset><frame src=\"frame.html\" /></frameset></body></html>".getBytes(StandardCharsets.UTF_8)), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        String stringWriter2 = stringWriter.toString();
        Assertions.assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", stringWriter2));
        Assertions.assertFalse(Pattern.matches("(?s).*<body>.*$", stringWriter2));
        StringWriter stringWriter3 = new StringWriter();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title> my title </title></head><body><frameset rows=\"20,*\"><frame src=\"top.html\"></frame><frameset cols=\"20,*\"><frame src=\"left.html\"></frame><frame src=\"invalid.html\"/></frame><frame src=\"right.html\"></frame></frameset></frameset></body></html>".getBytes(StandardCharsets.UTF_8)), makeHtmlTransformer(stringWriter3), new Metadata(), new ParseContext());
        String stringWriter4 = stringWriter3.toString();
        Assertions.assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", stringWriter4));
        Assertions.assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", stringWriter4));
        Assertions.assertTrue(Pattern.matches("(?s).*<frame .* src=\"invalid.html\"/>.*$", stringWriter4));
        Assertions.assertTrue(Pattern.matches("(?s).*<frame .* src=\"right.html\"/>.*$", stringWriter4));
        Assertions.assertFalse(Pattern.matches("(?s).*<body>.*$", stringWriter4));
    }

    @Test
    public void testBoilerplateDelegation() throws Exception {
        Metadata metadata = new Metadata();
        StringWriter stringWriter = new StringWriter();
        new JSoupParser().parse(getResourceAsStream("/test-documents/boilerplate.html"), makeHtmlTransformer(stringWriter), metadata, new ParseContext());
        String stringWriter2 = stringWriter.toString();
        Assertions.assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\">.*</html>.*$", stringWriter2));
        Assertions.assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", stringWriter2));
        Assertions.assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", stringWriter2));
        Assertions.assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", stringWriter2));
    }

    @Test
    public void testLinkHrefResolution() throws Exception {
        StringWriter stringWriter = new StringWriter();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>Title</title><base href=\"http://domain.com\" /><link rel=\"next\" href=\"next.html\" /></head><body></body></html>".getBytes(StandardCharsets.UTF_8)), makeHtmlTransformer(stringWriter), new Metadata(), new ParseContext());
        Assertions.assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$", stringWriter.toString()));
    }

    private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
        TransformerHandler newTransformerHandler = ((SAXTransformerFactory) SAXTransformerFactory.newInstance()).newTransformerHandler();
        newTransformerHandler.getTransformer().setOutputProperty("method", "html");
        newTransformerHandler.getTransformer().setOutputProperty("indent", "no");
        newTransformerHandler.getTransformer().setOutputProperty("encoding", "utf-8");
        newTransformerHandler.setResult(new StreamResult(writer));
        return newTransformerHandler;
    }

    @Test
    public void testPushback() throws IOException, TikaException {
        Assertions.assertNotNull(new Tika().parseToString(getResourceAsStream("/test-documents/tika434.html"), new Metadata()));
    }

    @Test
    public void testIdentityMapper() throws Exception {
        Metadata metadata = new Metadata();
        ParseContext parseContext = new ParseContext();
        parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
        StringWriter stringWriter = new StringWriter();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>Title</title></head><body></body></html>".getBytes(StandardCharsets.UTF_8)), makeHtmlTransformer(stringWriter), metadata, parseContext);
        Assertions.assertTrue(Pattern.matches("(?s).*<body/>.*$", stringWriter.toString()));
    }

    @Test
    public void testNewlineAndIndent() throws Exception {
        BodyContentHandler bodyContentHandler = new BodyContentHandler();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><title>Title</title></head><body><ul><li>one</li></ul></body></html>".getBytes(StandardCharsets.UTF_8)), bodyContentHandler, new Metadata(), new ParseContext());
        Assertions.assertTrue(Pattern.matches("\tone\n\n", bodyContentHandler.toString()));
    }

    @Test
    public void testHtmlLanguage() throws Exception {
        StringWriter stringWriter = new StringWriter();
        Metadata metadata = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html lang=\"fr\"></html>".getBytes(StandardCharsets.UTF_8)), makeHtmlTransformer(stringWriter), metadata, new ParseContext());
        Assertions.assertEquals("fr", metadata.get("Content-Language"));
        Assertions.assertTrue(Pattern.matches("(?s)<html[^>]* lang=\"fr\".*", stringWriter.toString()), "Missing HTML lang attribute");
    }

    @Test
    public void testOpenGraphMetadata() throws Exception {
        Metadata metadata = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><meta property=\"og:description\" content=\"some description\" /><meta property=\"og:image\" content=\"http://example.com/image1.jpg\" /><meta property=\"og:image\" content=\"http://example.com/image2.jpg\" /><title>hello</title></head><body></body></html>".getBytes(StandardCharsets.ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
        Assertions.assertEquals("some description", metadata.get("og:description"));
        Assertions.assertTrue(metadata.isMultiValued("og:image"));
    }

    @Test
    public void testUserDefinedCharset() throws Exception {
        Assertions.assertNotNull(new Tika().parseToString(getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"), new Metadata()));
    }

    @Test
    public void testNoisyMetaCharsetHeaders() throws Exception {
        Tika tika = new Tika();
        for (int i = 1; i <= 4; i++) {
            String str = "/test-documents/testHTMLNoisyMetaEncoding_" + i + ".html";
            Assertions.assertTrue(tika.parseToString(getResourceAsStream(str)).contains("أعرب"), "testing: " + str);
        }
    }

    @Disabled("jsoup doesn't seem to deal with locators?")
    @Test
    public void testLocator() throws Exception {
        final int[] iArr = new int[2];
        new JSoupParser().parse(getResourceAsStream("/test-documents/testHTML.html"), new ContentHandler() { // from class: org.apache.tika.parser.html.HtmlParserTest.3
            Locator locator;

            @Override // org.xml.sax.ContentHandler
            public void setDocumentLocator(Locator locator) {
                this.locator = locator;
            }

            @Override // org.xml.sax.ContentHandler
            public void startDocument() throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void endDocument() throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void startPrefixMapping(String str, String str2) throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void endPrefixMapping(String str) throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void endElement(String str, String str2, String str3) throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void characters(char[] cArr, int i, int i2) throws SAXException {
                if (!new String(cArr, i, i2).contains("Test Indexation Html") || this.locator == null) {
                    return;
                }
                iArr[0] = this.locator.getLineNumber();
                iArr[1] = this.locator.getColumnNumber();
            }

            @Override // org.xml.sax.ContentHandler
            public void ignorableWhitespace(char[] cArr, int i, int i2) throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void processingInstruction(String str, String str2) throws SAXException {
            }

            @Override // org.xml.sax.ContentHandler
            public void skippedEntity(String str) throws SAXException {
            }
        }, new Metadata(), new ParseContext());
        Assertions.assertEquals(24, iArr[0]);
        Assertions.assertTrue(Math.abs(iArr[1] - 47) < 10);
    }

    @Test
    public void testFirstTitleValueisSetToMetadata() throws Exception {
        Metadata metadata = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html><title>Simple Content</title><body><h1></h1><title>TitleToIgnore</title></body></html>".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
        Assertions.assertEquals("Simple Content", metadata.get(TikaCoreProperties.TITLE));
    }

    @Test
    public void testMisleadingMetaContentTypeTags() throws Exception {
        Metadata metadata = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-ELEVEN\"></head><title>title</title><body>body</body></html>".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
        Assertions.assertEquals("text/html; charset=UTF-ELEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
        Assertions.assertEquals("text/html; charset=ISO-8859-1", metadata.get("Content-Type"));
        Metadata metadata2 = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\"></head><title>title</title><body>body</body></html>".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata2, new ParseContext());
        Assertions.assertEquals("application/pdf", metadata2.get(TikaCoreProperties.CONTENT_TYPE_HINT));
        Assertions.assertEquals("text/html; charset=ISO-8859-1", metadata2.get("Content-Type"));
        Metadata metadata3 = new Metadata();
        new JSoupParser().parse(new ByteArrayInputStream("<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\" content=\"application/ms-word\"></head><title>title</title><body>body</body></html>".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata3, new ParseContext());
        Assertions.assertEquals("application/pdf", metadata3.get(TikaCoreProperties.CONTENT_TYPE_HINT));
        Assertions.assertEquals("text/html; charset=ISO-8859-1", metadata3.get("Content-Type"));
    }

    @Test
    public void testXHTMLWithMisleading() throws Exception {
        Metadata metadata = new Metadata();
        AUTO_DETECT_PARSER.parse(new ByteArrayInputStream("<?xml version=\"1.0\" ?><!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\" />\n<title>title</title></head><body>body</body></html>".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
        Assertions.assertEquals("text/html; charset=iso-8859-1", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
        Assertions.assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get("Content-Type"));
        Metadata metadata2 = new Metadata();
        AUTO_DETECT_PARSER.parse(new ByteArrayInputStream("<?xml version=\"1.0\" ?><!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-NUMBER_SEVEN\" />\n<title>title</title></head><body>body</body></html>".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata2, new ParseContext());
        Assertions.assertEquals("text/html; charset=iso-NUMBER_SEVEN", metadata2.get(TikaCoreProperties.CONTENT_TYPE_HINT));
        Assertions.assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata2.get("Content-Type"));
    }

    @Test
    public void testScriptSrc() throws Exception {
        assertScriptLink("<html><body><script src=\"" + "http://domain.com/logic.js" + "\"></script></body></html>", "http://domain.com/logic.js");
        assertScriptLink("<html><head><script src=\"" + "http://domain.com/logic.js" + "\"></script></head></html>", "http://domain.com/logic.js");
    }

    private void assertScriptLink(String str, String str2) throws Exception {
        ParseContext parseContext = new ParseContext();
        parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
        Metadata metadata = new Metadata();
        metadata.set("Content-Type", "text/html");
        final ArrayList arrayList = new ArrayList();
        new JSoupParser().parse(new ByteArrayInputStream(str.getBytes(StandardCharsets.UTF_8)), new DefaultHandler() { // from class: org.apache.tika.parser.html.HtmlParserTest.4
            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void startElement(String str3, String str4, String str5, Attributes attributes) {
                if (!str5.equals("script") || attributes.getValue("", "src") == null) {
                    return;
                }
                arrayList.add(attributes.getValue("", "src"));
            }
        }, metadata, parseContext);
        Assertions.assertEquals(1, arrayList.size());
        Assertions.assertEquals(str2, arrayList.get(0));
    }

    @Test
    public void testAllHeadElements() throws Exception {
        ParseContext parseContext = new ParseContext();
        parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
        Metadata metadata = new Metadata();
        metadata.set("Content-Type", "text/html");
        final HashMap hashMap = new HashMap();
        InputStream resourceAsStream = getResourceAsStream("/test-documents/testHTML_head.html");
        try {
            new JSoupParser().parse(resourceAsStream, new DefaultHandler() { // from class: org.apache.tika.parser.html.HtmlParserTest.5
                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
                    hashMap.put(str3, Integer.valueOf(((Integer) hashMap.getOrDefault(str3, 0)).intValue() + 1));
                }
            }, metadata, parseContext);
            if (resourceAsStream != null) {
                resourceAsStream.close();
            }
            Assertions.assertEquals(1, ((Integer) hashMap.get("title")).intValue());
            Assertions.assertEquals(11, ((Integer) hashMap.get("meta")).intValue());
            Assertions.assertEquals(12, ((Integer) hashMap.get("link")).intValue());
            Assertions.assertEquals(6, ((Integer) hashMap.get("script")).intValue());
        } catch (Throwable th) {
            if (resourceAsStream != null) {
                try {
                    resourceAsStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    @Test
    public void testSkippingCommentsInEncodingDetection() throws Exception {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < 10000; i++) {
            sb.append(" ");
        }
        assertContains("有什么需要我帮你的", getXML(new ByteArrayInputStream(new String("<html><head><!--<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\"> -->\n   <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /></head>" + sb.toString() + "<body>有什么需要我帮你的</body></html>").getBytes(StandardCharsets.UTF_8)), AUTO_DETECT_PARSER, new Metadata()).xml);
    }

    @Disabled("until we fix TIKA-1896")
    @Test
    public void testBadScript() throws Exception {
        String str = getXML("testHTMLBadScript.html").xml;
        assertContains("This is a test", str);
        assertNotContained("cool", str);
    }

    @Test
    public void testGoodScript() throws Exception {
        String str = getXML("testHTMLGoodScript.html").xml;
        assertContains("This is a test", str);
        assertNotContained("cool", str);
    }

    @Test
    public void testScriptInBody() throws Exception {
        String str = getXML("testHTML_script_in_body.html").xml;
        assertContains("This is a test", str);
        assertNotContained("cool", str);
    }

    @Test
    public void testExtractScript() throws Exception {
        JSoupParser jSoupParser = new JSoupParser();
        jSoupParser.setExtractScripts(true);
        List recursiveMetadata = getRecursiveMetadata("testHTMLGoodScript.html", jSoupParser, BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
        Assertions.assertEquals(2, recursiveMetadata.size());
        Assertions.assertEquals("MACRO", ((Metadata) recursiveMetadata.get(1)).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
        assertContains("cool", ((Metadata) recursiveMetadata.get(1)).get(TikaCoreProperties.TIKA_CONTENT));
        assertNotContained("cool", ((Metadata) recursiveMetadata.get(0)).get(TikaCoreProperties.TIKA_CONTENT));
    }

    @Test
    public void testConfigExtractScript() throws Exception {
        InputStream resourceAsStream = getResourceAsStream("/org/apache/tika/parser/html/tika-config.xml");
        try {
            Assertions.assertNotNull(resourceAsStream);
            List recursiveMetadata = getRecursiveMetadata("testHTMLGoodScript.html", new AutoDetectParser(new TikaConfig(resourceAsStream)));
            Assertions.assertEquals(2, recursiveMetadata.size());
            Assertions.assertEquals("MACRO", ((Metadata) recursiveMetadata.get(1)).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
            assertContains("cool", ((Metadata) recursiveMetadata.get(1)).get(TikaCoreProperties.TIKA_CONTENT));
            assertNotContained("cool", ((Metadata) recursiveMetadata.get(0)).get(TikaCoreProperties.TIKA_CONTENT));
            if (resourceAsStream != null) {
                resourceAsStream.close();
            }
        } catch (Throwable th) {
            if (resourceAsStream != null) {
                try {
                    resourceAsStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    @Test
    public void testMultiThreadingEncodingDetection() throws Exception {
        Iterator it = new ArrayList(new ServiceLoader(AutoDetectReader.class.getClassLoader()).loadServiceProviders(EncodingDetector.class)).iterator();
        while (it.hasNext()) {
            testDetector((EncodingDetector) it.next());
        }
    }

    private void testDetector(EncodingDetector encodingDetector) throws Exception {
        Path path = Paths.get(getResourceAsUri("/test-documents"));
        ArrayList arrayList = new ArrayList();
        ConcurrentHashMap concurrentHashMap = new ConcurrentHashMap();
        File[] listFiles = path.toFile().listFiles();
        Assertions.assertNotNull(listFiles, "no test docs??");
        for (File file : listFiles) {
            if (file.getName().endsWith(".txt") || file.getName().endsWith(".html")) {
                String encoding = getEncoding(encodingDetector, file.toPath());
                arrayList.add(file.toPath());
                concurrentHashMap.put(file.toPath(), encoding);
            }
        }
        ArrayBlockingQueue arrayBlockingQueue = new ArrayBlockingQueue(arrayList.size());
        arrayBlockingQueue.addAll(arrayList);
        int size = arrayBlockingQueue.size() + 1;
        ExecutorCompletionService executorCompletionService = new ExecutorCompletionService(Executors.newFixedThreadPool(size));
        for (int i = 0; i < size; i++) {
            executorCompletionService.submit(new EncodingDetectorRunner(arrayBlockingQueue, concurrentHashMap, encodingDetector));
        }
        int i2 = 0;
        while (i2 < size) {
            Future take = executorCompletionService.take();
            if (take.isDone() && "done".equals(take.get())) {
                i2++;
            }
        }
    }

    public String getEncoding(EncodingDetector encodingDetector, Path path) throws IOException {
        TikaInputStream tikaInputStream = TikaInputStream.get(path);
        try {
            Charset detect = encodingDetector.detect(tikaInputStream, new Metadata());
            if (detect == null) {
                if (tikaInputStream != null) {
                    tikaInputStream.close();
                }
                return "NULL";
            }
            String charset = detect.toString();
            if (tikaInputStream != null) {
                tikaInputStream.close();
            }
            return charset;
        } catch (Throwable th) {
            if (tikaInputStream != null) {
                try {
                    tikaInputStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    @Test
    public void testCharsetsNotSupportedByIANA() throws Exception {
        assertContains("This is a sample text", getXML("testHTML_charset_utf8.html").xml);
        assertContains("This is a sample text", getXML("testHTML_charset_utf16le.html").xml);
    }

    @Test
    public void testSkippingDataURIInScriptNode() throws Exception {
        List recursiveMetadata = getRecursiveMetadata("testHTML_embedded_data_uri_js.html");
        Assertions.assertEquals(1, recursiveMetadata.size());
        assertNotContained("alert( 'Hello, world!' );", ((Metadata) recursiveMetadata.get(0)).get(TikaCoreProperties.TIKA_CONTENT));
        InputStream resourceAsStream = getResourceAsStream("/org/apache/tika/parser/html/tika-config.xml");
        try {
            Assertions.assertNotNull(resourceAsStream);
            List recursiveMetadata2 = getRecursiveMetadata("testHTML_embedded_data_uri_js.html", new AutoDetectParser(new TikaConfig(resourceAsStream)));
            Assertions.assertEquals(2, recursiveMetadata2.size());
            assertContains("alert( 'Hello, world!' );", ((Metadata) recursiveMetadata2.get(1)).get(TikaCoreProperties.TIKA_CONTENT));
            if (resourceAsStream != null) {
                resourceAsStream.close();
            }
        } catch (Throwable th) {
            if (resourceAsStream != null) {
                try {
                    resourceAsStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    @Test
    public void testMetadataMapping() throws Exception {
        Metadata metadata = (Metadata) getRecursiveMetadata("testHTML_metadata.html").get(0);
        Assertions.assertEquals("Free Web tutorials", metadata.get(TikaCoreProperties.DESCRIPTION));
        Assertions.assertEquals("Free Web tutorials", metadata.get("description"));
        Assertions.assertEquals("HTML,CSS,XML,JavaScript", metadata.get(TikaCoreProperties.SUBJECT));
        Assertions.assertEquals("HTML,CSS,XML,JavaScript", metadata.get("keywords"));
        Assertions.assertEquals("HTML,CSS,XML,JavaScript", metadata.get(Office.KEYWORDS));
        Assertions.assertEquals("HTML,CSS,XML,JavaScript", metadata.get(Office.KEYWORDS));
        Assertions.assertEquals("OldMetaTitle", metadata.get(TikaCoreProperties.TITLE));
        Assertions.assertEquals("OldMetaTitle", metadata.get("title"));
        Assertions.assertEquals("John Doe", metadata.get(TikaCoreProperties.CREATOR));
        Assertions.assertEquals("John Doe", metadata.get("author"));
    }

    @Test
    public void testPreferenceForTitleElement() throws Exception {
        Metadata metadata = (Metadata) getRecursiveMetadata("testHTML_metadata_two_titles.html").get(0);
        Assertions.assertEquals("ActualTitle", metadata.get(TikaCoreProperties.TITLE));
        Assertions.assertEquals("OldMetaTitle", metadata.get("title"));
    }

    @Test
    public void testStreamNotClosed() throws Exception {
        Metadata metadata = new Metadata();
        TikaInputStream tikaInputStream = TikaInputStream.get(getResourceAsStream("/test-documents/testHTML.html"));
        try {
            Path path = tikaInputStream.getPath();
            new JSoupParser().parse(tikaInputStream, new WriteOutContentHandler(), metadata, new ParseContext());
            Assertions.assertTrue(Files.isRegularFile(path, new LinkOption[0]));
            if (tikaInputStream != null) {
                tikaInputStream.close();
            }
        } catch (Throwable th) {
            if (tikaInputStream != null) {
                try {
                    tikaInputStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }
}
