package org.apache.tika.parser.html;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.SequenceInputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector;
import org.apache.tika.parser.html.charsetdetector.charsets.ReplacementCharset;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

/* loaded from: input_file:org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.class */
public class StandardHtmlEncodingDetectorTest {
    private Metadata metadata = new Metadata();

    @BeforeEach
    public void setUp() {
        this.metadata = new Metadata();
    }

    @Test
    public void basic() throws IOException {
        assertWindows1252("<meta charset=WINDOWS-1252>");
    }

    @Test
    public void quoted() throws IOException {
        assertWindows1252("<meta charset='WINDOWS-1252'>");
    }

    @Test
    public void duplicateMeta() throws IOException {
        assertWindows1252("<meta charset='WINDOWS-1252'><meta charset='UTF-8'>");
    }

    @Test
    public void duplicateAttribute() throws IOException {
        assertWindows1252("<meta charset='WINDOWS-1252' charset='UTF-8'>");
    }

    @Test
    public void invalidThenValid() throws IOException {
        assertCharset("<meta charset=blah><meta charset=WINDOWS-1252>", (Charset) null);
    }

    @Test
    public void spacesInAttributes() throws IOException {
        assertWindows1252("<meta charset\f=  \t  WINDOWS-1252>");
    }

    @Test
    public void httpEquiv() throws IOException {
        assertWindows1252("<meta http-equiv='content-type' content='text/html; charset=\"WINDOWS-1252\"'>");
        assertWindows1252("<meta content=' charset  =  WINDOWS-1252' http-equiv='content-type' >");
    }

    @Test
    public void emptyAttributeEnd() throws IOException {
        assertWindows1252("<meta charset=WINDOWS-1252 a>");
    }

    @Test
    public void httpEquivDuplicateCharset() throws IOException {
        assertWindows1252("<meta http-equiv='content-type' content='charset=WINDOWS-1252;charset=UTF-8'>");
    }

    @Test
    public void htmlFragment() throws IOException {
        assertWindows1252("<!doctype html><html class=nojs><head><meta charset='WINDOWS-1252'>");
    }

    @Test
    public void veryBadHtml() throws IOException {
        assertWindows1252("<< l \" == / '=x\n ><!--> < <x'/ <=> <meta/><meta><a x/><meta charset='WINDOWS-1252'>");
    }

    @Test
    public void specialTag() throws IOException {
        assertWindows1252("<? x='><meta charset='WINDOWS-1252'>");
    }

    @Test
    public void longHtml() throws IOException {
        StringBuilder sb = new StringBuilder("<!doctype html>\n<html>\n<head>\n<title>Hello world</title>\n");
        while (sb.length() + "<meta x='y' />\n".length() + "<meta charset='windows-1252'>".length() < 1024) {
            sb.append("<meta x='y' />\n");
        }
        sb.append("<meta charset='windows-1252'>");
        assertWindows1252(sb.toString());
    }

    @Test
    public void tooLong() throws IOException {
        assertCharset(new String(new byte[1000000], StandardCharsets.ISO_8859_1) + "<meta charset='windows-1252'>", (Charset) null);
    }

    @Test
    public void incompleteMeta() throws IOException {
        assertCharset("<meta charset='WINDOWS-1252'", (Charset) null);
    }

    @Test
    public void charsetWithWhiteSpaces() throws IOException {
        assertWindows1252("<meta charset='   \t\n  WINDOWS-1252 \t\n'>");
    }

    @Test
    public void mixedCase() throws IOException {
        assertWindows1252("<mEtA chArsEt='WInDOWs-1252'>");
    }

    @Test
    public void utf16() throws IOException {
        assertCharset("<meta charset='UTF-16BE'>", StandardCharsets.UTF_8);
    }

    @Test
    public void xUserDefined() throws IOException {
        assertWindows1252("<meta charset='x-user-defined'>");
    }

    @Test
    public void replacement() throws IOException {
        assertCharset((InputStream) new ByteArrayInputStream("<meta charset='iso-2022-cn'>".getBytes(StandardCharsets.ISO_8859_1)), (Charset) new ReplacementCharset());
    }

    @Test
    public void iso88591() throws IOException {
        assertWindows1252("<meta charset='iso-8859-1'>");
    }

    @Test
    public void macintoshEncoding() throws IOException {
        assertCharset("<meta charset='macintosh'>", Charset.forName("x-MacRoman"));
    }

    @Test
    public void bom() throws IOException {
        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
    }

    @Test
    public void withSlash() throws IOException {
        assertWindows1252("<meta/charset='WINDOWS-1252'>");
    }

    @Test
    public void insideDescription() throws IOException {
        assertWindows1252("<meta name='description'content='If I write charset=UTF-8 here, it doesnt mean the page is in UTF-8'/><meta charset='WINDOWS-1252'>");
    }

    @Test
    public void insideTag() throws IOException {
        assertWindows1252("<tag attribute=\"<meta charset='UTF-8'>\" <meta charset='UTF-8' /><meta charset='WINDOWS-1252'>");
    }

    @Test
    public void missingAttribute() throws IOException {
        assertWindows1252("<meta content='charset=UTF-8'><meta charset='WINDOWS-1252'>");
    }

    @Test
    public void insideSpecialTag() throws IOException {
        for (byte b : "?!/".getBytes(StandardCharsets.US_ASCII)) {
            assertWindows1252("<" + ((char) b) + "<meta charset='UTF-8'><meta charset='WINDOWS-1252'>");
        }
    }

    @Test
    public void spaceBeforeTag() throws IOException {
        assertWindows1252("< meta charset='UTF-8'><meta charset='WINDOWS-1252'>");
    }

    @Test
    public void invalidAttribute() throws IOException {
        assertWindows1252("<meta badcharset='UTF-8' charset='WINDOWS-1252'>");
    }

    @Test
    public void unmatchedQuote() throws IOException {
        assertWindows1252("<meta http-equiv='content-type' content='charset=\"UTF-8'><meta charset='WINDOWS-1252'>");
    }

    @Test
    public void realWorld() throws IOException {
        assertWindows1252("<!DOCTYPE html>\n<html lang=\"fr\">\n<head>\n<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':\n\t\t\tnew Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],\n\t\t\tj=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=\n\t\t\t'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);\n\t\t\t})(window,document,'script','dataLayer','GTM-PNX8H8X');</script>\n<title>Horaires Transilien 2018 - Lignes A B C D E H J K L N P R U</title>\n<meta name=\"description\" content=\"Consultez les horaires du Transilien en temps réel. Lignes A et B du RER. Lignes C D E H J K L N P R U du Transilien.\">\n<meta name=\"keywords\" content=\"horaires transilien\">\n<meta charset=\"windows-1252\">\n<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n<meta name=\"robots\" content=\"follow, index\">\n<base hr");
    }

    @Test
    public void withCompactComment() throws IOException {
        assertWindows1252("<!--<meta charset='UTF-8'>--><!--><meta charset='WINDOWS-1252'>");
    }

    @Test
    public void withCharsetInContentType() throws IOException {
        this.metadata.set("Content-Type", "text/html; Charset=ISO-8859-1");
        assertWindows1252("");
        assertWindows1252("<meta charset='UTF-8'>");
        assertWindows1252("<meta http-equiv='content-type' content='charset=utf-8'>");
        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
    }

    @Test
    public void throwResistance() throws IOException {
        assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'>"));
        assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'><some other tag"));
        assertCharset(throwAfter("<meta charset='WINDOWS-1252'"), (Charset) null);
        assertCharset(throwAfter("<"), (Charset) null);
        assertCharset(throwAfter("<!"), (Charset) null);
        assertCharset(throwAfter("<!doctype"), (Charset) null);
        assertCharset(throwAfter("<!doctype html><html"), (Charset) null);
        assertCharset(throwAfter("<!doctype html><html attr"), (Charset) null);
        assertCharset(throwAfter("<!doctype html><html attr="), (Charset) null);
        assertCharset(throwAfter("<!doctype html><html attr=x"), (Charset) null);
        assertCharset(throwAfter("<!doctype html><html attr='x"), (Charset) null);
    }

    @Test
    public void streamReset() throws IOException {
        byte[] bArr = {0, 1, 2, 3, 4};
        byte[] bArr2 = new byte[5];
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bArr);
        detectCharset(byteArrayInputStream);
        byteArrayInputStream.read(bArr2);
        Assertions.assertArrayEquals(bArr, bArr2);
    }

    private void assertWindows1252(String str) throws IOException {
        assertCharset(str, Charset.forName("WINDOWS-1252"));
    }

    private void assertWindows1252(InputStream inputStream) throws IOException {
        assertCharset(inputStream, Charset.forName("WINDOWS-1252"));
    }

    private void assertCharset(String str, Charset charset) throws IOException {
        Assertions.assertEquals(charset, detectCharset(new ByteArrayInputStream(str.getBytes(charset == null ? StandardCharsets.UTF_8 : charset))), str + " should be detected as " + charset);
    }

    private void assertCharset(InputStream inputStream, Charset charset) throws IOException {
        Assertions.assertEquals(charset, detectCharset(inputStream));
    }

    private Charset detectCharset(InputStream inputStream) throws IOException {
        return new StandardHtmlEncodingDetector().detect(inputStream, this.metadata);
    }

    private InputStream throwAfter(String str) {
        return new BufferedInputStream(new SequenceInputStream(new ByteArrayInputStream(str.getBytes(StandardCharsets.UTF_8)), new InputStream() { // from class: org.apache.tika.parser.html.StandardHtmlEncodingDetectorTest.1
            @Override // java.io.InputStream
            public int read() throws IOException {
                throw new IOException("test exception");
            }
        }));
    }
}
