package org.apache.tika.parser.ocr;

import java.io.File;
import java.io.InputStream;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.config.TikaTaskTimeout;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.image.BPGParser;
import org.apache.tika.parser.image.HeifParser;
import org.apache.tika.parser.image.ICNSParser;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.image.JpegParser;
import org.apache.tika.parser.image.PSDParser;
import org.apache.tika.parser.image.TiffParser;
import org.apache.tika.parser.image.WebPParser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Assumptions;
import org.junit.jupiter.api.Test;

/* loaded from: input_file:org/apache/tika/parser/ocr/TesseractOCRParserTest.class */
public class TesseractOCRParserTest extends TikaTest {
    public static boolean canRun() throws TikaConfigException {
        TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
        tesseractOCRParser.initialize(Collections.EMPTY_MAP);
        return tesseractOCRParser.hasTesseract();
    }

    @Test
    public void testInterwordSpacing() throws Exception {
        Assumptions.assumeTrue(canRun(), "can run OCR");
        assertContains("The quick", getXML("testOCR_spacing.png", getMetadata(MediaType.image("png"))).xml);
        TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
        tesseractOCRConfig.setPreserveInterwordSpacing(true);
        ParseContext parseContext = new ParseContext();
        parseContext.set(TesseractOCRConfig.class, tesseractOCRConfig);
        Assertions.assertTrue(Pattern.compile("The\\s{5,20}quick").matcher(getXML("testOCR_spacing.png", getMetadata(MediaType.image("png")), parseContext).xml).find());
    }

    private Metadata getMetadata(MediaType mediaType) {
        Metadata metadata = new Metadata();
        metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, new MediaType(mediaType.getType(), "OCR-" + mediaType.getSubtype()).toString());
        return metadata;
    }

    private MediaType deOCR(MediaType mediaType) {
        String subtype = mediaType.getSubtype();
        if (subtype.startsWith("ocr-")) {
            subtype = subtype.substring(4);
        }
        return new MediaType(mediaType.getType(), subtype);
    }

    @Test
    public void confirmMultiPageTiffHandling() throws Exception {
        Assumptions.assumeTrue(canRun(), "can run OCR");
        assertContains("Page 2", getXML("testTIFF_multipage.tif", getMetadata(MediaType.image("tiff"))).xml);
    }

    @Test
    public void confirmRuntimeSkipOCR() throws Exception {
        Assumptions.assumeTrue(canRun(), "can run OCR");
        TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
        tesseractOCRConfig.setSkipOcr(true);
        ParseContext parseContext = new ParseContext();
        parseContext.set(TesseractOCRConfig.class, tesseractOCRConfig);
        assertNotContained("Page 2", getXML("testTIFF_multipage.tif", getMetadata(MediaType.image("tiff")), parseContext).xml);
    }

    @Test
    public void testPositiveRotateOCR() throws Exception {
        Assumptions.assumeTrue(canRun());
        Assumptions.assumeTrue(new TesseractOCRParser().hasImageMagick());
        TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
        tesseractOCRConfig.setApplyRotation(true);
        tesseractOCRConfig.setResize(100);
        ParseContext parseContext = new ParseContext();
        parseContext.set(TesseractOCRConfig.class, tesseractOCRConfig);
        Metadata metadata = getMetadata(MediaType.image("png"));
        String text = getText("testRotated+10.png", metadata, parseContext);
        Assertions.assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
        Assertions.assertEquals(10.0d, Double.parseDouble(metadata.get(TesseractOCRParser.IMAGE_ROTATION)), 0.01d);
        assertContains("Its had resolving otherwise she contented therefore", text);
    }

    @Test
    public void testNegativeRotateOCR() throws Exception {
        Assumptions.assumeTrue(new TesseractOCRParser().hasImageMagick());
        TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
        tesseractOCRConfig.setApplyRotation(true);
        tesseractOCRConfig.setResize(100);
        ParseContext parseContext = new ParseContext();
        parseContext.set(TesseractOCRConfig.class, tesseractOCRConfig);
        Assumptions.assumeTrue(canRun());
        Metadata metadata = getMetadata(MediaType.image("png"));
        String text = getText("testRotated-10.png", metadata, parseContext);
        Assertions.assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
        Assertions.assertEquals(-10.0d, Double.parseDouble(metadata.get(TesseractOCRParser.IMAGE_ROTATION)), 0.01d);
        assertContains("Its had resolving otherwise she contented therefore", text);
    }

    @Test
    public void testConfig() throws Exception {
        InputStream resourceAsStream = getResourceAsStream("/test-configs/TIKA-2705-tesseract.xml");
        Throwable th = null;
        try {
            TesseractOCRParser findParser = findParser(new TikaConfig(resourceAsStream).getParser(), TesseractOCRParser.class);
            Assertions.assertNotNull(findParser);
            TesseractOCRConfig defaultConfig = findParser.getDefaultConfig();
            Assertions.assertEquals(241, defaultConfig.getTimeoutSeconds());
            Assertions.assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR, defaultConfig.getOutputType());
            Assertions.assertEquals("ceb", defaultConfig.getLanguage());
            Assertions.assertEquals(false, Boolean.valueOf(defaultConfig.isApplyRotation()));
            if (resourceAsStream != null) {
                if (0 == 0) {
                    resourceAsStream.close();
                    return;
                }
                try {
                    resourceAsStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
        } catch (Throwable th3) {
            if (resourceAsStream != null) {
                if (0 != 0) {
                    try {
                        resourceAsStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    resourceAsStream.close();
                }
            }
            throw th3;
        }
    }

    @Test
    public void testTimeoutOverride() throws Exception {
        Assumptions.assumeTrue(canRun(), "can run OCR");
        try {
            InputStream resourceAsStream = getResourceAsStream("/test-configs/TIKA-3582-tesseract.xml");
            Throwable th = null;
            try {
                AutoDetectParser autoDetectParser = new AutoDetectParser(new TikaConfig(resourceAsStream));
                Metadata metadata = new Metadata();
                ParseContext parseContext = new ParseContext();
                parseContext.set(TikaTaskTimeout.class, new TikaTaskTimeout(50L));
                getXML("testRotated+10.png", autoDetectParser, metadata, parseContext);
                Assertions.fail("should have thrown a timeout");
                if (resourceAsStream != null) {
                    if (0 != 0) {
                        try {
                            resourceAsStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        resourceAsStream.close();
                    }
                }
            } finally {
            }
        } catch (TikaException e) {
            assertContains("timeout", e.getMessage());
        }
    }

    @Test
    public void testPSM0() throws Exception {
        Assumptions.assumeTrue(canRun(), "can run OCR");
        InputStream resourceAsStream = getResourceAsStream("/test-configs/tika-config-psm0.xml");
        Throwable th = null;
        try {
            AutoDetectParser autoDetectParser = new AutoDetectParser(new TikaConfig(resourceAsStream));
            Metadata metadata = new Metadata();
            getXML("testRotated+10.png", autoDetectParser, metadata);
            Assertions.assertEquals(0, metadata.getInt(TesseractOCRParser.PSM0_PAGE_NUMBER));
            Assertions.assertEquals(180, metadata.getInt(TesseractOCRParser.PSM0_ORIENTATION));
            Assertions.assertEquals(180, metadata.getInt(TesseractOCRParser.PSM0_ROTATE));
            Assertions.assertEquals(5.71d, Double.parseDouble(metadata.get(TesseractOCRParser.PSM0_ORIENTATION_CONFIDENCE)), 0.1d);
            Assertions.assertEquals(0.83d, Double.parseDouble(metadata.get(TesseractOCRParser.PSM0_SCRIPT_CONFIDENCE)), 0.1d);
            Assertions.assertEquals("Latin", metadata.get(TesseractOCRParser.PSM0_SCRIPT));
            if (resourceAsStream != null) {
                if (0 == 0) {
                    resourceAsStream.close();
                    return;
                }
                try {
                    resourceAsStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
        } catch (Throwable th3) {
            if (resourceAsStream != null) {
                if (0 != 0) {
                    try {
                        resourceAsStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    resourceAsStream.close();
                }
            }
            throw th3;
        }
    }

    @Test
    public void testPreloadLangs() throws Exception {
        Assumptions.assumeTrue(canRun());
        InputStream resourceAsStream = getResourceAsStream("/test-configs/tika-config-tesseract-load-langs.xml");
        Throwable th = null;
        try {
            TikaConfig tikaConfig = new TikaConfig(resourceAsStream);
            if (resourceAsStream != null) {
                if (0 != 0) {
                    try {
                        resourceAsStream.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                } else {
                    resourceAsStream.close();
                }
            }
            TesseractOCRParser findParser = findParser(tikaConfig.getParser(), TesseractOCRParser.class);
            Assertions.assertNotNull(findParser);
            Assertions.assertTrue(findParser.getLangs().size() > 0);
            TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
            tesseractOCRConfig.setLanguage("zzz");
            ParseContext parseContext = new ParseContext();
            parseContext.set(TesseractOCRConfig.class, tesseractOCRConfig);
            try {
                getRecursiveMetadata("testOCR_spacing.png", new AutoDetectParser(tikaConfig), getMetadata(MediaType.image("png")), parseContext, false);
                Assertions.fail("should have thrown exception");
            } catch (TikaException e) {
            }
        } catch (Throwable th3) {
            if (resourceAsStream != null) {
                if (0 != 0) {
                    try {
                        resourceAsStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    resourceAsStream.close();
                }
            }
            throw th3;
        }
    }

    @Test
    public void testArbitraryParams() throws Exception {
        InputStream resourceAsStream = getResourceAsStream("/test-configs/tika-config-tesseract-arbitrary.xml");
        Throwable th = null;
        try {
            TesseractOCRParser findParser = findParser(new TikaConfig(resourceAsStream).getParser(), TesseractOCRParser.class);
            Assertions.assertNotNull(findParser);
            TesseractOCRConfig defaultConfig = findParser.getDefaultConfig();
            Assertions.assertEquals("0.75", defaultConfig.getOtherTesseractConfig().get("textord_initialx_ile"));
            Assertions.assertEquals("0.15625", defaultConfig.getOtherTesseractConfig().get("textord_noise_hfract"));
            if (resourceAsStream != null) {
                if (0 == 0) {
                    resourceAsStream.close();
                    return;
                }
                try {
                    resourceAsStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
        } catch (Throwable th3) {
            if (resourceAsStream != null) {
                if (0 != 0) {
                    try {
                        resourceAsStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    resourceAsStream.close();
                }
            }
            throw th3;
        }
    }

    public void showCoverage() throws Exception {
        HashSet<MediaType> hashSet = new HashSet();
        for (Parser parser : new Parser[]{new BPGParser(), new HeifParser(), new ICNSParser(), new ImageParser(), new JpegParser(), new PSDParser(), new TiffParser(), new WebPParser()}) {
            hashSet.addAll(parser.getSupportedTypes(new ParseContext()));
        }
        HashSet hashSet2 = new HashSet();
        HashSet hashSet3 = new HashSet();
        for (MediaType mediaType : new TesseractOCRParser().getSupportedTypes(new ParseContext())) {
            if (mediaType.getSubtype().startsWith("ocr-")) {
                hashSet3.add(deOCR(mediaType));
            } else {
                hashSet2.add(mediaType);
            }
        }
        for (MediaType mediaType2 : hashSet) {
            if (!hashSet3.contains(mediaType2)) {
                System.out.println("tesseract isn't currently configured to handle: " + mediaType2);
            }
        }
        Iterator it = hashSet2.iterator();
        while (it.hasNext()) {
            System.out.println("We don't have dedicated image parsers for these formats, which are handled by tesseract: " + ((MediaType) it.next()));
        }
    }

    @Test
    public void testTrailingSlashInPathBehavior() {
        TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
        tesseractOCRParser.setTesseractPath("blah");
        Assertions.assertEquals("blah" + File.separator, tesseractOCRParser.getTesseractPath());
        tesseractOCRParser.setTesseractPath("blah" + File.separator);
        Assertions.assertEquals("blah" + File.separator, tesseractOCRParser.getTesseractPath());
        tesseractOCRParser.setTesseractPath("");
        Assertions.assertEquals("", tesseractOCRParser.getTesseractPath());
        tesseractOCRParser.setTessdataPath("blahdata");
        Assertions.assertEquals("blahdata" + File.separator, tesseractOCRParser.getTessdataPath());
        tesseractOCRParser.setTessdataPath("blahdata" + File.separator);
        Assertions.assertEquals("blahdata" + File.separator, tesseractOCRParser.getTessdataPath());
        tesseractOCRParser.setTessdataPath("");
        Assertions.assertEquals("", tesseractOCRParser.getTessdataPath());
        tesseractOCRParser.setImageMagickPath("imagemagickpath");
        Assertions.assertEquals("imagemagickpath" + File.separator, tesseractOCRParser.getImageMagickPath());
        tesseractOCRParser.setImageMagickPath("imagemagickpath" + File.separator);
        Assertions.assertEquals("imagemagickpath" + File.separator, tesseractOCRParser.getImageMagickPath());
        tesseractOCRParser.setImageMagickPath("");
        Assertions.assertEquals("", tesseractOCRParser.getImageMagickPath());
    }

    @Test
    public void testBogusPathCheck() {
        TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
        tesseractOCRParser.setTesseractPath("blahdeblahblah");
        Assertions.assertEquals("blahdeblahblah" + File.separator, tesseractOCRParser.getTesseractPath());
    }

    @Test
    public void testThreadJoinInLoadingLangs() throws Exception {
        Assumptions.assumeTrue(canRun());
        Set<String> langs = getLangs();
        Assumptions.assumeTrue(langs.size() > 0);
        for (int i = 0; i < 20; i++) {
            Assertions.assertEquals(langs, getLangs());
        }
    }

    private Set<String> getLangs() throws Exception {
        TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
        tesseractOCRParser.setPreloadLangs(true);
        tesseractOCRParser.initialize(Collections.EMPTY_MAP);
        return tesseractOCRParser.getLangs();
    }
}
