package org.visallo.tikaTextExtractor;

import com.fasterxml.jackson.core.util.MinimalPrettyPrinter;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.inject.Inject;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import de.l3s.boilerpipe.extractors.NumWordsRulesExtractor;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Properties;
import org.apache.commons.io.IOUtils;
import org.apache.maven.scm.provider.svn.SvnTagBranchUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParsersConfigReaderMetKeys;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.parser.pdf.VisalloParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.SecureContentHandler;
import org.apache.xml.serialize.LineSeparator;
import org.json.JSONException;
import org.json.JSONObject;
import org.vertexium.Element;
import org.vertexium.Property;
import org.vertexium.Vertex;
import org.vertexium.Visibility;
import org.vertexium.mutation.ExistingElementMutation;
import org.vertexium.property.StreamingPropertyValue;
import org.visallo.core.ingest.graphProperty.GraphPropertyWorkData;
import org.visallo.core.ingest.graphProperty.GraphPropertyWorker;
import org.visallo.core.ingest.graphProperty.GraphPropertyWorkerPrepareData;
import org.visallo.core.model.Description;
import org.visallo.core.model.Name;
import org.visallo.core.model.properties.VisalloProperties;
import org.visallo.core.model.properties.types.LongVisalloProperty;
import org.visallo.core.util.VisalloLogger;
import org.visallo.core.util.VisalloLoggerFactory;
import org.visallo.tikaTextExtractor.TikaTextExtractorGraphPropertyWorkerConfiguration;
import org.xml.sax.SAXException;

@Name("Tika Text Extractor")
@Description("Uses Apache Tika to extract text")
/* loaded from: input_file:org/visallo/tikaTextExtractor/TikaTextExtractorGraphPropertyWorker.class */
public class TikaTextExtractorGraphPropertyWorker extends GraphPropertyWorker {
    private static final VisalloLogger LOGGER = VisalloLoggerFactory.getLogger(TikaTextExtractorGraphPropertyWorker.class);

    @Deprecated
    public static final String MULTI_VALUE_KEY = TikaTextExtractorGraphPropertyWorker.class.getName();
    private static final String PROPS_FILE = "tika-extractor.properties";
    private static final String DATE_KEYS_PROPERTY = "tika.extraction.datekeys";
    private static final String SUBJECT_KEYS_PROPERTY = "tika.extraction.titlekeys";
    private static final String AUTHOR_PROPERTY = "tika.extractions.author";
    private static final String URL_KEYS_PROPERTY = "tika.extraction.urlkeys";
    private static final String TYPE_KEYS_PROPERTY = "tika.extraction.typekeys";
    private static final String EXT_URL_KEYS_PROPERTY = "tika.extraction.exturlkeys";
    private static final String SRC_TYPE_KEYS_PROPERTY = "tika.extraction.srctypekeys";
    private static final String RETRIEVAL_TIMESTAMP_KEYS_PROPERTY = "tika.extraction.retrievaltimestampkeys";
    private static final String CUSTOM_FLICKR_METADATA_KEYS_PROPERTY = "tika.extraction.customflickrmetadatakeys";
    private static final String NUMBER_OF_PAGES_PROPERTY = "tika.extraction.numberofpageskeys";
    private static final double SYSTEM_ASSIGNED_CONFIDENCE = 0.4d;
    private final TikaTextExtractorGraphPropertyWorkerConfiguration configuration;
    private List<String> dateKeys;
    private List<String> subjectKeys;
    private List<String> urlKeys;
    private List<String> typeKeys;
    private List<String> extUrlKeys;
    private List<String> srcTypeKeys;
    private List<String> retrievalTimestampKeys;
    private List<String> customFlickrMetadataKeys;
    private List<String> authorKeys;
    private List<String> numberOfPagesKeys;
    private LongVisalloProperty pageCountProperty;
    private String authorPropertyIri;
    private String titlePropertyIri;

    @Inject
    public TikaTextExtractorGraphPropertyWorker(TikaTextExtractorGraphPropertyWorkerConfiguration tikaTextExtractorGraphPropertyWorkerConfiguration) {
        this.configuration = tikaTextExtractorGraphPropertyWorkerConfiguration;
    }

    public void prepare(GraphPropertyWorkerPrepareData graphPropertyWorkerPrepareData) throws Exception {
        super.prepare(graphPropertyWorkerPrepareData);
        Properties properties = new Properties();
        try {
            InputStream resourceAsStream = Thread.currentThread().getContextClassLoader().getResourceAsStream(PROPS_FILE);
            if (resourceAsStream != null) {
                properties.load(resourceAsStream);
            }
        } catch (IOException e) {
            LOGGER.error("Could not load config: %s", new Object[]{PROPS_FILE});
        }
        String propertyIRIByIntent = getOntologyRepository().getPropertyIRIByIntent("pageCount");
        if (propertyIRIByIntent != null) {
            this.pageCountProperty = new LongVisalloProperty(propertyIRIByIntent);
        }
        this.dateKeys = Arrays.asList(properties.getProperty(DATE_KEYS_PROPERTY, "date,published,pubdate,publish_date,last-modified,atc:last-modified").split(","));
        this.subjectKeys = Arrays.asList(properties.getProperty(SUBJECT_KEYS_PROPERTY, "title,subject").split(","));
        this.urlKeys = Arrays.asList(properties.getProperty(URL_KEYS_PROPERTY, "url,og:url").split(","));
        this.typeKeys = Arrays.asList(properties.getProperty(TYPE_KEYS_PROPERTY, "Content-Type").split(","));
        this.extUrlKeys = Arrays.asList(properties.getProperty(EXT_URL_KEYS_PROPERTY, "atc:result-url").split(","));
        this.srcTypeKeys = Arrays.asList(properties.getProperty(SRC_TYPE_KEYS_PROPERTY, "og:type").split(","));
        this.retrievalTimestampKeys = Arrays.asList(properties.getProperty(RETRIEVAL_TIMESTAMP_KEYS_PROPERTY, "atc:retrieval-timestamp").split(","));
        this.customFlickrMetadataKeys = Arrays.asList(properties.getProperty(CUSTOM_FLICKR_METADATA_KEYS_PROPERTY, "Unknown tag (0x9286)").split(","));
        this.authorKeys = Arrays.asList(properties.getProperty(AUTHOR_PROPERTY, "author").split(","));
        this.numberOfPagesKeys = Arrays.asList(properties.getProperty(NUMBER_OF_PAGES_PROPERTY, "xmpTPg:NPages").split(","));
        this.authorPropertyIri = getOntologyRepository().getPropertyIRIByIntent("documentAuthor");
        this.titlePropertyIri = getOntologyRepository().getPropertyIRIByIntent("documentTitle");
        if (this.titlePropertyIri == null) {
            this.titlePropertyIri = getOntologyRepository().getPropertyIRIByIntent("artifactTitle");
        }
    }

    public void execute(InputStream inputStream, GraphPropertyWorkData graphPropertyWorkData) throws Exception {
        String extractTextField;
        String str = (String) graphPropertyWorkData.getProperty().getMetadata().getValue(VisalloProperties.MIME_TYPE.getPropertyName());
        Preconditions.checkNotNull(str, VisalloProperties.MIME_TYPE.getPropertyName() + " is a required metadata field");
        Charset forName = Charset.forName("UTF-8");
        Metadata metadata = new Metadata();
        metadata.set("Content-Type", str);
        String extractText = extractText(inputStream, str, metadata);
        String propertyKey = getPropertyKey(graphPropertyWorkData);
        TikaTextExtractorGraphPropertyWorkerConfiguration.TextExtractMapping textExtractMapping = this.configuration.getTextExtractMapping(graphPropertyWorkData.getElement(), graphPropertyWorkData.getProperty());
        ExistingElementMutation<Vertex> prepareMutation = graphPropertyWorkData.getElement().prepareMutation();
        String extractTextField2 = extractTextField(metadata, this.authorKeys);
        if (this.authorPropertyIri != null && extractTextField2 != null && extractTextField2.length() > 0) {
            prepareMutation.addPropertyValue(propertyKey, this.authorPropertyIri, extractTextField2, graphPropertyWorkData.createPropertyMetadata(getUser()), graphPropertyWorkData.getVisibility());
        }
        String extractTextField3 = extractTextField(metadata, this.customFlickrMetadataKeys);
        org.vertexium.Metadata createPropertyMetadata = graphPropertyWorkData.createPropertyMetadata(getUser());
        Visibility defaultVisibility = getVisibilityTranslator().getDefaultVisibility();
        VisalloProperties.MIME_TYPE_METADATA.setMetadata(createPropertyMetadata, "text/plain", defaultVisibility);
        if (!Strings.isNullOrEmpty(textExtractMapping.getTextDescription())) {
            VisalloProperties.TEXT_DESCRIPTION_METADATA.setMetadata(createPropertyMetadata, textExtractMapping.getTextDescription(), defaultVisibility);
        }
        if (extractTextField3 == null || extractTextField3.equals("")) {
            addTextProperty(textExtractMapping, prepareMutation, propertyKey, new StreamingPropertyValue(new ByteArrayInputStream(extractText.getBytes(forName)), String.class), createPropertyMetadata, graphPropertyWorkData.getVisibility());
            VisalloProperties.MODIFIED_DATE.setProperty(prepareMutation, extractDate(metadata), defaultVisibility);
            String extractTextField4 = extractTextField(metadata, this.subjectKeys);
            if (extractTextField4 != null && extractTextField4.length() > 0 && this.titlePropertyIri != null) {
                org.vertexium.Metadata createPropertyMetadata2 = graphPropertyWorkData.createPropertyMetadata(getUser());
                VisalloProperties.CONFIDENCE_METADATA.setMetadata(createPropertyMetadata2, Double.valueOf(SYSTEM_ASSIGNED_CONFIDENCE), defaultVisibility);
                prepareMutation.addPropertyValue(propertyKey, this.titlePropertyIri, extractTextField4, createPropertyMetadata2, graphPropertyWorkData.getVisibility());
            }
            if (this.pageCountProperty != null && (extractTextField = extractTextField(metadata, this.numberOfPagesKeys)) != null) {
                org.vertexium.Metadata createPropertyMetadata3 = graphPropertyWorkData.createPropertyMetadata(getUser());
                VisalloProperties.CONFIDENCE_METADATA.setMetadata(createPropertyMetadata3, Double.valueOf(SYSTEM_ASSIGNED_CONFIDENCE), defaultVisibility);
                this.pageCountProperty.addPropertyValue(prepareMutation, propertyKey, Long.valueOf(extractTextField), createPropertyMetadata3, graphPropertyWorkData.getVisibility());
            }
        } else {
            try {
                JSONObject jSONObject = new JSONObject(extractTextField3);
                addTextProperty(textExtractMapping, prepareMutation, propertyKey, new StreamingPropertyValue(new ByteArrayInputStream((new JSONObject(jSONObject.get("description").toString()).get("_content") + "\n" + jSONObject.get(SvnTagBranchUtils.SVN_TAGS).toString()).getBytes(forName)), String.class), createPropertyMetadata, graphPropertyWorkData.getVisibility());
                VisalloProperties.MODIFIED_DATE.setProperty(prepareMutation, GenericDateExtractor.extractSingleDate(jSONObject.get("lastupdate").toString()), defaultVisibility);
                if (this.titlePropertyIri != null) {
                    org.vertexium.Metadata createPropertyMetadata4 = graphPropertyWorkData.createPropertyMetadata(getUser());
                    VisalloProperties.CONFIDENCE_METADATA.setMetadata(createPropertyMetadata4, Double.valueOf(SYSTEM_ASSIGNED_CONFIDENCE), defaultVisibility);
                    prepareMutation.addPropertyValue(propertyKey, this.titlePropertyIri, jSONObject.get("title").toString(), createPropertyMetadata4, graphPropertyWorkData.getVisibility());
                }
            } catch (JSONException e) {
                LOGGER.warn("Image returned invalid custom metadata", new Object[0]);
            }
        }
        prepareMutation.save(getAuthorizations());
        getGraph().flush();
        getWorkQueueRepository().pushGraphPropertyQueue(graphPropertyWorkData.getElement(), propertyKey, textExtractMapping.getExtractedTextPropertyName(), graphPropertyWorkData.getWorkspaceId(), graphPropertyWorkData.getVisibilitySource(), graphPropertyWorkData.getPriority());
    }

    private void addTextProperty(TikaTextExtractorGraphPropertyWorkerConfiguration.TextExtractMapping textExtractMapping, ExistingElementMutation<Vertex> existingElementMutation, String str, StreamingPropertyValue streamingPropertyValue, org.vertexium.Metadata metadata, Visibility visibility) {
        existingElementMutation.addPropertyValue(str, textExtractMapping.getExtractedTextPropertyName(), streamingPropertyValue, metadata, visibility);
    }

    private String getPropertyKey(GraphPropertyWorkData graphPropertyWorkData) {
        return VisalloProperties.TEXT.getProperty(graphPropertyWorkData.getElement(), MULTI_VALUE_KEY) != null ? MULTI_VALUE_KEY : graphPropertyWorkData.getProperty().getKey();
    }

    private String extractText(InputStream inputStream, String str, Metadata metadata) throws IOException, SAXException, TikaException, BoilerpipeProcessingException {
        String cleanExtractedText;
        metadata.set("Content-Type", str);
        if (isHtml(str)) {
            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
            IOUtils.copy(inputStream, byteArrayOutputStream);
            byte[] byteArray = byteArrayOutputStream.toByteArray();
            String extractTextWithTika = extractTextWithTika(new ByteArrayInputStream(byteArray), metadata);
            cleanExtractedText = extractTextFromHtml(IOUtils.toString(byteArray, "UTF-8"));
            if (cleanExtractedText == null || cleanExtractedText.length() == 0) {
                cleanExtractedText = cleanExtractedText(extractTextWithTika);
            }
        } else {
            cleanExtractedText = cleanExtractedText(extractTextWithTika(inputStream, metadata));
        }
        return Normalizer.normalize(cleanExtractedText, Normalizer.Form.NFC);
    }

    private static String extractTextWithTika(InputStream inputStream, Metadata metadata) throws TikaException, SAXException, IOException {
        TikaConfig defaultConfig = TikaConfig.getDefaultConfig();
        CompositeParser compositeParser = new CompositeParser(defaultConfig.getMediaTypeRegistry(), defaultConfig.getParser());
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        BodyContentHandler bodyContentHandler = new BodyContentHandler(new OutputStreamWriter(byteArrayOutputStream, "UTF-8"));
        ParseContext parseContext = new ParseContext();
        parseContext.set(PDFParserConfig.class, new VisalloParserConfig());
        TemporaryResources temporaryResources = new TemporaryResources();
        try {
            TikaInputStream tikaInputStream = TikaInputStream.get(inputStream, temporaryResources);
            SecureContentHandler secureContentHandler = new SecureContentHandler(bodyContentHandler, tikaInputStream);
            try {
                compositeParser.parse(tikaInputStream, secureContentHandler, metadata, parseContext);
                if (LOGGER.isDebugEnabled()) {
                    LOGGER.debug("extracted %d bytes", new Object[]{Integer.valueOf(byteArrayOutputStream.size())});
                    LOGGER.debug(ExternalParsersConfigReaderMetKeys.METADATA_TAG, new Object[0]);
                    for (String str : metadata.names()) {
                        LOGGER.debug("  %s: %s", new Object[]{str, metadata.get(str)});
                    }
                }
                return IOUtils.toString(byteArrayOutputStream.toByteArray(), "UTF-8");
            } catch (SAXException e) {
                secureContentHandler.throwIfCauseOf(e);
                throw e;
            }
        } finally {
            temporaryResources.dispose();
        }
    }

    private String extractTextFromHtml(String str) throws BoilerpipeProcessingException {
        String cleanHtml = cleanHtml(str);
        String text = NumWordsRulesExtractor.getInstance().getText(cleanHtml);
        if (text != null && text.length() > 0) {
            return text;
        }
        String text2 = ArticleExtractor.getInstance().getText(cleanHtml);
        if (text2 == null || text2.length() <= 0) {
            return null;
        }
        return text2;
    }

    private String cleanHtml(String str) {
        return str.replaceAll("&mdash;", "--").replaceAll("&ldquo;", "\"").replaceAll("&rdquo;", "\"").replaceAll("&lsquo;", "'").replaceAll("&rsquo;", "'");
    }

    private Date extractDate(Metadata metadata) {
        String findKey = TikaMetadataUtils.findKey(this.dateKeys, metadata);
        Date date = null;
        if (findKey != null) {
            date = GenericDateExtractor.extractSingleDate(metadata.get(findKey));
        }
        if (date == null) {
            date = new Date();
        }
        return date;
    }

    private Long extractRetrievalTime(Metadata metadata) {
        Long l = 0L;
        String findKey = TikaMetadataUtils.findKey(this.retrievalTimestampKeys, metadata);
        if (findKey != null) {
            l = Long.valueOf(Long.parseLong(metadata.get(findKey)));
        }
        return l;
    }

    private String extractTextField(Metadata metadata, List<String> list) {
        String findKey = TikaMetadataUtils.findKey(list, metadata);
        String str = findKey != null ? metadata.get(findKey) : "";
        if (str != null) {
            str = str.trim();
        }
        return str;
    }

    private String extractUrl(Metadata metadata) {
        String findKey = TikaMetadataUtils.findKey(this.urlKeys, metadata);
        String str = "";
        if (findKey != null) {
            String str2 = metadata.get(findKey);
            try {
                str = new URL(str2).getHost();
                if (str.startsWith("www")) {
                    str = str.substring("www".length() + 1);
                }
            } catch (MalformedURLException e) {
                throw new RuntimeException("Bad url: " + str2);
            }
        }
        return str;
    }

    private boolean isHtml(String str) {
        return str.contains("html");
    }

    private String cleanExtractedText(String str) {
        return str.replaceAll(LineSeparator.Macintosh, "\n").replaceAll("\t", MinimalPrettyPrinter.DEFAULT_ROOT_VALUE_SEPARATOR).replaceAll(" ", MinimalPrettyPrinter.DEFAULT_ROOT_VALUE_SEPARATOR).replaceAll("(?<![\\n])[\\n](?![\\n])", MinimalPrettyPrinter.DEFAULT_ROOT_VALUE_SEPARATOR).replaceAll("([ ]*\\n[ ]*)+", "\n\n").replaceAll("[ ]+", MinimalPrettyPrinter.DEFAULT_ROOT_VALUE_SEPARATOR);
    }

    public boolean isHandled(Element element, Property property) {
        String str;
        if (property == null || (str = (String) VisalloProperties.MIME_TYPE_METADATA.getMetadataValue(property.getMetadata())) == null || str.startsWith("image") || str.startsWith("video") || str.startsWith("audio")) {
            return false;
        }
        return this.configuration.isHandled(element, property);
    }
}
