package io.bigconnect.dw.text.extractor;

import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.inject.Inject;
import com.mware.core.ingest.dataworker.DataWorker;
import com.mware.core.ingest.dataworker.DataWorkerData;
import com.mware.core.ingest.dataworker.DataWorkerPrepareData;
import com.mware.core.ingest.dataworker.ElementOrPropertyStatus;
import com.mware.core.model.Description;
import com.mware.core.model.Name;
import com.mware.core.model.file.FileSystemRepository;
import com.mware.core.model.properties.BcSchema;
import com.mware.core.model.properties.RawObjectSchema;
import com.mware.core.model.workQueue.Priority;
import com.mware.core.util.BcLogger;
import com.mware.core.util.BcLoggerFactory;
import com.mware.ge.Element;
import com.mware.ge.Property;
import com.mware.ge.Vertex;
import com.mware.ge.Visibility;
import com.mware.ge.mutation.ExistingElementMutation;
import com.mware.ge.values.storable.DefaultStreamingPropertyValue;
import com.mware.ge.values.storable.StreamingPropertyValue;
import com.mware.ge.values.storable.StringValue;
import com.mware.ge.values.storable.TextValue;
import com.mware.ge.values.storable.Values;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import de.l3s.boilerpipe.extractors.NumWordsRulesExtractor;
import io.bigconnect.dw.text.extractor.TikaTextExtractorWorkerConfiguration;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.time.ZonedDateTime;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pdf.BcParserConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.SecureContentHandler;
import org.json.JSONException;
import org.json.JSONObject;
import org.xml.sax.SAXException;

@Description("Uses Apache Tika to extract text")
@Name("Tika Text Extractor")
/* loaded from: input_file:io/bigconnect/dw/text/extractor/TikaTextExtractorWorker.class */
public class TikaTextExtractorWorker extends DataWorker {
    private static final BcLogger LOGGER = BcLoggerFactory.getLogger(TikaTextExtractorWorker.class);

    @Deprecated
    public static final String MULTI_VALUE_KEY = TikaTextExtractorWorker.class.getName();
    private static final String PROPS_FILE = "tika-extractor.properties";
    private static final String DATE_KEYS_PROPERTY = "tika.extraction.datekeys";
    private static final String SUBJECT_KEYS_PROPERTY = "tika.extraction.titlekeys";
    private static final String AUTHOR_PROPERTY = "tika.extractions.author";
    private static final String URL_KEYS_PROPERTY = "tika.extraction.urlkeys";
    private static final String TYPE_KEYS_PROPERTY = "tika.extraction.typekeys";
    private static final String EXT_URL_KEYS_PROPERTY = "tika.extraction.exturlkeys";
    private static final String SRC_TYPE_KEYS_PROPERTY = "tika.extraction.srctypekeys";
    private static final String RETRIEVAL_TIMESTAMP_KEYS_PROPERTY = "tika.extraction.retrievaltimestampkeys";
    private static final String CUSTOM_FLICKR_METADATA_KEYS_PROPERTY = "tika.extraction.customflickrmetadatakeys";
    private static final String NUMBER_OF_PAGES_PROPERTY = "tika.extraction.numberofpageskeys";
    private static final double SYSTEM_ASSIGNED_CONFIDENCE = 0.4d;
    private final TikaTextExtractorWorkerConfiguration configuration;
    private final FileSystemRepository fileSystemRepository;
    private List<String> dateKeys;
    private List<String> subjectKeys;
    private List<String> urlKeys;
    private List<String> typeKeys;
    private List<String> extUrlKeys;
    private List<String> srcTypeKeys;
    private List<String> retrievalTimestampKeys;
    private List<String> customFlickrMetadataKeys;
    private List<String> authorKeys;
    private List<String> numberOfPagesKeys;

    @Inject
    public TikaTextExtractorWorker(TikaTextExtractorWorkerConfiguration tikaTextExtractorWorkerConfiguration, FileSystemRepository fileSystemRepository) {
        this.configuration = tikaTextExtractorWorkerConfiguration;
        this.fileSystemRepository = fileSystemRepository;
    }

    public void prepare(DataWorkerPrepareData dataWorkerPrepareData) throws Exception {
        super.prepare(dataWorkerPrepareData);
        Properties properties = new Properties();
        try {
            InputStream resourceAsStream = Thread.currentThread().getContextClassLoader().getResourceAsStream(PROPS_FILE);
            if (resourceAsStream != null) {
                properties.load(resourceAsStream);
            }
        } catch (IOException e) {
            LOGGER.error("Could not load config: %s", new Object[]{PROPS_FILE});
        }
        this.dateKeys = Arrays.asList(properties.getProperty(DATE_KEYS_PROPERTY, "date,published,pubdate,publish_date,last-modified,atc:last-modified").split(","));
        this.subjectKeys = Arrays.asList(properties.getProperty(SUBJECT_KEYS_PROPERTY, "title,subject").split(","));
        this.urlKeys = Arrays.asList(properties.getProperty(URL_KEYS_PROPERTY, "url,og:url").split(","));
        this.typeKeys = Arrays.asList(properties.getProperty(TYPE_KEYS_PROPERTY, "Content-Type").split(","));
        this.extUrlKeys = Arrays.asList(properties.getProperty(EXT_URL_KEYS_PROPERTY, "atc:result-url").split(","));
        this.srcTypeKeys = Arrays.asList(properties.getProperty(SRC_TYPE_KEYS_PROPERTY, "og:type").split(","));
        this.retrievalTimestampKeys = Arrays.asList(properties.getProperty(RETRIEVAL_TIMESTAMP_KEYS_PROPERTY, "atc:retrieval-timestamp").split(","));
        this.customFlickrMetadataKeys = Arrays.asList(properties.getProperty(CUSTOM_FLICKR_METADATA_KEYS_PROPERTY, "Unknown tag (0x9286)").split(","));
        this.authorKeys = Arrays.asList(properties.getProperty(AUTHOR_PROPERTY, "author").split(","));
        this.numberOfPagesKeys = Arrays.asList(properties.getProperty(NUMBER_OF_PAGES_PROPERTY, "xmpTPg:NPages").split(","));
    }

    public void execute(InputStream inputStream, DataWorkerData dataWorkerData) throws Exception {
        LOGGER.info("Started Tika text extractor", new Object[0]);
        Element element = dataWorkerData.getElement();
        Property property = BcSchema.RAW.getProperty(element);
        TextValue value = property.getMetadata().getValue(BcSchema.MIME_TYPE.getPropertyName());
        Preconditions.checkNotNull(value, BcSchema.MIME_TYPE.getPropertyName() + " is a required metadata field");
        Charset forName = Charset.forName("UTF-8");
        Metadata metadata = new Metadata();
        metadata.set("Content-Type", value.stringValue());
        String extractText = extractText(property.getValue().getInputStream(), value.stringValue(), metadata);
        String key = property.getKey();
        TikaTextExtractorWorkerConfiguration.TextExtractMapping textExtractMapping = this.configuration.getTextExtractMapping(dataWorkerData.getProperty());
        ExistingElementMutation<Vertex> prepareMutation = refresh(element).prepareMutation();
        String extractTextField = extractTextField(metadata, this.authorKeys);
        if (!StringUtils.isEmpty(extractTextField)) {
            prepareMutation.addPropertyValue(key, RawObjectSchema.AUTHOR.getPropertyName(), Values.stringValue(extractTextField), dataWorkerData.createPropertyMetadata(getUser()), dataWorkerData.getProperty().getVisibility());
        }
        String extractTextField2 = extractTextField(metadata, this.customFlickrMetadataKeys);
        com.mware.ge.Metadata createPropertyMetadata = dataWorkerData.createPropertyMetadata(getUser());
        Visibility defaultVisibility = getVisibilityTranslator().getDefaultVisibility();
        BcSchema.MIME_TYPE_METADATA.setMetadata(createPropertyMetadata, "text/plain", defaultVisibility);
        if (!Strings.isNullOrEmpty(textExtractMapping.getTextDescription())) {
            BcSchema.TEXT_DESCRIPTION_METADATA.setMetadata(createPropertyMetadata, textExtractMapping.getTextDescription(), defaultVisibility);
        }
        if (extractTextField2 == null || extractTextField2.equals("")) {
            addTextProperty(textExtractMapping, prepareMutation, key, new DefaultStreamingPropertyValue(new ByteArrayInputStream(extractText.getBytes(forName)), StringValue.class), createPropertyMetadata, dataWorkerData.getProperty().getVisibility());
            BcSchema.MODIFIED_DATE.setProperty(prepareMutation, extractDate(metadata), defaultVisibility);
            String extractTextField3 = extractTextField(metadata, this.subjectKeys);
            if (!StringUtils.isEmpty(extractTextField3)) {
                prepareMutation.addPropertyValue(key, BcSchema.TITLE.getPropertyName(), Values.stringValue(extractTextField3), dataWorkerData.createPropertyMetadata(getUser()), dataWorkerData.getProperty().getVisibility());
            }
            String extractTextField4 = extractTextField(metadata, this.numberOfPagesKeys);
            if (!StringUtils.isEmpty(extractTextField4)) {
                try {
                    RawObjectSchema.PAGE_COUNT.setProperty(prepareMutation, Integer.valueOf(Integer.parseInt(extractTextField4)), dataWorkerData.createPropertyMetadata(getUser()), dataWorkerData.getProperty().getVisibility());
                } catch (NumberFormatException e) {
                }
            }
        } else {
            try {
                JSONObject jSONObject = new JSONObject(extractTextField2);
                addTextProperty(textExtractMapping, prepareMutation, key, new DefaultStreamingPropertyValue(new ByteArrayInputStream((new JSONObject(jSONObject.get("description").toString()).get("_content") + "\n" + jSONObject.get("tags").toString()).getBytes(forName)), StringValue.class), createPropertyMetadata, dataWorkerData.getProperty().getVisibility());
                BcSchema.MODIFIED_DATE.setProperty(prepareMutation, GenericDateExtractor.extractSingleDate(jSONObject.get("lastupdate").toString()), defaultVisibility);
                prepareMutation.addPropertyValue(key, BcSchema.TITLE.getPropertyName(), Values.stringValue(jSONObject.get("title").toString()), dataWorkerData.createPropertyMetadata(getUser()), dataWorkerData.getProperty().getVisibility());
            } catch (JSONException e2) {
                LOGGER.warn("Image returned invalid custom metadata", new Object[0]);
            }
        }
        Element save = prepareMutation.save(getAuthorizations());
        getGraph().flush();
        getWebQueueRepository().broadcastPropertyChange(save, key, textExtractMapping.getExtractedTextPropertyName(), dataWorkerData.getWorkspaceId());
        getWorkQueueRepository().pushGraphPropertyQueue(save, key, textExtractMapping.getExtractedTextPropertyName(), dataWorkerData.getWorkspaceId(), dataWorkerData.getVisibilitySource(), Priority.HIGH, ElementOrPropertyStatus.UPDATE, (Long) null);
        LOGGER.info("Ended Tika text extractor", new Object[0]);
    }

    private void addTextProperty(TikaTextExtractorWorkerConfiguration.TextExtractMapping textExtractMapping, ExistingElementMutation<Vertex> existingElementMutation, String str, StreamingPropertyValue streamingPropertyValue, com.mware.ge.Metadata metadata, Visibility visibility) {
        existingElementMutation.addPropertyValue(str, textExtractMapping.getExtractedTextPropertyName(), streamingPropertyValue, metadata, visibility);
    }

    private String extractText(InputStream inputStream, String str, Metadata metadata) throws IOException, SAXException, TikaException, BoilerpipeProcessingException {
        String cleanExtractedText;
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        IOUtils.copy(inputStream, byteArrayOutputStream);
        byte[] byteArray = byteArrayOutputStream.toByteArray();
        metadata.set("Content-Type", str);
        String extractTextWithTika = extractTextWithTika(byteArray, metadata);
        if (isHtml(str)) {
            cleanExtractedText = extractTextFromHtml(IOUtils.toString(byteArray, "UTF-8"));
            if (cleanExtractedText == null || cleanExtractedText.length() == 0) {
                cleanExtractedText = cleanExtractedText(extractTextWithTika);
            }
        } else {
            cleanExtractedText = cleanExtractedText(extractTextWithTika);
        }
        return Normalizer.normalize(cleanExtractedText, Normalizer.Form.NFC);
    }

    private static String extractTextWithTika(byte[] bArr, Metadata metadata) throws TikaException, SAXException, IOException {
        TikaConfig defaultConfig = TikaConfig.getDefaultConfig();
        CompositeParser compositeParser = new CompositeParser(defaultConfig.getMediaTypeRegistry(), new Parser[]{defaultConfig.getParser()});
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        BodyContentHandler bodyContentHandler = new BodyContentHandler(new OutputStreamWriter(byteArrayOutputStream, "UTF-8"));
        ParseContext parseContext = new ParseContext();
        parseContext.set(PDFParserConfig.class, new BcParserConfig());
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bArr);
        TemporaryResources temporaryResources = new TemporaryResources();
        try {
            TikaInputStream tikaInputStream = TikaInputStream.get(byteArrayInputStream, temporaryResources);
            SecureContentHandler secureContentHandler = new SecureContentHandler(bodyContentHandler, tikaInputStream);
            try {
                compositeParser.parse(tikaInputStream, secureContentHandler, metadata, parseContext);
                if (LOGGER.isDebugEnabled()) {
                    LOGGER.debug("extracted %d bytes", new Object[]{Integer.valueOf(byteArrayOutputStream.size())});
                    LOGGER.debug("metadata", new Object[0]);
                    for (String str : metadata.names()) {
                        LOGGER.debug("  %s: %s", new Object[]{str, metadata.get(str)});
                    }
                }
                return IOUtils.toString(byteArrayOutputStream.toByteArray(), "UTF-8");
            } catch (SAXException e) {
                secureContentHandler.throwIfCauseOf(e);
                throw e;
            }
        } finally {
            temporaryResources.dispose();
        }
    }

    private String extractTextFromHtml(String str) throws BoilerpipeProcessingException {
        String cleanHtml = cleanHtml(str);
        String text = NumWordsRulesExtractor.getInstance().getText(cleanHtml);
        if (text != null && text.length() > 0) {
            return text;
        }
        String text2 = ArticleExtractor.getInstance().getText(cleanHtml);
        if (text2 == null || text2.length() <= 0) {
            return null;
        }
        return text2;
    }

    private String cleanHtml(String str) {
        return str.replaceAll("&mdash;", "--").replaceAll("&ldquo;", "\"").replaceAll("&rdquo;", "\"").replaceAll("&lsquo;", "'").replaceAll("&rsquo;", "'");
    }

    private ZonedDateTime extractDate(Metadata metadata) {
        String findKey = TikaMetadataUtils.findKey(this.dateKeys, metadata);
        ZonedDateTime zonedDateTime = null;
        if (findKey != null) {
            zonedDateTime = GenericDateExtractor.extractSingleDate(metadata.get(findKey));
        }
        if (zonedDateTime == null) {
            zonedDateTime = ZonedDateTime.now();
        }
        return zonedDateTime;
    }

    private String extractTextField(Metadata metadata, List<String> list) {
        String findKey = TikaMetadataUtils.findKey(list, metadata);
        String str = findKey != null ? metadata.get(findKey) : "";
        if (str != null) {
            str = str.trim();
        }
        return str;
    }

    private boolean isHtml(String str) {
        return str.contains("html");
    }

    private String cleanExtractedText(String str) {
        return str.replaceAll("\r", "\n").replaceAll("\t", " ").replaceAll(" ", " ").replaceAll("(?<![\\n])[\\n](?![\\n])", " ").replaceAll("([ ]*\\n[ ]*)+", "\n\n").replaceAll("[ ]+", " ");
    }

    public boolean isHandled(Element element, Property property) {
        String str;
        if (property == null || (str = (String) BcSchema.MIME_TYPE.getFirstPropertyValue(element)) == null || str.startsWith("image") || str.startsWith("video") || str.startsWith("audio") || BcSchema.RAW.getProperty(element) == null) {
            return false;
        }
        return this.configuration.isHandled(element, property);
    }
}
