package org.jesterj.ingest.processors;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.jesterj.ingest.model.DocumentProcessor;
import org.jesterj.ingest.model.impl.NamedBuilder;
import org.w3c.dom.Document;

/* loaded from: input_file:org/jesterj/ingest/processors/TikaProcessor.class */
public class TikaProcessor implements DocumentProcessor {
    private static final Logger log = LogManager.getLogger();
    private String name;
    private String suffix;
    private int maxLength = -1;
    private TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
    private boolean replaceRaw = true;
    private String destField = null;

    /* loaded from: input_file:org/jesterj/ingest/processors/TikaProcessor$Builder.class */
    public static class Builder extends NamedBuilder<TikaProcessor> {
        TikaProcessor obj = new TikaProcessor();

        /* JADX INFO: Access modifiers changed from: protected */
        /* JADX WARN: Can't rename method to resolve collision */
        @Override // org.jesterj.ingest.model.impl.NamedBuilder
        public TikaProcessor getObj() {
            return this.obj;
        }

        @Override // org.jesterj.ingest.model.impl.NamedBuilder
        /* renamed from: named */
        public NamedBuilder<TikaProcessor> named2(String str) {
            getObj().name = str;
            return this;
        }

        public Builder appendingSuffix(String str) {
            getObj().suffix = str;
            return this;
        }

        public Builder truncatingTextTo(int i) {
            getObj().maxLength = i;
            return this;
        }

        public Builder replacingRawData(boolean z) {
            getObj().replaceRaw = z;
            return this;
        }

        public Builder intoField(String str) {
            getObj().destField = str;
            return this;
        }

        public Builder configuredWith(Document document) throws TikaException, IOException {
            getObj().tikaConfig = new TikaConfig(document);
            return this;
        }

        private void setObj(TikaProcessor tikaProcessor) {
            this.obj = tikaProcessor;
        }

        @Override // org.jesterj.ingest.model.Buildable
        public TikaProcessor build() {
            TikaProcessor obj = getObj();
            setObj(new TikaProcessor());
            return obj;
        }
    }

    @Override // org.jesterj.ingest.model.DocumentProcessor
    public org.jesterj.ingest.model.Document[] processDocument(org.jesterj.ingest.model.Document document) {
        boolean z;
        boolean z2;
        byte[] rawData;
        try {
            rawData = document.getRawData();
        } finally {
            if (!z && !z2) {
            }
            return new org.jesterj.ingest.model.Document[]{document};
        }
        if (rawData == null) {
            log.debug("Skipping document without data in " + getName());
            return new org.jesterj.ingest.model.Document[]{document};
        }
        Tika tika = new Tika(this.tikaConfig);
        tika.setMaxStringLength(document.getRawData().length);
        Metadata metadata = new Metadata();
        try {
            ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(rawData);
            try {
                String parseToString = tika.parseToString(byteArrayInputStream, metadata, this.maxLength);
                if (this.replaceRaw) {
                    document.setRawData(parseToString.getBytes(StandardCharsets.UTF_8));
                }
                if (this.destField != null) {
                    document.put(this.destField, parseToString);
                }
                for (String str : metadata.names()) {
                    document.put(sanitize(str) + plusSuffix(), metadata.get(str));
                }
                byteArrayInputStream.close();
                return new org.jesterj.ingest.model.Document[]{document};
            } catch (Throwable th) {
                try {
                    byteArrayInputStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
                throw th;
            }
        } catch (IOException | TikaException e) {
            log.debug("Tika processing failure!", e);
            throw new RuntimeException(e);
        }
    }

    private String plusSuffix() {
        return this.suffix == null ? "" : this.suffix;
    }

    private String sanitize(String str) {
        StringBuilder sb = new StringBuilder(str.length());
        for (char c : str.toCharArray()) {
            if ((c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
                sb.append('_');
            } else {
                sb.append(c);
            }
        }
        return sb.toString();
    }

    @Override // org.jesterj.ingest.model.Configurable
    public String getName() {
        return this.name;
    }
}
