package org.commoncrawl.hadoop.mapred;

import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HttpException;
import org.apache.http.HttpResponse;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.InputStreamEntity;
import org.apache.http.impl.DefaultHttpResponseFactory;
import org.apache.http.impl.io.AbstractSessionInputBuffer;
import org.apache.http.impl.io.DefaultHttpResponseParser;
import org.apache.http.message.BasicLineParser;
import org.apache.http.params.BasicHttpParams;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

/* loaded from: input_file:org/commoncrawl/hadoop/mapred/ArcRecord.class */
public class ArcRecord {
    private static final Logger LOG = Logger.getLogger(ArcRecord.class);
    private String _url;
    private String _ipAddress;
    private Date _archiveDate;
    private String _contentType;
    private int _contentLength;
    private byte[] _payload;
    private HttpResponse _httpResponse;

    /* loaded from: input_file:org/commoncrawl/hadoop/mapred/ArcRecord$ByteArraySessionInputBuffer.class */
    public static class ByteArraySessionInputBuffer extends AbstractSessionInputBuffer {
        public ByteArraySessionInputBuffer(byte[] bArr) {
            init(new ByteArrayInputStream(bArr), 4096, new BasicHttpParams());
        }

        public ByteArraySessionInputBuffer(byte[] bArr, int i, int i2) {
            init(new ByteArrayInputStream(bArr, i, i2), 4096, new BasicHttpParams());
        }

        @Override // org.apache.http.io.SessionInputBuffer
        public boolean isDataAvailable(int i) {
            return true;
        }
    }

    private void _clear() {
        this._url = null;
        this._ipAddress = null;
        this._archiveDate = null;
        this._contentType = null;
        this._contentLength = 0;
        this._payload = null;
        this._httpResponse = null;
    }

    private String _readLine(InputStream inputStream) throws IOException {
        StringBuilder sb = new StringBuilder(128);
        int read = inputStream.read();
        if (read == -1) {
            throw new EOFException();
        }
        while (((char) read) != '\n') {
            sb.append((char) read);
            read = inputStream.read();
            if (read == -1) {
                break;
            }
        }
        return sb.toString();
    }

    public boolean readFrom(InputStream inputStream) throws IOException {
        if (inputStream == null) {
            LOG.error("ArcRecord cannot be created from NULL/missing input stream.");
            return false;
        }
        _clear();
        try {
            setArcRecordHeader(_readLine(inputStream));
            setPayload(inputStream);
            return true;
        } catch (IOException e) {
            throw e;
        } catch (Exception e2) {
            LOG.error("Exception thrown while parsing ARC record", e2);
            return false;
        }
    }

    public void setArcRecordHeader(String str) throws IllegalArgumentException, ParseException {
        if (str == null || str.equals("")) {
            throw new IllegalArgumentException("ARC v1 record header string is empty.");
        }
        String[] split = str.split(" ");
        if (split.length != 5) {
            LOG.info(" [ " + str + " ] ");
            throw new IllegalArgumentException("ARC v1 record header must be 5 fields, but had " + split.length + ".");
        }
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyyMMddHHmmss");
        this._url = split[0];
        this._ipAddress = split[1];
        this._archiveDate = simpleDateFormat.parse(split[2]);
        this._contentType = split[3];
        this._contentLength = new Integer(split[4]).intValue();
    }

    public void setPayload(InputStream inputStream) throws IllegalArgumentException, ParseException, IOException {
        if (inputStream == null) {
            throw new IllegalArgumentException("ArcRecord cannot be created from NULL/missing input stream.");
        }
        int i = this._contentLength;
        this._payload = new byte[i];
        int read = IOUtils.read(inputStream, this._payload, 0, this._payload.length);
        if (read < this._payload.length) {
            LOG.warn("Expecting " + i + " bytes in ARC record payload, found " + read + " bytes.  Performing array copy.");
            this._payload = Arrays.copyOf(this._payload, read);
        }
    }

    public void addToPayload(byte[] bArr) {
        addToPayload(bArr, bArr.length);
    }

    public void addToPayload(byte[] bArr, int i) {
        LOG.warn("Content Length must have been incorrect - someone needed to add more data to the payload.");
        if (this._payload == null) {
            this._payload = Arrays.copyOf(bArr, i);
            return;
        }
        int length = this._payload.length;
        this._payload = Arrays.copyOf(this._payload, this._payload.length + i);
        System.arraycopy(bArr, 0, this._payload, length, i);
    }

    public String toString() {
        return this._url + " - " + this._archiveDate.toString() + " - " + this._contentType;
    }

    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(this._url);
        dataOutput.writeUTF(this._ipAddress);
        dataOutput.writeUTF(this._contentType);
        dataOutput.writeLong(this._archiveDate.getTime());
        dataOutput.writeInt(this._contentLength);
        dataOutput.writeInt(this._payload.length);
        dataOutput.write(this._payload, 0, this._payload.length);
    }

    public void readFields(DataInput dataInput) throws IOException {
        this._url = dataInput.readUTF();
        this._ipAddress = dataInput.readUTF();
        this._contentType = dataInput.readUTF();
        this._archiveDate = new Date(dataInput.readLong());
        this._contentLength = dataInput.readInt();
        int readInt = dataInput.readInt();
        if (this._payload == null || this._payload.length != readInt) {
            this._payload = new byte[readInt];
        }
        try {
            dataInput.readFully(this._payload, 0, readInt);
            this._httpResponse = null;
        } catch (EOFException e) {
            throw new IOException("End of input reached before payload was fully deserialized.", e);
        }
    }

    public byte[] getPayload() {
        return this._payload;
    }

    public String getURL() {
        return this._url;
    }

    public String getIpAddress() {
        return this._ipAddress;
    }

    public Date getArchiveDate() {
        return this._archiveDate;
    }

    public String getContentType() {
        return this._contentType;
    }

    public int getContentLength() {
        return this._contentLength;
    }

    public int getHttpStatusCode() throws IOException, HttpException {
        HttpResponse httpResponse = getHttpResponse();
        if (httpResponse == null) {
            return -1;
        }
        return httpResponse.getStatusLine().getStatusCode();
    }

    public Header[] getHttpHeaders() throws IOException, HttpException {
        HttpResponse httpResponse = getHttpResponse();
        if (httpResponse == null) {
            return null;
        }
        return httpResponse.getAllHeaders();
    }

    /* JADX WARN: Multi-variable type inference failed */
    private int _searchForCRLFCRLF(byte[] bArr) {
        boolean z = false;
        for (int i = 0; i < bArr.length; i++) {
            if (bArr[i] == 13) {
                if (!z) {
                    z = true;
                } else if (z) {
                    z = false;
                } else if (z == 2) {
                    z = 3;
                } else if (z == 3) {
                    z = false;
                }
            } else if (bArr[i] != 10) {
                z = false;
            } else if (!z) {
                z = false;
            } else if (z) {
                z = 2;
            } else if (z == 2) {
                z = false;
            } else if (z == 3) {
                z = 4;
            }
            if (z == 4) {
                return i + 1;
            }
        }
        return -1;
    }

    public HttpResponse getHttpResponse() throws IOException, HttpException {
        if (this._httpResponse != null) {
            return this._httpResponse;
        }
        if (this._payload == null) {
            LOG.error("Unable to parse HTTP response: Payload has not been set");
            return null;
        }
        if (this._url != null && !this._url.startsWith("http://") && !this._url.startsWith("https://")) {
            LOG.error("Unable to parse HTTP response: URL protocol is not HTTP");
            return null;
        }
        this._httpResponse = null;
        int _searchForCRLFCRLF = _searchForCRLFCRLF(this._payload);
        if (_searchForCRLFCRLF == -1) {
            LOG.error("Unable to parse HTTP response: End of HTTP headers not found");
            return null;
        }
        this._httpResponse = (HttpResponse) new DefaultHttpResponseParser(new ByteArraySessionInputBuffer(this._payload, 0, _searchForCRLFCRLF), new BasicLineParser(), new DefaultHttpResponseFactory(), new BasicHttpParams()).parse();
        if (this._httpResponse == null) {
            LOG.error("Unable to parse HTTP response");
            return null;
        }
        int length = this._payload.length - _searchForCRLFCRLF;
        if (this._payload.length > 0 && this._payload[this._payload.length - 1] == 10) {
            length--;
        }
        InputStreamEntity inputStreamEntity = new InputStreamEntity(new ByteArrayInputStream(this._payload, _searchForCRLFCRLF, length), length);
        inputStreamEntity.setContentType(this._httpResponse.getFirstHeader("Content-Type"));
        inputStreamEntity.setContentEncoding(this._httpResponse.getFirstHeader("Content-Encoding"));
        this._httpResponse.setEntity(inputStreamEntity);
        return this._httpResponse;
    }

    public Document getParsedHTML() throws IOException {
        if (this._url == null) {
            LOG.error("Unable to parse HTML: URL from ARC header has not been set");
            return null;
        }
        try {
            getHttpResponse();
            if (this._httpResponse == null) {
                LOG.error("Unable to parse HTML: Exception during HTTP response parsing");
                return null;
            }
            if (this._httpResponse.getEntity() == null) {
                LOG.error("Unable to parse HTML: No HTTP response entity found");
                return null;
            }
            if (!this._contentType.toLowerCase().contains("html")) {
                LOG.warn("Unable to parse HTML: Content is not HTML");
                return null;
            }
            String str = null;
            try {
                str = ContentType.getOrDefault(this._httpResponse.getEntity()).getCharset().name();
            } catch (Throwable th) {
            }
            if (str == null) {
                str = "ISO-8859-1";
            }
            return Jsoup.parse(this._httpResponse.getEntity().getContent(), str, this._url);
        } catch (HttpException e) {
            LOG.error("Unable to parse HTML: Exception during HTTP response parsing", e);
            return null;
        }
    }
}
