package edu.jhu.hlt.concrete.ingesters.webposts;

import edu.jhu.hlt.concrete.Communication;
import edu.jhu.hlt.concrete.ingesters.base.IngestException;
import edu.jhu.hlt.concrete.serialization.CompactCommunicationSerializer;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Paths;
import java.util.zip.GZIPInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:edu/jhu/hlt/concrete/ingesters/webposts/WebPostGzIngester.class */
public class WebPostGzIngester {
    private static final Logger LOGGER = LoggerFactory.getLogger(WebPostGzIngester.class);

    /* JADX INFO: Access modifiers changed from: package-private */
    public static String escapeAmpersands(String str) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < str.length(); i++) {
            if (str.charAt(i) != '&') {
                sb.append(str.charAt(i));
            } else {
                int indexOf = str.indexOf(59, i);
                if (indexOf == -1 || indexOf - i > 4) {
                    sb.append("&amp;");
                } else {
                    String substring = str.substring(i + 1, indexOf);
                    if (substring.equals("amp") || substring.equals("quot") || substring.equals("lt") || substring.equals("gt")) {
                        sb.append('&');
                    } else {
                        sb.append("&amp;");
                    }
                }
            }
        }
        return sb.toString();
    }

    public static void main(String[] strArr) throws Exception {
        WebPostIngester webPostIngester = new WebPostIngester();
        CompactCommunicationSerializer compactCommunicationSerializer = new CompactCommunicationSerializer();
        String str = strArr[0];
        String str2 = strArr[1];
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(Paths.get(str, new String[0]), new OpenOption[0])), StandardCharsets.UTF_8));
        PrintWriter printWriter = null;
        String str3 = "";
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                bufferedReader.close();
                return;
            }
            if (!readLine.startsWith("<DOC>")) {
                if (readLine.startsWith("<DOCID>")) {
                    str3 = readLine.substring("<DOCID>".length(), readLine.length() - "</DOCID>".length()).trim();
                    printWriter = new PrintWriter(str3, StandardCharsets.UTF_8.toString());
                    printWriter.println("<DOC>");
                    printWriter.println(readLine);
                } else if (readLine.equals("\">")) {
                    printWriter.println("\"/>");
                } else {
                    printWriter.println(escapeAmpersands(readLine));
                }
                if (readLine.equals("</DOC>")) {
                    printWriter.close();
                    try {
                        Communication fromCharacterBasedFile = webPostIngester.fromCharacterBasedFile(Paths.get(str3, new String[0]));
                        OutputStream newOutputStream = Files.newOutputStream(Paths.get(str2 + "/" + fromCharacterBasedFile.getId() + ".comm", new String[0]), new OpenOption[0]);
                        newOutputStream.write(compactCommunicationSerializer.toBytes(fromCharacterBasedFile));
                        newOutputStream.close();
                        LOGGER.info("Processed file " + fromCharacterBasedFile.getId());
                    } catch (IngestException e) {
                        LOGGER.error("Error processing communication " + str3, e);
                    }
                    Files.delete(Paths.get(str3, new String[0]));
                }
            }
        }
    }
}
