package de.digitalcollections.solrocr.formats.alto;

import de.digitalcollections.solrocr.formats.OcrPassageFormatter;
import de.digitalcollections.solrocr.iter.IterableCharSequence;
import de.digitalcollections.solrocr.iter.TagBreakIterator;
import de.digitalcollections.solrocr.model.OcrBox;
import de.digitalcollections.solrocr.model.OcrPage;
import java.awt.Dimension;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.lucene.search.uhighlight.Passage;

/* loaded from: input_file:de/digitalcollections/solrocr/formats/alto/AltoPassageFormatter.class */
public class AltoPassageFormatter extends OcrPassageFormatter {
    private static final String START_HL = "@@STARTHLTAG@@";
    private static final String END_HL = "@@ENDHLTAG@@";
    private static final Pattern pagePat = Pattern.compile("<Page ?(?<attribs>.+?)/?>");
    private static final Pattern wordPat = Pattern.compile("<String ?(?<attribs>.+?)/?>");
    private static final Pattern attribPat = Pattern.compile("(?<key>[A-Z_]+?)=\"(?<val>.+?)\"");
    private static final Pattern postContentPat = Pattern.compile("[\"']\\s*(\\w|/?>)");
    private final TagBreakIterator pageIter;

    /* JADX INFO: Access modifiers changed from: protected */
    public AltoPassageFormatter(String str, String str2, boolean z, boolean z2) {
        super(str, str2, z, z2);
        this.pageIter = new TagBreakIterator("Page");
    }

    private Map<String, String> parseAttribs(String str) {
        HashMap hashMap = new HashMap();
        Matcher matcher = attribPat.matcher(str);
        while (matcher.find()) {
            hashMap.put(matcher.group("key"), matcher.group("val"));
        }
        return hashMap;
    }

    private OcrPage parsePage(Map<String, String> map) {
        Dimension dimension = null;
        if (map.containsKey("WIDTH") && map.containsKey("HEIGHT")) {
            try {
                dimension = new Dimension((int) Double.parseDouble(map.get("WIDTH")), (int) Double.parseDouble(map.get("HEIGHT")));
            } catch (NumberFormatException e) {
            }
        }
        return new OcrPage(map.get("ID"), dimension);
    }

    @Override // de.digitalcollections.solrocr.formats.OcrPassageFormatter
    public OcrPage determineStartPage(String str, int i, IterableCharSequence iterableCharSequence) {
        this.pageIter.setText(iterableCharSequence);
        int preceding = this.pageIter.preceding(i);
        Matcher matcher = pagePat.matcher(iterableCharSequence.subSequence(preceding, Math.min(preceding + 512, iterableCharSequence.length())).toString());
        if (matcher.find()) {
            return parsePage(parseAttribs(matcher.group("attribs")));
        }
        return null;
    }

    @Override // de.digitalcollections.solrocr.formats.OcrPassageFormatter
    protected TreeMap<Integer, OcrPage> parsePages(String str) {
        TreeMap<Integer, OcrPage> treeMap = new TreeMap<>();
        Matcher matcher = pagePat.matcher(str);
        while (matcher.find()) {
            treeMap.put(Integer.valueOf(matcher.start()), parsePage(parseAttribs(matcher.group("attribs"))));
        }
        return treeMap;
    }

    @Override // de.digitalcollections.solrocr.formats.OcrPassageFormatter
    protected String getTextFromXml(String str) {
        StringBuilder sb = new StringBuilder(str.replaceAll(this.startHlTag, START_HL).replaceAll(this.endHlTag, END_HL).replaceAll("<SP.*?>", " ").replaceAll("(</?)?TextLine.*?>", " ").replaceAll("(?s)<Description>.+?</Description>", ""));
        boolean z = true;
        while (true) {
            boolean z2 = z;
            Matcher matcher = wordPat.matcher(sb);
            if (!matcher.find()) {
                return StringEscapeUtils.unescapeXml(sb.toString().replaceAll("</?[A-Z]?.*?>", "")).replaceAll("\n", "").replaceAll("\\s+", " ").trim().replaceAll(START_HL, this.startHlTag).replaceAll(END_HL, this.endHlTag);
            }
            int start = matcher.start();
            int end = matcher.end();
            Map<String, String> parseAttribs = parseAttribs(matcher.group("attribs"));
            sb.replace(start, end, "HypPart1".equals(parseAttribs.get("SUBS_TYPE")) ? matcher.find() ? parseAttribs.get("SUBS_CONTENT") : parseAttribs.get("CONTENT") : "HypPart2".equals(parseAttribs.get("SUBS_TYPE")) ? z2 ? parseAttribs.get("CONTENT") : "" : parseAttribs.get("CONTENT"));
            z = false;
        }
    }

    @Override // de.digitalcollections.solrocr.formats.OcrPassageFormatter
    protected String getHighlightedFragment(Passage passage, IterableCharSequence iterableCharSequence) {
        StringBuilder sb = new StringBuilder(iterableCharSequence.subSequence(passage.getStartOffset(), passage.getEndOffset()));
        int i = 0;
        if (passage.getNumMatches() > 0) {
            for (OcrPassageFormatter.PassageMatch passageMatch : mergeMatches(passage.getNumMatches(), passage.getMatchStarts(), passage.getMatchEnds())) {
                String charSequence = iterableCharSequence.subSequence(passage.getStartOffset(), passageMatch.start).toString();
                int length = charSequence.length();
                if (this.alignSpans) {
                    length = charSequence.lastIndexOf("CONTENT=") + 9;
                }
                sb.insert(length + i, this.startHlTag);
                int length2 = i + this.startHlTag.length();
                int min = Math.min(iterableCharSequence.subSequence(passage.getStartOffset(), passageMatch.end).toString().length() + length2, sb.length());
                if (this.alignSpans && min != sb.length()) {
                    Matcher matcher = postContentPat.matcher(sb.substring(min, sb.length()));
                    if (matcher.find()) {
                        min += matcher.start();
                    }
                }
                sb.insert(min, this.endHlTag);
                i = length2 + this.endHlTag.length();
            }
        }
        return sb.toString();
    }

    @Override // de.digitalcollections.solrocr.formats.OcrPassageFormatter
    protected List<OcrBox> parseWords(String str, TreeMap<Integer, OcrPage> treeMap, String str2) {
        String replaceAll = str.replaceAll(this.startHlTag, START_HL).replaceAll(this.endHlTag, END_HL);
        ArrayList arrayList = new ArrayList();
        Matcher matcher = wordPat.matcher(replaceAll);
        UUID uuid = null;
        boolean z = false;
        while (matcher.find()) {
            String str3 = str2;
            if (treeMap.floorKey(Integer.valueOf(matcher.start())) != null) {
                str3 = treeMap.floorEntry(Integer.valueOf(matcher.start())).getValue().id;
            }
            Map<String, String> parseAttribs = parseAttribs(matcher.group("attribs"));
            int parseDouble = (int) Double.parseDouble(parseAttribs.get("HPOS"));
            int parseDouble2 = (int) Double.parseDouble(parseAttribs.get("VPOS"));
            int parseDouble3 = (int) Double.parseDouble(parseAttribs.get("WIDTH"));
            int parseDouble4 = (int) Double.parseDouble(parseAttribs.get("HEIGHT"));
            String str4 = parseAttribs.get("SUBS_TYPE");
            String unescapeXml = StringEscapeUtils.unescapeXml(parseAttribs.get("CONTENT"));
            Boolean valueOf = str4 == null ? null : Boolean.valueOf("HypPart1".equals(str4));
            if (valueOf != null && valueOf.booleanValue()) {
                unescapeXml = unescapeXml + "-";
            }
            if (unescapeXml.contains(START_HL) || parseAttribs.getOrDefault("SUBS_CONTENT", "").contains(START_HL)) {
                uuid = UUID.randomUUID();
            }
            OcrBox ocrBox = new OcrBox(unescapeXml.replace(START_HL, this.startHlTag).replace(END_HL, this.endHlTag), str3, parseDouble, parseDouble2, parseDouble + parseDouble3, parseDouble2 + parseDouble4, uuid);
            if (valueOf != null) {
                ocrBox.setHyphenInfo(valueOf.booleanValue(), parseAttribs.get("SUBS_CONTENT").replace(START_HL, this.startHlTag).replace(END_HL, this.endHlTag));
            }
            arrayList.add(ocrBox);
            if (uuid == null || str4 == null) {
                if (unescapeXml.contains(END_HL)) {
                    uuid = null;
                } else if (replaceAll.substring(matcher.end(), Math.min(matcher.end() + END_HL.length(), replaceAll.length())).equals(END_HL)) {
                    uuid = null;
                }
            } else if (str4.equals("HypPart1") && parseAttribs.get("SUBS_CONTENT").contains(END_HL)) {
                z = true;
            } else if (z) {
                z = false;
                uuid = null;
            }
        }
        return arrayList;
    }
}
