package org.nasdanika.rag.core;

import java.lang.invoke.MethodHandles;
import java.lang.invoke.MethodType;
import java.lang.runtime.ObjectMethods;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import java.util.function.Function;
import org.eclipse.emf.ecore.EObject;
import org.nasdanika.models.pdf.Document;
import org.nasdanika.models.pdf.Paragraph;

/* loaded from: input_file:org/nasdanika/rag/core/PdfTextSplitter.class */
public class PdfTextSplitter {
    private int size;
    private int overlap;
    private int tolerance;
    private Function<String, List<String>> tokenizer;

    /* loaded from: input_file:org/nasdanika/rag/core/PdfTextSplitter$Chunk.class */
    public interface Chunk {
        String getText();

        List<EObject> getSources();

        int size();

        int overlap();
    }

    /* loaded from: input_file:org/nasdanika/rag/core/PdfTextSplitter$ChunkImpl.class */
    private class ChunkImpl implements Chunk {
        private int size;
        private int chunkOverlap;
        private StringBuilder textBuilder = new StringBuilder();
        private List<EObject> sources = new ArrayList();
        private Set<Integer> sourceRecords = new HashSet();

        ChunkImpl(List<ParagraphRecord> list, int i, int i2, int i3, int i4) {
            int i5 = PdfTextSplitter.this.overlap;
            ArrayList arrayList = new ArrayList();
            loop0: while (i >= 0) {
                ParagraphRecord paragraphRecord = list.get(i);
                if (i2 == -1) {
                    if (paragraphRecord.size() < i5) {
                        ChunkImpl chunkImpl = new ChunkImpl(null, -1, -1, -1, -1);
                        chunkImpl.add(paragraphRecord);
                        chunkImpl.add(PdfTextSplitter.this.getParagraphSeparator(), null);
                        arrayList.add(chunkImpl);
                        i5 -= chunkImpl.size();
                        if (i5 <= PdfTextSplitter.this.tolerance) {
                            break;
                        } else {
                            i--;
                        }
                    } else {
                        i2 = paragraphRecord.sentences().size() - 1;
                    }
                }
                while (i2 >= 0) {
                    SentenceRecord sentenceRecord = paragraphRecord.sentences().get(i2);
                    if (i3 == -1) {
                        if (sentenceRecord.size() < i5) {
                            ChunkImpl chunkImpl2 = new ChunkImpl(null, -1, -1, -1, -1);
                            chunkImpl2.add(sentenceRecord);
                            arrayList.add(chunkImpl2);
                            i5 -= chunkImpl2.size();
                            if (i5 <= PdfTextSplitter.this.tolerance) {
                                break loop0;
                            } else {
                                i2--;
                            }
                        } else {
                            i3 = sentenceRecord.words().size() - 1;
                        }
                    }
                    while (i3 >= 0) {
                        WordRecord wordRecord = sentenceRecord.words().get(i3);
                        if (i4 == -1) {
                            if (wordRecord.tokens().size() < i5) {
                                ChunkImpl chunkImpl3 = new ChunkImpl(null, -1, -1, -1, -1);
                                chunkImpl3.add(wordRecord);
                                chunkImpl3.add(PdfTextSplitter.this.getWordSeparator(), wordRecord.paragraph());
                                arrayList.add(chunkImpl3);
                                i5 -= chunkImpl3.size();
                                if (i5 <= PdfTextSplitter.this.tolerance) {
                                    break loop0;
                                } else {
                                    i3--;
                                }
                            } else {
                                i4 = wordRecord.tokens().size() - 1;
                            }
                        }
                        while (i4 >= 0) {
                            ChunkImpl chunkImpl4 = new ChunkImpl(null, -1, -1, -1, -1);
                            chunkImpl4.add(wordRecord.tokens().get(i4), wordRecord.paragraph());
                            arrayList.add(chunkImpl4);
                            i5 -= chunkImpl4.size();
                            if (i5 <= PdfTextSplitter.this.tolerance) {
                                break loop0;
                            } else {
                                i4--;
                            }
                        }
                        i3--;
                    }
                    i2--;
                }
                i--;
            }
            Collections.reverse(arrayList);
            Iterator it = arrayList.iterator();
            while (it.hasNext()) {
                add((ChunkImpl) it.next());
            }
            this.chunkOverlap = this.size;
        }

        @Override // org.nasdanika.rag.core.PdfTextSplitter.Chunk
        public int size() {
            return this.size;
        }

        @Override // org.nasdanika.rag.core.PdfTextSplitter.Chunk
        public String getText() {
            return this.textBuilder.toString();
        }

        void add(String str, int i, EObject eObject) {
            this.textBuilder.append(str);
            this.size += i;
            if (this.size > PdfTextSplitter.this.size) {
                throw new IllegalStateException("Chunk size exceeded: " + this.size);
            }
            this.sources.add(eObject);
        }

        void add(String str, EObject eObject) {
            add(str, PdfTextSplitter.this.tokenizer.apply(str).size(), eObject);
        }

        void add(ParagraphRecord paragraphRecord) {
            if (!this.sourceRecords.add(Integer.valueOf(paragraphRecord.id()))) {
                throw new IllegalStateException("Duplicate source paragraph: " + paragraphRecord);
            }
            if (this.size > 0) {
                add(PdfTextSplitter.this.getParagraphSeparator(), paragraphRecord.paragraph());
            }
            add(paragraphRecord.text(), paragraphRecord.size(), paragraphRecord.paragraph());
            add(PdfTextSplitter.this.getParagraphSeparator(), PdfTextSplitter.this.tokenizer.apply(PdfTextSplitter.this.getParagraphSeparator()).size(), paragraphRecord.paragraph());
        }

        void add(SentenceRecord sentenceRecord) {
            if (!this.sourceRecords.add(Integer.valueOf(sentenceRecord.id()))) {
                throw new IllegalStateException("Duplicate source sentence: " + sentenceRecord);
            }
            add(sentenceRecord.text(), sentenceRecord.size(), sentenceRecord.paragraph());
        }

        void add(WordRecord wordRecord) {
            if (!this.sourceRecords.add(Integer.valueOf(wordRecord.id()))) {
                throw new IllegalStateException("Duplicate source word: " + wordRecord);
            }
            if (this.size > 0) {
                add(PdfTextSplitter.this.getWordSeparator(), null);
            }
            add(wordRecord.text(), wordRecord.tokens().size(), wordRecord.paragraph());
        }

        boolean isFull() {
            return this.size > PdfTextSplitter.this.size - PdfTextSplitter.this.tolerance;
        }

        void add(ChunkImpl chunkImpl) {
            add(chunkImpl.getText(), chunkImpl.size(), null);
            this.sources.addAll(chunkImpl.getSources());
            Iterator<Integer> it = chunkImpl.sourceRecords.iterator();
            while (it.hasNext()) {
                if (!this.sourceRecords.add(it.next())) {
                    throw new IllegalStateException("Duplicate source record in chunk: " + chunkImpl);
                }
            }
        }

        @Override // org.nasdanika.rag.core.PdfTextSplitter.Chunk
        public List<EObject> getSources() {
            return this.sources.stream().filter((v0) -> {
                return Objects.nonNull(v0);
            }).distinct().toList();
        }

        @Override // org.nasdanika.rag.core.PdfTextSplitter.Chunk
        public int overlap() {
            return this.chunkOverlap;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord.class */
    public static final class ParagraphRecord extends Record {
        private final int id;
        private final String text;
        private final int size;
        private final Paragraph paragraph;
        private final List<SentenceRecord> sentences;

        private ParagraphRecord(int i, String str, int i2, Paragraph paragraph, List<SentenceRecord> list) {
            this.id = i;
            this.text = str;
            this.size = i2;
            this.paragraph = paragraph;
            this.sentences = list;
        }

        @Override // java.lang.Record
        public final String toString() {
            return (String) ObjectMethods.bootstrap(MethodHandles.lookup(), "toString", MethodType.methodType(String.class, ParagraphRecord.class), ParagraphRecord.class, "id;text;size;paragraph;sentences", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->id:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->text:Ljava/lang/String;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->size:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->paragraph:Lorg/nasdanika/models/pdf/Paragraph;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->sentences:Ljava/util/List;").dynamicInvoker().invoke(this) /* invoke-custom */;
        }

        @Override // java.lang.Record
        public final int hashCode() {
            return (int) ObjectMethods.bootstrap(MethodHandles.lookup(), "hashCode", MethodType.methodType(Integer.TYPE, ParagraphRecord.class), ParagraphRecord.class, "id;text;size;paragraph;sentences", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->id:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->text:Ljava/lang/String;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->size:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->paragraph:Lorg/nasdanika/models/pdf/Paragraph;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->sentences:Ljava/util/List;").dynamicInvoker().invoke(this) /* invoke-custom */;
        }

        @Override // java.lang.Record
        public final boolean equals(Object obj) {
            return (boolean) ObjectMethods.bootstrap(MethodHandles.lookup(), "equals", MethodType.methodType(Boolean.TYPE, ParagraphRecord.class, Object.class), ParagraphRecord.class, "id;text;size;paragraph;sentences", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->id:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->text:Ljava/lang/String;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->size:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->paragraph:Lorg/nasdanika/models/pdf/Paragraph;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$ParagraphRecord;->sentences:Ljava/util/List;").dynamicInvoker().invoke(this, obj) /* invoke-custom */;
        }

        public int id() {
            return this.id;
        }

        public String text() {
            return this.text;
        }

        public int size() {
            return this.size;
        }

        public Paragraph paragraph() {
            return this.paragraph;
        }

        public List<SentenceRecord> sentences() {
            return this.sentences;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/nasdanika/rag/core/PdfTextSplitter$SentenceRecord.class */
    public static final class SentenceRecord extends Record {
        private final int id;
        private final String text;
        private final int size;
        private final Paragraph paragraph;
        private final List<WordRecord> words;

        private SentenceRecord(int i, String str, int i2, Paragraph paragraph, List<WordRecord> list) {
            this.id = i;
            this.text = str;
            this.size = i2;
            this.paragraph = paragraph;
            this.words = list;
        }

        @Override // java.lang.Record
        public final String toString() {
            return (String) ObjectMethods.bootstrap(MethodHandles.lookup(), "toString", MethodType.methodType(String.class, SentenceRecord.class), SentenceRecord.class, "id;text;size;paragraph;words", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->id:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->text:Ljava/lang/String;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->size:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->paragraph:Lorg/nasdanika/models/pdf/Paragraph;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->words:Ljava/util/List;").dynamicInvoker().invoke(this) /* invoke-custom */;
        }

        @Override // java.lang.Record
        public final int hashCode() {
            return (int) ObjectMethods.bootstrap(MethodHandles.lookup(), "hashCode", MethodType.methodType(Integer.TYPE, SentenceRecord.class), SentenceRecord.class, "id;text;size;paragraph;words", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->id:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->text:Ljava/lang/String;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->size:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->paragraph:Lorg/nasdanika/models/pdf/Paragraph;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->words:Ljava/util/List;").dynamicInvoker().invoke(this) /* invoke-custom */;
        }

        @Override // java.lang.Record
        public final boolean equals(Object obj) {
            return (boolean) ObjectMethods.bootstrap(MethodHandles.lookup(), "equals", MethodType.methodType(Boolean.TYPE, SentenceRecord.class, Object.class), SentenceRecord.class, "id;text;size;paragraph;words", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->id:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->text:Ljava/lang/String;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->size:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->paragraph:Lorg/nasdanika/models/pdf/Paragraph;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$SentenceRecord;->words:Ljava/util/List;").dynamicInvoker().invoke(this, obj) /* invoke-custom */;
        }

        public int id() {
            return this.id;
        }

        public String text() {
            return this.text;
        }

        public int size() {
            return this.size;
        }

        public Paragraph paragraph() {
            return this.paragraph;
        }

        public List<WordRecord> words() {
            return this.words;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/nasdanika/rag/core/PdfTextSplitter$WordRecord.class */
    public static final class WordRecord extends Record {
        private final int id;
        private final String text;
        private final List<String> tokens;
        private final Paragraph paragraph;

        private WordRecord(int i, String str, List<String> list, Paragraph paragraph) {
            this.id = i;
            this.text = str;
            this.tokens = list;
            this.paragraph = paragraph;
        }

        @Override // java.lang.Record
        public final String toString() {
            return (String) ObjectMethods.bootstrap(MethodHandles.lookup(), "toString", MethodType.methodType(String.class, WordRecord.class), WordRecord.class, "id;text;tokens;paragraph", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$WordRecord;->id:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$WordRecord;->text:Ljava/lang/String;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$WordRecord;->tokens:Ljava/util/List;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$WordRecord;->paragraph:Lorg/nasdanika/models/pdf/Paragraph;").dynamicInvoker().invoke(this) /* invoke-custom */;
        }

        @Override // java.lang.Record
        public final int hashCode() {
            return (int) ObjectMethods.bootstrap(MethodHandles.lookup(), "hashCode", MethodType.methodType(Integer.TYPE, WordRecord.class), WordRecord.class, "id;text;tokens;paragraph", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$WordRecord;->id:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$WordRecord;->text:Ljava/lang/String;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$WordRecord;->tokens:Ljava/util/List;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$WordRecord;->paragraph:Lorg/nasdanika/models/pdf/Paragraph;").dynamicInvoker().invoke(this) /* invoke-custom */;
        }

        @Override // java.lang.Record
        public final boolean equals(Object obj) {
            return (boolean) ObjectMethods.bootstrap(MethodHandles.lookup(), "equals", MethodType.methodType(Boolean.TYPE, WordRecord.class, Object.class), WordRecord.class, "id;text;tokens;paragraph", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$WordRecord;->id:I", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$WordRecord;->text:Ljava/lang/String;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$WordRecord;->tokens:Ljava/util/List;", "FIELD:Lorg/nasdanika/rag/core/PdfTextSplitter$WordRecord;->paragraph:Lorg/nasdanika/models/pdf/Paragraph;").dynamicInvoker().invoke(this, obj) /* invoke-custom */;
        }

        public int id() {
            return this.id;
        }

        public String text() {
            return this.text;
        }

        public List<String> tokens() {
            return this.tokens;
        }

        public Paragraph paragraph() {
            return this.paragraph;
        }
    }

    public PdfTextSplitter(int i, int i2, int i3, Function<String, List<String>> function) {
        this.size = i;
        this.overlap = i2;
        this.tolerance = i3;
        this.tokenizer = function;
    }

    protected List<String> splitIntoSentences(String str) {
        BreakIterator sentenceInstance = BreakIterator.getSentenceInstance(Locale.US);
        sentenceInstance.setText(str);
        int first = sentenceInstance.first();
        ArrayList arrayList = new ArrayList();
        int next = sentenceInstance.next();
        while (true) {
            int i = next;
            if (i == -1) {
                return arrayList;
            }
            arrayList.add(str.substring(first, i));
            first = i;
            next = sentenceInstance.next();
        }
    }

    protected List<String> splitIntoWords(String str) {
        return Arrays.asList(str.split("\\s+"));
    }

    protected String getWordSeparator() {
        return " ";
    }

    protected String getLineSeparator() {
        return System.lineSeparator();
    }

    protected String getParagraphSeparator() {
        return getLineSeparator() + getLineSeparator();
    }

    public List<Chunk> split(Document document) {
        int[] iArr = {0};
        List list = document.getPages().stream().flatMap(page -> {
            return page.getArticles().stream();
        }).flatMap(article -> {
            return article.getParagraphs().stream();
        }).map(paragraph -> {
            String text = paragraph.getText(getLineSeparator(), getWordSeparator());
            int i = iArr[0];
            iArr[0] = i + 1;
            return new ParagraphRecord(i, text, this.tokenizer.apply(text).size(), paragraph, splitIntoSentences(text).stream().map(str -> {
                int i2 = iArr[0];
                iArr[0] = i2 + 1;
                return new SentenceRecord(i2, str, this.tokenizer.apply(str).size(), paragraph, splitIntoWords(str).stream().map(str -> {
                    int i3 = iArr[0];
                    iArr[0] = i3 + 1;
                    return new WordRecord(i3, str, this.tokenizer.apply(str), paragraph);
                }).toList());
            }).toList());
        }).toList();
        LinkedList linkedList = new LinkedList();
        linkedList.add(new ChunkImpl(list, -1, -1, -1, -1));
        for (int i = 0; i < list.size(); i++) {
            ChunkImpl chunkImpl = (ChunkImpl) linkedList.getLast();
            ParagraphRecord paragraphRecord = (ParagraphRecord) list.get(i);
            if (paragraphRecord.size() + chunkImpl.size() + this.tokenizer.apply(getParagraphSeparator()).size() < this.size) {
                chunkImpl.add(paragraphRecord);
            } else {
                if (chunkImpl.isFull()) {
                    chunkImpl = new ChunkImpl(list, i - 1, -1, -1, -1);
                    linkedList.add(chunkImpl);
                    if (paragraphRecord.size() + chunkImpl.size() < this.size) {
                        chunkImpl.add(paragraphRecord);
                    }
                }
                for (int i2 = 0; i2 < paragraphRecord.sentences().size(); i2++) {
                    SentenceRecord sentenceRecord = paragraphRecord.sentences().get(i2);
                    if (sentenceRecord.size() + chunkImpl.size() < this.size) {
                        chunkImpl.add(sentenceRecord);
                    } else {
                        if (chunkImpl.isFull()) {
                            chunkImpl = new ChunkImpl(list, i, i2 - 1, -1, -1);
                            linkedList.add(chunkImpl);
                            if (sentenceRecord.size() + chunkImpl.size() < this.size) {
                                chunkImpl.add(sentenceRecord);
                            }
                        }
                        int size = this.tokenizer.apply(getWordSeparator()).size();
                        for (int i3 = 0; i3 < sentenceRecord.words().size(); i3++) {
                            WordRecord wordRecord = sentenceRecord.words().get(i3);
                            if (wordRecord.tokens().size() + chunkImpl.size() + size < this.size) {
                                chunkImpl.add(wordRecord);
                                chunkImpl.add(getWordSeparator(), wordRecord.paragraph());
                            } else {
                                if (chunkImpl.isFull()) {
                                    chunkImpl = new ChunkImpl(list, i, i2, i3 - 1, -1);
                                    linkedList.add(chunkImpl);
                                    if (wordRecord.tokens().size() + chunkImpl.size() + size < this.size) {
                                        chunkImpl.add(wordRecord);
                                        chunkImpl.add(getWordSeparator(), wordRecord.paragraph());
                                    }
                                }
                                int i4 = 0;
                                Iterator<String> it = wordRecord.tokens().iterator();
                                while (it.hasNext()) {
                                    chunkImpl.add(it.next(), 1, wordRecord.paragraph());
                                    if (chunkImpl.isFull()) {
                                        chunkImpl = new ChunkImpl(list, i, i2, i3, i4);
                                        linkedList.add(chunkImpl);
                                    } else {
                                        i4++;
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        return linkedList;
    }
}
