package com.gengoai.hermes.zh;

import com.gengoai.apollo.data.observation.Observation;
import com.gengoai.apollo.data.observation.Variable;
import com.gengoai.apollo.data.observation.VariableCollectionSequence;
import com.gengoai.apollo.data.observation.VariableList;
import com.gengoai.apollo.feature.ObservationExtractor;
import com.gengoai.hermes.HString;
import com.gengoai.hermes.lexicon.WordList;
import com.gengoai.string.CharMatcher;
import com.gengoai.string.Strings;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import lombok.NonNull;

/* loaded from: input_file:com/gengoai/hermes/zh/ZHSegmentationExtractor.class */
public class ZHSegmentationExtractor implements ObservationExtractor<HString> {

    @NonNull
    private final WordList dictionary;
    private final int windowSize;
    private static final Set<Character> zhDigits = Set.of((Object[]) new Character[]{(char) 22777, (char) 19968, '1', (char) 36144, (char) 20108, '2', (char) 21441, (char) 19977, '3', (char) 32902, (char) 22235, '4', (char) 20237, (char) 20116, '5', (char) 38470, (char) 20845, '6', (char) 26578, (char) 19971, '7', (char) 25420, (char) 20843, '8', (char) 29590, (char) 20037, '9', (char) 38646, (char) 12295, '0', (char) 25342, (char) 21313, (char) 20336, (char) 30334, (char) 20191, (char) 21315, (char) 19975, (char) 20159, (char) 20806});
    private static final Set<String> dates = Set.of("月", "日电", "日", "日晚", "年");
    private static final Set<String> zhCounters = Set.of("个");

    public ZHSegmentationExtractor(WordList wordList, int i) {
        this.dictionary = wordList;
        this.windowSize = i;
    }

    private boolean isDigit(String str) {
        if (Strings.isDigit(str)) {
            return true;
        }
        return str.chars().allMatch(i -> {
            return zhDigits.contains(Character.valueOf((char) i));
        });
    }

    private String charType(String str) {
        return isDigit(str) ? "#" : dates.contains(str) ? "D" : zhCounters.contains(str) ? "C" : CharMatcher.Ideographic.matchesAllOf(str) ? "I" : Strings.isAlphaNumeric(str) ? "L" : Strings.isPunctuation(str) ? "P" : Strings.isNullOrBlank(str) ? "W" : "X";
    }

    private double isPunct(String str) {
        return Strings.isPunctuation(str) ? 1.0d : 0.0d;
    }

    private List<Variable> generate(int[] iArr, String[] strArr, String[] strArr2) {
        return List.of(Variable.binary((String) IntStream.of(iArr).mapToObj(i -> {
            return i <= this.windowSize ? "char[" + (i - this.windowSize) + "]" : "char[" + (this.windowSize - i) + "]";
        }).collect(Collectors.joining(",")), (String) IntStream.of(iArr).mapToObj(i2 -> {
            return strArr[i2];
        }).collect(Collectors.joining(","))), Variable.binary((String) IntStream.of(iArr).mapToObj(i3 -> {
            return i3 <= this.windowSize ? "type[-" + (i3 - this.windowSize) + "]" : "type[" + (this.windowSize - i3) + "]";
        }).collect(Collectors.joining(",")), (String) IntStream.of(iArr).mapToObj(i4 -> {
            return strArr2[i4];
        }).collect(Collectors.joining(","))));
    }

    public Observation extractObservation(@NonNull HString hString) {
        if (hString == null) {
            throw new NullPointerException("input is marked non-null but is null");
        }
        VariableCollectionSequence variableCollectionSequence = new VariableCollectionSequence();
        int i = this.windowSize;
        int i2 = i + 1;
        int i3 = (i * 2) + 1;
        for (int i4 = 0; i4 < hString.length(); i4++) {
            VariableList variableList = new VariableList();
            String[] strArr = new String[i3];
            String[] strArr2 = new String[i3];
            int i5 = 0;
            int i6 = i4 - i;
            while (i6 <= i4 + i) {
                if (i6 < 0) {
                    strArr[i5] = "BOS";
                    strArr2[i5] = "BOS";
                } else if (i6 >= hString.length()) {
                    strArr[i5] = "EOS";
                    strArr2[i5] = "EOS";
                } else {
                    strArr[i5] = Character.toString(hString.charAt(i6));
                    strArr2[i5] = charType(strArr[i5]);
                }
                i6++;
                i5++;
            }
            variableList.add(Variable.binary("char[0]", strArr[i]));
            variableList.add(Variable.binary("type[0]", strArr2[i]));
            for (int i7 = 0; i7 < i3; i7++) {
                if (i7 != i) {
                    for (int i8 = i7 + 1; i8 <= this.windowSize; i8++) {
                        variableList.addAll(generate(IntStream.range(i7, i8).toArray(), strArr, strArr2));
                    }
                }
            }
            variableCollectionSequence.add(variableList);
        }
        return variableCollectionSequence;
    }
}
