package org.maochen.nlp.app.ner;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.maochen.nlp.app.featextractor.BrownFeatExtractor;
import org.maochen.nlp.app.featextractor.IFeatureExtractor;
import org.maochen.nlp.ml.SequenceTuple;
import org.maochen.nlp.ml.Tuple;
import org.maochen.nlp.ml.vector.FeatNamedVector;

/* loaded from: input_file:org/maochen/nlp/app/ner/NERFeatureExtractor.class */
public class NERFeatureExtractor implements IFeatureExtractor {
    private static String getWordShape(String str) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < str.length(); i++) {
            if (Character.isUpperCase(str.charAt(i))) {
                sb.append("X");
            } else {
                sb.append("x");
            }
        }
        return sb.toString().trim();
    }

    public List<String> extractFeatSingle(int i, String[] strArr) {
        ArrayList arrayList = new ArrayList();
        for (int max = Math.max(0, i - 2); max < Math.min(i + 3, strArr.length); max++) {
            IFeatureExtractor.addFeat(arrayList, "w" + (max - i), strArr[max]);
            IFeatureExtractor.addFeat(arrayList, "word_length", String.valueOf(strArr[max].length()));
            IFeatureExtractor.addFeat(arrayList, "word_shape", getWordShape(strArr[max]));
            boolean find = Pattern.compile("\\d+").matcher(strArr[max]).find();
            boolean find2 = Pattern.compile("\\d{2}").matcher(strArr[max]).find();
            boolean find3 = Pattern.compile("\\d{4}").matcher(strArr[max]).find();
            boolean find4 = Pattern.compile("[%|,|.|/|-]").matcher(strArr[max]).find();
            boolean z = find4 && find;
            if (find4) {
                IFeatureExtractor.addFeat(arrayList, "contains_char", new String[0]);
            }
            if (find) {
                IFeatureExtractor.addFeat(arrayList, "contains_digit", new String[0]);
            }
            if (find2) {
                IFeatureExtractor.addFeat(arrayList, "contains_two_digit", new String[0]);
            }
            if (find3) {
                IFeatureExtractor.addFeat(arrayList, "contains_four_digit", new String[0]);
            }
            if (z) {
                IFeatureExtractor.addFeat(arrayList, "contains_digit_char", new String[0]);
            }
            if (max == i - 1) {
                IFeatureExtractor.addFeat(arrayList, "w-10", strArr[i - 1], strArr[i]);
            } else if (max == i + 1) {
                IFeatureExtractor.addFeat(arrayList, "w0+1", strArr[i], strArr[i + 1]);
            }
        }
        arrayList.addAll(BrownFeatExtractor.extractBrownFeat(i, -2, 2, strArr));
        return arrayList;
    }

    @Override // org.maochen.nlp.app.featextractor.IFeatureExtractor
    public List<Tuple> extractFeat(SequenceTuple sequenceTuple) {
        String[] strArr = (String[]) sequenceTuple.entries.stream().map(tuple -> {
            return tuple.vector.featsName[0];
        }).toArray(i -> {
            return new String[i];
        });
        List list = (List) IntStream.range(0, strArr.length).mapToObj(i2 -> {
            return extractFeatSingle(i2, strArr);
        }).collect(Collectors.toList());
        ArrayList arrayList = new ArrayList();
        for (int i3 = 0; i3 < list.size(); i3++) {
            Tuple tuple2 = new Tuple(new FeatNamedVector((String[]) ((List) list.get(i3)).stream().toArray(i4 -> {
                return new String[i4];
            })));
            tuple2.label = ((Tuple) sequenceTuple.entries.get(i3)).label;
            arrayList.add(tuple2);
        }
        return arrayList;
    }
}
