package com.hankcs.hanlp.mining.cluster;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.classification.utilities.io.ConsoleLogger;
import com.hankcs.hanlp.collection.trie.datrie.MutableDoubleArrayTrieInteger;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.utility.MathUtility;
import java.io.File;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;

/* loaded from: input_file:com/hankcs/hanlp/mining/cluster/ClusterAnalyzer.class */
public class ClusterAnalyzer<K> {
    protected HashMap<K, Document<K>> documents_ = new HashMap<>();
    protected Segment segment = HanLP.newSegment();
    protected MutableDoubleArrayTrieInteger vocabulary = new MutableDoubleArrayTrieInteger();
    static final int NUM_REFINE_LOOP = 30;

    protected int id(String str) {
        int i = this.vocabulary.get(str);
        if (i == -1) {
            i = this.vocabulary.size();
            this.vocabulary.put(str, i);
        }
        return i;
    }

    protected List<String> preprocess(String str) {
        List<Term> seg = this.segment.seg(str);
        ListIterator<Term> listIterator = seg.listIterator();
        while (listIterator.hasNext()) {
            Term next = listIterator.next();
            if (CoreStopWordDictionary.contains(next.word) || next.nature.startsWith("w")) {
                listIterator.remove();
            }
        }
        ArrayList arrayList = new ArrayList(seg.size());
        Iterator<Term> it = seg.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().word);
        }
        return arrayList;
    }

    protected SparseVector toVector(List<String> list) {
        SparseVector sparseVector = new SparseVector();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            int id = id(it.next());
            Double d = sparseVector.get((Object) Integer.valueOf(id));
            if (d == null) {
                sparseVector.put(Integer.valueOf(id), Double.valueOf(1.0d));
            } else {
                sparseVector.put(Integer.valueOf(id), Double.valueOf(d.doubleValue() + 1.0d));
            }
        }
        return sparseVector;
    }

    public Document<K> addDocument(K k, String str) {
        return addDocument((ClusterAnalyzer<K>) k, preprocess(str));
    }

    public Document<K> addDocument(K k, List<String> list) {
        return this.documents_.put(k, new Document<>(k, toVector(list)));
    }

    public List<Set<K>> kmeans(int i) {
        Cluster cluster = new Cluster();
        Iterator<Document<K>> it = this.documents_.values().iterator();
        while (it.hasNext()) {
            cluster.add_document(it.next());
        }
        cluster.section(i);
        refine_clusters(cluster.sectioned_clusters());
        ArrayList arrayList = new ArrayList(i);
        for (Cluster<K> cluster2 : cluster.sectioned_clusters()) {
            cluster2.refresh();
            arrayList.add(cluster2);
        }
        return toResult(arrayList);
    }

    private List<Set<K>> toResult(List<Cluster<K>> list) {
        ArrayList arrayList = new ArrayList(list.size());
        for (Cluster<K> cluster : list) {
            HashSet hashSet = new HashSet();
            Iterator<Document<K>> it = cluster.documents_.iterator();
            while (it.hasNext()) {
                hashSet.add(it.next().id_);
            }
            arrayList.add(hashSet);
        }
        return arrayList;
    }

    public List<Set<K>> repeatedBisection(int i) {
        return repeatedBisection(i, 0.0d);
    }

    public List<Set<K>> repeatedBisection(double d) {
        return repeatedBisection(0, d);
    }

    public List<Set<K>> repeatedBisection(int i, double d) {
        Cluster cluster = new Cluster();
        ArrayList arrayList = new ArrayList(i > 0 ? i : 16);
        Iterator<Document<K>> it = this.documents_.values().iterator();
        while (it.hasNext()) {
            cluster.add_document(it.next());
        }
        PriorityQueue priorityQueue = new PriorityQueue();
        cluster.section(2);
        refine_clusters(cluster.sectioned_clusters());
        cluster.set_sectioned_gain();
        cluster.composite_vector().clear();
        priorityQueue.add(cluster);
        while (!priorityQueue.isEmpty() && (i <= 0 || priorityQueue.size() < i)) {
            Cluster cluster2 = (Cluster) priorityQueue.peek();
            if (cluster2.sectioned_clusters().size() < 1 || (d > 0.0d && cluster2.sectioned_gain() < d)) {
                break;
            }
            priorityQueue.poll();
            for (Cluster<K> cluster3 : cluster2.sectioned_clusters()) {
                if (cluster3.size() >= 2) {
                    cluster3.section(2);
                    refine_clusters(cluster3.sectioned_clusters());
                    cluster3.set_sectioned_gain();
                    if (cluster3.sectioned_gain() < d) {
                        Iterator<Cluster<K>> it2 = cluster3.sectioned_clusters().iterator();
                        while (it2.hasNext()) {
                            it2.next().clear();
                        }
                    }
                    cluster3.composite_vector().clear();
                }
                priorityQueue.add(cluster3);
            }
        }
        while (!priorityQueue.isEmpty()) {
            arrayList.add(0, priorityQueue.poll());
        }
        return toResult(arrayList);
    }

    double refine_clusters(List<Cluster<K>> list) {
        double[] dArr = new double[list.size()];
        int i = 0;
        Iterator<Cluster<K>> it = list.iterator();
        while (it.hasNext()) {
            int i2 = i;
            i++;
            dArr[i2] = it.next().composite_vector().norm();
        }
        double d = 0.0d;
        int i3 = 0;
        while (true) {
            int i4 = i3;
            i3++;
            if (i4 >= NUM_REFINE_LOOP) {
                break;
            }
            ArrayList<int[]> arrayList = new ArrayList(this.documents_.size());
            for (int i5 = 0; i5 < list.size(); i5++) {
                for (int i6 = 0; i6 < list.get(i5).documents().size(); i6++) {
                    arrayList.add(new int[]{i5, i6});
                }
            }
            Collections.shuffle(arrayList);
            boolean z = false;
            for (int[] iArr : arrayList) {
                int i7 = iArr[0];
                int i8 = iArr[1];
                Cluster<K> cluster = list.get(i7);
                Document<K> document = cluster.documents().get(i8);
                double pow = Math.pow(dArr[i7], 2.0d) + refined_vector_value(cluster.composite_vector(), document.feature(), -1);
                double sqrt = pow > 0.0d ? Math.sqrt(pow) : 0.0d;
                double d2 = -1.0d;
                double d3 = 0.0d;
                int i9 = 0;
                for (int i10 = 0; i10 < list.size(); i10++) {
                    if (i7 != i10) {
                        double pow2 = Math.pow(dArr[i10], 2.0d) + refined_vector_value(list.get(i10).composite_vector(), document.feature(), 1);
                        double sqrt2 = pow2 > 0.0d ? Math.sqrt(pow2) : 0.0d;
                        double d4 = ((sqrt + sqrt2) - dArr[i7]) - dArr[i10];
                        if (d2 < d4) {
                            d2 = d4;
                            d3 = sqrt2;
                            i9 = i10;
                        }
                    }
                }
                if (d2 > 0.0d) {
                    d += d2;
                    list.get(i9).add_document(document);
                    list.get(i7).remove_document(i8);
                    dArr[i7] = sqrt;
                    dArr[i9] = d3;
                    z = true;
                }
            }
            if (!z) {
                break;
            }
            Iterator<Cluster<K>> it2 = list.iterator();
            while (it2.hasNext()) {
                it2.next().refresh();
            }
        }
        return d;
    }

    double refined_vector_value(SparseVector sparseVector, SparseVector sparseVector2, int i) {
        double d = 0.0d;
        for (Map.Entry<Integer, Double> entry : sparseVector2.entrySet()) {
            d += Math.pow(entry.getValue().doubleValue(), 2.0d) + (i * 2 * sparseVector.get((Object) entry.getKey()).doubleValue() * entry.getValue().doubleValue());
        }
        return d;
    }

    public static double evaluate(String str, String str2) {
        File[] listFiles;
        if (str == null) {
            throw new IllegalArgumentException("参数 folderPath == null");
        }
        File file = new File(str);
        if (!file.exists()) {
            throw new IllegalArgumentException(String.format("目录 %s 不存在", file.getAbsolutePath()));
        }
        if (!file.isDirectory()) {
            throw new IllegalArgumentException(String.format("目录 %s 不是一个目录", file.getAbsolutePath()));
        }
        ClusterAnalyzer clusterAnalyzer = new ClusterAnalyzer();
        File[] listFiles2 = file.listFiles();
        if (listFiles2 == null) {
            return 1.0d;
        }
        ConsoleLogger.logger.start("根目录:%s\n加载中...\n", str);
        int i = 0;
        int[] iArr = new int[listFiles2.length];
        String[] strArr = new String[listFiles2.length];
        int i2 = 0;
        for (File file2 : listFiles2) {
            if (!file2.isFile() && (listFiles = file2.listFiles()) != null) {
                String name = file2.getName();
                strArr[i2] = name;
                ConsoleLogger.logger.out("[%s]...", name);
                int length = listFiles.length;
                int ceil = (int) Math.ceil((length - 0) / 10000.0f);
                for (int i3 = 0; i3 < length; i3++) {
                    clusterAnalyzer.addDocument((ClusterAnalyzer) (file2.getName() + " " + listFiles[i3].getName()), IOUtil.readTxt(listFiles[i3].getAbsolutePath()));
                    if (i3 % ceil == 0) {
                        ConsoleLogger.logger.out("%c[%s]...%.2f%%", 13, name, Double.valueOf(MathUtility.percentage((i3 - 0) + 1, length - 0)));
                    }
                    i++;
                    int i4 = i2;
                    iArr[i4] = iArr[i4] + 1;
                }
                ConsoleLogger.logger.out(" %d 篇文档\n", Integer.valueOf(length - 0));
                i2++;
            }
        }
        ConsoleLogger.logger.finish(" 加载了 %d 个类目,共 %d 篇文档\n", Integer.valueOf(listFiles2.length), Integer.valueOf(i));
        ConsoleLogger.logger.start(str2 + "聚类中...", new Object[0]);
        List<Set<K>> kmeans = str2.replaceAll("[-\\s]", "").toLowerCase().equals("kmeans") ? clusterAnalyzer.kmeans(iArr.length) : clusterAnalyzer.repeatedBisection(iArr.length);
        ConsoleLogger.logger.finish(" 完毕。\n", new Object[0]);
        double[] dArr = new double[iArr.length];
        for (int i5 = 0; i5 < iArr.length; i5++) {
            Iterator<Set<K>> it = kmeans.iterator();
            while (it.hasNext()) {
                int i6 = 0;
                Iterator<K> it2 = it.next().iterator();
                while (it2.hasNext()) {
                    if (((String) it2.next()).startsWith(strArr[i5])) {
                        i6++;
                    }
                }
                if (i6 != 0) {
                    double size = i6 / r0.size();
                    double d = i6 / iArr[i5];
                    dArr[i5] = Math.max(dArr[i5], ((2.0d * size) * d) / (size + d));
                }
            }
        }
        double d2 = 0.0d;
        for (int i7 = 0; i7 < dArr.length; i7++) {
            d2 += (dArr[i7] * iArr[i7]) / i;
        }
        return d2;
    }
}
