package org.carrot2.clustering.synthetic;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang.ArrayUtils;
import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.ProcessingComponentBase;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.CommonAttributes;
import org.carrot2.core.attribute.Internal;
import org.carrot2.core.attribute.Processing;
import org.carrot2.shaded.guava.common.collect.LinkedHashMultimap;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Output;

@Bindable(inherit = {CommonAttributes.class})
@Label("By URL Clustering")
/* loaded from: input_file:org/carrot2/clustering/synthetic/ByUrlClusteringAlgorithm.class */
public class ByUrlClusteringAlgorithm extends ProcessingComponentBase implements IClusteringAlgorithm {
    private static final Set<String> STOP_URL_PARTS = new HashSet();

    @Processing
    @Input
    @Internal
    @Attribute(key = "documents", inherit = true)
    public List<Document> documents;

    @Processing
    @Output
    @Internal
    @Attribute(key = "clusters", inherit = true)
    public List<Cluster> clusters = null;

    @Override // org.carrot2.core.ProcessingComponentBase, org.carrot2.core.IProcessingComponent
    public void process() throws ProcessingException {
        Document[] documentArr = (Document[]) this.documents.toArray(new Document[this.documents.size()]);
        String[][] buildUrlParts = buildUrlParts(documentArr);
        ArrayList arrayList = new ArrayList(documentArr.length);
        for (int i = 0; i < documentArr.length; i++) {
            arrayList.add(Integer.valueOf(i));
        }
        this.clusters = createClusters(documentArr, arrayList, buildUrlParts, 0, "");
        if (this.clusters.size() == 0) {
            Cluster.appendOtherTopics(this.documents, this.clusters, "Other Sites");
        }
    }

    private List<Cluster> createClusters(Document[] documentArr, Collection<Integer> collection, String[][] strArr, int i, String str) {
        LinkedHashMultimap create = LinkedHashMultimap.create();
        for (Integer num : collection) {
            String[] strArr2 = strArr[num.intValue()];
            if (strArr2 != null && strArr2.length > i && !STOP_URL_PARTS.contains(strArr2[i])) {
                create.put(strArr2[i], num);
            }
        }
        LinkedHashSet linkedHashSet = new LinkedHashSet();
        ArrayList arrayList = new ArrayList();
        for (String str2 : create.keySet()) {
            Collection<Integer> collection2 = create.get(str2);
            if (collection2.size() > 1) {
                Cluster cluster = new Cluster();
                String str3 = str2 + (str.length() > 0 ? "." + str : "");
                List<Cluster> createClusters = createClusters(documentArr, collection2, strArr, i + 1, str3);
                if (createClusters.size() > 1) {
                    cluster.addSubclusters(createClusters);
                } else if (createClusters.size() == 1) {
                    Cluster cluster2 = createClusters.get(0);
                    str3 = cluster2.getPhrases().get(0);
                    cluster.addDocuments(cluster2.getDocuments());
                    cluster.addSubclusters(cluster2.getSubclusters());
                } else {
                    Iterator<Integer> it = collection2.iterator();
                    while (it.hasNext()) {
                        cluster.addDocuments(documentArr[it.next().intValue()]);
                    }
                }
                cluster.addPhrases(str3);
                arrayList.add(cluster);
                linkedHashSet.addAll(collection2);
            }
        }
        if (linkedHashSet.isEmpty()) {
            return Lists.newArrayList();
        }
        Collections.sort(arrayList, Cluster.BY_REVERSED_SIZE_AND_LABEL_COMPARATOR);
        ArrayList newArrayListWithExpectedSize = Lists.newArrayListWithExpectedSize(collection.size());
        Iterator<Integer> it2 = collection.iterator();
        while (it2.hasNext()) {
            newArrayListWithExpectedSize.add(documentArr[it2.next().intValue()]);
        }
        Cluster.appendOtherTopics(newArrayListWithExpectedSize, arrayList, "Other Sites");
        return arrayList;
    }

    /* JADX WARN: Type inference failed for: r0v2, types: [java.lang.String[], java.lang.String[][]] */
    final String[][] buildUrlParts(Document[] documentArr) {
        int i;
        ?? r0 = new String[documentArr.length];
        for (int i2 = 0; i2 < documentArr.length; i2++) {
            String str = (String) documentArr[i2].getField(Document.CONTENT_URL);
            if (str != null) {
                int indexOf = str.indexOf("://");
                if (indexOf < 0) {
                    i = 0;
                } else if (indexOf + 3 < str.length()) {
                    i = indexOf + 3;
                }
                int indexOf2 = str.indexOf(47, i + 3);
                if (indexOf2 < 0) {
                    indexOf2 = str.length();
                }
                String[] split = str.substring(i, indexOf2).toLowerCase().split("\\.");
                ArrayUtils.reverse(split);
                r0[i2] = split;
            }
        }
        return r0;
    }

    static {
        STOP_URL_PARTS.add("www");
    }
}
