package org.archive.crawler.url.canonicalize;

import java.util.regex.Pattern;

/* loaded from: input_file:site-search/heritrix/heritrix-1.12.1.jar:org/archive/crawler/url/canonicalize/StripWWWRule.class */
public class StripWWWRule extends BaseRule {
    private static final long serialVersionUID = -5416391108485746976L;
    private static final String DESCRIPTION = "Strip any 'www' found. Use this rule to equate 'http://www.archive.org/index.html' and 'http://archive.org/index.html'. The resulting canonicalization returns 'http://archive.org/index.html'.  It removes any www's found, except on URIs that have no path/query component ('slash' pages).  Operates on http and https schemes only. Use the more general StripWWWNRule if you want to strip both 'www' and 'www01', 'www02', etc.";
    private static final Pattern REGEX = Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$");

    public StripWWWRule(String str) {
        super(str, DESCRIPTION);
    }

    @Override // org.archive.crawler.url.CanonicalizationRule
    public String canonicalize(String str, Object obj) {
        return doStripRegexMatch(str, REGEX.matcher(str));
    }
}
