/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.indexer.links;

import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlink;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class LinksIndexingFilter
implements IndexingFilter {
    public static final String LINKS_OUTLINKS_HOST = "index.links.outlinks.host.ignore";
    public static final String LINKS_INLINKS_HOST = "index.links.inlinks.host.ignore";
    public static final String LINKS_ONLY_HOSTS = "index.links.hosts.only";
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private Configuration conf;
    private boolean filterOutlinks;
    private boolean filterInlinks;
    private boolean indexHost;

    public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
        Outlink[] outlinks = parse.getData().getOutlinks();
        if (outlinks != null) {
            HashSet<String> hosts = new HashSet<String>();
            for (Outlink outlink : outlinks) {
                try {
                    String linkUrl = outlink.getToUrl();
                    String outHost = new URL(linkUrl).getHost().toLowerCase();
                    if (this.indexHost) {
                        linkUrl = outHost;
                        if (hosts.contains(linkUrl)) continue;
                        hosts.add(linkUrl);
                    }
                    this.addFilteredLink("outlinks", url.toString(), linkUrl, outHost, this.filterOutlinks, doc);
                }
                catch (MalformedURLException e) {
                    LOG.error("Malformed URL in {}: {}", (Object)url, (Object)e.getMessage());
                }
            }
        }
        if (null != inlinks) {
            Iterator iterator = inlinks.iterator();
            HashSet<String> inlinkHosts = new HashSet<String>();
            while (iterator.hasNext()) {
                try {
                    Inlink link = (Inlink)iterator.next();
                    String linkUrl = link.getFromUrl();
                    String inHost = new URL(linkUrl).getHost().toLowerCase();
                    if (this.indexHost) {
                        linkUrl = inHost;
                        if (inlinkHosts.contains(linkUrl)) continue;
                        inlinkHosts.add(linkUrl);
                    }
                    this.addFilteredLink("inlinks", url.toString(), linkUrl, inHost, this.filterInlinks, doc);
                }
                catch (MalformedURLException e) {
                    LOG.error("Malformed URL in {}: {}", (Object)url, (Object)e.getMessage());
                }
            }
        }
        return doc;
    }

    private void addFilteredLink(String fieldName, String url, String linkUrl, String urlHost, boolean filter, NutchDocument doc) throws MalformedURLException {
        if (filter) {
            String host = new URL(url.toString()).getHost().toLowerCase();
            if (!host.equalsIgnoreCase(urlHost)) {
                doc.add(fieldName, (Object)linkUrl);
            }
        } else {
            doc.add(fieldName, (Object)linkUrl);
        }
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
        this.filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false);
        this.filterInlinks = conf.getBoolean(LINKS_INLINKS_HOST, false);
        this.indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false);
    }

    public Configuration getConf() {
        return this.conf;
    }
}

