/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.crawl;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.CounterGroup;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.CrawlDb;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DeduplicationJob
extends NutchTool
implements Tool {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    protected static final Text urlKey = new Text("_URLTEMPKEY_");
    protected static final String DEDUPLICATION_GROUP_MODE = "deduplication.group.mode";
    protected static final String DEDUPLICATION_COMPARE_ORDER = "deduplication.compare.order";
    protected static final String UTF_8 = StandardCharsets.UTF_8.toString();

    public int run(String[] args) throws IOException {
        if (args.length < 1) {
            System.err.println("Usage: DeduplicationJob <crawldb> [-group <none|host|domain>] [-compareOrder <score>,<fetchTime>,<httpsOverHttp>,<urlLength>]");
            return 1;
        }
        String group = "none";
        Path crawlDb = new Path(args[0]);
        String compareOrder = "score,fetchTime,urlLength";
        for (int i = 1; i < args.length; ++i) {
            if (args[i].equals("-group")) {
                group = args[++i];
            }
            if (!args[i].equals("-compareOrder") || (compareOrder = args[++i]).indexOf("score") != -1 && compareOrder.indexOf("fetchTime") != -1 && compareOrder.indexOf("urlLength") != -1) continue;
            System.err.println("DeduplicationJob: compareOrder must contain score, fetchTime and urlLength.");
            return 1;
        }
        StopWatch stopWatch = new StopWatch();
        stopWatch.start();
        LOG.info("DeduplicationJob: starting");
        Path tempDir = new Path(crawlDb, "dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        Job job = Job.getInstance((Configuration)this.getConf(), (String)("Nutch DeduplicationJob: " + String.valueOf(crawlDb)));
        Configuration conf = job.getConfiguration();
        conf.set(DEDUPLICATION_GROUP_MODE, group);
        conf.set(DEDUPLICATION_COMPARE_ORDER, compareOrder);
        job.setJarByClass(DeduplicationJob.class);
        FileInputFormat.addInputPath((Job)job, (Path)new Path(crawlDb, "current"));
        job.setInputFormatClass(SequenceFileInputFormat.class);
        FileOutputFormat.setOutputPath((Job)job, (Path)tempDir);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setMapOutputKeyClass(BytesWritable.class);
        job.setMapOutputValueClass(CrawlDatum.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        job.setMapperClass(DBFilter.class);
        job.setReducerClass(DedupReducer.class);
        FileSystem fs = tempDir.getFileSystem(this.getConf());
        try {
            boolean success = job.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("Crawl", job);
                LOG.error(message);
                fs.delete(tempDir, true);
                throw new RuntimeException(message);
            }
            CounterGroup g = (CounterGroup)job.getCounters().getGroup("DeduplicationJobStatus");
            if (g != null) {
                Counter counter = g.findCounter("Documents marked as duplicate");
                long dups = counter.getValue();
                LOG.info("Deduplication: {} documents marked as duplicates", (Object)dups);
            }
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error("DeduplicationJob:", (Throwable)e);
            fs.delete(tempDir, true);
            return -1;
        }
        LOG.info("Deduplication: Updating status of duplicate urls into crawl db.");
        Job mergeJob = CrawlDb.createJob(this.getConf(), crawlDb);
        FileInputFormat.addInputPath((Job)mergeJob, (Path)tempDir);
        mergeJob.setReducerClass(StatusUpdateReducer.class);
        mergeJob.setJarByClass(DeduplicationJob.class);
        fs = crawlDb.getFileSystem(this.getConf());
        Path outPath = FileOutputFormat.getOutputPath((JobContext)job);
        Path lock = CrawlDb.lock(this.getConf(), crawlDb, false);
        try {
            boolean success = mergeJob.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("Crawl", mergeJob);
                LOG.error(message);
                fs.delete(tempDir, true);
                NutchJob.cleanupAfterFailure(outPath, lock, fs);
                throw new RuntimeException(message);
            }
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error("DeduplicationMergeJob:", (Throwable)e);
            fs.delete(tempDir, true);
            NutchJob.cleanupAfterFailure(outPath, lock, fs);
            return -1;
        }
        CrawlDb.install(mergeJob, crawlDb);
        fs.delete(tempDir, true);
        stopWatch.stop();
        LOG.info("Deduplication finished, elapsed: {} ms", (Object)stopWatch.getTime(TimeUnit.MILLISECONDS));
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int result = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new DeduplicationJob(), (String[])args);
        System.exit(result);
    }

    @Override
    public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
        HashMap<String, Object> results = new HashMap<String, Object>();
        String[] arg = new String[1];
        Object crawldb = args.containsKey("crawldb") ? (String)args.get("crawldb") : crawlId + "/crawldb";
        arg[0] = crawldb;
        int res = this.run(arg);
        results.put("result", Integer.toString(res));
        return results;
    }

    public static class StatusUpdateReducer
    extends Reducer<Text, CrawlDatum, Text, CrawlDatum> {
        private CrawlDatum old = new CrawlDatum();
        private CrawlDatum duplicate = new CrawlDatum();

        public void setup(Reducer.Context context) {
        }

        public void reduce(Text key, Iterable<CrawlDatum> values, Reducer.Context context) throws IOException, InterruptedException {
            boolean duplicateSet = false;
            for (CrawlDatum val : values) {
                if (val.getStatus() == 7) {
                    this.duplicate.set(val);
                    duplicateSet = true;
                    continue;
                }
                this.old.set(val);
            }
            if (duplicateSet) {
                context.write((Object)key, (Object)this.duplicate);
                return;
            }
            context.write((Object)key, (Object)this.old);
        }
    }

    public static class DedupReducer<K extends Writable>
    extends Reducer<K, CrawlDatum, Text, CrawlDatum> {
        protected String[] compareOrder;

        public void setup(Reducer.Context context) {
            Configuration conf = context.getConfiguration();
            this.compareOrder = conf.get(DeduplicationJob.DEDUPLICATION_COMPARE_ORDER).split(",");
        }

        protected void writeOutAsDuplicate(CrawlDatum datum, Reducer.Context context) throws IOException, InterruptedException {
            datum.setStatus(7);
            Text key = (Text)datum.getMetaData().remove((Object)urlKey);
            context.getCounter("DeduplicationJobStatus", "Documents marked as duplicate").increment(1L);
            context.write((Object)key, (Object)datum);
        }

        public void reduce(K key, Iterable<CrawlDatum> values, Reducer.Context context) throws IOException, InterruptedException {
            CrawlDatum existingDoc = null;
            for (CrawlDatum newDoc : values) {
                if (existingDoc == null) {
                    existingDoc = new CrawlDatum();
                    existingDoc.set(newDoc);
                    continue;
                }
                CrawlDatum duplicate = this.getDuplicate(existingDoc, newDoc);
                if (duplicate == null) continue;
                this.writeOutAsDuplicate(duplicate, context);
                if (duplicate != existingDoc) continue;
                existingDoc.set(newDoc);
            }
        }

        protected CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc) {
            block16: for (int i = 0; i < this.compareOrder.length; ++i) {
                switch (this.compareOrder[i]) {
                    case "score": {
                        if (existingDoc.getScore() < newDoc.getScore()) {
                            return existingDoc;
                        }
                        if (!(existingDoc.getScore() > newDoc.getScore())) continue block16;
                        return newDoc;
                    }
                    case "fetchTime": {
                        if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
                            return newDoc;
                        }
                        if (existingDoc.getFetchTime() >= newDoc.getFetchTime()) continue block16;
                        return existingDoc;
                    }
                    case "httpsOverHttp": {
                        String url1 = existingDoc.getMetaData().get((Object)urlKey).toString();
                        String url2 = newDoc.getMetaData().get((Object)urlKey).toString();
                        if (url1.startsWith("https://") && url2.startsWith("http://") && url1.substring(8).equals(url2.substring(7))) {
                            return newDoc;
                        }
                        if (!url2.startsWith("https://") || !url1.startsWith("http://") || !url2.substring(8).equals(url1.substring(7))) continue block16;
                        return existingDoc;
                    }
                    case "urlLength": {
                        String urlExisting = existingDoc.getMetaData().get((Object)urlKey).toString();
                        String urlnewDoc = newDoc.getMetaData().get((Object)urlKey).toString();
                        try {
                            urlExisting = URLDecoder.decode(urlExisting, UTF_8);
                        }
                        catch (UnsupportedEncodingException | IllegalArgumentException e) {
                            LOG.error("Error decoding: {}", (Object)urlExisting, (Object)e);
                        }
                        try {
                            urlnewDoc = URLDecoder.decode(urlnewDoc, UTF_8);
                        }
                        catch (UnsupportedEncodingException | IllegalArgumentException e) {
                            LOG.error("Error decoding: {}", (Object)urlnewDoc, (Object)e);
                        }
                        if (urlExisting.length() < urlnewDoc.length()) {
                            return newDoc;
                        }
                        if (urlExisting.length() <= urlnewDoc.length()) continue block16;
                        return existingDoc;
                    }
                }
            }
            return null;
        }
    }

    public static class DBFilter
    extends Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
        private String groupMode;

        public void setup(Mapper.Context context) {
            Configuration conf = context.getConfiguration();
            this.groupMode = conf.get(DeduplicationJob.DEDUPLICATION_GROUP_MODE);
        }

        public void map(Text key, CrawlDatum value, Mapper.Context context) throws IOException, InterruptedException {
            if (value.getStatus() == 2 || value.getStatus() == 6) {
                byte[] signature = value.getSignature();
                if (signature == null) {
                    return;
                }
                String url = key.toString();
                BytesWritable sig = null;
                switch (this.groupMode) {
                    case "none": {
                        sig = new BytesWritable(signature);
                        break;
                    }
                    case "host": {
                        byte[] host = URLUtil.getHost(url).getBytes();
                        byte[] data = new byte[signature.length + host.length];
                        System.arraycopy(signature, 0, data, 0, signature.length);
                        System.arraycopy(host, 0, data, signature.length, host.length);
                        sig = new BytesWritable(data);
                        break;
                    }
                    case "domain": {
                        byte[] domain = URLUtil.getDomainName(url).getBytes();
                        byte[] data = new byte[signature.length + domain.length];
                        System.arraycopy(signature, 0, data, 0, signature.length);
                        System.arraycopy(domain, 0, data, signature.length, domain.length);
                        sig = new BytesWritable(data);
                    }
                }
                value.getMetaData().put((Writable)urlKey, (Writable)key);
                context.write((Object)sig, (Object)value);
            }
        }
    }
}

