package it.unimi.dsi.big.mg4j.tool;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.dsi.big.mg4j.document.Document;
import it.unimi.dsi.big.mg4j.document.TRECDocumentCollection;
import it.unimi.dsi.big.mg4j.util.MG4JClassParser;
import it.unimi.dsi.big.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.big.util.StringMap;
import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.mph.MWHCFunction;
import it.unimi.dsi.sux4j.mph.TwoStepsLcpMonotoneMinimalPerfectHashFunction;
import it.unimi.dsi.util.BloomFilter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.log4j.Logger;

/* loaded from: input_file:it/unimi/dsi/big/mg4j/tool/URLMPHVirtualDocumentResolver.class */
public class URLMPHVirtualDocumentResolver implements VirtualDocumentResolver {
    private static final long serialVersionUID = 1;
    private static final Logger LOGGER = Logger.getLogger(URLMPHVirtualDocumentResolver.class);
    private final StringMap<? extends CharSequence> url2DocumentPointer;
    private transient URI documentURI;

    public URLMPHVirtualDocumentResolver(StringMap<? extends CharSequence> stringMap) {
        this.url2DocumentPointer = stringMap;
    }

    @Override // it.unimi.dsi.big.mg4j.tool.VirtualDocumentResolver
    public void context(Document document) {
        try {
            this.documentURI = new URI(document.uri().toString()).normalize();
        } catch (URISyntaxException e) {
            this.documentURI = null;
        }
    }

    @Override // it.unimi.dsi.big.mg4j.tool.VirtualDocumentResolver
    public long resolve(CharSequence charSequence) {
        try {
            URI normalize = URI.create(charSequence.toString()).normalize();
            if (!normalize.isAbsolute()) {
                if (this.documentURI == null) {
                    return -1L;
                }
                normalize = this.documentURI.resolve(normalize);
            }
            return this.url2DocumentPointer.getLong(normalize.toString());
        } catch (Exception e) {
            return -1L;
        }
    }

    @Override // it.unimi.dsi.big.mg4j.tool.VirtualDocumentResolver
    public long numberOfDocuments() {
        return this.url2DocumentPointer.size64();
    }

    private static void makeUnique(BloomFilter bloomFilter, MutableString mutableString) {
        while (!bloomFilter.add(mutableString)) {
            LOGGER.debug("Duplicate URI " + mutableString);
            mutableString.append('/').append(RandomStringUtils.randomAlphanumeric(32));
        }
    }

    public static void main(String[] strArr) throws JSAPException, IOException {
        ArrayList fileLinesCollection;
        SimpleJSAP simpleJSAP = new SimpleJSAP(URLMPHVirtualDocumentResolver.class.getName(), "Builds a URL document resolver from a sequence of URIs, extracted typically using ScanMetadata, using a suitable function. You can specify that the list is sorted, in which case it is possible to generate a resolver that occupies less space.", new Parameter[]{new Switch("sorted", 's', "sorted", "URIs are sorted: use a monotone minimal perfect hash function."), new Switch("iso", 'i', "iso", "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)."), new FlaggedOption("bufferSize", JSAP.INTSIZE_PARSER, TRECDocumentCollection.DEFAULT_BUFFER_SIZE, false, 'b', "buffer-size", "The size of the I/O buffer used to read terms."), new FlaggedOption("class", MG4JClassParser.getParser(), JSAP.NO_DEFAULT, false, 'c', "class", "A class used to create the function from URIs to their ranks; defaults to it.unimi.dsi.sux4j.mph.MHWCFunction for non-sorted inputs, and to it.unimi.dsi.sux4j.mph.TwoStepsLcpMonotoneMinimalPerfectHashFunction for sorted inputs."), new FlaggedOption("width", JSAP.INTEGER_PARSER, Integer.toString(64), false, 'w', "width", "The width, in bits, of the signatures used to sign the function from URIs to their rank."), new FlaggedOption("termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'o', "offline", "Read terms from this file (without loading them into core memory) instead of standard input."), new FlaggedOption("uniqueUris", JSAP.INTSIZE_PARSER, JSAP.NO_DEFAULT, false, 'U', "unique-uris", "Force URIs to be unique by adding random garbage at the end of duplicates; the argument is an upper bound for the number of URIs that will be read, and will be used to create a Bloom filter."), new UnflaggedOption("resolver", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, false, "The filename for the resolver.")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            return;
        }
        int i = parse.getInt("bufferSize");
        String string = parse.getString("resolver");
        boolean z = parse.getBoolean("iso");
        String string2 = parse.getString("termFile");
        BloomFilter bloomFilter = null;
        boolean userSpecified = parse.userSpecified("uniqueUris");
        if (userSpecified) {
            bloomFilter = new BloomFilter(parse.getInt("uniqueUris"));
        }
        if (string2 == null) {
            ArrayList arrayList = new ArrayList();
            ProgressLogger progressLogger = new ProgressLogger();
            progressLogger.itemsName = "URIs";
            LineIterator lineIterator = new LineIterator(new FastBufferedReader(new InputStreamReader(System.in, "UTF-8"), i), progressLogger);
            progressLogger.start("Reading URIs...");
            while (lineIterator.hasNext()) {
                MutableString next = lineIterator.next();
                if (userSpecified) {
                    makeUnique(bloomFilter, next);
                }
                arrayList.add(next.copy());
            }
            progressLogger.done();
            fileLinesCollection = arrayList;
        } else {
            if (userSpecified) {
                ProgressLogger progressLogger2 = new ProgressLogger();
                progressLogger2.itemsName = "URIs";
                progressLogger2.start("Copying URIs...");
                LineIterator lineIterator2 = new LineIterator(new FastBufferedReader(new InputStreamReader(new FileInputStream(string2)), i), progressLogger2);
                File createTempFile = File.createTempFile(URLMPHVirtualDocumentResolver.class.getName(), ".uniqueuris");
                createTempFile.deleteOnExit();
                string2 = createTempFile.toString();
                FastBufferedOutputStream fastBufferedOutputStream = new FastBufferedOutputStream(new FileOutputStream(string2), i);
                while (lineIterator2.hasNext()) {
                    MutableString next2 = lineIterator2.next();
                    makeUnique(bloomFilter, next2);
                    next2.writeUTF8(fastBufferedOutputStream);
                    fastBufferedOutputStream.write(10);
                }
                progressLogger2.done();
                fastBufferedOutputStream.close();
            }
            fileLinesCollection = new FileLinesCollection(string2, "UTF-8");
        }
        LOGGER.debug("Building function...");
        int i2 = parse.getInt("width");
        if (parse.getBoolean("sorted")) {
            BinIO.storeObject(new URLMPHVirtualDocumentResolver(new ShiftAddXorSignedStringMap(fileLinesCollection.iterator(), new TwoStepsLcpMonotoneMinimalPerfectHashFunction(fileLinesCollection, z ? TransformationStrategies.prefixFreeIso() : TransformationStrategies.prefixFreeUtf16()), i2)), string);
        } else {
            BinIO.storeObject(new URLMPHVirtualDocumentResolver(new ShiftAddXorSignedStringMap(fileLinesCollection.iterator(), new MWHCFunction(fileLinesCollection, z ? TransformationStrategies.iso() : TransformationStrategies.utf16()), i2)), string);
        }
        LOGGER.debug(" done.");
    }
}
