package it.unimi.dsi.big.mg4j.document;

import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.dsi.Util;
import it.unimi.dsi.big.mg4j.document.PropertyBasedDocumentFactory;
import it.unimi.dsi.fastutil.bytes.ByteArrays;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.FastByteArrayInputStream;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.MultipleInputStream;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.util.EliasFanoMonotoneLongBigList;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.net.URLEncoder;
import java.util.Arrays;
import java.util.zip.GZIPInputStream;
import org.apache.log4j.Logger;

/* loaded from: input_file:it/unimi/dsi/big/mg4j/document/WikipediaDocumentCollection.class */
public class WikipediaDocumentCollection extends AbstractDocumentCollection implements Serializable {
    private static final long serialVersionUID = 1;
    private static final int NUM_FIELDS = 10;
    private final String[] file;
    private boolean gzipped;
    private final DocumentFactory factory;
    private final ObjectArrayList<EliasFanoMonotoneLongBigList> pointers;
    private final int size;
    private final boolean phrase;
    private final long[] firstDocument;
    private transient byte[][] buffer;
    private transient byte[] lineBuffer;
    private transient int[] bufferSize;
    private transient Reference2ObjectMap<Enum<?>, Object> metadata;
    private transient int lastDocument;
    private static final Logger LOGGER = Util.getLogger(WikipediaDocumentCollection.class);
    private static final byte[] META_MARKER = "%%#".getBytes();
    private static final byte[] DOC_MARKER = "%%#DOC".getBytes();
    private static final byte[] PAGE_MARKER = "%%#PAGE".getBytes();
    private static final byte[] SENTENCE_MARKER = "%%#SEN".getBytes();
    private static final String[] FIELD_NAME = {"token", "POS", "lemma", "CONL", "WNSS", "WSJ", "ana", "head", "deplabel", "link"};

    /* loaded from: input_file:it/unimi/dsi/big/mg4j/document/WikipediaDocumentCollection$WhitespaceWordReader.class */
    public static class WhitespaceWordReader extends FastBufferedReader {
        private static final long serialVersionUID = 1;

        protected boolean isWordConstituent(char c) {
            return !Character.isWhitespace(c);
        }
    }

    /* JADX WARN: Type inference failed for: r1v3, types: [byte[], byte[][]] */
    private final void initBuffers() {
        this.bufferSize = new int[10];
        this.buffer = new byte[10];
        this.lineBuffer = ByteArrays.EMPTY_ARRAY;
        this.lastDocument = -1;
        this.metadata = new Reference2ObjectArrayMap();
        int i = 10;
        while (true) {
            int i2 = i;
            i--;
            if (i2 == 0) {
                return;
            } else {
                this.buffer[i] = ByteArrays.EMPTY_ARRAY;
            }
        }
    }

    public WikipediaDocumentCollection(String[] strArr, DocumentFactory documentFactory, boolean z) throws IOException {
        this(strArr, documentFactory, z, false);
    }

    public WikipediaDocumentCollection(String[] strArr, DocumentFactory documentFactory, boolean z, boolean z2) throws IOException {
        this.file = strArr;
        this.factory = documentFactory;
        this.gzipped = z2;
        this.phrase = z;
        initBuffers();
        LongArrayList longArrayList = new LongArrayList();
        this.pointers = new ObjectArrayList<>(strArr.length);
        this.firstDocument = new long[strArr.length + 1];
        int i = 0;
        ProgressLogger progressLogger = new ProgressLogger(LOGGER);
        progressLogger.expectedUpdates = strArr.length;
        progressLogger.itemsName = "files";
        progressLogger.start("Scanning files...");
        for (String str : strArr) {
            longArrayList.clear();
            FastBufferedInputStream fastBufferedInputStream = z2 ? new FastBufferedInputStream(new GZIPInputStream(new FileInputStream(str))) : new FastBufferedInputStream(new FileInputStream(str));
            while (true) {
                long position = fastBufferedInputStream.position();
                if (readLine(fastBufferedInputStream) == -1) {
                    break;
                }
                if (startsWith(this.lineBuffer, DOC_MARKER)) {
                    longArrayList.add(position);
                }
                if (z && startsWith(this.lineBuffer, SENTENCE_MARKER)) {
                    longArrayList.add(position);
                }
            }
            i += longArrayList.size();
            longArrayList.add(fastBufferedInputStream.position());
            fastBufferedInputStream.close();
            this.pointers.add(new EliasFanoMonotoneLongBigList(longArrayList));
            this.firstDocument[this.pointers.size()] = i;
            progressLogger.update();
        }
        progressLogger.done();
        this.size = i;
    }

    private final int readLine(FastBufferedInputStream fastBufferedInputStream) throws IOException {
        int readLine;
        int i = 0;
        while (true) {
            readLine = fastBufferedInputStream.readLine(this.lineBuffer, i, this.lineBuffer.length - i, FastBufferedInputStream.ALL_TERMINATORS);
            if (readLine != this.lineBuffer.length - i) {
                break;
            }
            i += readLine;
            this.lineBuffer = ByteArrays.grow(this.lineBuffer, this.lineBuffer.length + 1);
        }
        if (readLine != -1) {
            i += readLine;
        }
        if (readLine == -1) {
            return -1;
        }
        return i;
    }

    protected WikipediaDocumentCollection(String[] strArr, DocumentFactory documentFactory, ObjectArrayList<EliasFanoMonotoneLongBigList> objectArrayList, int i, long[] jArr, boolean z, boolean z2) {
        this.file = strArr;
        this.factory = documentFactory;
        this.pointers = objectArrayList;
        this.size = i;
        this.firstDocument = jArr;
        this.gzipped = z2;
        this.phrase = z;
        initBuffers();
    }

    private static boolean startsWith(byte[] bArr, byte[] bArr2) {
        int length = bArr2.length;
        if (bArr.length < length) {
            return false;
        }
        do {
            int i = length;
            length--;
            if (i == 0) {
                return true;
            }
        } while (bArr[length] == bArr2[length]);
        return false;
    }

    @Override // it.unimi.dsi.big.mg4j.document.DocumentSequence
    public DocumentFactory factory() {
        return this.factory;
    }

    @Override // it.unimi.dsi.big.mg4j.document.DocumentCollection
    public long size() {
        return this.size;
    }

    @Override // it.unimi.dsi.big.mg4j.document.DocumentCollection
    public Reference2ObjectMap<Enum<?>, Object> metadata(long j) throws IOException {
        readDocument(j, -1, null);
        if (!this.metadata.containsKey(PropertyBasedDocumentFactory.MetadataKeys.TITLE)) {
            this.metadata.put(PropertyBasedDocumentFactory.MetadataKeys.TITLE, "Sentence #" + (j + serialVersionUID));
        }
        return this.metadata;
    }

    @Override // it.unimi.dsi.big.mg4j.document.DocumentCollection
    public Document document(long j) throws IOException {
        return this.factory.getDocument(stream(j), metadata(j));
    }

    @Override // it.unimi.dsi.big.mg4j.document.DocumentCollection
    public InputStream stream(long j) throws IOException {
        readDocument(j, -1, null);
        FastByteArrayInputStream[] fastByteArrayInputStreamArr = new FastByteArrayInputStream[10];
        for (int i = 0; i < 10; i++) {
            fastByteArrayInputStreamArr[i] = new FastByteArrayInputStream(this.buffer[i], 0, this.bufferSize[i]);
        }
        return MultipleInputStream.getStream(fastByteArrayInputStreamArr);
    }

    @Override // it.unimi.dsi.big.mg4j.document.AbstractDocumentCollection, it.unimi.dsi.big.mg4j.document.DocumentSequence
    public DocumentIterator iterator() throws IOException {
        return new AbstractDocumentIterator() { // from class: it.unimi.dsi.big.mg4j.document.WikipediaDocumentCollection.1
            private int index = 0;
            private int f = 0;
            private FastBufferedInputStream fbis;

            {
                this.fbis = new FastBufferedInputStream(new FileInputStream(WikipediaDocumentCollection.this.file[0]));
            }

            @Override // it.unimi.dsi.big.mg4j.document.AbstractDocumentIterator, it.unimi.dsi.big.mg4j.document.DocumentIterator, java.io.Closeable, java.lang.AutoCloseable
            public void close() throws IOException {
                super.close();
                if (this.fbis != null) {
                    this.fbis.close();
                    this.fbis = null;
                }
            }

            @Override // it.unimi.dsi.big.mg4j.document.DocumentIterator
            public Document nextDocument() throws IOException {
                if (this.index == WikipediaDocumentCollection.this.size) {
                    return null;
                }
                if (this.index == WikipediaDocumentCollection.this.firstDocument[this.f + 1]) {
                    this.fbis.close();
                    String[] strArr = WikipediaDocumentCollection.this.file;
                    int i = this.f + 1;
                    this.f = i;
                    this.fbis = new FastBufferedInputStream(new FileInputStream(strArr[i]));
                }
                WikipediaDocumentCollection.this.readDocument(this.index, this.f, this.fbis);
                WikipediaDocumentCollection wikipediaDocumentCollection = WikipediaDocumentCollection.this;
                int i2 = this.index;
                this.index = i2 + 1;
                return wikipediaDocumentCollection.document(i2);
            }
        };
    }

    /* JADX INFO: Access modifiers changed from: private */
    public void readDocument(long j, int i, FastBufferedInputStream fastBufferedInputStream) throws IOException {
        ensureDocumentIndex(j);
        if (j == this.lastDocument) {
            return;
        }
        boolean z = fastBufferedInputStream == null;
        if (z) {
            i = Arrays.binarySearch(this.firstDocument, j);
            if (i < 0) {
                i = (-i) - 2;
            }
            fastBufferedInputStream = new FastBufferedInputStream(new FileInputStream(this.file[i]));
        }
        fastBufferedInputStream.position(((EliasFanoMonotoneLongBigList) this.pointers.get(i)).getLong(j - this.firstDocument[i]));
        long j2 = ((EliasFanoMonotoneLongBigList) this.pointers.get(i)).getLong((j - this.firstDocument[i]) + serialVersionUID);
        IntArrays.fill(this.bufferSize, 0);
        this.metadata.clear();
        while (fastBufferedInputStream.position() < j2) {
            int readLine = readLine(fastBufferedInputStream);
            if (startsWith(this.lineBuffer, META_MARKER)) {
                boolean z2 = false;
                boolean z3 = false;
                if (startsWith(this.lineBuffer, DOC_MARKER) && this.phrase) {
                    return;
                }
                if (startsWith(this.lineBuffer, PAGE_MARKER)) {
                    z3 = true;
                } else if (startsWith(this.lineBuffer, SENTENCE_MARKER)) {
                    z2 = true;
                }
                if (z3) {
                    String trim = new String(this.lineBuffer, Math.min(PAGE_MARKER.length + 1, readLine), Math.max((readLine - PAGE_MARKER.length) - 1, 0), "UTF-8").trim();
                    this.metadata.put(PropertyBasedDocumentFactory.MetadataKeys.TITLE, trim);
                    this.metadata.put(PropertyBasedDocumentFactory.MetadataKeys.URI, "http://en.wikipedia.org/wiki/" + URLEncoder.encode(trim, "UTF-8"));
                }
                if (z3 || z2) {
                    if (!this.phrase) {
                        for (int i2 = 0; i2 < 10; i2++) {
                            this.buffer[i2] = ByteArrays.grow(this.buffer[i2], this.bufferSize[i2] + 3);
                            byte[] bArr = this.buffer[i2];
                            int[] iArr = this.bufferSize;
                            int i3 = i2;
                            int i4 = iArr[i3];
                            iArr[i3] = i4 + 1;
                            bArr[i4] = -62;
                            byte[] bArr2 = this.buffer[i2];
                            int[] iArr2 = this.bufferSize;
                            int i5 = i2;
                            int i6 = iArr2[i5];
                            iArr2[i5] = i6 + 1;
                            bArr2[i6] = -74;
                            byte[] bArr3 = this.buffer[i2];
                            int[] iArr3 = this.bufferSize;
                            int i7 = i2;
                            int i8 = iArr3[i7];
                            iArr3[i7] = i8 + 1;
                            bArr3[i8] = 10;
                        }
                    }
                }
            } else {
                int i9 = 0;
                for (int i10 = 0; i10 < readLine; i10++) {
                    if (this.lineBuffer[i10] == 9) {
                        i9++;
                    } else {
                        this.buffer[i9] = ByteArrays.grow(this.buffer[i9], this.bufferSize[i9] + 2);
                        byte[] bArr4 = this.buffer[i9];
                        int[] iArr4 = this.bufferSize;
                        int i11 = i9;
                        int i12 = iArr4[i11];
                        iArr4[i11] = i12 + 1;
                        bArr4[i12] = this.lineBuffer[i10];
                        if (i10 == readLine - 1 || this.lineBuffer[i10 + 1] == 9) {
                            byte[] bArr5 = this.buffer[i9];
                            int[] iArr5 = this.bufferSize;
                            int i13 = i9;
                            int i14 = iArr5[i13];
                            iArr5[i13] = i14 + 1;
                            bArr5[i14] = 32;
                        }
                    }
                }
            }
        }
        if (z) {
            fastBufferedInputStream.close();
        }
    }

    @Override // it.unimi.dsi.big.mg4j.document.DocumentCollection
    /* renamed from: copy, reason: merged with bridge method [inline-methods] */
    public WikipediaDocumentCollection m30copy() {
        return new WikipediaDocumentCollection(this.file, this.factory.m25copy(), this.pointers, this.size, this.firstDocument, this.phrase, this.gzipped);
    }

    private void readObject(ObjectInputStream objectInputStream) throws IOException, ClassNotFoundException {
        objectInputStream.defaultReadObject();
        initBuffers();
    }

    public static void main(String[] strArr) throws IOException, JSAPException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
        SimpleJSAP simpleJSAP = new SimpleJSAP(WikipediaDocumentCollection.class.getName(), "Saves a serialised document collection based on a set of files.", new Parameter[]{new Switch("sentence", 's', "sentence", "Index sentences rather than documents."), new Switch("gzipped", 'z', "gzipped", "The files are gzipped."), new UnflaggedOption("collection", JSAP.STRING_PARSER, true, "The filename for the serialised collection."), new UnflaggedOption("file", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, true, "A list of files that will be indexed. If missing, a list of files will be read from standard input.")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            return;
        }
        IdentityDocumentFactory identityDocumentFactory = new IdentityDocumentFactory((Reference2ObjectMap<Enum<?>, Object>) new Reference2ObjectOpenHashMap(new PropertyBasedDocumentFactory.MetadataKeys[]{PropertyBasedDocumentFactory.MetadataKeys.ENCODING, PropertyBasedDocumentFactory.MetadataKeys.WORDREADER}, new Object[]{"UTF-8", WhitespaceWordReader.class.getName()}));
        String[] strArr2 = (String[]) parse.getObjectArray("file", new String[0]);
        if (strArr2.length == 0) {
            ObjectArrayList objectArrayList = new ObjectArrayList();
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(System.in));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                } else {
                    objectArrayList.add(readLine);
                }
            }
            strArr2 = (String[]) objectArrayList.toArray(new String[0]);
        }
        if (strArr2.length == 0) {
            System.err.println("WARNING: empty file set.");
        }
        BinIO.storeObject(new WikipediaDocumentCollection(strArr2, ReplicatedDocumentFactory.getFactory(identityDocumentFactory, 10, FIELD_NAME), parse.getBoolean("sentence"), parse.getBoolean("gzipped")), parse.getString("collection"));
    }
}
