/*
 * Decompiled with CFR 0.152.
 */
package org.deeplearning4j.bagofwords.vectorizer;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.deeplearning4j.bagofwords.vectorizer.BaseTextVectorizer;
import org.deeplearning4j.bagofwords.vectorizer.TextVectorizer;
import org.deeplearning4j.models.word2vec.wordstore.VocabCache;
import org.deeplearning4j.text.documentiterator.DocumentIterator;
import org.deeplearning4j.text.invertedindex.InvertedIndex;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizer.Tokenizer;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.deeplearning4j.util.MathUtils;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.dataset.DataSet;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.util.FeatureUtil;

public class TfidfVectorizer
extends BaseTextVectorizer
implements Serializable {
    public TfidfVectorizer() {
    }

    protected TfidfVectorizer(VocabCache cache, TokenizerFactory tokenizerFactory, List<String> stopWords, int layerSize, int minWordFrequency, DocumentIterator docIter, SentenceIterator sentenceIterator, List<String> labels, InvertedIndex index, int batchSize, double sample) {
        super(cache, tokenizerFactory, stopWords, layerSize, minWordFrequency, docIter, sentenceIterator, labels, index, batchSize, sample);
    }

    private double tfidfWord(String word) {
        return MathUtils.tfidf((double)this.tfForWord(word), (double)this.idfForWord(word));
    }

    private double tfForWord(String word) {
        return MathUtils.tf((int)this.cache.wordFrequency(word));
    }

    private double idfForWord(String word) {
        return MathUtils.idf((double)this.cache.totalNumberOfDocs(), (double)this.cache.docAppearedIn(word));
    }

    private INDArray tfidfForInput(String text) {
        INDArray ret = Nd4j.create((int)1, (int)this.cache.numWords());
        Tokenizer tokenizer = this.tokenizerFactory.create(text);
        List<String> tokens = tokenizer.getTokens();
        for (int i = 0; i < tokens.size(); ++i) {
            int idx = this.cache.indexOf(tokens.get(i));
            if (idx < 0) continue;
            ret.putScalar(idx, this.tfidfWord(tokens.get(i)));
        }
        return ret;
    }

    private INDArray tfidfForInput(InputStream is) {
        try {
            String text = new String(IOUtils.toByteArray((InputStream)is));
            return this.tfidfForInput(text);
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public DataSet vectorize(InputStream is, String label) {
        return new DataSet(this.tfidfForInput(is), FeatureUtil.toOutcomeVector((int)this.labels.indexOf(label), (int)this.labels.size()));
    }

    @Override
    public DataSet vectorize(String text, String label) {
        INDArray tfidf = this.tfidfForInput(text);
        INDArray label2 = FeatureUtil.toOutcomeVector((int)this.labels.indexOf(label), (int)this.labels.size());
        return new DataSet(tfidf, label2);
    }

    @Override
    public DataSet vectorize(File input, String label) {
        try {
            return this.vectorize(FileUtils.readFileToString((File)input), label);
        }
        catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public INDArray transform(String text) {
        return this.tfidfForInput(text);
    }

    public DataSet vectorize() {
        return null;
    }

    public static class Builder
    extends org.deeplearning4j.bagofwords.vectorizer.Builder {
        @Override
        public TextVectorizer build() {
            return new TfidfVectorizer(this.cache, this.tokenizerFactory, this.stopWords, this.layerSize, this.minWordFrequency, this.docIter, this.sentenceIterator, this.labels, this.index, this.batchSize, this.sample);
        }
    }
}

