package edu.mit.simile.vicino;

import com.wcohen.ss.api.Token;
import com.wcohen.ss.api.Tokenizer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/mit/simile/vicino/NGramTokenizer.class */
public class NGramTokenizer implements Tokenizer {
    private int ngram_size;
    static final Pattern extra = Pattern.compile("\\p{Cntrl}|\\p{Punct}");
    static final Pattern whitespace = Pattern.compile("\\p{Space}+");
    private int nextId = 0;
    private Map<String, Token> tokMap = new TreeMap();

    /* loaded from: input_file:edu/mit/simile/vicino/NGramTokenizer$BasicToken.class */
    public class BasicToken implements Token, Comparable<Token> {
        private final int index;
        private final String value;

        BasicToken(int i, String str) {
            this.index = i;
            this.value = str;
        }

        public String getValue() {
            return this.value;
        }

        public int getIndex() {
            return this.index;
        }

        @Override // java.lang.Comparable
        public int compareTo(Token token) {
            return this.index - token.getIndex();
        }

        public int hashCode() {
            return this.value.hashCode();
        }

        public String toString() {
            return "[token#" + getIndex() + ":" + getValue() + "]";
        }
    }

    public NGramTokenizer(int i) {
        this.ngram_size = i;
    }

    public Token[] tokenize(String str) {
        String normalize = normalize(str);
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < normalize.length(); i++) {
            int i2 = i + this.ngram_size;
            if (i2 <= normalize.length()) {
                arrayList.add(intern(normalize.substring(i, i2)));
            }
        }
        return (Token[]) arrayList.toArray(new BasicToken[arrayList.size()]);
    }

    private String normalize(String str) {
        return whitespace.matcher(extra.matcher(str.trim()).replaceAll("")).replaceAll(" ").toLowerCase().intern();
    }

    public Token intern(String str) {
        String intern = str.toLowerCase().intern();
        Token token = this.tokMap.get(intern);
        if (token == null) {
            int i = this.nextId + 1;
            this.nextId = i;
            token = new BasicToken(i, intern);
            this.tokMap.put(intern, token);
        }
        return token;
    }

    public Iterator<Token> tokenIterator() {
        return this.tokMap.values().iterator();
    }

    public int maxTokenIndex() {
        return this.nextId;
    }
}
