/*
 * Decompiled with CFR 0.152.
 */
package com.yahoo.language.wordpiece;

import com.yahoo.collections.Tuple2;
import com.yahoo.language.Language;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.Tokenizer;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.TreeMap;

class Model {
    private final String subwordPrefix;
    private final Path source;
    private final Language language;
    private final NavigableMap<String, Integer> vocabulary;
    private final Map<Integer, String> tokenId2Token;

    Model(String subwordPrefix, Language language, Path path) {
        this.subwordPrefix = subwordPrefix;
        this.source = path;
        this.language = language;
        this.vocabulary = new TreeMap(Collections.reverseOrder());
        this.tokenId2Token = new HashMap<Integer, String>();
        try (BufferedReader reader = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(path.toFile()), StandardCharsets.UTF_8));){
            String token;
            int i = 0;
            while ((token = reader.readLine()) != null) {
                this.vocabulary.put(token, i);
                this.tokenId2Token.put(i, token);
                ++i;
            }
        }
        catch (IOException e) {
            throw new IllegalArgumentException("Could not read a WordPiece model from " + path, e);
        }
    }

    Language language() {
        return this.language;
    }

    List<Integer> embed(String text, Tokenizer tokenizer) {
        ArrayList<Integer> ids = new ArrayList<Integer>();
        text = text.toLowerCase();
        block0: for (Token t : tokenizer.tokenize(text, this.language, StemMode.NONE, true)) {
            Tuple2<String, Integer> entry;
            String originalToken = t.getTokenString();
            Object candidate = originalToken;
            int count = 0;
            while (((String)candidate).length() > 0 && !((String)candidate).equals(this.subwordPrefix) && (entry = this.findLongestSubstring((String)candidate)) != null) {
                ids.add((Integer)entry.second);
                candidate = this.subwordPrefix + ((String)candidate).substring(((String)entry.first).length());
                if (count++ <= originalToken.length()) continue;
                continue block0;
            }
        }
        return ids;
    }

    List<String> segment(String text, Tokenizer tokenizer) {
        return this.embed(text, tokenizer).stream().map(tokenId -> this.tokenId2Token.get(tokenId)).toList();
    }

    private Tuple2<String, Integer> findLongestSubstring(String candidate) {
        NavigableMap<String, Integer> tailMap = this.vocabulary.tailMap(candidate, true);
        if (tailMap.isEmpty()) {
            return null;
        }
        String longestSubstring = (String)tailMap.firstKey();
        Integer id = tailMap.firstEntry().getValue();
        int subStringLength = Math.min(candidate.length(), longestSubstring.length());
        while (!candidate.startsWith(longestSubstring)) {
            if ((tailMap = tailMap.tailMap(candidate.substring(0, --subStringLength), true)).isEmpty()) {
                return null;
            }
            longestSubstring = (String)tailMap.firstKey();
            id = tailMap.firstEntry().getValue();
        }
        return new Tuple2((Object)longestSubstring, (Object)id);
    }

    public String toString() {
        return "WordPiece model for " + this.language + ": '" + this.source + "'";
    }
}

