/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.tokenizers;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tokenizers.Tokenizer;
import org.languagetool.tools.StringTools;

public class WordTokenizer
implements Tokenizer {
    private static final List<String> PROTOCOLS = Collections.unmodifiableList(Arrays.asList("http", "https", "ftp"));
    private static final Pattern URL_CHARS = Pattern.compile("[a-zA-Z0-9/%$-_.+!*'(),\\?]+");
    private static final String TOKENIZING_CHARACTERS = " \u00a0\u115f\u1160\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u200e\u200f\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f\u205f\u2060\u2061\u2062\u2063\u206a\u206b\u206c\u206d\u206e\u206f\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb,.;()[]{}=*#\u2217\u00d7\u00b7+\u00f7<>!?:/|\\\"'\u00ab\u00bb\u201e\u201d\u201c`\u00b4\u2018\u2019\u201b\u2032\u2026\u00bf\u00a1\u2192\t\n\r";

    public static List<String> getProtocols() {
        return PROTOCOLS;
    }

    @Override
    public List<String> tokenize(String text) {
        ArrayList<String> l = new ArrayList<String>();
        StringTokenizer st = new StringTokenizer(text, TOKENIZING_CHARACTERS, true);
        while (st.hasMoreElements()) {
            l.add(st.nextToken());
        }
        return this.joinUrls(l);
    }

    public String getTokenizingCharacters() {
        return TOKENIZING_CHARACTERS;
    }

    protected List<String> joinUrls(List<String> l) {
        ArrayList<String> newList = new ArrayList<String>();
        boolean inUrl = false;
        StringBuilder url = new StringBuilder();
        for (int i = 0; i < l.size(); ++i) {
            if (this.urlStartsAt(i, l)) {
                inUrl = true;
                url.append(l.get(i));
                continue;
            }
            if (inUrl && this.urlEndsAt(i, l)) {
                inUrl = false;
                newList.add(url.toString());
                url.setLength(0);
                newList.add(l.get(i));
                continue;
            }
            if (inUrl) {
                url.append(l.get(i));
                continue;
            }
            newList.add(l.get(i));
        }
        if (url.length() > 0) {
            newList.add(url.toString());
        }
        return newList;
    }

    private boolean urlStartsAt(int i, List<String> l) {
        String token = l.get(i);
        if (this.isProtocol(token) && l.size() > i + 3) {
            String nToken = l.get(i + 1);
            String nnToken = l.get(i + 2);
            String nnnToken = l.get(i + 3);
            if (nToken.equals(":") && nnToken.equals("/") && nnnToken.equals("/")) {
                return true;
            }
        }
        return false;
    }

    private boolean isProtocol(String token) {
        for (String protocol : PROTOCOLS) {
            if (!token.equals(protocol)) continue;
            return true;
        }
        return false;
    }

    private boolean urlEndsAt(int i, List<String> l) {
        Matcher matcher;
        String nToken;
        String token = l.get(i);
        if (StringTools.isWhitespace(token)) {
            return true;
        }
        if (token.equals(")")) {
            return true;
        }
        return l.size() > i + 1 ? StringTools.isWhitespace(nToken = l.get(i + 1)) && (token.equals(".") || token.equals(",") || token.equals(";") || token.equals(":") || token.equals("!") || token.equals("?")) : !(matcher = URL_CHARS.matcher(token)).matches();
    }
}

