/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.tokenizers.fr;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tagging.fr.FrenchTagger;
import org.languagetool.tokenizers.WordTokenizer;

public class FrenchWordTokenizer
extends WordTokenizer {
    private static final String wordCharacters = "\u00a7\u00a9@\u20ac\u00a3\\$_\\p{L}\\d\\-\u0300-\u036f\u00a8\u2070-\u209f\u00b0%\u2030\u2031&\ufffd\u00ad\u00ac";
    private static final Pattern tokenizerPattern = Pattern.compile("[\u00a7\u00a9@\u20ac\u00a3\\$_\\p{L}\\d\\-\u0300-\u036f\u00a8\u2070-\u209f\u00b0%\u2030\u2031&\ufffd\u00ad\u00ac]+|[^\u00a7\u00a9@\u20ac\u00a3\\$_\\p{L}\\d\\-\u0300-\u036f\u00a8\u2070-\u209f\u00b0%\u2030\u2031&\ufffd\u00ad\u00ac]");
    private static final Pattern SOFT_HYPHEN = Pattern.compile("\u00ad");
    private static final Pattern CURLY_QUOTE = Pattern.compile("\u2019");
    private static final Pattern PATTERN_1 = Pattern.compile("xxFR_APOS_TYPEWxx");
    private static final Pattern PATTERN_2 = Pattern.compile("xxFR_APOS_TYPOGxx");
    private static final Pattern PATTERN_3 = Pattern.compile("xxFR_HYPHENxx");
    private static final Pattern PATTERN_4 = Pattern.compile("xxFR_DECIMALPOINTxx");
    private static final Pattern PATTERN_5 = Pattern.compile("xxFR_DECIMALCOMMAxx");
    private static final Pattern PATTERN_6 = Pattern.compile("xxFR_SPACExx");
    private static final Pattern TYPEWRITER_APOSTROPHE = Pattern.compile("([\\p{L}])'([\\p{L}1\"\u2018\u201c\u00ab])", 66);
    private static final Pattern TYPOGRAPHIC_APOSTROPHE = Pattern.compile("([\\p{L}])\u2019([\\p{L}1\"\u2018\u201c\u00ab])", 66);
    private static final Pattern NEARBY_HYPHENS = Pattern.compile("([\\p{L}])-([\\p{L}])-([\\p{L}])", 66);
    private static final Pattern HYPHENS = Pattern.compile("([\\p{L}])-([\\p{L}\\d])", 66);
    private static final Pattern DECIMAL_POINT = Pattern.compile("([\\d])\\.([\\d])", 66);
    private static final Pattern DECIMAL_COMMA = Pattern.compile("([\\d]),([\\d])", 66);
    private static final Pattern SPACE_DIGITS0 = Pattern.compile("([\\d]{4}) ", 66);
    private static final Pattern SPACE_DIGITS = Pattern.compile("([\\d]) ([\\d][\\d][\\d])\\b", 66);
    private static final Pattern SPACE_DIGITS2 = Pattern.compile("([\\d]) ([\\d][\\d][\\d]) ([\\d][\\d][\\d])\\b", 66);
    private static final Pattern SPACE0 = Pattern.compile("xxFR_SPACE0xx");
    private static final List<String> doNotSplit = Arrays.asList("mers-cov", "mcgraw-hill", "sars-cov-2", "sars-cov", "ph-metre", "ph-metres", "anti-ivg", "anti-uv", "anti-vih", "al-qa\u00efda", "c'est-\u00e0-dire", "add-on", "add-ons", "rendez-vous", "garde-\u00e0-vous", "chez-eux", "chez-moi", "chez-nous", "chez-soi", "chez-toi", "chez-vous", "m'as-tu-vu");
    private final String frTokenizingChars = super.getTokenizingCharacters() + "-";
    static final int maxPatterns = 7;
    static final Pattern[] patterns = new Pattern[7];

    public List<String> tokenize(String text) {
        ArrayList<String> l = new ArrayList<String>();
        String auxText = text.replace('\u2010', '-');
        auxText = auxText.replace('\u2011', '-');
        Matcher matcher = TYPEWRITER_APOSTROPHE.matcher(auxText);
        auxText = matcher.replaceAll("$1xxFR_APOS_TYPEWxx$2");
        matcher = TYPOGRAPHIC_APOSTROPHE.matcher(auxText);
        auxText = matcher.replaceAll("$1xxFR_APOS_TYPOGxx$2");
        matcher = NEARBY_HYPHENS.matcher(auxText);
        auxText = matcher.replaceAll("$1xxFR_HYPHENxx$2xxFR_HYPHENxx$3");
        matcher = HYPHENS.matcher(auxText);
        auxText = matcher.replaceAll("$1xxFR_HYPHENxx$2");
        matcher = DECIMAL_POINT.matcher(auxText);
        auxText = matcher.replaceAll("$1xxFR_DECIMALPOINTxx$2");
        matcher = DECIMAL_COMMA.matcher(auxText);
        auxText = matcher.replaceAll("$1xxFR_DECIMALCOMMAxx$2");
        matcher = SPACE_DIGITS2.matcher(auxText);
        auxText = matcher.replaceAll("$1xxFR_SPACExx$2xxFR_SPACExx$3");
        matcher = SPACE_DIGITS0.matcher(auxText);
        auxText = matcher.replaceAll("$1xxFR_SPACE0xx");
        matcher = SPACE_DIGITS.matcher(auxText);
        auxText = matcher.replaceAll("$1xxFR_SPACExx$2");
        matcher = SPACE0.matcher(auxText);
        auxText = matcher.replaceAll(" ");
        Matcher tokenizerMatcher = tokenizerPattern.matcher(auxText);
        while (tokenizerMatcher.find()) {
            String s = tokenizerMatcher.group();
            if (l.size() > 0 && s.length() == 1 && s.codePointAt(0) >= 65024 && s.codePointAt(0) <= 65039) {
                l.set(l.size() - 1, (String)l.get(l.size() - 1) + s);
                continue;
            }
            s = PATTERN_1.matcher(s).replaceAll("'");
            s = PATTERN_2.matcher(s).replaceAll("\u2019");
            s = PATTERN_3.matcher(s).replaceAll("-");
            s = PATTERN_4.matcher(s).replaceAll(".");
            s = PATTERN_5.matcher(s).replaceAll(",");
            s = PATTERN_6.matcher(s).replaceAll(" ");
            boolean matchFound = false;
            while (s.length() > 1 && s.startsWith("-")) {
                l.add("-");
                s = s.substring(1);
            }
            int hyphensAtEnd = 0;
            while (s.length() > 1 && s.endsWith("-")) {
                s = s.substring(0, s.length() - 1);
                ++hyphensAtEnd;
            }
            for (int j = 0; j < 7 && !matchFound; ++j) {
                matcher = patterns[j].matcher(s);
                matchFound = matcher.find();
            }
            if (matchFound) {
                for (int i = 1; i <= matcher.groupCount(); ++i) {
                    String groupStr = matcher.group(i);
                    l.addAll(this.wordsToAdd(groupStr));
                }
            } else {
                l.addAll(this.wordsToAdd(s));
            }
            while (hyphensAtEnd > 0) {
                l.add("-");
                --hyphensAtEnd;
            }
        }
        return this.joinEMailsAndUrls(l);
    }

    private List<String> wordsToAdd(String s) {
        ArrayList<String> l = new ArrayList<String>();
        if (!s.isEmpty()) {
            if (!s.contains("-")) {
                l.add(s);
            } else {
                String normalized = SOFT_HYPHEN.matcher(s).replaceAll("");
                normalized = CURLY_QUOTE.matcher(normalized).replaceAll("'");
                if (FrenchTagger.INSTANCE.tag(Arrays.asList(normalized)).get(0).isTagged()) {
                    l.add(s);
                } else if (doNotSplit.contains(s.toLowerCase())) {
                    l.add(s);
                } else {
                    StringTokenizer st2 = new StringTokenizer(s, "-", true);
                    while (st2.hasMoreElements()) {
                        l.add(st2.nextToken());
                    }
                }
            }
        }
        return l;
    }

    static {
        FrenchWordTokenizer.patterns[0] = Pattern.compile("^(c['\u2019]te?|m['\u2019]as-tu-vu|c['\u2019]est-\u00e0-dire|add-on|add-ons|rendez-vous|garde-\u00e0-vous|chez-eux|chez-moi|chez-nous|chez-soi|chez-toi|chez-vous)$", 66);
        FrenchWordTokenizer.patterns[1] = Pattern.compile("^([c\u00e7]['\u2019]|j['\u2019]|n['\u2019]|m['\u2019]|t['\u2019]|s['\u2019]|l['\u2019]|d['\u2019]|qu['\u2019]|jusqu['\u2019]|lorsqu['\u2019]|puisqu['\u2019]|quoiqu['\u2019])([^\\-]*)(-ce|-elle|-t-elle|-elles|-t-elles|-en|-il|-t-il|-ils|-t-ils|-je|-la|-le|-les|-leur|-lui|-moi|-nous|-on|-t-on|-toi|-tu|-vous|-vs|-y)$", 66);
        FrenchWordTokenizer.patterns[2] = Pattern.compile("^([c\u00e7]['\u2019]|j['\u2019]|n['\u2019]|m['\u2019]|t['\u2019]|s['\u2019]|l['\u2019]|d['\u2019]|qu['\u2019]|jusqu['\u2019]|lorsqu['\u2019]|puisqu['\u2019]|quoiqu['\u2019])([^'\u2019\\-].*)$", 66);
        FrenchWordTokenizer.patterns[3] = Pattern.compile("^([^\\-\\d]+)(-ce|-t-elle|-t-elles|-elle|-elles|-en|-il|-t-il|-ils|-t-ils|-je|-la|-le|-les|-leur|-lui|-moi|-nous|-on|-t-on|-toi|-tu|-vous|-vs|-y)(-ce|-elle|-t-elle|-elles|-t-elles|-en|-il|-t-il|-ils|-t-ils|-je|-la|-le|-les|-leur|-lui|-moi|-nous|-on|-t-on|-toi|-tu|-vous|-vs|-y)$", 66);
        FrenchWordTokenizer.patterns[4] = Pattern.compile("^([^\\-]*)(-t|-m)(['\u2019]en|['\u2019]y)$", 66);
        FrenchWordTokenizer.patterns[5] = Pattern.compile("^(.*)(-t-elle|-t-elles|-t-il|-t-ils|-t-on)$", 66);
        FrenchWordTokenizer.patterns[6] = Pattern.compile("^(.*)(-ce|-elle|-t-elle|-elles|-t-elles|-en|-il|-t-il|-ils|-t-ils|-je|-la|-le|-les|-leur|-lui|-moi|-nous|-on|-t-on|-toi|-tu|-vous|-vs|-y)$", 66);
    }
}

