/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.chunking;

import com.google.common.base.Function;
import edu.washington.cs.knowitall.regex.Match;
import edu.washington.cs.knowitall.regex.RegularExpression;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.chunking.ChunkTag;
import org.languagetool.chunking.ChunkTaggedToken;
import org.languagetool.chunking.Chunker;
import org.languagetool.chunking.TokenExpressionFactory;
import org.languagetool.rules.patterns.StringMatcher;

public class GermanChunker
implements Chunker {
    private static final Set<String> FILTER_TAGS = new HashSet<String>(Arrays.asList("PP", "NPP", "NPS"));
    private static final TokenExpressionFactory FACTORY = new TokenExpressionFactory(false);
    private static final Pattern simpleFormRegexp = Pattern.compile("(^| )<([a-z\u00e4\u00f6\u00fc\u00df|()\\[\\]?,]+)>\\+?( |$)", 66);
    private static final Map<String, String> SYNTAX_EXPANSION = new HashMap<String, String>();
    private static boolean debug;
    private static final String[][] undOderBzw;
    private static final List<RegularExpressionWithPhraseType> REGEXES1;
    private static final List<RegularExpressionWithPhraseType> REGEXES2;

    public static void setDebug(boolean debugMode) {
        debug = debugMode;
    }

    public static boolean isDebug() {
        return debug;
    }

    private static RegularExpressionWithPhraseType build(String expr, PhraseType phraseType) {
        return GermanChunker.build(expr, phraseType, false);
    }

    private static RegularExpressionWithPhraseType build(String expr, PhraseType phraseType, boolean overwrite) {
        String expandedExpr = expr;
        for (Map.Entry<String, String> entry : SYNTAX_EXPANSION.entrySet()) {
            expandedExpr = expandedExpr.replace(entry.getKey(), entry.getValue());
        }
        return GermanChunker.buildExpanded(expandedExpr, phraseType, overwrite, GermanChunker.calcFormHints(expandedExpr));
    }

    private static RegularExpressionWithPhraseType buildExpanded(String expandedExpr, PhraseType phraseType, boolean overwrite, String[][] formHints) {
        RegularExpression expression = RegularExpression.compile((String)expandedExpr, (Function)FACTORY);
        return new RegularExpressionWithPhraseType((RegularExpression<ChunkTaggedToken>)expression, phraseType, overwrite, formHints);
    }

    private static String[][] calcFormHints(String expandedExpr) {
        ArrayList<String[]> formHints = new ArrayList<String[]>();
        Matcher matcher = simpleFormRegexp.matcher(expandedExpr);
        while (matcher.find()) {
            Set possibleValues = StringMatcher.create((String)matcher.group(2), (boolean)true, (boolean)false).getPossibleValues();
            if (possibleValues == null) continue;
            formHints.add(possibleValues.toArray(new String[0]));
        }
        return (String[][])formHints.toArray((T[])new String[0][]);
    }

    public void addChunkTags(List<AnalyzedTokenReadings> tokenReadings) {
        Set<String> allForms = GermanChunker.allForms(tokenReadings);
        List<ChunkTaggedToken> chunkTaggedTokens = this.getBasicChunks(tokenReadings, allForms);
        for (RegularExpressionWithPhraseType regex : REGEXES2) {
            this.apply(regex, chunkTaggedTokens, allForms);
        }
        this.assignChunksToReadings(chunkTaggedTokens);
    }

    List<ChunkTaggedToken> getBasicChunks(List<AnalyzedTokenReadings> tokenReadings) {
        return this.getBasicChunks(tokenReadings, GermanChunker.allForms(tokenReadings));
    }

    private List<ChunkTaggedToken> getBasicChunks(List<AnalyzedTokenReadings> tokenReadings, Set<String> allForms) {
        ArrayList<ChunkTaggedToken> chunkTaggedTokens = new ArrayList<ChunkTaggedToken>();
        for (AnalyzedTokenReadings tokenReading : tokenReadings) {
            if (tokenReading.isWhitespace()) continue;
            List<ChunkTag> chunkTags = Collections.singletonList(new ChunkTag("O"));
            ChunkTaggedToken chunkTaggedToken = new ChunkTaggedToken(tokenReading.getToken(), chunkTags, tokenReading);
            chunkTaggedTokens.add(chunkTaggedToken);
        }
        if (debug) {
            System.out.println("=============== CHUNKER INPUT ===============");
            System.out.println(this.getDebugString(chunkTaggedTokens));
        }
        for (RegularExpressionWithPhraseType regex : REGEXES1) {
            this.apply(regex, chunkTaggedTokens, allForms);
        }
        return chunkTaggedTokens;
    }

    private void apply(RegularExpressionWithPhraseType regex, List<ChunkTaggedToken> tokens, Set<String> allForms) {
        if (!GermanChunker.hasAllFormHints(regex, allForms)) {
            return;
        }
        String prevDebug = this.getDebugString(tokens);
        try {
            AffectedSpans affectedSpans = this.doApplyRegex(regex, tokens);
            String debug = this.getDebugString(tokens);
            if (!debug.equals(prevDebug)) {
                this.printDebugInfo(regex, affectedSpans, debug);
            }
        }
        catch (Exception e) {
            throw new RuntimeException("Could not apply chunk regexp '" + regex + "' to tokens: " + tokens, e);
        }
    }

    private static boolean hasAllFormHints(RegularExpressionWithPhraseType regex, Set<String> allForms) {
        for (String[] hints : regex.formHints) {
            if (GermanChunker.hasForm(allForms, hints)) continue;
            return false;
        }
        return true;
    }

    private static boolean hasForm(Set<String> allForms, String[] hints) {
        for (String hint : hints) {
            if (!allForms.contains(hint)) continue;
            return true;
        }
        return false;
    }

    private static Set<String> allForms(List<AnalyzedTokenReadings> tokens) {
        TreeSet<String> result = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
        for (AnalyzedTokenReadings token : tokens) {
            result.add(token.getToken());
        }
        return result;
    }

    private void assignChunksToReadings(List<ChunkTaggedToken> chunkTaggedTokens) {
        for (ChunkTaggedToken taggedToken : chunkTaggedTokens) {
            AnalyzedTokenReadings readings = taggedToken.getReadings();
            if (readings == null) continue;
            readings.setChunkTags(taggedToken.getChunkTags());
        }
    }

    private AffectedSpans doApplyRegex(RegularExpressionWithPhraseType regex, List<ChunkTaggedToken> tokens) {
        List matches = regex.expression.findAll(tokens);
        ArrayList<Span> affectedSpans = new ArrayList<Span>();
        for (Match match : matches) {
            affectedSpans.add(new Span(match.startIndex(), match.endIndex()));
            for (int i = match.startIndex(); i < match.endIndex(); ++i) {
                ChunkTag newTag;
                ChunkTaggedToken token = tokens.get(i);
                ArrayList<ChunkTag> newChunkTags = new ArrayList<ChunkTag>();
                newChunkTags.addAll(token.getChunkTags());
                if (regex.overwrite) {
                    ArrayList<ChunkTag> filtered = new ArrayList<ChunkTag>();
                    for (ChunkTag newChunkTag : newChunkTags) {
                        if (FILTER_TAGS.contains(newChunkTag.getChunkTag())) continue;
                        filtered.add(newChunkTag);
                    }
                    newChunkTags = filtered;
                }
                if (!newChunkTags.contains(newTag = this.getChunkTag(regex, (Match<ChunkTaggedToken>)match, i))) {
                    newChunkTags.add(newTag);
                    newChunkTags.remove(new ChunkTag("O"));
                }
                tokens.set(i, new ChunkTaggedToken(token.getToken(), newChunkTags, token.getReadings()));
            }
        }
        return new AffectedSpans(affectedSpans);
    }

    private ChunkTag getChunkTag(RegularExpressionWithPhraseType regex, Match<ChunkTaggedToken> match, int i) {
        ChunkTag newTag = regex.phraseType == PhraseType.NP ? (i == match.startIndex() ? new ChunkTag("B-NP") : new ChunkTag("I-NP")) : new ChunkTag(regex.phraseType.name());
        return newTag;
    }

    private void printDebugInfo(RegularExpressionWithPhraseType regex, AffectedSpans affectedSpans, String debug) {
        System.out.println("=== Applied " + regex + " ===");
        if (regex.overwrite) {
            System.out.println("Note: overwrite mode, replacing old " + FILTER_TAGS + " tags");
        }
        String[] debugLines = debug.split("\n");
        int i = 0;
        for (String debugLine : debugLines) {
            if (affectedSpans.isAffected(i)) {
                System.out.println(debugLine.replaceFirst("^  ", " *"));
            } else {
                System.out.println(debugLine);
            }
            ++i;
        }
        System.out.println();
    }

    private String getDebugString(List<ChunkTaggedToken> tokens) {
        if (!debug) {
            return "";
        }
        StringBuilder sb = new StringBuilder();
        for (ChunkTaggedToken token : tokens) {
            String tokenReadingStr = token.getReadings().toString().replaceFirst(Pattern.quote(token.getToken()) + "\\[", "[");
            sb.append("  ").append(token).append(" -- ").append(tokenReadingStr).append('\n');
        }
        return sb.toString();
    }

    static {
        SYNTAX_EXPANSION.put("<NP>", "<chunk=B-NP> <chunk=I-NP>*");
        SYNTAX_EXPANSION.put("&prozent;", "Prozent|Kilo|Kilogramm|Gramm|Euro|Pfund");
        debug = false;
        undOderBzw = new String[][]{{"und", "oder", "bzw"}};
        REGEXES1 = Arrays.asList(GermanChunker.build("(<posre=^ART.*>|<pos=PRO>)? <pos=ADV>* <pos=PA2>* <pos=ADJ>* <pos=SUB>+", PhraseType.NP), GermanChunker.buildExpanded("<pos=SUB> (<und|oder>|(<bzw> <.>)) <pos=SUB>", PhraseType.NP, false, undOderBzw), GermanChunker.buildExpanded("<pos=ADJ> (<und|oder>|(<bzw> <.>)) <pos=PA2> <pos=SUB>", PhraseType.NP, false, undOderBzw), GermanChunker.buildExpanded("<pos=ADJ> (<und|oder>|(<bzw> <.>)) <pos=ADJ> <pos=SUB>", PhraseType.NP, false, undOderBzw), GermanChunker.build("<posre=^ART.*> <pos=ADV>* <pos=ADJ>* <regexCS=[A-Z\u00d6\u00c4\u00dc][a-z\u00f6\u00e4\u00fc]+>", PhraseType.NP), GermanChunker.build("<pos=PRO>? <pos=ZAL> <pos=SUB>", PhraseType.NP), GermanChunker.build("<Herr|Herrn|Frau> <pos=EIG>+", PhraseType.NP), GermanChunker.build("<Herr|Herrn|Frau> <regexCS=[A-Z\u00d6\u00c4\u00dc][a-z\u00f6\u00e4\u00fc-]+>+", PhraseType.NP), GermanChunker.build("<der>", PhraseType.NP));
        REGEXES2 = Arrays.asList(GermanChunker.build("<pos=ADJ> <,> <chunk=B-NP> <chunk=I-NP>* <und|sowie> <NP>", PhraseType.NPP), GermanChunker.build("<chunk=B-NP & !regex=jede[rs]?> <chunk=I-NP>* <und|sowie> <pos=ADV>? <NP>", PhraseType.NPP), GermanChunker.build("<pos=ADJ> <und|sowie> <chunk=B-NP & !pos=PLU> <chunk=I-NP>*", PhraseType.NPS, true), GermanChunker.build("<deren> <chunk=B-NP & !pos=PLU> <und|sowie> <chunk=B-NP>*", PhraseType.NPS, true), GermanChunker.build("<pos=EIG> <und> <pos=EIG>", PhraseType.NPP), GermanChunker.build("<pos=ART> <pos=ADJ> <und|sowie> (<pos=ADJ>|<pos=PA2>) <chunk=I-NP & !pos=PLU>+", PhraseType.NPS, true), GermanChunker.build("<chunk=B-NP & !pos=PLU> <chunk=I-NP>* <und|sowie> <keine> <chunk=I-NP>+", PhraseType.NPS, true), GermanChunker.build("<NP> <und|sowie> <pos=ART> <pos=PA1> <pos=SUB>", PhraseType.NPP, true), GermanChunker.build("<eins|eines> <chunk=B-NP> <chunk=I-NP>+", PhraseType.NPS), GermanChunker.build("<ich|du|er|sie|es|wir|ihr|sie> <und|oder|sowie> <NP>", PhraseType.NPP), GermanChunker.build("<sowohl> <NP> <als> <auch> <NP>", PhraseType.NPP), GermanChunker.build("<sowohl> <pos=EIG> <als> <auch> <pos=EIG>", PhraseType.NPP), GermanChunker.build("<sowohl> <ich|du|er|sie|es|wir|ihr|sie> <als> <auch> <NP>", PhraseType.NPP), GermanChunker.build("<pos=SUB> <und|oder|sowie> <chunk=B-NP & !ihre> <chunk=I-NP>*", PhraseType.NPP), GermanChunker.build("<weder> <pos=SUB> <noch> <pos=SUB>", PhraseType.NPP), GermanChunker.build("<zwei|drei|vier|f\u00fcnf|sechs|sieben|acht|neun|zehn|elf|zw\u00f6lf> <chunk=I-NP>", PhraseType.NPP), GermanChunker.build("<chunk=B-NP> <pos=PRP> <NP> <chunk=B-NP & pos=SIN> <chunk=I-NP>*", PhraseType.NPS), GermanChunker.build("<chunk=B-NP> <pos=PRP> <NP> <chunk=B-NP & pos=PLU> <chunk=I-NP>*", PhraseType.NPP), GermanChunker.build("<chunk=B-NP> <pos=PRP> <NP> <pos=PA2> <chunk=B-NP & !pos=PLU> <chunk=I-NP>*", PhraseType.NPS), GermanChunker.build("<chunk=B-NP> <pos=PRP> <NP> <pos=PA2> <chunk=B-NP & !pos=SIN> <chunk=I-NP>*", PhraseType.NPP), GermanChunker.build("<Herr|Frau> <und> <Herr|Frau> <pos=EIG>*", PhraseType.NPP), GermanChunker.build("<chunk=B-NP & !pos=ZAL & !pos=PLU & !chunk=NPP & !einige & !(regex=&prozent;)> <chunk=I-NP & !pos=PLU & !und>*", PhraseType.NPS), GermanChunker.build("<chunk=B-NP & !pos=SIN & !chunk=NPS & !Ellen> <chunk=I-NP & !pos=SIN>*", PhraseType.NPP), GermanChunker.build("<chunk=NPS> <pos=PRO> <pos=ADJ> <pos=ADJ> <NP>", PhraseType.NPS), GermanChunker.build("<regex=eine[rs]?> <der> <am> <pos=ADJ> <pos=PA2> <NP>", PhraseType.NPS), GermanChunker.build("<regex=eine[rs]?> <der> <beiden> <pos=ADJ>* <pos=SUB>", PhraseType.NPS), GermanChunker.build("<regex=eine[rs]?> <seiner|ihrer> <pos=PA1> <pos=SUB>", PhraseType.NPS), GermanChunker.build("<regex=[\\d,.]+> <&prozent;>", PhraseType.NPS), GermanChunker.build("<regex=[\\d,.]+> <&prozent;>", PhraseType.NPP), GermanChunker.build("<dass> <sie> <wie> <NP>", PhraseType.NPP), GermanChunker.build("<pos=PLU> <die> <Regel>", PhraseType.NPP), GermanChunker.build("<chunk=B-NP & pos=SIN> <chunk=I-NP & pos=SIN>* <,> <die> <pos=ADV>+ <chunk=NPS>+", PhraseType.NPS), GermanChunker.build("<chunk=B-NP & pos=PLU> <chunk=I-NP & pos=PLU>* <,> <die> <pos=ADV>+ <chunk=NPS>+", PhraseType.NPP), GermanChunker.build("<der|die|das> <pos=ADJ> <der> <pos=PA1> <pos=SUB>", PhraseType.NPS), GermanChunker.build("<pos=SUB & pos=PLU> <der> <pos=PA1> <pos=SUB>", PhraseType.NPP), GermanChunker.build("<der|die|das> <pos=ADJ> <der> <pos=PRO>? <pos=SUB>", PhraseType.NPS), GermanChunker.build("<chunk=NPS & !einige> <chunk=NPP & (pos=GEN |pos=ZAL)>+", PhraseType.NPS, true), GermanChunker.build("<chunk=NPP> <chunk=NPS & pos=GEN>+", PhraseType.NPP, true), GermanChunker.build("<chunk=NPS>+ <und> <chunk=NP[SP] & (pos=GEN | pos=ADV)>+", PhraseType.NPS, true), GermanChunker.build("<chunk=NPS>+ <der> <pos=ADV> <pos=PA2> <chunk=I-NP>", PhraseType.NPS, true), GermanChunker.build("<chunk=NPS>+ <der> (<pos=ADJ>|<pos=ZAL>) <NP>", PhraseType.NPS, true), GermanChunker.build("<chunk=NPS>+ <der> <NP>", PhraseType.NPS, true), GermanChunker.build("<chunk=NPS>+ <der> <pos=ADJ> <pos=ADV> <pos=PA2> <NP>", PhraseType.NPS, true), GermanChunker.build("<chunk=NPS>+ <pos=PRO:POS> <pos=ADJ> <NP>", PhraseType.NPS, true), GermanChunker.build("<der|das> <pos=ADJ> <der> <pos=ZAL> <NP>", PhraseType.NPS, true), GermanChunker.build("<eine> <menge> <NP>+", PhraseType.NPP, true), GermanChunker.build("<er|sie|es> <und> <NP> <NP>", PhraseType.NPP), GermanChunker.build("<laut> <regex=.*>{0,3} <Quellen>", PhraseType.PP, true), GermanChunker.build("<pos=PRP> <pos=ART:> <pos=ADV>* <pos=ADJ> <NP>", PhraseType.PP, true), GermanChunker.build("<pos=PRP> <chunk=NPP>+ <,> <NP>", PhraseType.PP, true), GermanChunker.build("<pos=PRP> <chunk=NPP>+", PhraseType.PP, true), GermanChunker.build("<pos=PRP> <der> <chunk=NPP>+", PhraseType.PP), GermanChunker.build("<pos=PRP> <NP>", PhraseType.PP), GermanChunker.build("<pos=PRP> <NP> <pos=ADJ> <und|oder|bzw.> <NP>", PhraseType.PP), GermanChunker.build("<pos=PRP> (<NP>)+", PhraseType.PP), GermanChunker.build("<pos=PRP> <chunk=B-NP> <pos=ADV> <NP>", PhraseType.PP), GermanChunker.build("<pos=PRP> <pos=ADV> <pos=ZAL> <chunk=B-NP>", PhraseType.PP), GermanChunker.build("<pos=PRP> <pos=PRO> <NP>", PhraseType.PP), GermanChunker.build("<pos=PRP> <pos=ADJ> <und|oder|sowie> <NP>", PhraseType.PP), GermanChunker.build("<pos=PRP> <pos=ADV> <regex=\\d+> <NP>", PhraseType.PP), GermanChunker.build("<pos=PRP> <pos=PA1> <NP>", PhraseType.PP), GermanChunker.build("<pos=PRP> <pos=ADJ> <pos=PA1> <NP>", PhraseType.PP), GermanChunker.build("<pos=PRP> <NP> <NP> <und|oder> <NP>", PhraseType.PP), GermanChunker.build("<pos=PRP> <pos=ADV> <pos=ADJ> <NP>", PhraseType.PP), GermanChunker.build("<pos=PRP> <pos=ADJ:PRD:GRU> <pos=ZAL> <NP>", PhraseType.PP), GermanChunker.build("<die> <pos=ADJ> <Sekunden|Minuten|Stunden|Tage|Wochen|Monate|Jahre|Jahrzehnte|Jahrhunderte> (<NP>)?", PhraseType.PP), GermanChunker.build("<die> <pos=ADJ> <pos=ZAL> <Sekunden|Minuten|Stunden|Tage|Wochen|Monate|Jahre|Jahrzehnte|Jahrhunderte> (<NP>)?", PhraseType.PP), GermanChunker.build("<regex=(vor)?letzte[sn]?> <Woche|Monat|Jahr|Jahrzehnt|Jahrhundert>", PhraseType.PP), GermanChunker.build("<f\u00fcr> <in> <pos=EIG> <pos=PA1> <pos=SUB> <und> <pos=SUB>", PhraseType.PP, true), GermanChunker.build("<chunk=NPP> <zwischen> <pos=EIG> <und|sowie> <NP>", PhraseType.NPP), GermanChunker.build("<,> <die|welche> <NP> <chunk=NPS & pos=GEN>+", PhraseType.NPP), GermanChunker.build("<NP> <,> <NP> <,> <NP>", PhraseType.NPP), GermanChunker.build("<NP> <,> <NP> <,> <wie> <auch> <chunk=NPS>+", PhraseType.NPP));
    }

    static enum PhraseType {
        NP,
        NPS,
        NPP,
        PP;

    }

    private static class RegularExpressionWithPhraseType {
        final RegularExpression<ChunkTaggedToken> expression;
        final PhraseType phraseType;
        final boolean overwrite;
        final String[][] formHints;

        RegularExpressionWithPhraseType(RegularExpression<ChunkTaggedToken> expression, PhraseType phraseType, boolean overwrite, String[][] formHints) {
            this.expression = expression;
            this.phraseType = phraseType;
            this.overwrite = overwrite;
            this.formHints = formHints;
        }

        public String toString() {
            return this.phraseType + " <= " + this.expression + " (overwrite: " + this.overwrite + ")";
        }
    }

    private static class AffectedSpans {
        final List<Span> spans;

        AffectedSpans(List<Span> spans) {
            this.spans = spans;
        }

        boolean isAffected(int pos) {
            for (Span span : this.spans) {
                if (pos < span.startIndex || pos >= span.endIndex) continue;
                return true;
            }
            return false;
        }
    }

    private static class Span {
        final int startIndex;
        final int endIndex;

        Span(int startIndex, int endIndex) {
            this.startIndex = startIndex;
            this.endIndex = endIndex;
        }
    }
}

