/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.dev;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.tagging.pt.PortugueseTagger;

public class MissingPortuguesePosFinder {
    public static void main(String[] args) throws IOException {
        if (args.length != 2) {
            System.out.println("Usage: " + MissingPortuguesePosFinder.class.getSimpleName() + " <file> <gaia_file>");
            System.out.println("   <gaia_file> is e.g. pt_br_wordlist.xml from https://github.com/mozilla-b2g/gaia/tree/master/apps/keyboard/js/imes/latin/dictionaries");
            System.exit(1);
        }
        Map<String, Integer> occ = MissingPortuguesePosFinder.getOccurrences(new File(args[1]));
        List<String> lines = Files.readAllLines(Paths.get(args[0], new String[0]));
        PortugueseTagger tagger = new PortugueseTagger();
        for (String word : lines) {
            int origCount = -1;
            if (word.matches("\\d+ .*")) {
                String[] parts = word.split(" ");
                origCount = Integer.parseInt(parts[0]);
                word = parts[1];
            }
            if ((word = word.trim()).endsWith(".")) {
                word = word.substring(0, word.length() - 1);
            }
            List matches = tagger.tag(Collections.singletonList(word));
            List lcMatches = tagger.tag(Collections.singletonList(word.toLowerCase()));
            if (matches.size() != 1 || !MissingPortuguesePosFinder.noTag((AnalyzedTokenReadings)matches.get(0)) || lcMatches.size() != 1 || !MissingPortuguesePosFinder.noTag((AnalyzedTokenReadings)lcMatches.get(0)) || !occ.containsKey(word)) continue;
            long count = origCount == -1 ? (long)occ.get(word).intValue() : (long)origCount;
            System.out.println(count + "\t" + word);
        }
    }

    private static Map<String, Integer> getOccurrences(File gaiaXmlFile) throws IOException {
        List<String> lines = Files.readAllLines(gaiaXmlFile.toPath());
        HashMap<String, Integer> map = new HashMap<String, Integer>();
        Pattern p = Pattern.compile("<w f=\"(\\d+)\" flags=\".*?\">(.*?)</w>");
        for (String line : lines) {
            if ((line = line.trim()).startsWith("<w ")) {
                Matcher matcher = p.matcher(line);
                if (matcher.matches()) {
                    int occ = Integer.parseInt(matcher.group(1));
                    String word = matcher.group(2);
                    map.put(word, occ);
                    continue;
                }
                System.out.println("Skipping line, doesn't match regex: " + line);
                continue;
            }
            System.out.println("Skipping line: " + line);
        }
        return map;
    }

    private static boolean noTag(AnalyzedTokenReadings atr) {
        return !atr.isTagged();
    }
}

