package net.kafujo.samples.wikidata;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

public class WikidataCityCleaner {

    public static List<String> cleanRawCvs(Path raw) throws IOException {

        final Set<String> firstColumn = new HashSet<>(); // contains the wikidata id
        final List<String> collect = new LinkedList<>();
        int ignoreCount = 0;
        int totalCount = 0;

        for (var line : Files.readAllLines(raw)) {
            totalCount++;
            var begin1 = line.substring(0, line.indexOf(","));
            if (firstColumn.add(begin1)) {
                collect.add(line.replace("http://www.wikidata.org/entity/", ""));
            } else {
                ignoreCount++;
            }
        }

        System.out.println("TOTAL   : " + totalCount);
        System.out.println("IGNORED : " + ignoreCount);
        System.out.println("UNIQUE  :  " + collect.size());
        assert firstColumn.size() == collect.size();

        return collect;
    }


    public static void main(String[] args) throws IOException {
        var set = cleanRawCvs(Path.of("/opt/tmp/query.csv"));
        var dest = Path.of("/opt/tmp/cleaned.csv");
        Files.write(dest, set);
    }

}
