From 6857229658b74c99b08cf619971c6984fa7617fb Mon Sep 17 00:00:00 2001 From: lukstbit Date: Mon, 9 May 2022 08:06:23 +0300 Subject: [PATCH] Migrate CsvSniffer.java to kotlin --- AnkiDroid/kotlinMigration.gradle | 2 +- .../libanki/importer/python/CsvSniffer.kt | 557 ++++++++---------- 2 files changed, 252 insertions(+), 307 deletions(-) diff --git a/AnkiDroid/kotlinMigration.gradle b/AnkiDroid/kotlinMigration.gradle index 5975a72a03..a2e505b236 100644 --- a/AnkiDroid/kotlinMigration.gradle +++ b/AnkiDroid/kotlinMigration.gradle @@ -43,7 +43,7 @@ permission notice: // Example of class name: "/com/ichi2/anki/UIUtils.kt" // Ensure that it starts with '/' (slash) def source = Source.MAIN -def className = "/com/ichi2/libanki/importer/python/CsvSniffer.kt" +def className = "" enum Source { MAIN("/src/main/java"), diff --git a/AnkiDroid/src/main/java/com/ichi2/libanki/importer/python/CsvSniffer.kt b/AnkiDroid/src/main/java/com/ichi2/libanki/importer/python/CsvSniffer.kt index 7800e1595b..ada9ea9f24 100644 --- a/AnkiDroid/src/main/java/com/ichi2/libanki/importer/python/CsvSniffer.kt +++ b/AnkiDroid/src/main/java/com/ichi2/libanki/importer/python/CsvSniffer.kt @@ -19,433 +19,378 @@ Ported from https://github.com/python/cpython/blob/a74eea238f5baba15797e2e8b570d153bc8690a7/Lib/csv.py#L159 */ +package com.ichi2.libanki.importer.python -package com.ichi2.libanki.importer.python; - -import android.annotation.SuppressLint; -import android.os.Build; - -import com.ichi2.libanki.importer.CsvException; -import com.ichi2.utils.HashUtil; - -import java.util.AbstractMap; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import androidx.annotation.Nullable; -import androidx.annotation.RequiresApi; +import android.annotation.SuppressLint +import android.os.Build +import androidx.annotation.RequiresApi +import com.ichi2.libanki.importer.CsvException +import com.ichi2.utils.HashUtil.HashMapInit +import com.ichi2.utils.KotlinCleanup +import java.util.* +import java.util.regex.Matcher +import java.util.regex.Pattern @SuppressLint("NonPublicNonStaticFieldName") @RequiresApi(Build.VERSION_CODES.O) // Regex group(str) -public class CsvSniffer { +@KotlinCleanup("fix IDE lint issues") +class CsvSniffer { + private val preferred: CharArray - - private final char[] preferred; - - - public CsvSniffer() { + init { // in case there is more than one possible delimiter - preferred = new char[] {',', '\t', ';', ' ', ':'}; + preferred = charArrayOf(',', '\t', ';', ' ', ':') } - - - public CsvDialect sniff(String sample, char[] delimiters) { - - List delimiterList = toList(delimiters); - GuessQuoteAndDelimiter result = _guess_quote_and_delimiter(sample, delimiterList); - char quotechar = result.quotechar; - boolean doublequote = result.doublequote; - char delimiter = result.delimiter; - boolean skipinitialspace = result.skipinitialspace; - - if (delimiter == '\0') { - Guess g = _guess_delimiter(sample, delimiterList); - delimiter = g.delimiter; - skipinitialspace = g.skipinitialspace; + fun sniff(sample: String, delimiters: CharArray?): CsvDialect { + val delimiterList = toList(delimiters) + val result = _guess_quote_and_delimiter(sample, delimiterList) + val quotechar = result.quotechar + val doublequote = result.doublequote + var delimiter = result.delimiter + var skipinitialspace = result.skipinitialspace + if (delimiter == '\u0000') { + val g = _guess_delimiter(sample, delimiterList) + delimiter = g.delimiter + skipinitialspace = g.skipinitialspace } - - if (delimiter == '\0') { - throw new CsvException("Could not determine delimiter"); + if (delimiter == '\u0000') { + throw CsvException("Could not determine delimiter") } - - CsvDialect dialect = new CsvDialect("sniffed"); - - dialect.mDoublequote = doublequote; - dialect.mDelimiter = delimiter; + @KotlinCleanup("use a scope function") + val dialect = CsvDialect("sniffed") + dialect.mDoublequote = doublequote + dialect.mDelimiter = delimiter // _csv.reader won't accept a quotechar of '' - dialect.mQuotechar = quotechar == '\0' ? '"' : quotechar; - dialect.mSkipInitialSpace = skipinitialspace; - - return dialect; + dialect.mQuotechar = if (quotechar == '\u0000') '"' else quotechar + dialect.mSkipInitialSpace = skipinitialspace + return dialect } - - private List toList(@Nullable char[] delimiters) { + @KotlinCleanup("could be further simplified: return if/else, use delimiters.toList()") + private fun toList(delimiters: CharArray?): List { if (delimiters == null) { - return new ArrayList<>(0); + return ArrayList(0) } - ArrayList ret = new ArrayList<>(delimiters.length); - for (char delimiter : delimiters) { - ret.add(delimiter); + val ret = ArrayList(delimiters.size) + for (delimiter in delimiters) { + ret.add(delimiter) } - return ret; + return ret } - /** - * Looks for text enclosed between two identical quotes - * (the probable quotechar) which are preceded and followed - * by the same character (the probable delimiter). - * For example: - * ,'some text', - * The quote with the most wins, same with the delimiter. - * If there is no quotechar the delimiter can't be determined - * this way. + * Looks for text enclosed between two identical quotes + * (the probable quotechar) which are preceded and followed + * by the same character (the probable delimiter). + * For example: + * ,'some text', + * The quote with the most wins, same with the delimiter. + * If there is no quotechar the delimiter can't be determined + * this way. */ - private GuessQuoteAndDelimiter _guess_quote_and_delimiter(String data, List delimiters) { - ArrayList regexes = new ArrayList<>(4); - regexes.add("(?[^\\w\\n\"'])(? ?)(?[\"']).*?\\k\\k"); // ,".*?", - regexes.add("(?:^|\\n)(?[\"']).*?\\k(?[^\\w\\n\"'])(? ?)"); // ".*?", - regexes.add("(?[^\\w\\n\"'])(? ?)(?[\"']).*?\\k(?:$|\\n)"); // ,".*?" - regexes.add("(?:^|\\n)(?[\"']).*?\\k(?:$|\\n)"); // ".*?" (no delim, no space) - - - List matches = new ArrayList<>(); - - for(String regex : regexes) { - Pattern p = Pattern.compile(regex, Pattern.MULTILINE | Pattern.DOTALL); - Matcher m = p.matcher(data); + private fun _guess_quote_and_delimiter(data: String, delimiters: List?): GuessQuoteAndDelimiter { + val regexes = ArrayList(4) + regexes.add("(?[^\\w\\n\"'])(? ?)(?[\"']).*?\\k\\k") // ,".*?", + regexes.add("(?:^|\\n)(?[\"']).*?\\k(?[^\\w\\n\"'])(? ?)") // ".*?", + regexes.add("(?[^\\w\\n\"'])(? ?)(?[\"']).*?\\k(?:$|\\n)") // ,".*?" + regexes.add("(?:^|\\n)(?[\"']).*?\\k(?:$|\\n)") // ".*?" (no delim, no space) + val matches: MutableList = ArrayList() + for (regex in regexes) { + val p = Pattern.compile(regex, Pattern.MULTILINE or Pattern.DOTALL) + val m = p.matcher(data) while (m.find()) { - Group g = new Group(); - g.delim = getCharOrNull(m, "delim"); - g.quote = getCharOrNull(m, "quote"); - g.space = m.group("space"); - matches.add(g); + val g = Group() + g.delim = getCharOrNull(m, "delim") + g.quote = getCharOrNull(m, "quote") + g.space = m.group("space") + matches.add(g) } if (!matches.isEmpty()) { - break; + break } } if (matches.isEmpty()) { - return new GuessQuoteAndDelimiter('\0', false, '\0', false); + return GuessQuoteAndDelimiter('\u0000', false, '\u0000', false) } - - - Map quotes = HashUtil.HashMapInit(matches.size()); - Map delims = new HashMap<>(); - int spaces = 0; - for (Group m : matches) { - char key = m.quote; - if (key != '\0') { - quotes.put(key, quotes.getOrDefault(key, 0) + 1); + val quotes: MutableMap = HashMapInit(matches.size) + val delims: MutableMap = HashMap() + var spaces = 0 + for (m in matches) { + var key = m.quote + if (key != '\u0000') { + quotes[key] = quotes.getOrDefault(key, 0) + 1 } - - key = m.delim; - - if (key != '\0' && (delimiters == null || delimiters.isEmpty() || delimiters.contains(key))) { - delims.put(key, delims.getOrDefault(key, 0) + 1); + key = m.delim + if (key != '\u0000' && (delimiters == null || delimiters.isEmpty() || delimiters.contains(key))) { + delims[key] = delims.getOrDefault(key, 0) + 1 } - - if (m.space != null && m.space.length() > 0) { - spaces += 1; + if (m.space != null && m.space!!.length > 0) { + spaces += 1 } } - - Character quotechar = max(quotes); - - Character delim; - boolean skipinitialspace; + val quotechar = max(quotes)!! + var delim: Char + val skipinitialspace: Boolean if (!delims.isEmpty()) { - delim = max(delims); - skipinitialspace = delims.get(delim) == spaces; + delim = max(delims)!! + skipinitialspace = delims[delim] == spaces if (delim == '\n') { // most likely a file with a single column - delim = '\0'; + delim = '\u0000' } } else { // there is *no* delimiter, it's a single column of quoted data - delim = '\0'; - skipinitialspace = false; + delim = '\u0000' + skipinitialspace = false } - // if we see an extra quote between delimiters, we've got a // double quoted format - String regex = String.format("((%s)|^)\\W*%s[^%s\\n]*%s[^%s\\n]*%s\\W*((%s)|$)", delim, quotechar, delim, quotechar, delim, quotechar, delim); - Pattern dq_regexp = Pattern.compile(regex, Pattern.MULTILINE); - - - boolean doublequote = dq_regexp.matcher(data).find(); - - return new GuessQuoteAndDelimiter(quotechar, doublequote, delim, skipinitialspace); + val regex = String.format( + "((%s)|^)\\W*%s[^%s\\n]*%s[^%s\\n]*%s\\W*((%s)|$)", + delim, + quotechar, + delim, + quotechar, + delim, + quotechar, + delim + ) + val dq_regexp = Pattern.compile(regex, Pattern.MULTILINE) + val doublequote = dq_regexp.matcher(data).find() + return GuessQuoteAndDelimiter(quotechar, doublequote, delim, skipinitialspace) } - - private char getCharOrNull(Matcher m, String delim) { - String group = m.group(delim); - if (group == null || group.length() == 0) { - return '\0'; - } - return group.charAt(0); + @KotlinCleanup("method name?! the method can't return null") + private fun getCharOrNull(m: Matcher, delim: String): Char { + val group = m.group(delim) + return if (group == null || group.length == 0) { + '\u0000' + } else group[0] } - /** * The delimiter /should/ occur the same number of times on * each row. However, due to malformed data, it may not. We don't want * an all or nothing approach, so we allow for small variations in this * number. - * 1) build a table of the frequency of each character on every line. - * 2) build a table of frequencies of this frequency (meta-frequency?), - * e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, - * 7 times in 2 rows' - * 3) use the mode of the meta-frequency to determine the /expected/ - * frequency for that character - * 4) find out how often the character actually meets that goal - * 5) the character that best meets its goal is the delimiter + * 1) build a table of the frequency of each character on every line. + * 2) build a table of frequencies of this frequency (meta-frequency?), + * e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, + * 7 times in 2 rows' + * 3) use the mode of the meta-frequency to determine the /expected/ + * frequency for that character + * 4) find out how often the character actually meets that goal + * 5) the character that best meets its goal is the delimiter * For performance reasons, the data is evaluated in chunks, so it can * try and evaluate the smallest portion of the data possible, evaluating * additional chunks as necessary. */ - private Guess _guess_delimiter(String input, List delimiters) { + private fun _guess_delimiter(input: String, delimiters: List?): Guess { // remove falsey values - String[] samples = input.split("\n"); - List data = new ArrayList<>(samples.length); - for (String s : samples) { - if (s == null || s.length() == 0) { - continue; + val samples = input.split("\n").toTypedArray() + val data: MutableList = ArrayList(samples.size) + for (s in samples) { + if (s.length == 0) { + continue } - data.add(s); + data.add(s) } - - char[] ascii = new char[128]; // 7-bit ASCII - for(char i = 0; i < 128; i++) { - ascii[i] = i; + val ascii = CharArray(128) // 7-bit ASCII + for (i in 0..127) { + ascii[i] = i.toChar() } // build frequency tables - int chunkLength = Math.min(10, data.size()); - int iteration = 0; - Map> charFrequency = new HashMap<>(); - Map modes = new HashMap<>(); - Map delims = new HashMap<>(); - int start = 0; - int end = chunkLength; - - while (start < data.size()) { - iteration++; - for (String line : data.subList(start, end)) { - for (char c : ascii) { - Map metaFrequency = charFrequency.getOrDefault(c, new HashMap<>()); + val chunkLength = Math.min(10, data.size) + var iteration = 0 + val charFrequency: MutableMap> = HashMap() + val modes: MutableMap = HashMap() + val delims: MutableMap = HashMap() + var start = 0 + var end = chunkLength + while (start < data.size) { + iteration++ + for (line in data.subList(start, end)) { + for (c in ascii) { + val metaFrequency = charFrequency.getOrDefault(c, HashMap()) // must count even if frequency is 0 - int freq = countInString(line, c); + val freq = countInString(line, c) // value is the mode - metaFrequency.put(freq, metaFrequency.getOrDefault(freq, 0) + 1); - charFrequency.put(c, metaFrequency); + metaFrequency[freq] = metaFrequency.getOrDefault(freq, 0) + 1 + charFrequency[c] = metaFrequency } } - for (Map.Entry> e : charFrequency.entrySet()) { - char c = e.getKey(); - Set> bareList = e.getValue().entrySet(); - - List items = new ArrayList<>(bareList.size()); - - for (Map.Entry entry : bareList) { - items.add(new Tuple(entry)); + for ((c, value) in charFrequency) { + val bareList = value.entries + val items: MutableList = ArrayList(bareList.size) + for (entry in bareList) { + items.add(Tuple(entry)) } - - if (items.size() == 1 && items.get(0).second == 0) { - continue; + if (items.size == 1 && items[0].second == 0) { + continue } // get the mode of the frequencies - if (items.size() > 1) { - modes.put(c, maxSecond(items)); + if (items.size > 1) { + val toRemove = maxSecond(items) // adjust the mode - subtract the sum of all // other frequencies - Tuple toRemove = modes.get(c); - items.remove(toRemove); - modes.put(c, new Tuple(toRemove.first, toRemove.second - sumSecond(items))); + items.remove(toRemove) + modes[c] = Tuple(toRemove!!.first, toRemove.second - sumSecond(items)) } else { - modes.put(c, items.get(0)); + modes[c] = items[0] } } // build a list of possible delimiters - Set> modeList = modes.entrySet(); - float total = Math.min(chunkLength * iteration, data.size()); + val modeList: Set> = modes.entries + val total = Math.min(chunkLength * iteration, data.size).toFloat() // (rows of consistent data) / (number of rows) = 100% - double consistency = 1.0; + var consistency = 1.0 // minimum consistency threshold - double threshold = 0.9; + val threshold = 0.9 while (delims.isEmpty() && consistency >= threshold) { - for (Map.Entry entry : modeList) { - Tuple value = entry.getValue(); + for ((key, value) in modeList) { if (value.first > 0 && value.second > 0) { - if (((double) value.second / total) >= consistency && (delimiters == null || delimiters.contains(entry.getKey()))) { - delims.put(entry.getKey(), value); + if (value.second.toDouble() / total >= consistency && (delimiters == null || delimiters.contains(key))) { + delims[key] = value } } } - consistency -= 0.01; + consistency -= 0.01 } - - if (delims.size() == 1) { - Character delim = new ArrayList<>(delims.keySet()).get(0); - boolean skipinitialspace = countInString(data.get(0), delim) == countInString(data.get(0), delim + " "); - return new Guess(delim, skipinitialspace); + if (delims.size == 1) { + val delim = ArrayList(delims.keys)[0] + val skipinitialspace = countInString(data[0], delim) == countInString( + data[0], "$delim " + ) + return Guess(delim, skipinitialspace) } // analyze another chunkLength lines - start = end; - end += chunkLength; + start = end + end += chunkLength } - if (delims.isEmpty()) { - return new Guess('\0', false); + return Guess('\u0000', false) } // if there's more than one, fall back to a 'preferred' list - if (delims.size() > 1) { - for (char d : preferred) { + if (delims.size > 1) { + for (d in preferred) { if (delims.containsKey(d)) { - boolean skipinitialspace = countInString(data.get(0), d) == countInString(data.get(0), d + " "); - return new Guess(d, skipinitialspace); + val skipinitialspace = countInString(data[0], d) == countInString( + data[0], "$d " + ) + return Guess(d, skipinitialspace) } } } // nothing else indicates a preference, pick the character that // dominates(?) - ArrayList> items = new ArrayList<>(delims.size()); - for(Map.Entry i : delims.entrySet()) { - items.add(new AbstractMap.SimpleEntry<>(i.getValue(), i.getKey())); + val items = ArrayList>(delims.size) + for ((key, value) in delims) { + items.add(AbstractMap.SimpleEntry(value, key)) } - items.sort((o1, o2) -> { - int compare = Integer.compare(o1.getKey().first, o2.getKey().first); - if (compare != 0) { - return compare; + items.sortWith( + kotlin.Comparator { o1: Map.Entry, o2: Map.Entry -> + val compare = o1.key.first.compareTo(o2.key.first) + if (compare != 0) { + compare + } else { + o1.key.second.compareTo(o2.key.second) + } } - return Integer.compare(o1.getKey().second, o2.getKey().second); - }); - char delim = items.get(items.size() - 1).getValue(); - - boolean skipinitialspace = countInString(data.get(0), delim) == countInString(data.get(0), delim + " "); - return new Guess(delim, skipinitialspace); - + ) + val delim = items[items.size - 1].value + val skipinitialspace = countInString(data[0], delim) == countInString( + data[0], "$delim " + ) + return Guess(delim, skipinitialspace) } - - private int sumSecond(List items) { - int total = 0; - for (Tuple item : items) { - total += item.second; + private fun sumSecond(items: List): Int { + var total = 0 + for (item in items) { + total += item!!.second } - return total; + return total } - - private T max(Map histogram) { - T max = null; - int maximum = 0; - for (Map.Entry entry : histogram.entrySet()) { - if (entry.getValue() > maximum) { - maximum = entry.getValue(); - max = entry.getKey(); + private fun max(histogram: Map): T? { + var max: T? = null + var maximum = 0 + for ((key, value) in histogram) { + if (value > maximum) { + maximum = value + max = key } } - return max; + return max } - - /** max(items, key = lambda x:x[1]) */ - private Tuple maxSecond(List items) { + /** max(items, key = lambda x:x[1]) */ + private fun maxSecond(items: List): Tuple? { // items = [(1,1), (2,1)] // pp(max(items, key = lambda x:x[1])) // (1,1) - the first is picked, so use > max - int max = 0; - Tuple bestMax = null; - for (Tuple item : items) { - if (item.second > max) { - bestMax = item; - max = item.second; + var max = 0 + var bestMax: Tuple? = null + for (item in items) { + if (item!!.second > max) { + bestMax = item + max = item.second } } - return bestMax; + return bestMax } - - private static class Tuple { - public final int first; - public final int second; - - - public Tuple(Integer key, Integer value) { - first = key; - second = value; - } - - - public Tuple(Map.Entry entry) { - this(entry.getKey(), entry.getValue()); - } + private class Tuple(val first: Int, val second: Int) { + constructor(entry: Map.Entry) : this(entry.key, entry.value) {} } - private static int countInString(String s, char c) { - int count = 0; - for (int i = 0; i < s.length(); i++) { - if (s.charAt(i) == c) { - count++; + protected class GuessQuoteAndDelimiter( + val quotechar: Char, + val doublequote: Boolean, + delimiter: Char, + skipinitialspace: Boolean + ) : Guess(delimiter, skipinitialspace) + + @KotlinCleanup("check: values were assigned by the migration tool, seems ok from where class it's used") + protected class Group { + var quote = 0.toChar() + var delim = 0.toChar() + var space: String? = null + } + + protected open class Guess(val delimiter: Char, val skipinitialspace: Boolean) + + companion object { + @JvmStatic + private fun countInString(s: String, c: Char): Int { + var count = 0 + for (i in 0 until s.length) { + if (s[i] == c) { + count++ + } } + return count } - return count; - } - private static int countInString(String haystack, String needle) { - int idx = 0; - int count = 0; - - while (idx != -1) { - idx = haystack.indexOf(needle, idx); - if (idx != -1) { - count++; - idx += needle.length(); + @JvmStatic + private fun countInString(haystack: String, needle: String): Int { + var idx = 0 + var count = 0 + while (idx != -1) { + idx = haystack.indexOf(needle, idx) + if (idx != -1) { + count++ + idx += needle.length + } } - } - return count; - } - - protected static class GuessQuoteAndDelimiter extends Guess { - public final char quotechar; - public final boolean doublequote; - - - public GuessQuoteAndDelimiter(char quotechar, boolean doublequote, char delimiter, boolean skipinitialspace) { - super(delimiter, skipinitialspace); - this.quotechar = quotechar; - this.doublequote = doublequote; - } - } - - protected static class Group { - public char quote; - public char delim; - public String space; - } - - protected static class Guess { - public final char delimiter; - public final boolean skipinitialspace; - - - public Guess(char delimiter, boolean skipinitialspace) { - this.delimiter = delimiter; - this.skipinitialspace = skipinitialspace; + return count } } }