0
0
mirror of https://github.com/ankidroid/Anki-Android.git synced 2024-09-20 03:52:15 +02:00

Migrate CsvSniffer.java to kotlin

This commit is contained in:
lukstbit 2022-05-09 08:06:23 +03:00 committed by Mike Hardy
parent 640ef9fec8
commit 6857229658
2 changed files with 252 additions and 307 deletions

View File

@ -43,7 +43,7 @@ permission notice:
// Example of class name: "/com/ichi2/anki/UIUtils.kt"
// Ensure that it starts with '/' (slash)
def source = Source.MAIN
def className = "/com/ichi2/libanki/importer/python/CsvSniffer.kt"
def className = ""
enum Source {
MAIN("/src/main/java"),

View File

@ -19,433 +19,378 @@
Ported from https://github.com/python/cpython/blob/a74eea238f5baba15797e2e8b570d153bc8690a7/Lib/csv.py#L159
*/
package com.ichi2.libanki.importer.python
package com.ichi2.libanki.importer.python;
import android.annotation.SuppressLint;
import android.os.Build;
import com.ichi2.libanki.importer.CsvException;
import com.ichi2.utils.HashUtil;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import androidx.annotation.Nullable;
import androidx.annotation.RequiresApi;
import android.annotation.SuppressLint
import android.os.Build
import androidx.annotation.RequiresApi
import com.ichi2.libanki.importer.CsvException
import com.ichi2.utils.HashUtil.HashMapInit
import com.ichi2.utils.KotlinCleanup
import java.util.*
import java.util.regex.Matcher
import java.util.regex.Pattern
@SuppressLint("NonPublicNonStaticFieldName")
@RequiresApi(Build.VERSION_CODES.O) // Regex group(str)
public class CsvSniffer {
@KotlinCleanup("fix IDE lint issues")
class CsvSniffer {
private val preferred: CharArray
private final char[] preferred;
public CsvSniffer() {
init {
// in case there is more than one possible delimiter
preferred = new char[] {',', '\t', ';', ' ', ':'};
preferred = charArrayOf(',', '\t', ';', ' ', ':')
}
public CsvDialect sniff(String sample, char[] delimiters) {
List<Character> delimiterList = toList(delimiters);
GuessQuoteAndDelimiter result = _guess_quote_and_delimiter(sample, delimiterList);
char quotechar = result.quotechar;
boolean doublequote = result.doublequote;
char delimiter = result.delimiter;
boolean skipinitialspace = result.skipinitialspace;
if (delimiter == '\0') {
Guess g = _guess_delimiter(sample, delimiterList);
delimiter = g.delimiter;
skipinitialspace = g.skipinitialspace;
fun sniff(sample: String, delimiters: CharArray?): CsvDialect {
val delimiterList = toList(delimiters)
val result = _guess_quote_and_delimiter(sample, delimiterList)
val quotechar = result.quotechar
val doublequote = result.doublequote
var delimiter = result.delimiter
var skipinitialspace = result.skipinitialspace
if (delimiter == '\u0000') {
val g = _guess_delimiter(sample, delimiterList)
delimiter = g.delimiter
skipinitialspace = g.skipinitialspace
}
if (delimiter == '\0') {
throw new CsvException("Could not determine delimiter");
if (delimiter == '\u0000') {
throw CsvException("Could not determine delimiter")
}
CsvDialect dialect = new CsvDialect("sniffed");
dialect.mDoublequote = doublequote;
dialect.mDelimiter = delimiter;
@KotlinCleanup("use a scope function")
val dialect = CsvDialect("sniffed")
dialect.mDoublequote = doublequote
dialect.mDelimiter = delimiter
// _csv.reader won't accept a quotechar of ''
dialect.mQuotechar = quotechar == '\0' ? '"' : quotechar;
dialect.mSkipInitialSpace = skipinitialspace;
return dialect;
dialect.mQuotechar = if (quotechar == '\u0000') '"' else quotechar
dialect.mSkipInitialSpace = skipinitialspace
return dialect
}
private List<Character> toList(@Nullable char[] delimiters) {
@KotlinCleanup("could be further simplified: return if/else, use delimiters.toList()")
private fun toList(delimiters: CharArray?): List<Char> {
if (delimiters == null) {
return new ArrayList<>(0);
return ArrayList(0)
}
ArrayList<Character> ret = new ArrayList<>(delimiters.length);
for (char delimiter : delimiters) {
ret.add(delimiter);
val ret = ArrayList<Char>(delimiters.size)
for (delimiter in delimiters) {
ret.add(delimiter)
}
return ret;
return ret
}
/**
* Looks for text enclosed between two identical quotes
* (the probable quotechar) which are preceded and followed
* by the same character (the probable delimiter).
* For example:
* ,'some text',
* The quote with the most wins, same with the delimiter.
* If there is no quotechar the delimiter can't be determined
* this way.
* Looks for text enclosed between two identical quotes
* (the probable quotechar) which are preceded and followed
* by the same character (the probable delimiter).
* For example:
* ,'some text',
* The quote with the most wins, same with the delimiter.
* If there is no quotechar the delimiter can't be determined
* this way.
*/
private GuessQuoteAndDelimiter _guess_quote_and_delimiter(String data, List<Character> delimiters) {
ArrayList<String> regexes = new ArrayList<>(4);
regexes.add("(?<delim>[^\\w\\n\"'])(?<space> ?)(?<quote>[\"']).*?\\k<quote>\\k<delim>"); // ,".*?",
regexes.add("(?:^|\\n)(?<quote>[\"']).*?\\k<quote>(?<delim>[^\\w\\n\"'])(?<space> ?)"); // ".*?",
regexes.add("(?<delim>[^\\w\\n\"'])(?<space> ?)(?<quote>[\"']).*?\\k<quote>(?:$|\\n)"); // ,".*?"
regexes.add("(?:^|\\n)(?<quote>[\"']).*?\\k<quote>(?:$|\\n)"); // ".*?" (no delim, no space)
List<Group> matches = new ArrayList<>();
for(String regex : regexes) {
Pattern p = Pattern.compile(regex, Pattern.MULTILINE | Pattern.DOTALL);
Matcher m = p.matcher(data);
private fun _guess_quote_and_delimiter(data: String, delimiters: List<Char>?): GuessQuoteAndDelimiter {
val regexes = ArrayList<String>(4)
regexes.add("(?<delim>[^\\w\\n\"'])(?<space> ?)(?<quote>[\"']).*?\\k<quote>\\k<delim>") // ,".*?",
regexes.add("(?:^|\\n)(?<quote>[\"']).*?\\k<quote>(?<delim>[^\\w\\n\"'])(?<space> ?)") // ".*?",
regexes.add("(?<delim>[^\\w\\n\"'])(?<space> ?)(?<quote>[\"']).*?\\k<quote>(?:$|\\n)") // ,".*?"
regexes.add("(?:^|\\n)(?<quote>[\"']).*?\\k<quote>(?:$|\\n)") // ".*?" (no delim, no space)
val matches: MutableList<Group> = ArrayList()
for (regex in regexes) {
val p = Pattern.compile(regex, Pattern.MULTILINE or Pattern.DOTALL)
val m = p.matcher(data)
while (m.find()) {
Group g = new Group();
g.delim = getCharOrNull(m, "delim");
g.quote = getCharOrNull(m, "quote");
g.space = m.group("space");
matches.add(g);
val g = Group()
g.delim = getCharOrNull(m, "delim")
g.quote = getCharOrNull(m, "quote")
g.space = m.group("space")
matches.add(g)
}
if (!matches.isEmpty()) {
break;
break
}
}
if (matches.isEmpty()) {
return new GuessQuoteAndDelimiter('\0', false, '\0', false);
return GuessQuoteAndDelimiter('\u0000', false, '\u0000', false)
}
Map<Character, Integer> quotes = HashUtil.HashMapInit(matches.size());
Map<Character, Integer> delims = new HashMap<>();
int spaces = 0;
for (Group m : matches) {
char key = m.quote;
if (key != '\0') {
quotes.put(key, quotes.getOrDefault(key, 0) + 1);
val quotes: MutableMap<Char, Int> = HashMapInit(matches.size)
val delims: MutableMap<Char, Int> = HashMap()
var spaces = 0
for (m in matches) {
var key = m.quote
if (key != '\u0000') {
quotes[key] = quotes.getOrDefault(key, 0) + 1
}
key = m.delim;
if (key != '\0' && (delimiters == null || delimiters.isEmpty() || delimiters.contains(key))) {
delims.put(key, delims.getOrDefault(key, 0) + 1);
key = m.delim
if (key != '\u0000' && (delimiters == null || delimiters.isEmpty() || delimiters.contains(key))) {
delims[key] = delims.getOrDefault(key, 0) + 1
}
if (m.space != null && m.space.length() > 0) {
spaces += 1;
if (m.space != null && m.space!!.length > 0) {
spaces += 1
}
}
Character quotechar = max(quotes);
Character delim;
boolean skipinitialspace;
val quotechar = max(quotes)!!
var delim: Char
val skipinitialspace: Boolean
if (!delims.isEmpty()) {
delim = max(delims);
skipinitialspace = delims.get(delim) == spaces;
delim = max(delims)!!
skipinitialspace = delims[delim] == spaces
if (delim == '\n') { // most likely a file with a single column
delim = '\0';
delim = '\u0000'
}
} else {
// there is *no* delimiter, it's a single column of quoted data
delim = '\0';
skipinitialspace = false;
delim = '\u0000'
skipinitialspace = false
}
// if we see an extra quote between delimiters, we've got a
// double quoted format
String regex = String.format("((%s)|^)\\W*%s[^%s\\n]*%s[^%s\\n]*%s\\W*((%s)|$)", delim, quotechar, delim, quotechar, delim, quotechar, delim);
Pattern dq_regexp = Pattern.compile(regex, Pattern.MULTILINE);
boolean doublequote = dq_regexp.matcher(data).find();
return new GuessQuoteAndDelimiter(quotechar, doublequote, delim, skipinitialspace);
val regex = String.format(
"((%s)|^)\\W*%s[^%s\\n]*%s[^%s\\n]*%s\\W*((%s)|$)",
delim,
quotechar,
delim,
quotechar,
delim,
quotechar,
delim
)
val dq_regexp = Pattern.compile(regex, Pattern.MULTILINE)
val doublequote = dq_regexp.matcher(data).find()
return GuessQuoteAndDelimiter(quotechar, doublequote, delim, skipinitialspace)
}
private char getCharOrNull(Matcher m, String delim) {
String group = m.group(delim);
if (group == null || group.length() == 0) {
return '\0';
}
return group.charAt(0);
@KotlinCleanup("method name?! the method can't return null")
private fun getCharOrNull(m: Matcher, delim: String): Char {
val group = m.group(delim)
return if (group == null || group.length == 0) {
'\u0000'
} else group[0]
}
/**
* The delimiter /should/ occur the same number of times on
* each row. However, due to malformed data, it may not. We don't want
* an all or nothing approach, so we allow for small variations in this
* number.
* 1) build a table of the frequency of each character on every line.
* 2) build a table of frequencies of this frequency (meta-frequency?),
* e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
* 7 times in 2 rows'
* 3) use the mode of the meta-frequency to determine the /expected/
* frequency for that character
* 4) find out how often the character actually meets that goal
* 5) the character that best meets its goal is the delimiter
* 1) build a table of the frequency of each character on every line.
* 2) build a table of frequencies of this frequency (meta-frequency?),
* e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
* 7 times in 2 rows'
* 3) use the mode of the meta-frequency to determine the /expected/
* frequency for that character
* 4) find out how often the character actually meets that goal
* 5) the character that best meets its goal is the delimiter
* For performance reasons, the data is evaluated in chunks, so it can
* try and evaluate the smallest portion of the data possible, evaluating
* additional chunks as necessary.
*/
private Guess _guess_delimiter(String input, List<Character> delimiters) {
private fun _guess_delimiter(input: String, delimiters: List<Char>?): Guess {
// remove falsey values
String[] samples = input.split("\n");
List<String> data = new ArrayList<>(samples.length);
for (String s : samples) {
if (s == null || s.length() == 0) {
continue;
val samples = input.split("\n").toTypedArray()
val data: MutableList<String> = ArrayList(samples.size)
for (s in samples) {
if (s.length == 0) {
continue
}
data.add(s);
data.add(s)
}
char[] ascii = new char[128]; // 7-bit ASCII
for(char i = 0; i < 128; i++) {
ascii[i] = i;
val ascii = CharArray(128) // 7-bit ASCII
for (i in 0..127) {
ascii[i] = i.toChar()
}
// build frequency tables
int chunkLength = Math.min(10, data.size());
int iteration = 0;
Map<Character, Map<Integer, Integer>> charFrequency = new HashMap<>();
Map<Character, Tuple> modes = new HashMap<>();
Map<Character, Tuple> delims = new HashMap<>();
int start = 0;
int end = chunkLength;
while (start < data.size()) {
iteration++;
for (String line : data.subList(start, end)) {
for (char c : ascii) {
Map<Integer, Integer> metaFrequency = charFrequency.getOrDefault(c, new HashMap<>());
val chunkLength = Math.min(10, data.size)
var iteration = 0
val charFrequency: MutableMap<Char, MutableMap<Int, Int>> = HashMap()
val modes: MutableMap<Char, Tuple> = HashMap()
val delims: MutableMap<Char, Tuple> = HashMap()
var start = 0
var end = chunkLength
while (start < data.size) {
iteration++
for (line in data.subList(start, end)) {
for (c in ascii) {
val metaFrequency = charFrequency.getOrDefault(c, HashMap())
// must count even if frequency is 0
int freq = countInString(line, c);
val freq = countInString(line, c)
// value is the mode
metaFrequency.put(freq, metaFrequency.getOrDefault(freq, 0) + 1);
charFrequency.put(c, metaFrequency);
metaFrequency[freq] = metaFrequency.getOrDefault(freq, 0) + 1
charFrequency[c] = metaFrequency
}
}
for (Map.Entry<Character, Map<Integer, Integer>> e : charFrequency.entrySet()) {
char c = e.getKey();
Set<Map.Entry<Integer, Integer>> bareList = e.getValue().entrySet();
List<Tuple> items = new ArrayList<>(bareList.size());
for (Map.Entry<Integer, Integer> entry : bareList) {
items.add(new Tuple(entry));
for ((c, value) in charFrequency) {
val bareList = value.entries
val items: MutableList<Tuple> = ArrayList(bareList.size)
for (entry in bareList) {
items.add(Tuple(entry))
}
if (items.size() == 1 && items.get(0).second == 0) {
continue;
if (items.size == 1 && items[0].second == 0) {
continue
}
// get the mode of the frequencies
if (items.size() > 1) {
modes.put(c, maxSecond(items));
if (items.size > 1) {
val toRemove = maxSecond(items)
// adjust the mode - subtract the sum of all
// other frequencies
Tuple toRemove = modes.get(c);
items.remove(toRemove);
modes.put(c, new Tuple(toRemove.first, toRemove.second - sumSecond(items)));
items.remove(toRemove)
modes[c] = Tuple(toRemove!!.first, toRemove.second - sumSecond(items))
} else {
modes.put(c, items.get(0));
modes[c] = items[0]
}
}
// build a list of possible delimiters
Set<Map.Entry<Character, Tuple>> modeList = modes.entrySet();
float total = Math.min(chunkLength * iteration, data.size());
val modeList: Set<Map.Entry<Char, Tuple>> = modes.entries
val total = Math.min(chunkLength * iteration, data.size).toFloat()
// (rows of consistent data) / (number of rows) = 100%
double consistency = 1.0;
var consistency = 1.0
// minimum consistency threshold
double threshold = 0.9;
val threshold = 0.9
while (delims.isEmpty() && consistency >= threshold) {
for (Map.Entry<Character, Tuple> entry : modeList) {
Tuple value = entry.getValue();
for ((key, value) in modeList) {
if (value.first > 0 && value.second > 0) {
if (((double) value.second / total) >= consistency && (delimiters == null || delimiters.contains(entry.getKey()))) {
delims.put(entry.getKey(), value);
if (value.second.toDouble() / total >= consistency && (delimiters == null || delimiters.contains(key))) {
delims[key] = value
}
}
}
consistency -= 0.01;
consistency -= 0.01
}
if (delims.size() == 1) {
Character delim = new ArrayList<>(delims.keySet()).get(0);
boolean skipinitialspace = countInString(data.get(0), delim) == countInString(data.get(0), delim + " ");
return new Guess(delim, skipinitialspace);
if (delims.size == 1) {
val delim = ArrayList(delims.keys)[0]
val skipinitialspace = countInString(data[0], delim) == countInString(
data[0], "$delim "
)
return Guess(delim, skipinitialspace)
}
// analyze another chunkLength lines
start = end;
end += chunkLength;
start = end
end += chunkLength
}
if (delims.isEmpty()) {
return new Guess('\0', false);
return Guess('\u0000', false)
}
// if there's more than one, fall back to a 'preferred' list
if (delims.size() > 1) {
for (char d : preferred) {
if (delims.size > 1) {
for (d in preferred) {
if (delims.containsKey(d)) {
boolean skipinitialspace = countInString(data.get(0), d) == countInString(data.get(0), d + " ");
return new Guess(d, skipinitialspace);
val skipinitialspace = countInString(data[0], d) == countInString(
data[0], "$d "
)
return Guess(d, skipinitialspace)
}
}
}
// nothing else indicates a preference, pick the character that
// dominates(?)
ArrayList<Map.Entry<Tuple, Character>> items = new ArrayList<>(delims.size());
for(Map.Entry<Character, Tuple> i : delims.entrySet()) {
items.add(new AbstractMap.SimpleEntry<>(i.getValue(), i.getKey()));
val items = ArrayList<Map.Entry<Tuple, Char>>(delims.size)
for ((key, value) in delims) {
items.add(AbstractMap.SimpleEntry(value, key))
}
items.sort((o1, o2) -> {
int compare = Integer.compare(o1.getKey().first, o2.getKey().first);
if (compare != 0) {
return compare;
items.sortWith(
kotlin.Comparator { o1: Map.Entry<Tuple, Char>, o2: Map.Entry<Tuple, Char> ->
val compare = o1.key.first.compareTo(o2.key.first)
if (compare != 0) {
compare
} else {
o1.key.second.compareTo(o2.key.second)
}
}
return Integer.compare(o1.getKey().second, o2.getKey().second);
});
char delim = items.get(items.size() - 1).getValue();
boolean skipinitialspace = countInString(data.get(0), delim) == countInString(data.get(0), delim + " ");
return new Guess(delim, skipinitialspace);
)
val delim = items[items.size - 1].value
val skipinitialspace = countInString(data[0], delim) == countInString(
data[0], "$delim "
)
return Guess(delim, skipinitialspace)
}
private int sumSecond(List<Tuple> items) {
int total = 0;
for (Tuple item : items) {
total += item.second;
private fun sumSecond(items: List<Tuple?>): Int {
var total = 0
for (item in items) {
total += item!!.second
}
return total;
return total
}
private <T> T max(Map<T, Integer> histogram) {
T max = null;
int maximum = 0;
for (Map.Entry<T, Integer> entry : histogram.entrySet()) {
if (entry.getValue() > maximum) {
maximum = entry.getValue();
max = entry.getKey();
private fun <T> max(histogram: Map<T, Int>): T? {
var max: T? = null
var maximum = 0
for ((key, value) in histogram) {
if (value > maximum) {
maximum = value
max = key
}
}
return max;
return max
}
/** max(items, key = lambda x:x[1]) */
private Tuple maxSecond(List<Tuple> items) {
/** max(items, key = lambda x:x[1]) */
private fun maxSecond(items: List<Tuple?>): Tuple? {
// items = [(1,1), (2,1)]
// pp(max(items, key = lambda x:x[1]))
// (1,1) - the first is picked, so use > max
int max = 0;
Tuple bestMax = null;
for (Tuple item : items) {
if (item.second > max) {
bestMax = item;
max = item.second;
var max = 0
var bestMax: Tuple? = null
for (item in items) {
if (item!!.second > max) {
bestMax = item
max = item.second
}
}
return bestMax;
return bestMax
}
private static class Tuple {
public final int first;
public final int second;
public Tuple(Integer key, Integer value) {
first = key;
second = value;
}
public Tuple(Map.Entry<Integer, Integer> entry) {
this(entry.getKey(), entry.getValue());
}
private class Tuple(val first: Int, val second: Int) {
constructor(entry: Map.Entry<Int, Int>) : this(entry.key, entry.value) {}
}
private static int countInString(String s, char c) {
int count = 0;
for (int i = 0; i < s.length(); i++) {
if (s.charAt(i) == c) {
count++;
protected class GuessQuoteAndDelimiter(
val quotechar: Char,
val doublequote: Boolean,
delimiter: Char,
skipinitialspace: Boolean
) : Guess(delimiter, skipinitialspace)
@KotlinCleanup("check: values were assigned by the migration tool, seems ok from where class it's used")
protected class Group {
var quote = 0.toChar()
var delim = 0.toChar()
var space: String? = null
}
protected open class Guess(val delimiter: Char, val skipinitialspace: Boolean)
companion object {
@JvmStatic
private fun countInString(s: String, c: Char): Int {
var count = 0
for (i in 0 until s.length) {
if (s[i] == c) {
count++
}
}
return count
}
return count;
}
private static int countInString(String haystack, String needle) {
int idx = 0;
int count = 0;
while (idx != -1) {
idx = haystack.indexOf(needle, idx);
if (idx != -1) {
count++;
idx += needle.length();
@JvmStatic
private fun countInString(haystack: String, needle: String): Int {
var idx = 0
var count = 0
while (idx != -1) {
idx = haystack.indexOf(needle, idx)
if (idx != -1) {
count++
idx += needle.length
}
}
}
return count;
}
protected static class GuessQuoteAndDelimiter extends Guess {
public final char quotechar;
public final boolean doublequote;
public GuessQuoteAndDelimiter(char quotechar, boolean doublequote, char delimiter, boolean skipinitialspace) {
super(delimiter, skipinitialspace);
this.quotechar = quotechar;
this.doublequote = doublequote;
}
}
protected static class Group {
public char quote;
public char delim;
public String space;
}
protected static class Guess {
public final char delimiter;
public final boolean skipinitialspace;
public Guess(char delimiter, boolean skipinitialspace) {
this.delimiter = delimiter;
this.skipinitialspace = skipinitialspace;
return count
}
}
}