florisboard/utils/convert_fcitx5_sqlite.py

#!/usr/bin/env python3

# Execute in this folder to convert to sqlite:
#     python3 convert_fcitx5_sqlite.py
# Or for a subset of tables only:
#     python3 convert_fcitx5_sqlite.py cangjie-large.txt quick-classic.txt wubi-large.txt zhengma.txt
# https://github.com/fcitx/fcitx5-table-extra/tree/master/tables
# The tables are in public domain per their README.

import os
import sys
import re
import json
import glob
import sqlite3
import collections


def put_table(database, schema, table):
    length, table = table['LengthReal'], table['Data']
    assert re.fullmatch('[a-zA-Z0-9_]+', schema) is not None
    columns = len(table[0])
    assert all(len(x) == columns for x in table)
    with sqlite3.connect(database) as con:
        cur = con.cursor()
        if columns == 3:
            cur.execute(f'create table {schema}(code VARCHAR({length}), text TEXT, weight INT)')
            cur.executemany(f'insert into {schema} values(?, ?, ?)', table)
        elif columns == 4:
            # hard-coded 5-long stem
            length_stem = max(len(x[3]) for x in table if x[3] is not None)
            cur.execute(f'create table {schema}(code VARCHAR({length}), text TEXT, weight INT, stem VARCHAR({length_stem}))')
            cur.executemany(f'insert into {schema} values(?, ?, ?, ?)', table)
        else:
            raise ValueError(f'Number of columns ({columns}) not supported')


fcitx_fields_translate = {
    '组词规则': 'Rule',
    '数据': 'Data',
    '提示': 'Prompt',
    '拼音长度': 'PinyinLength',
    '键码': 'KeyCode',
    '拼音': 'Pinyin',
    '码长': 'Length',
    '构词': 'ConstructPhrase',
}


def parse_fcitx_table(table):
    with open(table, 'rt') as f:
        lines = [line.strip('\n') for line in f.readlines()]
    parsed = dict()
    field_now = ''
    for idx, line in enumerate(lines):
        if '\ufeff' in line:
            line = line.replace('\ufeff', '')
        if not line or line.startswith(';'):
            continue
        if line.startswith('[') and line.endswith(']'):
            # starting a table
            field_now = line[1:-1]
            field_now = fcitx_fields_translate.get(field_now, field_now)
            table_now = parsed[field_now] = []
        else:
            if field_now:
                # appending to a table
                if field_now == 'Data':
                    # Parse first ' ' or '\t' as splitting point.
                    # Assume ' ' and '\t' may be in the text.
                    split = len(line)
                    for x in ' \t':
                        try:
                            split = min(split, line.index(x))
                        except ValueError:
                            pass
                    if split == len(line):
                        print(f'Throwing away row with one column:')
                        print(repr(line))
                        line = None
                    else:
                        line = (line[:split], line[split+1:])
                # elif field_now == 'Rule':
                else:
                    line = line.split('=')
                    assert len(line) == 2
                # else:
                #     raise ValueError(f'Table field {field_now} not recognized')
                if line is not None:
                    table_now.append(line)
            else:
                # parsing other settings
                assert '=' in line, f'{table} has line without "=":\n{line}'
                split = line.index('=')
                field = line[:split]
                field = fcitx_fields_translate.get(field, field)
                parsed[field] = line[split+1:]
    return parsed


def clean_fcitx_table(table):
    # process Data with special field.
    out = dict(table)

    # compute actual KeyCode used.
    keycode_real = set()
    for x in out['Data']:
        keycode_real |= set(x[0])

    # Prompt: just add to word list and KeyCode.
    if 'Prompt' in out and out['Prompt'] in keycode_real:
        out['KeyCode'] += out['Prompt']
    # Pinyin: just add to word list and KeyCode.
    if 'Pinyin' in out and out['Pinyin'] in keycode_real:
        out['KeyCode'] += out['Pinyin']
    # ConstructPhrase: add to "stem" column. (for zhengma_large)
    if 'ConstructPhrase' in out and out['ConstructPhrase'] in keycode_real:
        conchar = out['ConstructPhrase']
        # separate constructing and non-constructing parts of the table
        table_noncon = [x for x in out['Data'] if conchar not in x[0]]
        table_con = [(x[0][1:], x[1]) for x in out['Data'] if conchar in x[0]]
        # do a join on text
        dict_con = {x[1]: x[0] for x in table_con}
        assert len(table_con) == len(dict_con), \
                'ConstructPhrase entries not unique'
        assert all(not conchar in x for x in dict_con.values()), \
                'ConstructPhrase appearing after starts'
        out['Data'] = [(x[0], x[1], dict_con.get(x[1], None))
                       for x in table_noncon]

    # Weight: just use order.
    counter = collections.Counter(x[0] for x in out['Data'])
    for idx, x in enumerate(out['Data']):
        weight = counter[x[0]]
        counter.subtract((x[0],))
        x = x[:2] + (weight,) + x[2:]
        out['Data'][idx] = x
    assert not len(list(counter.elements()))

    # compute KeyCodeReal one more time after trimming table
    keycode_real = set()
    for x in out['Data']:
        keycode_real |= set(x[0])
    out['KeyCodeReal'] = keycode_real

    # actual seek length
    out['LengthReal'] = max(len(x[0]) for x in out['Data'])
    return out


# Loading
tables = dict()
file_list = sys.argv[1:] if len(sys.argv) > 1 else glob.glob('[a-z]*.txt')
assert all(x.endswith('.txt') for x in file_list)
for x in file_list:
    print(f'Processing {x}...')
    schema = x[:-4].replace('-', '').replace('_', '')
    tables[schema] = parse_fcitx_table(x)
    conf = parse_fcitx_table(x[:-4] + '.conf.in')
    conf = {k: {x[0]: x[1] for x in v} for k, v in conf.items()}
    tables[schema]['.conf.in'] = conf
    tables[schema]['FlorisLocale'] = f"{conf['InputMethod']['LangCode']}_{schema}"

# Fixing
if 'wubi98_pinyin' in tables:
    tables['wubi98pinyin']['KeyCode'] += 'z'
    keycode = set(tables['wubi98pinyin']['KeyCode']) | set(tables['wubi98pinyin']['Pinyin'])
    for idx, x in enumerate(tables['wubi98pinyin']['Data']):
        if not all(ch in keycode for ch in x[0]):
            x = list(x)
            x[0] = ''.join(ch for ch in x[0] if ch in keycode)
            tables['wubi98pinyin']['Data'][idx] = tuple(x)
if 'easylarge' in tables:
    tables['easylarge']['KeyCode'] += '|'

# Cleaning
for schema, table in tables.items():
    print(f'Cleaning {schema}, with {len(table["Data"])} items...', end='')
    tables[schema] = clean_fcitx_table(table)
    print(f' Done, with {len(tables[schema]["Data"])} items.')

# Analysis
if True:
    for schema, table in tables.items():
        print(f'Analyzing {schema}... LengthReal = {table["LengthReal"]}')
        specials = ["Prompt", "Pinyin", "ConstructPhrase"]
        for field in specials:
            if field in table:
                has = [x for x in table['Data'] if table[field] in x[0]]
                if has:
                    print(f'There are {len(has)}/{len(table["Data"])} with {field}={table[field]}')
        keycode = set(table['KeyCode'])
        keycode_real = set(table['KeyCodeReal'])
        if keycode != keycode_real:
            print(f'KeyCode mismatch:')
            print(f'Claimed not used: ' + ''.join(sorted(keycode - keycode_real)))
            print(f'Exists unclaimed: ' + ''.join(sorted(keycode_real - keycode)))

# Writing
language_pack = [dict(id=table['FlorisLocale'], hanShapeBasedKeyCode=table['KeyCode']) for schema, table in tables.items()]
with open('./extension-draft.json', 'wt') as f:
    json.dump({'$': 'ime.extension.languagepack', 'items': sorted(language_pack, key=lambda x: x['id'])}, f, indent=2)
database = './han.sqlite3'
if os.path.exists(database):
    os.remove(database)
for schema, table in tables.items():
    put_table(database, schema, table)
    # put_table(database, table['FlorisLocale'], table)
print({schema: table['KeyCode'] for schema, table in tables.items()})

# Final display
with sqlite3.connect(database) as con:
    cur = con.cursor()
    # for schema in ['zh_CN_zhengmapinyin', 'zh_CN_zhengmalarge', 'zh_CN_wubilarge', 'zh_CN_wubi98', 'zh_TW_cangjie5', 'zh_HK_stroke5']:
    for schema in ['zhengmapinyin', 'zhengmalarge', 'wubilarge', 'wubi98', 'cangjie5', 'stroke5']:
        if schema not in tables: continue
        cur.execute(f'select * from {schema} order by length(code) desc')
        print(cur.fetchmany(10))