florisboard/utils/convert_fcitx5_sqlite.py

#!/usr/bin/env python3

# Execute in this folder to convert to sqlite:
#     python3 convert_fcitx5_sqlite.py
# Or for a subset of tables only:
#     python3 convert_fcitx5_sqlite.py cangjie-large.txt quick-classic.txt wubi-large.txt zhengma.txt
# https://github.com/fcitx/fcitx5-table-extra/tree/master/tables
# The tables are in public domain per their README.

import os
import sys
import re
import json
import glob
import sqlite3
import collections


def put_table(database, schema, table):
    length, table = table['LengthReal'], table['Data']
    assert re.fullmatch('[a-zA-Z0-9_]+', schema) is not None
    columns = len(table[0])
    assert all(len(x) == columns for x in table)
    with sqlite3.connect(database) as con:
        cur = con.cursor()
        if columns == 3:
            cur.execute(f'create table {schema}(code VARCHAR({length}), text TEXT, weight INT)')
            cur.executemany(f'insert into {schema} values(?, ?, ?)', table)
        elif columns == 4:
            # hard-coded 5-long stem
            length_stem = max(len(x[3]) for x in table if x[3] is not None)
            cur.execute(f'create table {schema}(code VARCHAR({length}), text TEXT, weight INT, stem VARCHAR({length_stem}))')
            cur.executemany(f'insert into {schema} values(?, ?, ?, ?)', table)
        else:
            raise ValueError(f'Number of columns ({columns}) not supported')


fcitx_fields_translate = {
    '组词规则': 'Rule',
    '数据': 'Data',
    '提示': 'Prompt',
    '拼音长度': 'PinyinLength',
    '键码': 'KeyCode', 
    '拼音': 'Pinyin', 
    '码长': 'Length', 
    '构词': 'ConstructPhrase', 
}


def parse_fcitx_table(table):
    with open(table, 'rt') as f:
        lines = [line.strip('\n') for line in f.readlines()]
    parsed = dict()
    field_now = ''
    for idx, line in enumerate(lines):
        if '\ufeff' in line:
            line = line.replace('\ufeff', '')
        if not line or line.startswith(';'):
            continue
        if line.startswith('[') and line.endswith(']'):
            # starting a table
            field_now = line[1:-1]
            field_now = fcitx_fields_translate.get(field_now, field_now)
            table_now = parsed[field_now] = []
        else:
            if field_now:
                # appending to a table
                if field_now == 'Data':
                    # Parse first ' ' or '\t' as splitting point.
                    # Assume ' ' and '\t' may be in the text.
                    split = len(line)
                    for x in ' \t':
                        try:
                            split = min(split, line.index(x))
                        except ValueError:
                            pass
                    if split == len(line):
                        print(f'Throwing away row with one column:')
                        print(repr(line))
                        line = None
                    else:
                        line = (line[:split], line[split+1:])
                # elif field_now == 'Rule':
                else:
                    line = line.split('=')
                    assert len(line) == 2
                # else:
                #     raise ValueError(f'Table field {field_now} not recognized')
                if line is not None:
                    table_now.append(line)
            else:
                # parsing other settings
                assert '=' in line, f'{table} has line without "=":\n{line}'
                split = line.index('=')
                field = line[:split]
                field = fcitx_fields_translate.get(field, field)
                parsed[field] = line[split+1:]
    return parsed


def clean_fcitx_table(table):
    # process Data with special field.
    out = dict(table)

    # compute actual KeyCode used.
    keycode_real = set()
    for x in out['Data']:
        keycode_real |= set(x[0])

    # Prompt: just add to word list and KeyCode.
    if 'Prompt' in out and out['Prompt'] in keycode_real:
        out['KeyCode'] += out['Prompt']
    # Pinyin: just add to word list and KeyCode.
    if 'Pinyin' in out and out['Pinyin'] in keycode_real:
        out['KeyCode'] += out['Pinyin']
    # ConstructPhrase: add to "stem" column. (for zhengma_large)
    if 'ConstructPhrase' in out and out['ConstructPhrase'] in keycode_real:
        conchar = out['ConstructPhrase']
        # separate constructing and non-constructing parts of the table
        table_noncon = [x for x in out['Data'] if conchar not in x[0]]
        table_con = [(x[0][1:], x[1]) for x in out['Data'] if conchar in x[0]]
        # do a join on text
        dict_con = {x[1]: x[0] for x in table_con}
        assert len(table_con) == len(dict_con), \
                'ConstructPhrase entries not unique'
        assert all(not conchar in x for x in dict_con.values()), \
                'ConstructPhrase appearing after starts'
        out['Data'] = [(x[0], x[1], dict_con.get(x[1], None))
                       for x in table_noncon]

    # Weight: just use order.
    counter = collections.Counter(x[0] for x in out['Data'])
    for idx, x in enumerate(out['Data']):
        weight = counter[x[0]]
        counter.subtract((x[0],))
        x = x[:2] + (weight,) + x[2:]
        out['Data'][idx] = x
    assert not len(list(counter.elements()))

    # compute KeyCodeReal one more time after trimming table
    keycode_real = set()
    for x in out['Data']:
        keycode_real |= set(x[0])
    out['KeyCodeReal'] = keycode_real

    # actual seek length
    out['LengthReal'] = max(len(x[0]) for x in out['Data'])
    return out


# Loading
tables = dict()
file_list = sys.argv[1:] if len(sys.argv) > 1 else glob.glob('[a-z]*.txt')
assert all(x.endswith('.txt') for x in file_list)
for x in file_list:
    print(f'Processing {x}...')
    schema = x[:-4].replace('-', '').replace('_', '')
    tables[schema] = parse_fcitx_table(x)
    conf = parse_fcitx_table(x[:-4] + '.conf.in')
    conf = {k: {x[0]: x[1] for x in v} for k, v in conf.items()}
    tables[schema]['.conf.in'] = conf
    tables[schema]['FlorisLocale'] = f"{conf['InputMethod']['LangCode']}_{schema}"

# Fixing
if 'wubi98_pinyin' in tables:
    tables['wubi98pinyin']['KeyCode'] += 'z'
    keycode = set(tables['wubi98pinyin']['KeyCode']) | set(tables['wubi98pinyin']['Pinyin'])
    for idx, x in enumerate(tables['wubi98pinyin']['Data']):
        if not all(ch in keycode for ch in x[0]):
            x = list(x)
            x[0] = ''.join(ch for ch in x[0] if ch in keycode)
            tables['wubi98pinyin']['Data'][idx] = tuple(x)
if 'easylarge' in tables:
    tables['easylarge']['KeyCode'] += '|'

# Cleaning
for schema, table in tables.items():
    print(f'Cleaning {schema}, with {len(table["Data"])} items...', end='')
    tables[schema] = clean_fcitx_table(table)
    print(f' Done, with {len(tables[schema]["Data"])} items.')

# Analysis
if True:
    for schema, table in tables.items():
        print(f'Analyzing {schema}... LengthReal = {table["LengthReal"]}')
        specials = ["Prompt", "Pinyin", "ConstructPhrase"]
        for field in specials:
            if field in table:
                has = [x for x in table['Data'] if table[field] in x[0]]
                if has:
                    print(f'There are {len(has)}/{len(table["Data"])} with {field}={table[field]}')
        keycode = set(table['KeyCode'])
        keycode_real = set(table['KeyCodeReal'])
        if keycode != keycode_real:
            print(f'KeyCode mismatch:')
            print(f'Claimed not used: ' + ''.join(sorted(keycode - keycode_real)))
            print(f'Exists unclaimed: ' + ''.join(sorted(keycode_real - keycode)))

# Writing
language_pack = [dict(id=table['FlorisLocale'], hanShapeBasedKeyCode=table['KeyCode']) for schema, table in tables.items()]
with open('./extension-draft.json', 'wt') as f:
    json.dump({'$': 'ime.extension.languagepack', 'items': sorted(language_pack, key=lambda x: x['id'])}, f, indent=2)
database = './han.sqlite3'
if os.path.exists(database):
    os.remove(database)
for schema, table in tables.items():
    put_table(database, schema, table)
    # put_table(database, table['FlorisLocale'], table)
print({schema: table['KeyCode'] for schema, table in tables.items()})

# Final display
with sqlite3.connect(database) as con:
    cur = con.cursor()
    # for schema in ['zh_CN_zhengmapinyin', 'zh_CN_zhengmalarge', 'zh_CN_wubilarge', 'zh_CN_wubi98', 'zh_TW_cangjie5', 'zh_HK_stroke5']:
    for schema in ['zhengmapinyin', 'zhengmalarge', 'wubilarge', 'wubi98', 'cangjie5', 'stroke5']:
        if schema not in tables: continue
        cur.execute(f'select * from {schema} order by length(code) desc')
        print(cur.fetchmany(10))
Add Chinese Shape Based Layouts (#2054) * feat(ime/nlp): Add `HanShapeBasedLanguageProvider` * feat: Manually set default NLP to be HanShapeBased * feat: Temporarily disable adding spaces This commit should give insight into how the keyboard adds spaces, this should then be refined into not adding a space after commiting a CJK text suggestion * fix(ime/nlp): Remove empty str suggest in HanShape * feat(ime/nlp): Handle locale variants in HanShape this should facilitate multiple layouts in the zh locale * fix(ime/nlp): Handle query params in HanShape This also helps performance as the DBC doesn't have to compile the query for every string the user writes * Space behavior QoL updates for Han shape-based layout (#1) * Separate space behavior for zh* and latin, and allow space when there is no suggestion. Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Add checking if locale is CJK Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * refactor: Change predicate to a getter & rename * chore: Remove TODO `supportsAutoSpace` message * fix: Fix spaces after sugg. in non-space subtypes * fix: Fix auto space predicate in `PhantomSpace` Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> Co-authored-by: waelwindows <waelwindows9922@gmail.com> * Draft: editor screen exposes nlpProviders and shape-based Chinese input methods as variants Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Fix defaults for zhengma preset Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Add word tables for added input methods Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Fix: bug in zhengma preset * Draft: support composing with special characters by delegating nlpProvider to decide composing range. * Catch SQLite errors such as layout (locale variant) not found (e.g. using HanShapeBased with JIS) * fixup: remove TODO * fix: partly addresses 2101, allow searching for locale in English for phones lacking system locale IME * Adds support for importing "language packs" (sqlite3 db for HanShapeBased for now) * Changes language pack to zip files. Adds a basic language pack class for storing metadata of IMEs. Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Implement language pack as a type of Flex extension, and draft its import and view UI Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * fix: input method name translation Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Trim down to zhengma, quickclassic, and cangjie for the barebones Chinese shape-based pack. Polish extension user documentation. * Fix hack to allow multiple language pack extensions to co-exist. Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Replace quickclassic with boshiamy * Fix href in LANGUAGEPACKS.md * build(nix): Clean up nix flake * refactor: Encapsulate lanaguage pack query in HSB * feat(ime/nlp): Implement `getListOfWords` in HSB * feat(ime/nlp): Implement `getFrequencyForWord` * chore: Normalize weights for freq in `han.sqlite3` * chore(ime/nlp): Add some logging for HSB * Update app/src/main/assets/ime/keyboard/org.florisboard.localization/extension.json Co-authored-by: Patrick Goldinger <patrick@patrickgold.dev> Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> Co-authored-by: moonbeamcelery <114041522+moonbeamcelery@users.noreply.github.com> Co-authored-by: moonbeamcelery <moonbeamcelery@proton.me> Co-authored-by: Patrick Goldinger <patrick@patrickgold.dev> 2023-01-15 17:22:10 +01:00			`#!/usr/bin/env python3`

			`# Execute in this folder to convert to sqlite:`
			`# python3 convert_fcitx5_sqlite.py`
			`# Or for a subset of tables only:`
			`# python3 convert_fcitx5_sqlite.py cangjie-large.txt quick-classic.txt wubi-large.txt zhengma.txt`
			`# https://github.com/fcitx/fcitx5-table-extra/tree/master/tables`
			`# The tables are in public domain per their README.`

			`import os`
			`import sys`
			`import re`
			`import json`
			`import glob`
			`import sqlite3`
			`import collections`


			`def put_table(database, schema, table):`
			`length, table = table['LengthReal'], table['Data']`
			`assert re.fullmatch('[a-zA-Z0-9_]+', schema) is not None`
			`columns = len(table[0])`
			`assert all(len(x) == columns for x in table)`
			`with sqlite3.connect(database) as con:`
			`cur = con.cursor()`
			`if columns == 3:`
			`cur.execute(f'create table {schema}(code VARCHAR({length}), text TEXT, weight INT)')`
			`cur.executemany(f'insert into {schema} values(?, ?, ?)', table)`
			`elif columns == 4:`
			`# hard-coded 5-long stem`
			`length_stem = max(len(x[3]) for x in table if x[3] is not None)`
			`cur.execute(f'create table {schema}(code VARCHAR({length}), text TEXT, weight INT, stem VARCHAR({length_stem}))')`
			`cur.executemany(f'insert into {schema} values(?, ?, ?, ?)', table)`
			`else:`
			`raise ValueError(f'Number of columns ({columns}) not supported')`


			`fcitx_fields_translate = {`
			`'组词规则': 'Rule',`
			`'数据': 'Data',`
			`'提示': 'Prompt',`
			`'拼音长度': 'PinyinLength',`
			`'键码': 'KeyCode',`
			`'拼音': 'Pinyin',`
			`'码长': 'Length',`
			`'构词': 'ConstructPhrase',`
			`}`


			`def parse_fcitx_table(table):`
			`with open(table, 'rt') as f:`
			`lines = [line.strip('\n') for line in f.readlines()]`
			`parsed = dict()`
			`field_now = ''`
			`for idx, line in enumerate(lines):`
			`if '\ufeff' in line:`
			`line = line.replace('\ufeff', '')`
			`if not line or line.startswith(';'):`
			`continue`
			`if line.startswith('[') and line.endswith(']'):`
			`# starting a table`
			`field_now = line[1:-1]`
			`field_now = fcitx_fields_translate.get(field_now, field_now)`
			`table_now = parsed[field_now] = []`
			`else:`
			`if field_now:`
			`# appending to a table`
			`if field_now == 'Data':`
			`# Parse first ' ' or '\t' as splitting point.`
			`# Assume ' ' and '\t' may be in the text.`
			`split = len(line)`
			`for x in ' \t':`
			`try:`
			`split = min(split, line.index(x))`
			`except ValueError:`
			`pass`
			`if split == len(line):`
			`print(f'Throwing away row with one column:')`
			`print(repr(line))`
			`line = None`
			`else:`
			`line = (line[:split], line[split+1:])`
			`# elif field_now == 'Rule':`
			`else:`
			`line = line.split('=')`
			`assert len(line) == 2`
			`# else:`
			`# raise ValueError(f'Table field {field_now} not recognized')`
			`if line is not None:`
			`table_now.append(line)`
			`else:`
			`# parsing other settings`
			`assert '=' in line, f'{table} has line without "=":\n{line}'`
			`split = line.index('=')`
			`field = line[:split]`
			`field = fcitx_fields_translate.get(field, field)`
			`parsed[field] = line[split+1:]`
			`return parsed`


			`def clean_fcitx_table(table):`
			`# process Data with special field.`
			`out = dict(table)`

			`# compute actual KeyCode used.`
			`keycode_real = set()`
			`for x in out['Data']:`
			`keycode_real \|= set(x[0])`

			`# Prompt: just add to word list and KeyCode.`
			`if 'Prompt' in out and out['Prompt'] in keycode_real:`
			`out['KeyCode'] += out['Prompt']`
			`# Pinyin: just add to word list and KeyCode.`
			`if 'Pinyin' in out and out['Pinyin'] in keycode_real:`
			`out['KeyCode'] += out['Pinyin']`
			`# ConstructPhrase: add to "stem" column. (for zhengma_large)`
			`if 'ConstructPhrase' in out and out['ConstructPhrase'] in keycode_real:`
			`conchar = out['ConstructPhrase']`
			`# separate constructing and non-constructing parts of the table`
			`table_noncon = [x for x in out['Data'] if conchar not in x[0]]`
			`table_con = [(x[0][1:], x[1]) for x in out['Data'] if conchar in x[0]]`
			`# do a join on text`
			`dict_con = {x[1]: x[0] for x in table_con}`
			`assert len(table_con) == len(dict_con), \`
			`'ConstructPhrase entries not unique'`
			`assert all(not conchar in x for x in dict_con.values()), \`
			`'ConstructPhrase appearing after starts'`
			`out['Data'] = [(x[0], x[1], dict_con.get(x[1], None))`
			`for x in table_noncon]`

			`# Weight: just use order.`
			`counter = collections.Counter(x[0] for x in out['Data'])`
			`for idx, x in enumerate(out['Data']):`
			`weight = counter[x[0]]`
			`counter.subtract((x[0],))`
			`x = x[:2] + (weight,) + x[2:]`
			`out['Data'][idx] = x`
			`assert not len(list(counter.elements()))`

			`# compute KeyCodeReal one more time after trimming table`
			`keycode_real = set()`
			`for x in out['Data']:`
			`keycode_real \|= set(x[0])`
			`out['KeyCodeReal'] = keycode_real`

			`# actual seek length`
			`out['LengthReal'] = max(len(x[0]) for x in out['Data'])`
			`return out`


			`# Loading`
			`tables = dict()`
			`file_list = sys.argv[1:] if len(sys.argv) > 1 else glob.glob('[a-z]*.txt')`
			`assert all(x.endswith('.txt') for x in file_list)`
			`for x in file_list:`
			`print(f'Processing {x}...')`
			`schema = x[:-4].replace('-', '').replace('_', '')`
			`tables[schema] = parse_fcitx_table(x)`
			`conf = parse_fcitx_table(x[:-4] + '.conf.in')`
			`conf = {k: {x[0]: x[1] for x in v} for k, v in conf.items()}`
			`tables[schema]['.conf.in'] = conf`
			`tables[schema]['FlorisLocale'] = f"{conf['InputMethod']['LangCode']}_{schema}"`

			`# Fixing`
			`if 'wubi98_pinyin' in tables:`
			`tables['wubi98pinyin']['KeyCode'] += 'z'`
			`keycode = set(tables['wubi98pinyin']['KeyCode']) \| set(tables['wubi98pinyin']['Pinyin'])`
			`for idx, x in enumerate(tables['wubi98pinyin']['Data']):`
			`if not all(ch in keycode for ch in x[0]):`
			`x = list(x)`
			`x[0] = ''.join(ch for ch in x[0] if ch in keycode)`
			`tables['wubi98pinyin']['Data'][idx] = tuple(x)`
			`if 'easylarge' in tables:`
			`tables['easylarge']['KeyCode'] += '\|'`

			`# Cleaning`
			`for schema, table in tables.items():`
			`print(f'Cleaning {schema}, with {len(table["Data"])} items...', end='')`
			`tables[schema] = clean_fcitx_table(table)`
			`print(f' Done, with {len(tables[schema]["Data"])} items.')`

			`# Analysis`
			`if True:`
			`for schema, table in tables.items():`
			`print(f'Analyzing {schema}... LengthReal = {table["LengthReal"]}')`
			`specials = ["Prompt", "Pinyin", "ConstructPhrase"]`
			`for field in specials:`
			`if field in table:`
			`has = [x for x in table['Data'] if table[field] in x[0]]`
			`if has:`
			`print(f'There are {len(has)}/{len(table["Data"])} with {field}={table[field]}')`
			`keycode = set(table['KeyCode'])`
			`keycode_real = set(table['KeyCodeReal'])`
			`if keycode != keycode_real:`
			`print(f'KeyCode mismatch:')`
			`print(f'Claimed not used: ' + ''.join(sorted(keycode - keycode_real)))`
			`print(f'Exists unclaimed: ' + ''.join(sorted(keycode_real - keycode)))`

			`# Writing`
			`language_pack = [dict(id=table['FlorisLocale'], hanShapeBasedKeyCode=table['KeyCode']) for schema, table in tables.items()]`
			`with open('./extension-draft.json', 'wt') as f:`
			`json.dump({'$': 'ime.extension.languagepack', 'items': sorted(language_pack, key=lambda x: x['id'])}, f, indent=2)`
			`database = './han.sqlite3'`
			`if os.path.exists(database):`
			`os.remove(database)`
			`for schema, table in tables.items():`
			`put_table(database, schema, table)`
			`# put_table(database, table['FlorisLocale'], table)`
			`print({schema: table['KeyCode'] for schema, table in tables.items()})`

			`# Final display`
			`with sqlite3.connect(database) as con:`
			`cur = con.cursor()`
			`# for schema in ['zh_CN_zhengmapinyin', 'zh_CN_zhengmalarge', 'zh_CN_wubilarge', 'zh_CN_wubi98', 'zh_TW_cangjie5', 'zh_HK_stroke5']:`
			`for schema in ['zhengmapinyin', 'zhengmalarge', 'wubilarge', 'wubi98', 'cangjie5', 'stroke5']:`
			`if schema not in tables: continue`
			`cur.execute(f'select * from {schema} order by length(code) desc')`
			`print(cur.fetchmany(10))`