0
0
mirror of https://github.com/florisboard/florisboard.git synced 2024-09-20 03:52:18 +02:00
florisboard/utils/convert_fcitx5_sqlite.py

220 lines
8.5 KiB
Python
Raw Normal View History

Add Chinese Shape Based Layouts (#2054) * feat(ime/nlp): Add `HanShapeBasedLanguageProvider` * feat: Manually set default NLP to be HanShapeBased * feat: Temporarily disable adding spaces This commit should give insight into how the keyboard adds spaces, this should then be refined into not adding a space after commiting a CJK text suggestion * fix(ime/nlp): Remove empty str suggest in HanShape * feat(ime/nlp): Handle locale variants in HanShape this should facilitate multiple layouts in the zh locale * fix(ime/nlp): Handle query params in HanShape This also helps performance as the DBC doesn't have to compile the query for every string the user writes * Space behavior QoL updates for Han shape-based layout (#1) * Separate space behavior for zh* and latin, and allow space when there is no suggestion. Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Add checking if locale is CJK Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * refactor: Change predicate to a getter & rename * chore: Remove TODO `supportsAutoSpace` message * fix: Fix spaces after sugg. in non-space subtypes * fix: Fix auto space predicate in `PhantomSpace` Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> Co-authored-by: waelwindows <waelwindows9922@gmail.com> * Draft: editor screen exposes nlpProviders and shape-based Chinese input methods as variants Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Fix defaults for zhengma preset Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Add word tables for added input methods Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Fix: bug in zhengma preset * Draft: support composing with special characters by delegating nlpProvider to decide composing range. * Catch SQLite errors such as layout (locale variant) not found (e.g. using HanShapeBased with JIS) * fixup: remove TODO * fix: partly addresses 2101, allow searching for locale in English for phones lacking system locale IME * Adds support for importing "language packs" (sqlite3 db for HanShapeBased for now) * Changes language pack to zip files. Adds a basic language pack class for storing metadata of IMEs. Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Implement language pack as a type of Flex extension, and draft its import and view UI Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * fix: input method name translation Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Trim down to zhengma, quickclassic, and cangjie for the barebones Chinese shape-based pack. Polish extension user documentation. * Fix hack to allow multiple language pack extensions to co-exist. Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> * Replace quickclassic with boshiamy * Fix href in LANGUAGEPACKS.md * build(nix): Clean up nix flake * refactor: Encapsulate lanaguage pack query in HSB * feat(ime/nlp): Implement `getListOfWords` in HSB * feat(ime/nlp): Implement `getFrequencyForWord` * chore: Normalize weights for freq in `han.sqlite3` * chore(ime/nlp): Add some logging for HSB * Update app/src/main/assets/ime/keyboard/org.florisboard.localization/extension.json Co-authored-by: Patrick Goldinger <patrick@patrickgold.dev> Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me> Co-authored-by: moonbeamcelery <114041522+moonbeamcelery@users.noreply.github.com> Co-authored-by: moonbeamcelery <moonbeamcelery@proton.me> Co-authored-by: Patrick Goldinger <patrick@patrickgold.dev>
2023-01-15 17:22:10 +01:00
#!/usr/bin/env python3
# Execute in this folder to convert to sqlite:
# python3 convert_fcitx5_sqlite.py
# Or for a subset of tables only:
# python3 convert_fcitx5_sqlite.py cangjie-large.txt quick-classic.txt wubi-large.txt zhengma.txt
# https://github.com/fcitx/fcitx5-table-extra/tree/master/tables
# The tables are in public domain per their README.
import os
import sys
import re
import json
import glob
import sqlite3
import collections
def put_table(database, schema, table):
length, table = table['LengthReal'], table['Data']
assert re.fullmatch('[a-zA-Z0-9_]+', schema) is not None
columns = len(table[0])
assert all(len(x) == columns for x in table)
with sqlite3.connect(database) as con:
cur = con.cursor()
if columns == 3:
cur.execute(f'create table {schema}(code VARCHAR({length}), text TEXT, weight INT)')
cur.executemany(f'insert into {schema} values(?, ?, ?)', table)
elif columns == 4:
# hard-coded 5-long stem
length_stem = max(len(x[3]) for x in table if x[3] is not None)
cur.execute(f'create table {schema}(code VARCHAR({length}), text TEXT, weight INT, stem VARCHAR({length_stem}))')
cur.executemany(f'insert into {schema} values(?, ?, ?, ?)', table)
else:
raise ValueError(f'Number of columns ({columns}) not supported')
fcitx_fields_translate = {
'组词规则': 'Rule',
'数据': 'Data',
'提示': 'Prompt',
'拼音长度': 'PinyinLength',
'键码': 'KeyCode',
'拼音': 'Pinyin',
'码长': 'Length',
'构词': 'ConstructPhrase',
}
def parse_fcitx_table(table):
with open(table, 'rt') as f:
lines = [line.strip('\n') for line in f.readlines()]
parsed = dict()
field_now = ''
for idx, line in enumerate(lines):
if '\ufeff' in line:
line = line.replace('\ufeff', '')
if not line or line.startswith(';'):
continue
if line.startswith('[') and line.endswith(']'):
# starting a table
field_now = line[1:-1]
field_now = fcitx_fields_translate.get(field_now, field_now)
table_now = parsed[field_now] = []
else:
if field_now:
# appending to a table
if field_now == 'Data':
# Parse first ' ' or '\t' as splitting point.
# Assume ' ' and '\t' may be in the text.
split = len(line)
for x in ' \t':
try:
split = min(split, line.index(x))
except ValueError:
pass
if split == len(line):
print(f'Throwing away row with one column:')
print(repr(line))
line = None
else:
line = (line[:split], line[split+1:])
# elif field_now == 'Rule':
else:
line = line.split('=')
assert len(line) == 2
# else:
# raise ValueError(f'Table field {field_now} not recognized')
if line is not None:
table_now.append(line)
else:
# parsing other settings
assert '=' in line, f'{table} has line without "=":\n{line}'
split = line.index('=')
field = line[:split]
field = fcitx_fields_translate.get(field, field)
parsed[field] = line[split+1:]
return parsed
def clean_fcitx_table(table):
# process Data with special field.
out = dict(table)
# compute actual KeyCode used.
keycode_real = set()
for x in out['Data']:
keycode_real |= set(x[0])
# Prompt: just add to word list and KeyCode.
if 'Prompt' in out and out['Prompt'] in keycode_real:
out['KeyCode'] += out['Prompt']
# Pinyin: just add to word list and KeyCode.
if 'Pinyin' in out and out['Pinyin'] in keycode_real:
out['KeyCode'] += out['Pinyin']
# ConstructPhrase: add to "stem" column. (for zhengma_large)
if 'ConstructPhrase' in out and out['ConstructPhrase'] in keycode_real:
conchar = out['ConstructPhrase']
# separate constructing and non-constructing parts of the table
table_noncon = [x for x in out['Data'] if conchar not in x[0]]
table_con = [(x[0][1:], x[1]) for x in out['Data'] if conchar in x[0]]
# do a join on text
dict_con = {x[1]: x[0] for x in table_con}
assert len(table_con) == len(dict_con), \
'ConstructPhrase entries not unique'
assert all(not conchar in x for x in dict_con.values()), \
'ConstructPhrase appearing after starts'
out['Data'] = [(x[0], x[1], dict_con.get(x[1], None))
for x in table_noncon]
# Weight: just use order.
counter = collections.Counter(x[0] for x in out['Data'])
for idx, x in enumerate(out['Data']):
weight = counter[x[0]]
counter.subtract((x[0],))
x = x[:2] + (weight,) + x[2:]
out['Data'][idx] = x
assert not len(list(counter.elements()))
# compute KeyCodeReal one more time after trimming table
keycode_real = set()
for x in out['Data']:
keycode_real |= set(x[0])
out['KeyCodeReal'] = keycode_real
# actual seek length
out['LengthReal'] = max(len(x[0]) for x in out['Data'])
return out
# Loading
tables = dict()
file_list = sys.argv[1:] if len(sys.argv) > 1 else glob.glob('[a-z]*.txt')
assert all(x.endswith('.txt') for x in file_list)
for x in file_list:
print(f'Processing {x}...')
schema = x[:-4].replace('-', '').replace('_', '')
tables[schema] = parse_fcitx_table(x)
conf = parse_fcitx_table(x[:-4] + '.conf.in')
conf = {k: {x[0]: x[1] for x in v} for k, v in conf.items()}
tables[schema]['.conf.in'] = conf
tables[schema]['FlorisLocale'] = f"{conf['InputMethod']['LangCode']}_{schema}"
# Fixing
if 'wubi98_pinyin' in tables:
tables['wubi98pinyin']['KeyCode'] += 'z'
keycode = set(tables['wubi98pinyin']['KeyCode']) | set(tables['wubi98pinyin']['Pinyin'])
for idx, x in enumerate(tables['wubi98pinyin']['Data']):
if not all(ch in keycode for ch in x[0]):
x = list(x)
x[0] = ''.join(ch for ch in x[0] if ch in keycode)
tables['wubi98pinyin']['Data'][idx] = tuple(x)
if 'easylarge' in tables:
tables['easylarge']['KeyCode'] += '|'
# Cleaning
for schema, table in tables.items():
print(f'Cleaning {schema}, with {len(table["Data"])} items...', end='')
tables[schema] = clean_fcitx_table(table)
print(f' Done, with {len(tables[schema]["Data"])} items.')
# Analysis
if True:
for schema, table in tables.items():
print(f'Analyzing {schema}... LengthReal = {table["LengthReal"]}')
specials = ["Prompt", "Pinyin", "ConstructPhrase"]
for field in specials:
if field in table:
has = [x for x in table['Data'] if table[field] in x[0]]
if has:
print(f'There are {len(has)}/{len(table["Data"])} with {field}={table[field]}')
keycode = set(table['KeyCode'])
keycode_real = set(table['KeyCodeReal'])
if keycode != keycode_real:
print(f'KeyCode mismatch:')
print(f'Claimed not used: ' + ''.join(sorted(keycode - keycode_real)))
print(f'Exists unclaimed: ' + ''.join(sorted(keycode_real - keycode)))
# Writing
language_pack = [dict(id=table['FlorisLocale'], hanShapeBasedKeyCode=table['KeyCode']) for schema, table in tables.items()]
with open('./extension-draft.json', 'wt') as f:
json.dump({'$': 'ime.extension.languagepack', 'items': sorted(language_pack, key=lambda x: x['id'])}, f, indent=2)
database = './han.sqlite3'
if os.path.exists(database):
os.remove(database)
for schema, table in tables.items():
put_table(database, schema, table)
# put_table(database, table['FlorisLocale'], table)
print({schema: table['KeyCode'] for schema, table in tables.items()})
# Final display
with sqlite3.connect(database) as con:
cur = con.cursor()
# for schema in ['zh_CN_zhengmapinyin', 'zh_CN_zhengmalarge', 'zh_CN_wubilarge', 'zh_CN_wubi98', 'zh_TW_cangjie5', 'zh_HK_stroke5']:
for schema in ['zhengmapinyin', 'zhengmalarge', 'wubilarge', 'wubi98', 'cangjie5', 'stroke5']:
if schema not in tables: continue
cur.execute(f'select * from {schema} order by length(code) desc')
print(cur.fetchmany(10))