0
0
mirror of https://github.com/florisboard/florisboard.git synced 2024-09-19 19:42:20 +02:00
florisboard/utils/convert_fcitx5_sqlite.py
Waelwindows a5dab5fb5a
Add Chinese Shape Based Layouts (#2054)
* feat(ime/nlp): Add `HanShapeBasedLanguageProvider`

* feat: Manually set default NLP to be HanShapeBased

* feat: Temporarily disable adding spaces

This commit should give insight into how the keyboard adds spaces, this
should then be refined into not adding a space after commiting a CJK
text suggestion

* fix(ime/nlp): Remove empty str suggest in HanShape

* feat(ime/nlp): Handle locale variants in HanShape

this should facilitate multiple layouts in the zh locale

* fix(ime/nlp): Handle query params in HanShape

This also helps performance as the DBC doesn't have to compile the query
for every string the user writes

* Space behavior QoL updates for Han shape-based layout (#1)

* Separate space behavior for zh* and latin, and allow space when there is no suggestion.

Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me>

* Add checking if locale is CJK

Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me>

* refactor: Change predicate to a getter & rename

* chore: Remove TODO `supportsAutoSpace` message

* fix: Fix spaces after sugg. in non-space subtypes

* fix: Fix auto space predicate in `PhantomSpace`

Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me>
Co-authored-by: waelwindows <waelwindows9922@gmail.com>

* Draft: editor screen exposes nlpProviders and shape-based Chinese input methods as variants

Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me>

* Fix defaults for zhengma preset

Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me>

* Add word tables for added input methods

Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me>

* Fix: bug in zhengma preset

* Draft: support composing with special characters by delegating nlpProvider to decide composing range.

* Catch SQLite errors such as layout (locale variant) not found (e.g. using HanShapeBased with JIS)

* fixup: remove TODO

* fix: partly addresses 2101, allow searching for locale in English for phones lacking system locale IME

* Adds support for importing "language packs" (sqlite3 db for HanShapeBased for now)

* Changes language pack to zip files. Adds a basic language pack class for storing metadata of IMEs.

Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me>

* Implement language pack as a type of Flex extension, and draft its import and view UI

Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me>

* fix: input method name translation

Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me>

* Trim down to zhengma, quickclassic, and cangjie for the barebones Chinese shape-based pack. Polish extension user documentation.

* Fix hack to allow multiple language pack extensions to co-exist.

Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me>

* Replace quickclassic with boshiamy

* Fix href in LANGUAGEPACKS.md

* build(nix): Clean up nix flake

* refactor: Encapsulate lanaguage pack query in HSB

* feat(ime/nlp): Implement `getListOfWords` in HSB

* feat(ime/nlp): Implement `getFrequencyForWord`

* chore: Normalize weights for freq in `han.sqlite3`

* chore(ime/nlp): Add some logging for HSB

* Update app/src/main/assets/ime/keyboard/org.florisboard.localization/extension.json

Co-authored-by: Patrick Goldinger <patrick@patrickgold.dev>

Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me>
Co-authored-by: moonbeamcelery <114041522+moonbeamcelery@users.noreply.github.com>
Co-authored-by: moonbeamcelery <moonbeamcelery@proton.me>
Co-authored-by: Patrick Goldinger <patrick@patrickgold.dev>
2023-01-15 17:22:10 +01:00

220 lines
8.5 KiB
Python

#!/usr/bin/env python3
# Execute in this folder to convert to sqlite:
# python3 convert_fcitx5_sqlite.py
# Or for a subset of tables only:
# python3 convert_fcitx5_sqlite.py cangjie-large.txt quick-classic.txt wubi-large.txt zhengma.txt
# https://github.com/fcitx/fcitx5-table-extra/tree/master/tables
# The tables are in public domain per their README.
import os
import sys
import re
import json
import glob
import sqlite3
import collections
def put_table(database, schema, table):
length, table = table['LengthReal'], table['Data']
assert re.fullmatch('[a-zA-Z0-9_]+', schema) is not None
columns = len(table[0])
assert all(len(x) == columns for x in table)
with sqlite3.connect(database) as con:
cur = con.cursor()
if columns == 3:
cur.execute(f'create table {schema}(code VARCHAR({length}), text TEXT, weight INT)')
cur.executemany(f'insert into {schema} values(?, ?, ?)', table)
elif columns == 4:
# hard-coded 5-long stem
length_stem = max(len(x[3]) for x in table if x[3] is not None)
cur.execute(f'create table {schema}(code VARCHAR({length}), text TEXT, weight INT, stem VARCHAR({length_stem}))')
cur.executemany(f'insert into {schema} values(?, ?, ?, ?)', table)
else:
raise ValueError(f'Number of columns ({columns}) not supported')
fcitx_fields_translate = {
'组词规则': 'Rule',
'数据': 'Data',
'提示': 'Prompt',
'拼音长度': 'PinyinLength',
'键码': 'KeyCode',
'拼音': 'Pinyin',
'码长': 'Length',
'构词': 'ConstructPhrase',
}
def parse_fcitx_table(table):
with open(table, 'rt') as f:
lines = [line.strip('\n') for line in f.readlines()]
parsed = dict()
field_now = ''
for idx, line in enumerate(lines):
if '\ufeff' in line:
line = line.replace('\ufeff', '')
if not line or line.startswith(';'):
continue
if line.startswith('[') and line.endswith(']'):
# starting a table
field_now = line[1:-1]
field_now = fcitx_fields_translate.get(field_now, field_now)
table_now = parsed[field_now] = []
else:
if field_now:
# appending to a table
if field_now == 'Data':
# Parse first ' ' or '\t' as splitting point.
# Assume ' ' and '\t' may be in the text.
split = len(line)
for x in ' \t':
try:
split = min(split, line.index(x))
except ValueError:
pass
if split == len(line):
print(f'Throwing away row with one column:')
print(repr(line))
line = None
else:
line = (line[:split], line[split+1:])
# elif field_now == 'Rule':
else:
line = line.split('=')
assert len(line) == 2
# else:
# raise ValueError(f'Table field {field_now} not recognized')
if line is not None:
table_now.append(line)
else:
# parsing other settings
assert '=' in line, f'{table} has line without "=":\n{line}'
split = line.index('=')
field = line[:split]
field = fcitx_fields_translate.get(field, field)
parsed[field] = line[split+1:]
return parsed
def clean_fcitx_table(table):
# process Data with special field.
out = dict(table)
# compute actual KeyCode used.
keycode_real = set()
for x in out['Data']:
keycode_real |= set(x[0])
# Prompt: just add to word list and KeyCode.
if 'Prompt' in out and out['Prompt'] in keycode_real:
out['KeyCode'] += out['Prompt']
# Pinyin: just add to word list and KeyCode.
if 'Pinyin' in out and out['Pinyin'] in keycode_real:
out['KeyCode'] += out['Pinyin']
# ConstructPhrase: add to "stem" column. (for zhengma_large)
if 'ConstructPhrase' in out and out['ConstructPhrase'] in keycode_real:
conchar = out['ConstructPhrase']
# separate constructing and non-constructing parts of the table
table_noncon = [x for x in out['Data'] if conchar not in x[0]]
table_con = [(x[0][1:], x[1]) for x in out['Data'] if conchar in x[0]]
# do a join on text
dict_con = {x[1]: x[0] for x in table_con}
assert len(table_con) == len(dict_con), \
'ConstructPhrase entries not unique'
assert all(not conchar in x for x in dict_con.values()), \
'ConstructPhrase appearing after starts'
out['Data'] = [(x[0], x[1], dict_con.get(x[1], None))
for x in table_noncon]
# Weight: just use order.
counter = collections.Counter(x[0] for x in out['Data'])
for idx, x in enumerate(out['Data']):
weight = counter[x[0]]
counter.subtract((x[0],))
x = x[:2] + (weight,) + x[2:]
out['Data'][idx] = x
assert not len(list(counter.elements()))
# compute KeyCodeReal one more time after trimming table
keycode_real = set()
for x in out['Data']:
keycode_real |= set(x[0])
out['KeyCodeReal'] = keycode_real
# actual seek length
out['LengthReal'] = max(len(x[0]) for x in out['Data'])
return out
# Loading
tables = dict()
file_list = sys.argv[1:] if len(sys.argv) > 1 else glob.glob('[a-z]*.txt')
assert all(x.endswith('.txt') for x in file_list)
for x in file_list:
print(f'Processing {x}...')
schema = x[:-4].replace('-', '').replace('_', '')
tables[schema] = parse_fcitx_table(x)
conf = parse_fcitx_table(x[:-4] + '.conf.in')
conf = {k: {x[0]: x[1] for x in v} for k, v in conf.items()}
tables[schema]['.conf.in'] = conf
tables[schema]['FlorisLocale'] = f"{conf['InputMethod']['LangCode']}_{schema}"
# Fixing
if 'wubi98_pinyin' in tables:
tables['wubi98pinyin']['KeyCode'] += 'z'
keycode = set(tables['wubi98pinyin']['KeyCode']) | set(tables['wubi98pinyin']['Pinyin'])
for idx, x in enumerate(tables['wubi98pinyin']['Data']):
if not all(ch in keycode for ch in x[0]):
x = list(x)
x[0] = ''.join(ch for ch in x[0] if ch in keycode)
tables['wubi98pinyin']['Data'][idx] = tuple(x)
if 'easylarge' in tables:
tables['easylarge']['KeyCode'] += '|'
# Cleaning
for schema, table in tables.items():
print(f'Cleaning {schema}, with {len(table["Data"])} items...', end='')
tables[schema] = clean_fcitx_table(table)
print(f' Done, with {len(tables[schema]["Data"])} items.')
# Analysis
if True:
for schema, table in tables.items():
print(f'Analyzing {schema}... LengthReal = {table["LengthReal"]}')
specials = ["Prompt", "Pinyin", "ConstructPhrase"]
for field in specials:
if field in table:
has = [x for x in table['Data'] if table[field] in x[0]]
if has:
print(f'There are {len(has)}/{len(table["Data"])} with {field}={table[field]}')
keycode = set(table['KeyCode'])
keycode_real = set(table['KeyCodeReal'])
if keycode != keycode_real:
print(f'KeyCode mismatch:')
print(f'Claimed not used: ' + ''.join(sorted(keycode - keycode_real)))
print(f'Exists unclaimed: ' + ''.join(sorted(keycode_real - keycode)))
# Writing
language_pack = [dict(id=table['FlorisLocale'], hanShapeBasedKeyCode=table['KeyCode']) for schema, table in tables.items()]
with open('./extension-draft.json', 'wt') as f:
json.dump({'$': 'ime.extension.languagepack', 'items': sorted(language_pack, key=lambda x: x['id'])}, f, indent=2)
database = './han.sqlite3'
if os.path.exists(database):
os.remove(database)
for schema, table in tables.items():
put_table(database, schema, table)
# put_table(database, table['FlorisLocale'], table)
print({schema: table['KeyCode'] for schema, table in tables.items()})
# Final display
with sqlite3.connect(database) as con:
cur = con.cursor()
# for schema in ['zh_CN_zhengmapinyin', 'zh_CN_zhengmalarge', 'zh_CN_wubilarge', 'zh_CN_wubi98', 'zh_TW_cangjie5', 'zh_HK_stroke5']:
for schema in ['zhengmapinyin', 'zhengmalarge', 'wubilarge', 'wubi98', 'cangjie5', 'stroke5']:
if schema not in tables: continue
cur.execute(f'select * from {schema} order by length(code) desc')
print(cur.fetchmany(10))