weblate/scripts/generate-specialchars.py
Michal Čihař 4cc1e17f42 chore: rename Python scripts to have py extension
This makes it easier to apply linting to them.
2025-07-18 12:33:16 +02:00

424 lines
7.5 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# Copyright © Michal Čihař <michal@weblate.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""
Parser for CLDR proposal (exported as CSV).
http://cldr.unicode.org/development/development-process/design-proposals/delimiter-quotation-mark-proposal
"""
import csv
import pprint
import sys
from weblate_language_data.aliases import ALIASES
from weblate_language_data.languages import LANGUAGES
# Historical values used as base data,
ALT_OPEN = {
"ja": "",
"zh": "",
"ar": "",
"fi": "",
"fo": "", # codespell:ignore fo
"lag": "",
"rn": "",
"se": "",
"sn": "",
"sv": "",
"ur": "",
"eo": "",
"vo": "",
"ALL": "",
"agq": "",
"bs": "",
"cs": "",
"de": "",
"dsb": "",
"et": "",
"ff": "",
"hr": "",
"hsb": "",
"is": "",
"ksh": "",
"lb": "",
"luy": "",
"mk": "",
"sk": "",
"sl": "",
"ast": "",
"bm": "",
"ca": "",
"cy": "",
"dyo": "",
"es": "",
"ewo": "",
"fur": "",
"ia": "",
"it": "",
"kab": "",
"mg": "",
"mua": "",
"nnh": "",
"nr": "",
"nso": "",
"pt": "",
"sg": "",
"sq": "",
"ss": "",
"ti": "",
"tn": "",
"ts": "",
"ve": "",
"bas": "",
"bg": "",
"ky": "",
"lt": "",
"os": "",
"ru": "",
"shi": "",
"uk": "",
"zgh": "",
"el": '"',
"eu": '"',
"uz": "'",
"yi": "'",
"hy": "«",
"ka": "«",
"nmg": "«",
"pl": "«",
"ro": "«",
"yav": "«",
"he": "׳",
"am": "",
"az": "",
"be": "",
"br": "",
"fa": "",
"fr": "",
"gsw": "",
"jgo": "",
"kkj": "",
"rm": "",
"wae": "",
"hu": "»",
"kl": "",
"ug": "",
}
ALT_CLOSE = {
"ja": "",
"zh": "",
"eo": "",
"vo": "",
"ALL": "",
"ar": "",
"bs": "",
"cs": "",
"de": "",
"dsb": "",
"et": "",
"hr": "",
"hsb": "",
"is": "",
"ksh": "",
"lb": "",
"luy": "",
"mk": "",
"sk": "",
"sl": "",
"sr": "",
"ur": "",
"ast": "",
"bm": "",
"ca": "",
"cy": "",
"dyo": "",
"es": "",
"ewo": "",
"fur": "",
"ia": "",
"it": "",
"kab": "",
"mg": "",
"mua": "",
"nnh": "",
"nr": "",
"nso": "",
"pt": "",
"sg": "",
"shi": "",
"sq": "",
"ss": "",
"ti": "",
"tn": "",
"ts": "",
"ve": "",
"zgh": "",
"bas": "",
"bg": "",
"ky": "",
"lt": "",
"os": "",
"ru": "",
"uk": "",
"el": '"',
"eu": '"',
"uz": "'",
"yi": "'",
"hu": "«",
"he": "׳",
"kl": "",
"ug": "",
"hy": "»",
"ka": "»",
"nmg": "»",
"pl": "»",
"ro": "»",
"yav": "»",
"am": "",
"az": "",
"be": "",
"br": "",
"fa": "",
"fr": "",
"gsw": "",
"jgo": "",
"kkj": "",
"rm": "",
"wae": "",
}
MAIN_OPEN = {
"eu": '"',
"uz": '"',
"yi": '"',
"ja": "",
"zh": "",
"cy": "",
"fur": "",
"ia": "",
"nr": "",
"nso": "",
"ss": "",
"ti": "",
"tn": "",
"ts": "",
"ve": "",
"am": "«",
"ast": "«",
"az": "«",
"bas": "«",
"be": "«",
"bm": "«",
"br": "«",
"ca": "«",
"dua": "«",
"dyo": "«",
"el": "«",
"es": "«",
"ewo": "«",
"fa": "«",
"fr": "«",
"gsw": "«",
"hy": "«",
"it": "«",
"jgo": "«",
"kab": "«",
"kkj": "«",
"ksf": "«",
"ky": "«",
"mg": "«",
"mua": "«",
"nb": "«",
"nn": "«",
"nnh": "«",
"os": "«",
"pt": "«",
"rm": "«",
"ru": "«",
"rw": "«",
"sg": "«",
"shi": "«",
"sq": "«",
"uk": "«",
"wae": "«",
"yav": "«",
"zgh": "«",
"he": "״",
"ar": "",
"fi": "",
"fo": "", # codespell:ignore fo
"lag": "",
"rn": "",
"se": "",
"sn": "",
"sv": "",
"ur": "",
"eo": "",
"vo": "",
"ALL": "",
"kl": "»",
"ug": "»",
"agq": "",
"bg": "",
"bs": "",
"cs": "",
"de": "",
"dsb": "",
"et": "",
"ff": "",
"hr": "",
"hsb": "",
"hu": "",
"is": "",
"ka": "",
"ksh": "",
"lb": "",
"lt": "",
"luy": "",
"mk": "",
"nmg": "",
"pl": "",
"sk": "",
"sl": "",
"sr": "",
}
MAIN_CLOSE = {
"eu": '"',
"kk": '"',
"uz": '"',
"yi": '"',
"he": "״",
"cy": "",
"fur": "",
"ia": "",
"nr": "",
"nso": "",
"ss": "",
"ti": "",
"tn": "",
"ts": "",
"ve": "",
"ja": "",
"zh": "",
"kl": "«",
"ug": "«",
"eo": "",
"vo": "",
"ALL": "",
"ar": "",
"bg": "",
"bs": "",
"cs": "",
"de": "",
"dsb": "",
"et": "",
"hr": "",
"hsb": "",
"is": "",
"ka": "",
"ksh": "",
"lb": "",
"lt": "",
"luy": "",
"mk": "",
"sk": "",
"sl": "",
"sr": "",
"ur": "",
"am": "»",
"ast": "»",
"az": "»",
"bas": "»",
"be": "»",
"bm": "»",
"br": "»",
"ca": "»",
"dua": "»",
"dyo": "»",
"el": "»",
"es": "»",
"ewo": "»",
"fa": "»",
"fr": "»",
"gsw": "»",
"hy": "»",
"it": "»",
"jgo": "»",
"kab": "»",
"kkj": "»",
"ksf": "»",
"ky": "»",
"mg": "»",
"mua": "»",
"nb": "»",
"nn": "»",
"nnh": "»",
"os": "»",
"pt": "»",
"rm": "»",
"ru": "»",
"rw": "»",
"sg": "»",
"shi": "»",
"sq": "»",
"uk": "»",
"wae": "»",
"yav": "»",
"zgh": "»",
}
CODES = {lang[0] for lang in LANGUAGES}
# Parse CSV passed on the command line
with open(sys.argv[1]) as handle:
for (
pos,
_change,
double,
single,
locales,
_style,
_alt,
_old_double,
_old_single,
) in csv.reader(handle):
if pos == "No.":
continue
for locale in locales.split(";"):
locale = locale.strip()
if not locale:
continue
code = locale.split("(")[-1].split(")")[0]
if code in ALIASES:
code = ALIASES[code]
if code not in CODES:
sys.stderr.write(f"Skipping not known {code}\n")
continue
ALT_OPEN[code], ALT_CLOSE[code] = single.split("")
MAIN_OPEN[code], MAIN_CLOSE[code] = double.split("")
# Manual overrides follow
# Hebrew, see https://github.com/WeblateOrg/weblate/issues/4772
ALT_OPEN["he"] = ""
ALT_CLOSE["he"] = ""
MAIN_OPEN["he"] = ""
MAIN_CLOSE["he"] = ""
sys.stdout.write("ALT_OPEN = ")
pprint.pprint(ALT_OPEN)
sys.stdout.write("ALT_CLOSE = ")
pprint.pprint(ALT_CLOSE)
sys.stdout.write("MAIN_OPEN = ")
pprint.pprint(MAIN_OPEN)
sys.stdout.write("MAIN_CLOSE = ")
pprint.pprint(MAIN_CLOSE)