weblate/scripts/generate-specialchars
Michal Čihař 8cb9c923ce Misc: Use python3 in shebang
Just in case there is somebody not using virtualenv and still having
Python 2 as default.

See #4982
2020-12-07 08:58:33 +01:00

419 lines
7.4 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Parser for CLDR proposal (exported as CSV).
http://cldr.unicode.org/development/development-process/design-proposals/delimiter-quotation-mark-proposal
"""
import csv
import pprint
import sys
from weblate_language_data.aliases import ALIASES
from weblate_language_data.languages import LANGUAGES
# Historical values used as base data,
SINGLE_OPEN = {
"ja": "",
"zh": "",
"ar": "",
"fi": "",
"fo": "",
"lag": "",
"rn": "",
"se": "",
"sn": "",
"sv": "",
"ur": "",
"eo": "",
"vo": "",
"ALL": "",
"agq": "",
"bs": "",
"cs": "",
"de": "",
"dsb": "",
"et": "",
"ff": "",
"hr": "",
"hsb": "",
"is": "",
"ksh": "",
"lb": "",
"luy": "",
"mk": "",
"sk": "",
"sl": "",
"ast": "",
"bm": "",
"ca": "",
"cy": "",
"dyo": "",
"es": "",
"ewo": "",
"fur": "",
"ia": "",
"it": "",
"kab": "",
"mg": "",
"mua": "",
"nnh": "",
"nr": "",
"nso": "",
"pt": "",
"sg": "",
"sq": "",
"ss": "",
"ti": "",
"tn": "",
"ts": "",
"ve": "",
"bas": "",
"bg": "",
"ky": "",
"lt": "",
"os": "",
"ru": "",
"shi": "",
"uk": "",
"zgh": "",
"el": '"',
"eu": '"',
"uz": "'",
"yi": "'",
"hy": "«",
"ka": "«",
"nmg": "«",
"pl": "«",
"ro": "«",
"yav": "«",
"he": "׳",
"am": "",
"az": "",
"be": "",
"br": "",
"fa": "",
"fr": "",
"gsw": "",
"jgo": "",
"kkj": "",
"rm": "",
"wae": "",
"hu": "»",
"kl": "",
"ug": "",
}
SINGLE_CLOSE = {
"ja": "",
"zh": "",
"eo": "",
"vo": "",
"ALL": "",
"ar": "",
"bs": "",
"cs": "",
"de": "",
"dsb": "",
"et": "",
"hr": "",
"hsb": "",
"is": "",
"ksh": "",
"lb": "",
"luy": "",
"mk": "",
"sk": "",
"sl": "",
"sr": "",
"ur": "",
"ast": "",
"bm": "",
"ca": "",
"cy": "",
"dyo": "",
"es": "",
"ewo": "",
"fur": "",
"ia": "",
"it": "",
"kab": "",
"mg": "",
"mua": "",
"nnh": "",
"nr": "",
"nso": "",
"pt": "",
"sg": "",
"shi": "",
"sq": "",
"ss": "",
"ti": "",
"tn": "",
"ts": "",
"ve": "",
"zgh": "",
"bas": "",
"bg": "",
"ky": "",
"lt": "",
"os": "",
"ru": "",
"uk": "",
"el": '"',
"eu": '"',
"uz": "'",
"yi": "'",
"hu": "«",
"he": "׳",
"kl": "",
"ug": "",
"hy": "»",
"ka": "»",
"nmg": "»",
"pl": "»",
"ro": "»",
"yav": "»",
"am": "",
"az": "",
"be": "",
"br": "",
"fa": "",
"fr": "",
"gsw": "",
"jgo": "",
"kkj": "",
"rm": "",
"wae": "",
}
DOUBLE_OPEN = {
"eu": '"',
"uz": '"',
"yi": '"',
"ja": "",
"zh": "",
"cy": "",
"fur": "",
"ia": "",
"nr": "",
"nso": "",
"ss": "",
"ti": "",
"tn": "",
"ts": "",
"ve": "",
"am": "«",
"ast": "«",
"az": "«",
"bas": "«",
"be": "«",
"bm": "«",
"br": "«",
"ca": "«",
"dua": "«",
"dyo": "«",
"el": "«",
"es": "«",
"ewo": "«",
"fa": "«",
"fr": "«",
"gsw": "«",
"hy": "«",
"it": "«",
"jgo": "«",
"kab": "«",
"kkj": "«",
"ksf": "«",
"ky": "«",
"mg": "«",
"mua": "«",
"nb": "«",
"nn": "«",
"nnh": "«",
"os": "«",
"pt": "«",
"rm": "«",
"ru": "«",
"rw": "«",
"sg": "«",
"shi": "«",
"sq": "«",
"uk": "«",
"wae": "«",
"yav": "«",
"zgh": "«",
"he": "״",
"ar": "",
"fi": "",
"fo": "",
"lag": "",
"rn": "",
"se": "",
"sn": "",
"sv": "",
"ur": "",
"eo": "",
"vo": "",
"ALL": "",
"kl": "»",
"ug": "»",
"agq": "",
"bg": "",
"bs": "",
"cs": "",
"de": "",
"dsb": "",
"et": "",
"ff": "",
"hr": "",
"hsb": "",
"hu": "",
"is": "",
"ka": "",
"ksh": "",
"lb": "",
"lt": "",
"luy": "",
"mk": "",
"nmg": "",
"pl": "",
"sk": "",
"sl": "",
"sr": "",
}
DOUBLE_CLOSE = {
"eu": '"',
"kk": '"',
"uz": '"',
"yi": '"',
"he": "״",
"cy": "",
"fur": "",
"ia": "",
"nr": "",
"nso": "",
"ss": "",
"ti": "",
"tn": "",
"ts": "",
"ve": "",
"ja": "",
"zh": "",
"kl": "«",
"ug": "«",
"eo": "",
"vo": "",
"ALL": "",
"ar": "",
"bg": "",
"bs": "",
"cs": "",
"de": "",
"dsb": "",
"et": "",
"hr": "",
"hsb": "",
"is": "",
"ka": "",
"ksh": "",
"lb": "",
"lt": "",
"luy": "",
"mk": "",
"sk": "",
"sl": "",
"sr": "",
"ur": "",
"am": "»",
"ast": "»",
"az": "»",
"bas": "»",
"be": "»",
"bm": "»",
"br": "»",
"ca": "»",
"dua": "»",
"dyo": "»",
"el": "»",
"es": "»",
"ewo": "»",
"fa": "»",
"fr": "»",
"gsw": "»",
"hy": "»",
"it": "»",
"jgo": "»",
"kab": "»",
"kkj": "»",
"ksf": "»",
"ky": "»",
"mg": "»",
"mua": "»",
"nb": "»",
"nn": "»",
"nnh": "»",
"os": "»",
"pt": "»",
"rm": "»",
"ru": "»",
"rw": "»",
"sg": "»",
"shi": "»",
"sq": "»",
"uk": "»",
"wae": "»",
"yav": "»",
"zgh": "»",
}
CODES = {lang[0] for lang in LANGUAGES}
# Parse CSV passed on the commad line
with open(sys.argv[1]) as handle:
for (
pos,
_change,
double,
single,
locales,
_style,
_alt,
_old_double,
_old_single,
) in csv.reader(handle):
if pos == "No.":
continue
for locale in locales.split(";"):
locale = locale.strip()
if not locale:
continue
code = locale.split("(")[-1].split(")")[0]
if code in ALIASES:
code = ALIASES[code]
if code not in CODES:
sys.stderr.write(f"Skipping not known {code}\n")
continue
SINGLE_OPEN[code], SINGLE_CLOSE[code] = single.split("")
DOUBLE_OPEN[code], DOUBLE_CLOSE[code] = double.split("")
# Manual overrides follow
# Hebrew, see https://github.com/WeblateOrg/weblate/issues/4772
SINGLE_OPEN["he"] = ""
SINGLE_CLOSE["he"] = ""
DOUBLE_OPEN["he"] = ""
DOUBLE_CLOSE["he"] = ""
sys.stdout.write("SINGLE_OPEN = ")
pprint.pprint(SINGLE_OPEN)
sys.stdout.write("SINGLE_CLOSE = ")
pprint.pprint(SINGLE_CLOSE)
sys.stdout.write("DOUBLE_OPEN = ")
pprint.pprint(DOUBLE_OPEN)
sys.stdout.write("DOUBLE_CLOSE = ")
pprint.pprint(DOUBLE_CLOSE)