language-data/scripts/export-cldr.py

292 lines
9 KiB
Python
Raw Normal View History

#! /usr/bin/env python3
2023-01-13 09:57:01 +01:00
2023-01-10 09:51:06 +01:00
# Copyright © Michal Čihař <michal@weblate.org>
#
2023-01-13 09:57:01 +01:00
# SPDX-License-Identifier: MIT
2025-02-05 14:40:41 +01:00
from __future__ import annotations
import json
import re
MAPPINGS = {
"ar_001": "ar",
"de_AT": "de",
"de_CH": "de",
"en_AU": "en",
"en_CA": "en",
"en_GB": "en",
"en_US": "en",
"es_419": "es",
"es_ES": "es",
"es_MX": "es",
"fa_AF": "fa",
"fr_CA": "fr",
"fr_CH": "fr",
"hi_Latn": "hi",
"nl_BE": "nl",
"pt_BR": "pt",
"pt": "pt_PT",
"ro_MD": "ro",
"sr_ME": "sr",
"sw_CD": "sw",
"zh_Hans": "zh",
"zh_Hant": "zh",
}
SIMPLIFICATIONS = {
"n >= 0 && n <= 2 && n != 2": "n == 0 || n == 1",
"n != 0 && n != 1": "n > 1",
"(n == 0 || n == 1) && n != 0": "n == 1",
}
def map_code(code: str) -> str:
return code.replace("-", "_")
def reduce_formula(formula: str) -> str:
return SIMPLIFICATIONS.get(formula, formula)
def expand_chunk(what: str, op: str, value: str) -> str:
if re.match(r"^\d+$", value):
return f"{what} {op} {value}"
if match := re.match(r"^(\d+)\.\.(\d+)$", value):
start = int(match.group(1))
end = int(match.group(2))
if (end - start) == 1:
if op == "==":
return f"({what} == {start} || {what} == {end})"
return f"{what} != {start} && {what} == {end}"
if op == "==":
return f"{what} >= {start} && {what} <= {end}"
if what == "n" and start <= 0:
return f"{what} > {end}"
return f"({what} < {start} || {what} > {end})"
raise ValueError(f"Unhandled range '{value}'")
def expand_atom(atom: str) -> str:
if match := re.match(r"^(n(?: % \d+)?) (==|!=) (\d+(?:\.\.\d+|,\d+)+)$", atom):
what = match.group(1)
op = match.group(2)
if op not in ("==", "!="):
raise ValueError(f"Unsupported operator {op} in {atom}")
chunks = []
for value in match.group(3).split(","):
chunks.append(expand_chunk(what, op, value))
if len(chunks) == 1:
return chunks[0]
if op == "==":
return f"({' || '.join(chunks)})"
return " && ".join(chunks)
raise ValueError(f"Unable to expand '{atom}'")
def convert_atom(atom: str) -> str | bool:
result = atom.replace(" = ", " == ").replace("i", "n")
if re.match(r"^n( % \d+)? (!=|==) \d+$", result):
return result
if re.match(r"^n( % \d+)? (!=|==) \d+(,\d+|\.\.\d+)+$", result):
return expand_atom(result)
if match := re.match(r"^(?:v|w)(?: % 10+)? == (\d+)(?:\.\.\d+)?$", result):
# For gettext: v == 0, w == 0
return int(match.group(1)) == 0
if match := re.match(r"^(?:v|w)(?: % 10+)? != (\d+)(?:\.\.\d+)?$", result):
# For gettext: v == 0, w == 0
return int(match.group(1)) != 0
if match := re.match(r"^(?:f|t|c|e)(?: % 10+)? == (\d+)(?:\.\.\d+)?$", result):
# For gettext: f == empty, t == empty, c == empty, e == empty
return int(match.group(1)) == 0
if match := re.match(r"^(?:f|t|c|e)(?: % 10+)? != (\d+)(?:\.\.\d+)?$", result):
# For gettext: f == empty, t == empty, c == empty, e == empty
return int(match.group(1)) != 0
raise ValueError(
2025-02-05 14:40:41 +01:00
f"Unable to convert the formula chunk '{atom}' from CLDR to gettext",
)
def convert_formula(cldr_formula_and_examples: str) -> str:
# Skip formulas which do not trigger integer
if "@integer" not in cldr_formula_and_examples:
return False
# Normalize whitespace
cldr_formula_and_examples = " ".join(cldr_formula_and_examples.split())
# Extract formula from examples
if not (
match := re.match(
"^([^@]*)(?:@integer([^@]+))?(?:@decimal(?:[^@]+))?$",
cldr_formula_and_examples,
)
):
raise ValueError(f"Invalid CLDR category rule: {cldr_formula_and_examples}")
cldr_formula = match.group(1).strip()
# Sanity checkign
if "(" in cldr_formula or ")" in cldr_formula:
raise ValueError(
2025-02-05 14:40:41 +01:00
f"Unable to convert the formula '{cldr_formula}': parenthesis handling not implemented",
)
# Blank formula for other
if not cldr_formula:
return True
chunks = []
for chunk in cldr_formula.split(" or "):
output = None
and_chunks = []
for atom in chunk.split(" and "):
gettext = convert_atom(atom)
if gettext is False:
# One atom joined by 'and' always evaluates to false => the whole 'and' group is always false
output = False
break
if gettext is not True:
and_chunks.append(gettext)
if output is not False:
if not and_chunks:
# All the atoms joined by 'and' always evaluate to true => the whole 'and' group is always true
# One part of the formula joined with the others by 'or' always evaluates to true => the whole formula always evaluates to true
return True
chunks.append(reduce_formula(" && ".join(and_chunks)))
if not chunks:
# All the parts joined by 'or' always evaluate to false => the whole formula always evaluates to false
return False
return " || ".join(chunks)
def reverse_formula(formula: str) -> str:
if re.match(r"^n( % \d+)? == \d+(\.\.\d+|,\d+)*?$", formula):
return formula.replace(" == ", " != ")
if re.match(r"^n( % \d+)? != \d+(\.\.\d+|,\d+)*?$", formula):
return formula.replace(" != ", " == ")
if re.match(r"^\(?n == \d+ \|\| n == \d+\)?$", formula):
return formula.replace(" == ", " != ").replace(" || ", " && ").strip("()")
if match := re.match(
2025-02-05 14:40:41 +01:00
r"^(n(?: % \d+)?) == (\d+) && (n(?: % \d+)?) != (\d+)$",
formula,
):
return f"{match.group(1)} != {match.group(2)} || {match.group(3)} == {match.group(4)}"
if (
formula
== "(n == 1 || n == 2 || n == 3) || n % 10 != 4 && n % 10 != 6 && n % 10 != 9"
):
return (
"n != 1 && n != 2 && n != 3 && (n % 10 == 4 || n % 10 == 6 || n % 10 == 9)"
)
if formula == "(n == 0 || n == 1) || n >= 11 && n <= 99":
return "n >= 2 && (n < 11 || n > 99)"
raise ValueError(f"Unable to reverse the formula '{formula}'")
def merge_formulas(formulas: list[str]) -> str:
max_n = len(formulas) - 1
formula = f"{max_n}"
for n in range(max_n - 1, -1, -1):
part = formulas[n]
2025-02-05 14:40:41 +01:00
if not re.match(r"^\([^()]+\)$", part):
part = f"({part})"
formula = f"{reduce_formula(part)} ? {n} : {formula}"
if n > 0:
formula = f"({formula})"
return formula
# Load language names
with open(
2025-02-05 14:40:41 +01:00
"modules/cldr-json/cldr-json/cldr-localenames-full/main/en/languages.json",
) as handle:
data = json.load(handle)
LANGUAGES = {
map_code(cldr_code): {"name": name}
for cldr_code, name in data["main"]["en"]["localeDisplayNames"][
"languages"
].items()
}
missing = {
"guw": "Gun",
"nah": "Nahuatl",
"smi": "Sami",
"lld": "Ladin",
}
for code, name in missing.items():
if code in LANGUAGES:
raise ValueError(f"{code} is no longer missing!")
LANGUAGES[code] = {"name": name}
# former Javanese
LANGUAGES["jw"] = LANGUAGES["jv"].copy()
# former Moldavian
LANGUAGES["mo"] = LANGUAGES["ro"].copy()
LANGUAGES["mo"]["name"] = "Moldavian"
# Parse plurals
with open("modules/cldr-json/cldr-json/cldr-core/supplemental/plurals.json") as handle:
data = json.load(handle)
for cldr_code, categories in data["supplemental"]["plurals-type-cardinal"].items():
code = map_code(cldr_code)
if len(categories) == 1:
# Just one category
LANGUAGES[code]["plurals"] = 1
LANGUAGES[code]["formula"] = "0"
continue
formulas = [convert_formula(category) for category in categories.values()]
if len(categories) == 2: # noqa: PLR2004
LANGUAGES[code]["plurals"] = 2
LANGUAGES[code]["formula"] = reduce_formula(reverse_formula(formulas[0]))
else:
cleaned_up_formula = [
formula for formula in formulas if formula is not False
]
LANGUAGES[code]["plurals"] = len(cleaned_up_formula)
LANGUAGES[code]["formula"] = merge_formulas(cleaned_up_formula)
# Add aliases
for new, old in MAPPINGS.items():
for key in ("plurals", "formula"):
LANGUAGES[new][key] = LANGUAGES[old][key]
# Remove the languages for which we don't have plurals
for code in sorted(LANGUAGES.keys()):
if "plurals" not in LANGUAGES[code]:
del LANGUAGES[code]
# Remove languages we do not want
del LANGUAGES["und"] # Unknown language
# Dump as CSV
with open("cldr.csv", "w") as handle:
2022-07-14 14:45:50 +02:00
handle.write("code,name,nplurals,formula\n")
for code in sorted(LANGUAGES):
data = LANGUAGES[code]
handle.write(
2025-02-05 14:40:41 +01:00
"{},{},{},{}\n".format(
code,
data["name"],
data["plurals"],
data["formula"],
),
)