mirror of
https://gh.llkk.cc/https://github.com/WeblateOrg/language-data.git
synced 2025-10-04 15:12:29 +08:00
cldr: use own conversion instead of relying on cldr-to-gettext-plural-rules
- cldr-to-gettext-plural-rules seems unmaintaned and haven't recevied recent CLDR updates - this removes extra dependency on PHP for the conversion - uses the same CLDR data as we already use - the code is pretty much ported from cldr-to-gettext-plural-rules to produce same output with same input
This commit is contained in:
parent
00deaac38a
commit
0dab099cd2
6 changed files with 301 additions and 23 deletions
3
.gitmodules
vendored
3
.gitmodules
vendored
|
@ -1,6 +1,3 @@
|
|||
[submodule "cldr-to-gettext-plural-rules"]
|
||||
path = modules/cldr-to-gettext-plural-rules
|
||||
url = https://github.com/mlocati/cldr-to-gettext-plural-rules.git
|
||||
[submodule "modules/gettext"]
|
||||
path = modules/gettext
|
||||
url = https://git.savannah.gnu.org/git/gettext.git
|
||||
|
|
2
Makefile
2
Makefile
|
@ -11,7 +11,7 @@ PLURALS_DIFF.md: languages.csv cldr.csv gettext.csv l10n-guide.csv translate.csv
|
|||
./scripts/list-diff
|
||||
pre-commit run --files PLURALS_DIFF.md || true
|
||||
|
||||
cldr.csv: modules/cldr-to-gettext-plural-rules/bin/export-plural-rules scripts/export-cldr
|
||||
cldr.csv: modules/cldr-json/cldr-json/cldr-core/supplemental/plurals.json modules/cldr-json/cldr-json/cldr-localenames-full/main/en/languages.json scripts/export-cldr
|
||||
./scripts/export-cldr
|
||||
|
||||
qt.csv: modules/qttools/src/linguist/shared/numerus.cpp scripts/export-qt languages.csv
|
||||
|
|
9
cldr.csv
9
cldr.csv
|
@ -11,11 +11,12 @@ asa,Asu,2,n != 1
|
|||
ast,Asturian,2,n != 1
|
||||
az,Azerbaijani,2,n != 1
|
||||
bal,Baluchi,2,n != 1
|
||||
be,Belarusian,3,(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : 2)
|
||||
be,Belarusian,4,(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n % 10 == 0 || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 11 && n % 100 <= 14) ? 2 : 3))
|
||||
bem,Bemba,2,n != 1
|
||||
bez,Bena,2,n != 1
|
||||
bg,Bulgarian,2,n != 1
|
||||
bho,Bhojpuri,2,n > 1
|
||||
blo,Anii,3,(n == 0) ? 0 : ((n == 1) ? 1 : 2)
|
||||
bm,Bambara,1,0
|
||||
bn,Bangla,2,n > 1
|
||||
bo,Tibetan,1,0
|
||||
|
@ -154,7 +155,7 @@ osa,Osage,1,0
|
|||
pa,Punjabi,2,n > 1
|
||||
pap,Papiamento,2,n != 1
|
||||
pcm,Nigerian Pidgin,2,n > 1
|
||||
pl,Polish,3,(n == 1) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : 2)
|
||||
pl,Polish,4,(n == 1) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n != 1 && (n % 10 == 0 || n % 10 == 1) || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 12 && n % 100 <= 14) ? 2 : 3))
|
||||
prg,Prussian,3,(n % 10 == 0 || n % 100 >= 11 && n % 100 <= 19) ? 0 : ((n % 10 == 1 && n % 100 != 11) ? 1 : 2)
|
||||
ps,Pashto,2,n != 1
|
||||
pt,Portuguese,3,(n == 0 || n == 1) ? 0 : ((n != 0 && n % 1000000 == 0) ? 1 : 2)
|
||||
|
@ -164,7 +165,7 @@ rm,Romansh,2,n != 1
|
|||
ro,Romanian,3,(n == 1) ? 0 : ((n == 0 || n != 1 && n % 100 >= 1 && n % 100 <= 19) ? 1 : 2)
|
||||
ro_MD,Moldavian,3,(n == 1) ? 0 : ((n == 0 || n != 1 && n % 100 >= 1 && n % 100 <= 19) ? 1 : 2)
|
||||
rof,Rombo,2,n != 1
|
||||
ru,Russian,3,(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : 2)
|
||||
ru,Russian,4,(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n % 10 == 0 || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 11 && n % 100 <= 14) ? 2 : 3))
|
||||
rwk,Rwa,2,n != 1
|
||||
sah,Yakut,1,0
|
||||
saq,Samburu,2,n != 1
|
||||
|
@ -215,7 +216,7 @@ tr,Turkish,2,n != 1
|
|||
ts,Tsonga,2,n != 1
|
||||
tzm,Central Atlas Tamazight,2,n >= 2 && (n < 11 || n > 99)
|
||||
ug,Uyghur,2,n != 1
|
||||
uk,Ukrainian,3,(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : 2)
|
||||
uk,Ukrainian,4,(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n % 10 == 0 || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 11 && n % 100 <= 14) ? 2 : 3))
|
||||
ur,Urdu,2,n != 1
|
||||
uz,Uzbek,2,n != 1
|
||||
ve,Venda,2,n != 1
|
||||
|
|
|
|
@ -1 +0,0 @@
|
|||
Subproject commit 4d61d67fe83a2ad85959fe6133d6d9ba7dddd1ab
|
|
@ -5,24 +5,269 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
subprocess.check_call(
|
||||
[
|
||||
"php",
|
||||
"./modules/cldr-to-gettext-plural-rules/bin/export-plural-rules",
|
||||
"--reduce=no",
|
||||
"--output=cldr.json",
|
||||
"json",
|
||||
]
|
||||
)
|
||||
MAPPINGS = {
|
||||
"ar_001": "ar",
|
||||
"de_AT": "de",
|
||||
"de_CH": "de",
|
||||
"en_AU": "en",
|
||||
"en_CA": "en",
|
||||
"en_GB": "en",
|
||||
"en_US": "en",
|
||||
"es_419": "es",
|
||||
"es_ES": "es",
|
||||
"es_MX": "es",
|
||||
"fa_AF": "fa",
|
||||
"fr_CA": "fr",
|
||||
"fr_CH": "fr",
|
||||
"hi_Latn": "hi",
|
||||
"nl_BE": "nl",
|
||||
"pt_BR": "pt",
|
||||
"ro_MD": "ro",
|
||||
"sr_ME": "sr",
|
||||
"sw_CD": "sw",
|
||||
"zh_Hans": "zh",
|
||||
"zh_Hant": "zh",
|
||||
}
|
||||
|
||||
with open("cldr.json", "rb") as handle:
|
||||
LANGUAGES = json.load(handle)
|
||||
SIMPLIFICATIONS = {
|
||||
"n >= 0 && n <= 2 && n != 2": "n == 0 || n == 1",
|
||||
"n != 0 && n != 1": "n > 1",
|
||||
"(n == 0 || n == 1) && n != 0": "n == 1",
|
||||
}
|
||||
|
||||
os.unlink("cldr.json")
|
||||
|
||||
def map_code(code: str) -> str:
|
||||
return code.replace("-", "_")
|
||||
|
||||
|
||||
def reduce_formula(formula: str) -> str:
|
||||
return SIMPLIFICATIONS.get(formula, formula)
|
||||
|
||||
|
||||
def expand_chunk(what: str, op: str, value: str) -> str:
|
||||
if re.match("^\d+$", value):
|
||||
return f"{what} {op} {value}"
|
||||
if match := re.match("^(\d+)\.\.(\d+)$", value):
|
||||
start = int(match.group(1))
|
||||
end = int(match.group(2))
|
||||
if (end - start) == 1:
|
||||
if op == "==":
|
||||
return f"({what} == {start} || {what} == {end})"
|
||||
return f"{what} != {start} && {what} == {end}"
|
||||
if op == "==":
|
||||
return f"{what} >= {start} && {what} <= {end}"
|
||||
if what == "n" and start <= 0:
|
||||
return f"{what} > {end}"
|
||||
return f"({what} < {start} || {what} > {end})"
|
||||
raise ValueError(f"Unhandled range '{value}'")
|
||||
|
||||
|
||||
def expand_atom(atom: str) -> str:
|
||||
if match := re.match("^(n(?: % \d+)?) (==|!=) (\d+(?:\.\.\d+|,\d+)+)$", atom):
|
||||
what = match.group(1)
|
||||
op = match.group(2)
|
||||
if op not in ("==", "!="):
|
||||
raise ValueError(f"Unsupported operator {op} in {atom}")
|
||||
chunks = []
|
||||
for value in match.group(3).split(","):
|
||||
chunks.append(expand_chunk(what, op, value))
|
||||
|
||||
if len(chunks) == 1:
|
||||
return chunks[0]
|
||||
|
||||
if op == "==":
|
||||
return f"({' || '.join(chunks)})"
|
||||
return " && ".join(chunks)
|
||||
raise ValueError(f"Unable to expand '{atom}'")
|
||||
|
||||
|
||||
def convert_atom(atom: str) -> str | bool:
|
||||
result = atom.replace(" = ", " == ").replace("i", "n")
|
||||
|
||||
if re.match("^n( % \d+)? (!=|==) \d+$", result):
|
||||
return result
|
||||
|
||||
if re.match("^n( % \d+)? (!=|==) \d+(,\d+|\.\.\d+)+$", result):
|
||||
return expand_atom(result)
|
||||
|
||||
if match := re.match("^(?:v|w)(?: % 10+)? == (\d+)(?:\.\.\d+)?$", result):
|
||||
# For gettext: v == 0, w == 0
|
||||
return int(match.group(1)) == 0
|
||||
if match := re.match("^(?:v|w)(?: % 10+)? != (\d+)(?:\.\.\d+)?$", result):
|
||||
# For gettext: v == 0, w == 0
|
||||
return int(match.group(1)) != 0
|
||||
if match := re.match("^(?:f|t|c|e)(?: % 10+)? == (\d+)(?:\.\.\d+)?$", result):
|
||||
# For gettext: f == empty, t == empty, c == empty, e == empty
|
||||
return int(match.group(1)) == 0
|
||||
if match := re.match("^(?:f|t|c|e)(?: % 10+)? != (\d+)(?:\.\.\d+)?$", result):
|
||||
# For gettext: f == empty, t == empty, c == empty, e == empty
|
||||
return int(match.group(1)) != 0
|
||||
raise ValueError(
|
||||
f"Unable to convert the formula chunk '{atom}' from CLDR to gettext"
|
||||
)
|
||||
|
||||
|
||||
def convert_formula(cldr_formula_and_examples: str) -> str:
|
||||
# Normalize whitespace
|
||||
cldr_formula_and_examples = " ".join(cldr_formula_and_examples.split())
|
||||
|
||||
# Extract formula from examples
|
||||
if not (
|
||||
match := re.match(
|
||||
"^([^@]*)(?:@integer([^@]+))?(?:@decimal(?:[^@]+))?$",
|
||||
cldr_formula_and_examples,
|
||||
)
|
||||
):
|
||||
raise ValueError(f"Invalid CLDR category rule: {cldr_formula_and_examples}")
|
||||
cldr_formula = match.group(1).strip()
|
||||
|
||||
# Sanity checkign
|
||||
if "(" in cldr_formula or ")" in cldr_formula:
|
||||
raise ValueError(
|
||||
f"Unable to convert the formula '{cldr_formula}': parenthesis handling not implemented"
|
||||
)
|
||||
|
||||
# Blank formula for other
|
||||
if not cldr_formula:
|
||||
return True
|
||||
|
||||
chunks = []
|
||||
|
||||
for chunk in cldr_formula.split(" or "):
|
||||
output = None
|
||||
and_chunks = []
|
||||
for atom in chunk.split(" and "):
|
||||
gettext = convert_atom(atom)
|
||||
if gettext is False:
|
||||
# One atom joined by 'and' always evaluates to false => the whole 'and' group is always false
|
||||
output = False
|
||||
break
|
||||
if gettext is not True:
|
||||
and_chunks.append(gettext)
|
||||
|
||||
if output is not False:
|
||||
if not and_chunks:
|
||||
# All the atoms joined by 'and' always evaluate to true => the whole 'and' group is always true
|
||||
# One part of the formula joined with the others by 'or' always evaluates to true => the whole formula always evaluates to true
|
||||
return True
|
||||
|
||||
chunks.append(reduce_formula(" && ".join(and_chunks)))
|
||||
|
||||
if not chunks:
|
||||
# All the parts joined by 'or' always evaluate to false => the whole formula always evaluates to false
|
||||
return False
|
||||
|
||||
return " || ".join(chunks)
|
||||
|
||||
|
||||
def reverse_formula(formula: str) -> str:
|
||||
if re.match("^n( % \d+)? == \d+(\.\.\d+|,\d+)*?$", formula):
|
||||
return formula.replace(" == ", " != ")
|
||||
if re.match("^n( % \d+)? != \d+(\.\.\d+|,\d+)*?$", formula):
|
||||
return formula.replace(" != ", " == ")
|
||||
if re.match("^\(?n == \d+ \|\| n == \d+\)?$", formula):
|
||||
return formula.replace(" == ", " != ").replace(" || ", " && ").strip("()")
|
||||
|
||||
if match := re.match(
|
||||
"^(n(?: % \d+)?) == (\d+) && (n(?: % \d+)?) != (\d+)$", formula
|
||||
):
|
||||
return f"{match.group(1)} != {match.group(2)} || {match.group(3)} == {match.group(4)}"
|
||||
|
||||
if (
|
||||
formula
|
||||
== "(n == 1 || n == 2 || n == 3) || n % 10 != 4 && n % 10 != 6 && n % 10 != 9"
|
||||
):
|
||||
return (
|
||||
"n != 1 && n != 2 && n != 3 && (n % 10 == 4 || n % 10 == 6 || n % 10 == 9)"
|
||||
)
|
||||
if formula == "(n == 0 || n == 1) || n >= 11 && n <= 99":
|
||||
return "n >= 2 && (n < 11 || n > 99)"
|
||||
|
||||
raise ValueError(f"Unable to reverse the formula '{formula}'")
|
||||
|
||||
|
||||
def merge_formulas(formulas: list[str]) -> str:
|
||||
max_n = len(formulas) - 1
|
||||
formula = f"{max_n}"
|
||||
for n in range(max_n - 1, -1, -1):
|
||||
part = formulas[n]
|
||||
|
||||
if not re.match("^\([^()]+\)$", part):
|
||||
part = f"({part})"
|
||||
formula = f"{reduce_formula(part)} ? {n} : {formula}"
|
||||
if n > 0:
|
||||
formula = f"({formula})"
|
||||
|
||||
return formula
|
||||
|
||||
|
||||
# Load language names
|
||||
with open(
|
||||
"modules/cldr-json/cldr-json/cldr-localenames-full/main/en/languages.json"
|
||||
) as handle:
|
||||
data = json.load(handle)
|
||||
LANGUAGES = {
|
||||
map_code(cldr_code): {"name": name}
|
||||
for cldr_code, name in data["main"]["en"]["localeDisplayNames"][
|
||||
"languages"
|
||||
].items()
|
||||
}
|
||||
|
||||
missing = {
|
||||
"guw": "Gun",
|
||||
"nah": "Nahuatl",
|
||||
"smi": "Sami",
|
||||
}
|
||||
|
||||
for code, name in missing.items():
|
||||
if code in LANGUAGES:
|
||||
raise ValueError(f"{code} is no longer missing!")
|
||||
LANGUAGES[code] = {"name": name}
|
||||
|
||||
# former Javanese
|
||||
LANGUAGES["jw"] = LANGUAGES["jv"].copy()
|
||||
# former Moldavian
|
||||
LANGUAGES["mo"] = LANGUAGES["ro"].copy()
|
||||
LANGUAGES["mo"]["name"] = "Moldavian"
|
||||
|
||||
|
||||
# Parse plurals
|
||||
with open("modules/cldr-json/cldr-json/cldr-core/supplemental/plurals.json") as handle:
|
||||
data = json.load(handle)
|
||||
for cldr_code, categories in data["supplemental"]["plurals-type-cardinal"].items():
|
||||
code = map_code(cldr_code)
|
||||
if len(categories) == 1:
|
||||
# Just one category
|
||||
LANGUAGES[code]["plurals"] = 1
|
||||
LANGUAGES[code]["formula"] = "0"
|
||||
continue
|
||||
formulas = [convert_formula(category) for category in categories.values()]
|
||||
if len(categories) == 2: # noqa: PLR2004
|
||||
LANGUAGES[code]["plurals"] = 2
|
||||
LANGUAGES[code]["formula"] = reduce_formula(reverse_formula(formulas[0]))
|
||||
else:
|
||||
cleaned_up_formula = [
|
||||
formula for formula in formulas if formula is not False
|
||||
]
|
||||
LANGUAGES[code]["plurals"] = len(cleaned_up_formula)
|
||||
LANGUAGES[code]["formula"] = merge_formulas(cleaned_up_formula)
|
||||
|
||||
# Add aliases
|
||||
for new, old in MAPPINGS.items():
|
||||
for key in ("plurals", "formula"):
|
||||
LANGUAGES[new][key] = LANGUAGES[old][key]
|
||||
|
||||
# Remove the languages for which we don't have plurals
|
||||
for code in sorted(LANGUAGES.keys()):
|
||||
if "plurals" not in LANGUAGES[code]:
|
||||
del LANGUAGES[code]
|
||||
|
||||
# Remove languages we do not want
|
||||
del LANGUAGES["und"] # Unknown language
|
||||
|
||||
# Dump as CSV
|
||||
with open("cldr.csv", "w") as handle:
|
||||
handle.write("code,name,nplurals,formula\n")
|
||||
for code in sorted(LANGUAGES):
|
||||
|
|
|
@ -209,6 +209,15 @@ EXTRAPLURALS = (
|
|||
)
|
||||
|
||||
CLDRPLURALS = (
|
||||
(
|
||||
"be",
|
||||
# Translators: Language name for ISO code "be". The parenthesis clarifies
|
||||
# variant of the language. It could contain a region, age (Old, Middle, ...)
|
||||
# or other variant.
|
||||
_("Belarusian"),
|
||||
4,
|
||||
"(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n % 10 == 0 || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 11 && n % 100 <= 14) ? 2 : 3))",
|
||||
),
|
||||
(
|
||||
"ca",
|
||||
# Translators: Language name for ISO code "ca". The parenthesis clarifies
|
||||
|
@ -299,6 +308,15 @@ CLDRPLURALS = (
|
|||
5,
|
||||
"(n == 1) ? 0 : ((n == 2) ? 1 : ((n == 0 || n % 100 >= 3 && n % 100 <= 10) ? 2 : ((n % 100 >= 11 && n % 100 <= 19) ? 3 : 4)))",
|
||||
),
|
||||
(
|
||||
"pl",
|
||||
# Translators: Language name for ISO code "pl". The parenthesis clarifies
|
||||
# variant of the language. It could contain a region, age (Old, Middle, ...)
|
||||
# or other variant.
|
||||
_("Polish"),
|
||||
4,
|
||||
"(n == 1) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n != 1 && (n % 10 == 0 || n % 10 == 1) || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 12 && n % 100 <= 14) ? 2 : 3))",
|
||||
),
|
||||
(
|
||||
"pt",
|
||||
# Translators: Language name for ISO code "pt". The parenthesis clarifies
|
||||
|
@ -326,6 +344,24 @@ CLDRPLURALS = (
|
|||
3,
|
||||
"(n == 1) ? 0 : ((n != 0 && n % 1000000 == 0) ? 1 : 2)",
|
||||
),
|
||||
(
|
||||
"ru",
|
||||
# Translators: Language name for ISO code "ru". The parenthesis clarifies
|
||||
# variant of the language. It could contain a region, age (Old, Middle, ...)
|
||||
# or other variant.
|
||||
_("Russian"),
|
||||
4,
|
||||
"(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n % 10 == 0 || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 11 && n % 100 <= 14) ? 2 : 3))",
|
||||
),
|
||||
(
|
||||
"uk",
|
||||
# Translators: Language name for ISO code "uk". The parenthesis clarifies
|
||||
# variant of the language. It could contain a region, age (Old, Middle, ...)
|
||||
# or other variant.
|
||||
_("Ukrainian"),
|
||||
4,
|
||||
"(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n % 10 == 0 || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 11 && n % 100 <= 14) ? 2 : 3))",
|
||||
),
|
||||
(
|
||||
"vec",
|
||||
# Translators: Language name for ISO code "vec". The parenthesis clarifies
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue