cldr: use own conversion instead of relying on cldr-to-gettext-plural-rules

- cldr-to-gettext-plural-rules seems unmaintaned and haven't recevied recent CLDR updates - this removes extra dependency on PHP for the conversion - uses the same CLDR data as we already use - the code is pretty much ported from cldr-to-gettext-plural-rules to produce same output with same input
2025-10-04 15:12:29 +08:00 · 2024-01-29 12:06:44 +01:00 · 2024-01-29 12:06:44 +01:00 · 0dab099cd2
commit 0dab099cd2
parent 00deaac38a
6 changed files with 301 additions and 23 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,6 +1,3 @@
-[submodule "cldr-to-gettext-plural-rules"]
-	path = modules/cldr-to-gettext-plural-rules
-	url = https://github.com/mlocati/cldr-to-gettext-plural-rules.git
 [submodule "modules/gettext"]
 	path = modules/gettext
 	url = https://git.savannah.gnu.org/git/gettext.git
--- a/2
+++ b/2
@ -11,7 +11,7 @@ PLURALS_DIFF.md: languages.csv cldr.csv gettext.csv l10n-guide.csv translate.csv
 	./scripts/list-diff
 	pre-commit run --files PLURALS_DIFF.md || true

-cldr.csv: modules/cldr-to-gettext-plural-rules/bin/export-plural-rules scripts/export-cldr
+cldr.csv: modules/cldr-json/cldr-json/cldr-core/supplemental/plurals.json modules/cldr-json/cldr-json/cldr-localenames-full/main/en/languages.json scripts/export-cldr
 	./scripts/export-cldr

 qt.csv: modules/qttools/src/linguist/shared/numerus.cpp scripts/export-qt languages.csv
--- a/cldr.csv
+++ b/cldr.csv
@ -11,11 +11,12 @@ asa,Asu,2,n != 1
 ast,Asturian,2,n != 1
 az,Azerbaijani,2,n != 1
 bal,Baluchi,2,n != 1
-be,Belarusian,3,(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : 2)
+be,Belarusian,4,(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n % 10 == 0 || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 11 && n % 100 <= 14) ? 2 : 3))
 bem,Bemba,2,n != 1
 bez,Bena,2,n != 1
 bg,Bulgarian,2,n != 1
 bho,Bhojpuri,2,n > 1
+blo,Anii,3,(n == 0) ? 0 : ((n == 1) ? 1 : 2)
 bm,Bambara,1,0
 bn,Bangla,2,n > 1
 bo,Tibetan,1,0
@ -154,7 +155,7 @@ osa,Osage,1,0
 pa,Punjabi,2,n > 1
 pap,Papiamento,2,n != 1
 pcm,Nigerian Pidgin,2,n > 1
-pl,Polish,3,(n == 1) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : 2)
+pl,Polish,4,(n == 1) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n != 1 && (n % 10 == 0 || n % 10 == 1) || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 12 && n % 100 <= 14) ? 2 : 3))
 prg,Prussian,3,(n % 10 == 0 || n % 100 >= 11 && n % 100 <= 19) ? 0 : ((n % 10 == 1 && n % 100 != 11) ? 1 : 2)
 ps,Pashto,2,n != 1
 pt,Portuguese,3,(n == 0 || n == 1) ? 0 : ((n != 0 && n % 1000000 == 0) ? 1 : 2)
@ -164,7 +165,7 @@ rm,Romansh,2,n != 1
 ro,Romanian,3,(n == 1) ? 0 : ((n == 0 || n != 1 && n % 100 >= 1 && n % 100 <= 19) ? 1 : 2)
 ro_MD,Moldavian,3,(n == 1) ? 0 : ((n == 0 || n != 1 && n % 100 >= 1 && n % 100 <= 19) ? 1 : 2)
 rof,Rombo,2,n != 1
-ru,Russian,3,(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : 2)
+ru,Russian,4,(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n % 10 == 0 || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 11 && n % 100 <= 14) ? 2 : 3))
 rwk,Rwa,2,n != 1
 sah,Yakut,1,0
 saq,Samburu,2,n != 1
@ -215,7 +216,7 @@ tr,Turkish,2,n != 1
 ts,Tsonga,2,n != 1
 tzm,Central Atlas Tamazight,2,n >= 2 && (n < 11 || n > 99)
 ug,Uyghur,2,n != 1
-uk,Ukrainian,3,(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : 2)
+uk,Ukrainian,4,(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n % 10 == 0 || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 11 && n % 100 <= 14) ? 2 : 3))
 ur,Urdu,2,n != 1
 uz,Uzbek,2,n != 1
 ve,Venda,2,n != 1
--- a/modules/cldr-to-gettext-plural-rules
+++ b/modules/cldr-to-gettext-plural-rules
@ -1 +0,0 @@
-Subproject commit 4d61d67fe83a2ad85959fe6133d6d9ba7dddd1ab
--- a/scripts/export-cldr
+++ b/scripts/export-cldr
@ -5,24 +5,269 @@
 # SPDX-License-Identifier: MIT

 import json
-import os
-import subprocess
+import re

-subprocess.check_call(
-    [
-        "php",
-        "./modules/cldr-to-gettext-plural-rules/bin/export-plural-rules",
-        "--reduce=no",
-        "--output=cldr.json",
-        "json",
-    ]
-)
+MAPPINGS = {
+    "ar_001": "ar",
+    "de_AT": "de",
+    "de_CH": "de",
+    "en_AU": "en",
+    "en_CA": "en",
+    "en_GB": "en",
+    "en_US": "en",
+    "es_419": "es",
+    "es_ES": "es",
+    "es_MX": "es",
+    "fa_AF": "fa",
+    "fr_CA": "fr",
+    "fr_CH": "fr",
+    "hi_Latn": "hi",
+    "nl_BE": "nl",
+    "pt_BR": "pt",
+    "ro_MD": "ro",
+    "sr_ME": "sr",
+    "sw_CD": "sw",
+    "zh_Hans": "zh",
+    "zh_Hant": "zh",
+}

-with open("cldr.json", "rb") as handle:
-    LANGUAGES = json.load(handle)
+SIMPLIFICATIONS = {
+    "n >= 0 && n <= 2 && n != 2": "n == 0 || n == 1",
+    "n != 0 && n != 1": "n > 1",
+    "(n == 0 || n == 1) && n != 0": "n == 1",
+}

-os.unlink("cldr.json")

+def map_code(code: str) -> str:
+    return code.replace("-", "_")
+
+
+def reduce_formula(formula: str) -> str:
+    return SIMPLIFICATIONS.get(formula, formula)
+
+
+def expand_chunk(what: str, op: str, value: str) -> str:
+    if re.match("^\d+$", value):
+        return f"{what} {op} {value}"
+    if match := re.match("^(\d+)\.\.(\d+)$", value):
+        start = int(match.group(1))
+        end = int(match.group(2))
+        if (end - start) == 1:
+            if op == "==":
+                return f"({what} == {start} || {what} == {end})"
+            return f"{what} != {start} && {what} == {end}"
+        if op == "==":
+            return f"{what} >= {start} && {what} <= {end}"
+        if what == "n" and start <= 0:
+            return f"{what} > {end}"
+        return f"({what} < {start} || {what} > {end})"
+    raise ValueError(f"Unhandled range '{value}'")
+
+
+def expand_atom(atom: str) -> str:
+    if match := re.match("^(n(?: % \d+)?) (==|!=) (\d+(?:\.\.\d+|,\d+)+)$", atom):
+        what = match.group(1)
+        op = match.group(2)
+        if op not in ("==", "!="):
+            raise ValueError(f"Unsupported operator {op} in {atom}")
+        chunks = []
+        for value in match.group(3).split(","):
+            chunks.append(expand_chunk(what, op, value))
+
+        if len(chunks) == 1:
+            return chunks[0]
+
+        if op == "==":
+            return f"({' || '.join(chunks)})"
+        return " && ".join(chunks)
+    raise ValueError(f"Unable to expand '{atom}'")
+
+
+def convert_atom(atom: str) -> str | bool:
+    result = atom.replace(" = ", " == ").replace("i", "n")
+
+    if re.match("^n( % \d+)? (!=|==) \d+$", result):
+        return result
+
+    if re.match("^n( % \d+)? (!=|==) \d+(,\d+|\.\.\d+)+$", result):
+        return expand_atom(result)
+
+    if match := re.match("^(?:v|w)(?: % 10+)? == (\d+)(?:\.\.\d+)?$", result):
+        # For gettext: v == 0, w == 0
+        return int(match.group(1)) == 0
+    if match := re.match("^(?:v|w)(?: % 10+)? != (\d+)(?:\.\.\d+)?$", result):
+        # For gettext: v == 0, w == 0
+        return int(match.group(1)) != 0
+    if match := re.match("^(?:f|t|c|e)(?: % 10+)? == (\d+)(?:\.\.\d+)?$", result):
+        # For gettext: f == empty, t == empty, c == empty, e == empty
+        return int(match.group(1)) == 0
+    if match := re.match("^(?:f|t|c|e)(?: % 10+)? != (\d+)(?:\.\.\d+)?$", result):
+        # For gettext: f == empty, t == empty, c == empty, e == empty
+        return int(match.group(1)) != 0
+    raise ValueError(
+        f"Unable to convert the formula chunk '{atom}' from CLDR to gettext"
+    )
+
+
+def convert_formula(cldr_formula_and_examples: str) -> str:
+    # Normalize whitespace
+    cldr_formula_and_examples = " ".join(cldr_formula_and_examples.split())
+
+    # Extract formula from examples
+    if not (
+        match := re.match(
+            "^([^@]*)(?:@integer([^@]+))?(?:@decimal(?:[^@]+))?$",
+            cldr_formula_and_examples,
+        )
+    ):
+        raise ValueError(f"Invalid CLDR category rule: {cldr_formula_and_examples}")
+    cldr_formula = match.group(1).strip()
+
+    # Sanity checkign
+    if "(" in cldr_formula or ")" in cldr_formula:
+        raise ValueError(
+            f"Unable to convert the formula '{cldr_formula}': parenthesis handling not implemented"
+        )
+
+    # Blank formula for other
+    if not cldr_formula:
+        return True
+
+    chunks = []
+
+    for chunk in cldr_formula.split(" or "):
+        output = None
+        and_chunks = []
+        for atom in chunk.split(" and "):
+            gettext = convert_atom(atom)
+            if gettext is False:
+                # One atom joined by 'and' always evaluates to false => the whole 'and' group is always false
+                output = False
+                break
+            if gettext is not True:
+                and_chunks.append(gettext)
+
+        if output is not False:
+            if not and_chunks:
+                # All the atoms joined by 'and' always evaluate to true => the whole 'and' group is always true
+                # One part of the formula joined with the others by 'or' always evaluates to true => the whole formula always evaluates to true
+                return True
+
+            chunks.append(reduce_formula(" && ".join(and_chunks)))
+
+    if not chunks:
+        # All the parts joined by 'or' always evaluate to false => the whole formula always evaluates to false
+        return False
+
+    return " || ".join(chunks)
+
+
+def reverse_formula(formula: str) -> str:
+    if re.match("^n( % \d+)? == \d+(\.\.\d+|,\d+)*?$", formula):
+        return formula.replace(" == ", " != ")
+    if re.match("^n( % \d+)? != \d+(\.\.\d+|,\d+)*?$", formula):
+        return formula.replace(" != ", " == ")
+    if re.match("^\(?n == \d+ \|\| n == \d+\)?$", formula):
+        return formula.replace(" == ", " != ").replace(" || ", " && ").strip("()")
+
+    if match := re.match(
+        "^(n(?: % \d+)?) == (\d+) && (n(?: % \d+)?) != (\d+)$", formula
+    ):
+        return f"{match.group(1)} != {match.group(2)} || {match.group(3)} == {match.group(4)}"
+
+    if (
+        formula
+        == "(n == 1 || n == 2 || n == 3) || n % 10 != 4 && n % 10 != 6 && n % 10 != 9"
+    ):
+        return (
+            "n != 1 && n != 2 && n != 3 && (n % 10 == 4 || n % 10 == 6 || n % 10 == 9)"
+        )
+    if formula == "(n == 0 || n == 1) || n >= 11 && n <= 99":
+        return "n >= 2 && (n < 11 || n > 99)"
+
+    raise ValueError(f"Unable to reverse the formula '{formula}'")
+
+
+def merge_formulas(formulas: list[str]) -> str:
+    max_n = len(formulas) - 1
+    formula = f"{max_n}"
+    for n in range(max_n - 1, -1, -1):
+        part = formulas[n]
+
+        if not re.match("^\([^()]+\)$", part):
+            part = f"({part})"
+        formula = f"{reduce_formula(part)} ? {n} : {formula}"
+        if n > 0:
+            formula = f"({formula})"
+
+    return formula
+
+
+# Load language names
+with open(
+    "modules/cldr-json/cldr-json/cldr-localenames-full/main/en/languages.json"
+) as handle:
+    data = json.load(handle)
+    LANGUAGES = {
+        map_code(cldr_code): {"name": name}
+        for cldr_code, name in data["main"]["en"]["localeDisplayNames"][
+            "languages"
+        ].items()
+    }
+
+missing = {
+    "guw": "Gun",
+    "nah": "Nahuatl",
+    "smi": "Sami",
+}
+
+for code, name in missing.items():
+    if code in LANGUAGES:
+        raise ValueError(f"{code} is no longer missing!")
+    LANGUAGES[code] = {"name": name}
+
+# former Javanese
+LANGUAGES["jw"] = LANGUAGES["jv"].copy()
+# former Moldavian
+LANGUAGES["mo"] = LANGUAGES["ro"].copy()
+LANGUAGES["mo"]["name"] = "Moldavian"
+
+
+# Parse plurals
+with open("modules/cldr-json/cldr-json/cldr-core/supplemental/plurals.json") as handle:
+    data = json.load(handle)
+    for cldr_code, categories in data["supplemental"]["plurals-type-cardinal"].items():
+        code = map_code(cldr_code)
+        if len(categories) == 1:
+            # Just one category
+            LANGUAGES[code]["plurals"] = 1
+            LANGUAGES[code]["formula"] = "0"
+            continue
+        formulas = [convert_formula(category) for category in categories.values()]
+        if len(categories) == 2:  # noqa: PLR2004
+            LANGUAGES[code]["plurals"] = 2
+            LANGUAGES[code]["formula"] = reduce_formula(reverse_formula(formulas[0]))
+        else:
+            cleaned_up_formula = [
+                formula for formula in formulas if formula is not False
+            ]
+            LANGUAGES[code]["plurals"] = len(cleaned_up_formula)
+            LANGUAGES[code]["formula"] = merge_formulas(cleaned_up_formula)
+
+# Add aliases
+for new, old in MAPPINGS.items():
+    for key in ("plurals", "formula"):
+        LANGUAGES[new][key] = LANGUAGES[old][key]
+
+# Remove the languages for which we don't have plurals
+for code in sorted(LANGUAGES.keys()):
+    if "plurals" not in LANGUAGES[code]:
+        del LANGUAGES[code]
+
+# Remove languages we do not want
+del LANGUAGES["und"]  # Unknown language
+
+# Dump as CSV
 with open("cldr.csv", "w") as handle:
    handle.write("code,name,nplurals,formula\n")
    for code in sorted(LANGUAGES):
--- a/weblate_language_data/plurals.py
+++ b/weblate_language_data/plurals.py
@ -209,6 +209,15 @@ EXTRAPLURALS = (
 )

 CLDRPLURALS = (
+    (
+        "be",
+        # Translators: Language name for ISO code "be". The parenthesis clarifies
+        # variant of the language. It could contain a region, age (Old, Middle, ...)
+        # or other variant.
+        _("Belarusian"),
+        4,
+        "(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n % 10 == 0 || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 11 && n % 100 <= 14) ? 2 : 3))",
+    ),
    (
        "ca",
        # Translators: Language name for ISO code "ca". The parenthesis clarifies
@ -299,6 +308,15 @@ CLDRPLURALS = (
        5,
        "(n == 1) ? 0 : ((n == 2) ? 1 : ((n == 0 || n % 100 >= 3 && n % 100 <= 10) ? 2 : ((n % 100 >= 11 && n % 100 <= 19) ? 3 : 4)))",
    ),
+    (
+        "pl",
+        # Translators: Language name for ISO code "pl". The parenthesis clarifies
+        # variant of the language. It could contain a region, age (Old, Middle, ...)
+        # or other variant.
+        _("Polish"),
+        4,
+        "(n == 1) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n != 1 && (n % 10 == 0 || n % 10 == 1) || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 12 && n % 100 <= 14) ? 2 : 3))",
+    ),
    (
        "pt",
        # Translators: Language name for ISO code "pt". The parenthesis clarifies
@ -326,6 +344,24 @@ CLDRPLURALS = (
        3,
        "(n == 1) ? 0 : ((n != 0 && n % 1000000 == 0) ? 1 : 2)",
    ),
+    (
+        "ru",
+        # Translators: Language name for ISO code "ru". The parenthesis clarifies
+        # variant of the language. It could contain a region, age (Old, Middle, ...)
+        # or other variant.
+        _("Russian"),
+        4,
+        "(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n % 10 == 0 || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 11 && n % 100 <= 14) ? 2 : 3))",
+    ),
+    (
+        "uk",
+        # Translators: Language name for ISO code "uk". The parenthesis clarifies
+        # variant of the language. It could contain a region, age (Old, Middle, ...)
+        # or other variant.
+        _("Ukrainian"),
+        4,
+        "(n % 10 == 1 && n % 100 != 11) ? 0 : ((n % 10 >= 2 && n % 10 <= 4 && (n % 100 < 12 || n % 100 > 14)) ? 1 : ((n % 10 == 0 || n % 10 >= 5 && n % 10 <= 9 || n % 100 >= 11 && n % 100 <= 14) ? 2 : 3))",
+    ),
    (
        "vec",
        # Translators: Language name for ISO code "vec". The parenthesis clarifies
				`@ -1 +0,0 @@`
				`Subproject commit 4d61d67fe83a2ad85959fe6133d6d9ba7dddd1ab`