language-data/scripts/export-cldr.py

#! /usr/bin/env python3

# Copyright © Michal Čihař <michal@weblate.org>
#
# SPDX-License-Identifier: MIT

from __future__ import annotations

import json
import re

MAPPINGS = {
    "ar_001": "ar",
    "de_AT": "de",
    "de_CH": "de",
    "en_AU": "en",
    "en_CA": "en",
    "en_GB": "en",
    "en_US": "en",
    "es_419": "es",
    "es_ES": "es",
    "es_MX": "es",
    "fa_AF": "fa",
    "fr_CA": "fr",
    "fr_CH": "fr",
    "hi_Latn": "hi",
    "nl_BE": "nl",
    "pt_BR": "pt",
    "pt": "pt_PT",
    "ro_MD": "ro",
    "sr_ME": "sr",
    "sw_CD": "sw",
    "zh_Hans": "zh",
    "zh_Hant": "zh",
}

SIMPLIFICATIONS = {
    "n >= 0 && n <= 2 && n != 2": "n == 0 || n == 1",
    "n != 0 && n != 1": "n > 1",
    "(n == 0 || n == 1) && n != 0": "n == 1",
}


def map_code(code: str) -> str:
    return code.replace("-", "_")


def reduce_formula(formula: str) -> str:
    return SIMPLIFICATIONS.get(formula, formula)


def expand_chunk(what: str, op: str, value: str) -> str:
    if re.match(r"^\d+$", value):
        return f"{what} {op} {value}"
    if match := re.match(r"^(\d+)\.\.(\d+)$", value):
        start = int(match.group(1))
        end = int(match.group(2))
        if (end - start) == 1:
            if op == "==":
                return f"({what} == {start} || {what} == {end})"
            return f"{what} != {start} && {what} == {end}"
        if op == "==":
            return f"{what} >= {start} && {what} <= {end}"
        if what == "n" and start <= 0:
            return f"{what} > {end}"
        return f"({what} < {start} || {what} > {end})"
    raise ValueError(f"Unhandled range '{value}'")


def expand_atom(atom: str) -> str:
    if match := re.match(r"^(n(?: % \d+)?) (==|!=) (\d+(?:\.\.\d+|,\d+)+)$", atom):
        what = match.group(1)
        op = match.group(2)
        if op not in ("==", "!="):
            raise ValueError(f"Unsupported operator {op} in {atom}")
        chunks = []
        for value in match.group(3).split(","):
            chunks.append(expand_chunk(what, op, value))

        if len(chunks) == 1:
            return chunks[0]

        if op == "==":
            return f"({' || '.join(chunks)})"
        return " && ".join(chunks)
    raise ValueError(f"Unable to expand '{atom}'")


def convert_atom(atom: str) -> str | bool:
    result = atom.replace(" = ", " == ").replace("i", "n")

    if re.match(r"^n( % \d+)? (!=|==) \d+$", result):
        return result

    if re.match(r"^n( % \d+)? (!=|==) \d+(,\d+|\.\.\d+)+$", result):
        return expand_atom(result)

    if match := re.match(r"^(?:v|w)(?: % 10+)? == (\d+)(?:\.\.\d+)?$", result):
        # For gettext: v == 0, w == 0
        return int(match.group(1)) == 0
    if match := re.match(r"^(?:v|w)(?: % 10+)? != (\d+)(?:\.\.\d+)?$", result):
        # For gettext: v == 0, w == 0
        return int(match.group(1)) != 0
    if match := re.match(r"^(?:f|t|c|e)(?: % 10+)? == (\d+)(?:\.\.\d+)?$", result):
        # For gettext: f == empty, t == empty, c == empty, e == empty
        return int(match.group(1)) == 0
    if match := re.match(r"^(?:f|t|c|e)(?: % 10+)? != (\d+)(?:\.\.\d+)?$", result):
        # For gettext: f == empty, t == empty, c == empty, e == empty
        return int(match.group(1)) != 0
    raise ValueError(
        f"Unable to convert the formula chunk '{atom}' from CLDR to gettext",
    )


def convert_formula(cldr_formula_and_examples: str) -> str:
    # Skip formulas which do not trigger integer
    if "@integer" not in cldr_formula_and_examples:
        return False

    # Normalize whitespace
    cldr_formula_and_examples = " ".join(cldr_formula_and_examples.split())

    # Extract formula from examples
    if not (
        match := re.match(
            "^([^@]*)(?:@integer([^@]+))?(?:@decimal(?:[^@]+))?$",
            cldr_formula_and_examples,
        )
    ):
        raise ValueError(f"Invalid CLDR category rule: {cldr_formula_and_examples}")
    cldr_formula = match.group(1).strip()

    # Sanity checkign
    if "(" in cldr_formula or ")" in cldr_formula:
        raise ValueError(
            f"Unable to convert the formula '{cldr_formula}': parenthesis handling not implemented",
        )

    # Blank formula for other
    if not cldr_formula:
        return True

    chunks = []

    for chunk in cldr_formula.split(" or "):
        output = None
        and_chunks = []
        for atom in chunk.split(" and "):
            gettext = convert_atom(atom)
            if gettext is False:
                # One atom joined by 'and' always evaluates to false => the whole 'and' group is always false
                output = False
                break
            if gettext is not True:
                and_chunks.append(gettext)

        if output is not False:
            if not and_chunks:
                # All the atoms joined by 'and' always evaluate to true => the whole 'and' group is always true
                # One part of the formula joined with the others by 'or' always evaluates to true => the whole formula always evaluates to true
                return True

            chunks.append(reduce_formula(" && ".join(and_chunks)))

    if not chunks:
        # All the parts joined by 'or' always evaluate to false => the whole formula always evaluates to false
        return False

    return " || ".join(chunks)


def reverse_formula(formula: str) -> str:
    if re.match(r"^n( % \d+)? == \d+(\.\.\d+|,\d+)*?$", formula):
        return formula.replace(" == ", " != ")
    if re.match(r"^n( % \d+)? != \d+(\.\.\d+|,\d+)*?$", formula):
        return formula.replace(" != ", " == ")
    if re.match(r"^\(?n == \d+ \|\| n == \d+\)?$", formula):
        return formula.replace(" == ", " != ").replace(" || ", " && ").strip("()")

    if match := re.match(
        r"^(n(?: % \d+)?) == (\d+) && (n(?: % \d+)?) != (\d+)$",
        formula,
    ):
        return f"{match.group(1)} != {match.group(2)} || {match.group(3)} == {match.group(4)}"

    if (
        formula
        == "(n == 1 || n == 2 || n == 3) || n % 10 != 4 && n % 10 != 6 && n % 10 != 9"
    ):
        return (
            "n != 1 && n != 2 && n != 3 && (n % 10 == 4 || n % 10 == 6 || n % 10 == 9)"
        )
    if formula == "(n == 0 || n == 1) || n >= 11 && n <= 99":
        return "n >= 2 && (n < 11 || n > 99)"

    raise ValueError(f"Unable to reverse the formula '{formula}'")


def merge_formulas(formulas: list[str]) -> str:
    max_n = len(formulas) - 1
    formula = f"{max_n}"
    for n in range(max_n - 1, -1, -1):
        part = formulas[n]

        if not re.match(r"^\([^()]+\)$", part):
            part = f"({part})"
        formula = f"{reduce_formula(part)} ? {n} : {formula}"
        if n > 0:
            formula = f"({formula})"

    return formula


# Load language names
with open(
    "modules/cldr-json/cldr-json/cldr-localenames-full/main/en/languages.json",
) as handle:
    data = json.load(handle)
    LANGUAGES = {
        map_code(cldr_code): {"name": name}
        for cldr_code, name in data["main"]["en"]["localeDisplayNames"][
            "languages"
        ].items()
    }

missing = {
    "guw": "Gun",
    "nah": "Nahuatl",
    "smi": "Sami",
    "lld": "Ladin",
}

for code, name in missing.items():
    if code in LANGUAGES:
        raise ValueError(f"{code} is no longer missing!")
    LANGUAGES[code] = {"name": name}

# former Javanese
LANGUAGES["jw"] = LANGUAGES["jv"].copy()
# former Moldavian
LANGUAGES["mo"] = LANGUAGES["ro"].copy()
LANGUAGES["mo"]["name"] = "Moldavian"


# Parse plurals
with open("modules/cldr-json/cldr-json/cldr-core/supplemental/plurals.json") as handle:
    data = json.load(handle)
    for cldr_code, categories in data["supplemental"]["plurals-type-cardinal"].items():
        code = map_code(cldr_code)
        if len(categories) == 1:
            # Just one category
            LANGUAGES[code]["plurals"] = 1
            LANGUAGES[code]["formula"] = "0"
            continue
        formulas = [convert_formula(category) for category in categories.values()]
        if len(categories) == 2:  # noqa: PLR2004
            LANGUAGES[code]["plurals"] = 2
            LANGUAGES[code]["formula"] = reduce_formula(reverse_formula(formulas[0]))
        else:
            cleaned_up_formula = [
                formula for formula in formulas if formula is not False
            ]
            LANGUAGES[code]["plurals"] = len(cleaned_up_formula)
            LANGUAGES[code]["formula"] = merge_formulas(cleaned_up_formula)

# Add aliases
for new, old in MAPPINGS.items():
    for key in ("plurals", "formula"):
        LANGUAGES[new][key] = LANGUAGES[old][key]

# Remove the languages for which we don't have plurals
for code in sorted(LANGUAGES.keys()):
    if "plurals" not in LANGUAGES[code]:
        del LANGUAGES[code]

# Remove languages we do not want
del LANGUAGES["und"]  # Unknown language

# Dump as CSV
with open("cldr.csv", "w") as handle:
    handle.write("code,name,nplurals,formula\n")
    for code in sorted(LANGUAGES):
        data = LANGUAGES[code]
        handle.write(
            "{},{},{},{}\n".format(
                code,
                data["name"],
                data["plurals"],
                data["formula"],
            ),
        )