feat: automate text direction extracting from CLDR

This fills is the data for all CLDR languages making issues like #1477
less likely to happen.
This commit is contained in:
Michal Čihař 2024-09-30 11:22:59 +02:00
parent 826f086770
commit af29b70621
5 changed files with 146 additions and 4 deletions

View file

@ -4,7 +4,7 @@

all: weblate_language_data/languages.py weblate_language_data/plural_tags.py PLURALS_DIFF.md $(wildcard weblate_language_data/locale/*/LC_MESSAGES/django.po) $(filter-out $(patsubst modules/cldr-json/cldr-json/cldr-localenames-full/main/%/languages.json,languages-po/%.po,$(wildcard modules/cldr-json/cldr-json/cldr-localenames-full/main/*/languages.json)),languages-po/en.po)

weblate_language_data/languages.py: languages.csv aliases.csv cldr.csv extraplurals.csv default_countries.csv population.csv qt.csv $(wildcard modules/iso-codes/data/iso_*.json) scripts/generate-language-data
weblate_language_data/languages.py: languages.csv aliases.csv cldr.csv extraplurals.csv default_countries.csv population.csv qt.csv rtl.csv $(wildcard modules/iso-codes/data/iso_*.json) scripts/generate-language-data
./scripts/generate-language-data

PLURALS_DIFF.md: languages.csv cldr.csv gettext.csv l10n-guide.csv translate.csv scripts/list-diff
@ -14,6 +14,9 @@ PLURALS_DIFF.md: languages.csv cldr.csv gettext.csv l10n-guide.csv translate.csv
cldr.csv: modules/cldr-json/cldr-json/cldr-core/supplemental/plurals.json modules/cldr-json/cldr-json/cldr-localenames-full/main/en/languages.json scripts/export-cldr
./scripts/export-cldr

rtl.csv: modules/cldr-json/cldr-json/cldr-misc-full/main/*/layout.json scripts/export-cldr-orientation languages.csv
./scripts/export-cldr-orientation

qt.csv: modules/qttools/src/linguist/shared/numerus.cpp scripts/export-qt languages.csv
./scripts/export-qt


53
rtl.csv Normal file
View file

@ -0,0 +1,53 @@
code
ae
aii
ajp
apc
ar
ar_BH
ar_DZ
ar_EG
ar_KW
ar_LY
ar_MA
ar_SA
ar_YE
ara
arc
ave
bal
bgn
bqi
ckb
ckb_IR
dv
egy
fa
fa_AF
fas
ha
he
heb
khw
ks
lrc
luz
ms_Arab
mzn
nqo
pal
per
phn
ps
rhg
sam
sd
sdh
skr
syc
syr
ug
ur
ur_IN
urd
yi
1 code
2 ae
3 aii
4 ajp
5 apc
6 ar
7 ar_BH
8 ar_DZ
9 ar_EG
10 ar_KW
11 ar_LY
12 ar_MA
13 ar_SA
14 ar_YE
15 ara
16 arc
17 ave
18 bal
19 bgn
20 bqi
21 ckb
22 ckb_IR
23 dv
24 egy
25 fa
26 fa_AF
27 fas
28 ha
29 he
30 heb
31 khw
32 ks
33 lrc
34 luz
35 ms_Arab
36 mzn
37 nqo
38 pal
39 per
40 phn
41 ps
42 rhg
43 sam
44 sd
45 sdh
46 skr
47 syc
48 syr
49 ug
50 ur
51 ur_IN
52 urd
53 yi

45
scripts/export-cldr-orientation Executable file
View file

@ -0,0 +1,45 @@
#! /usr/bin/env python3

# Copyright © Michal Čihař <michal@weblate.org>
#
# SPDX-License-Identifier: MIT

import json
from pathlib import Path
import csv

# Read languages
with open("languages.csv") as csvfile:
reader = csv.reader(csvfile, delimiter=",")
next(reader)
LANGUAGES = list(reader)
LANGUAGE_CODES = {lang[0] for lang in LANGUAGES}

# Read RTL
with open("rtl.csv") as csvfile:
reader = csv.reader(csvfile, delimiter=",")
next(reader)
RTLS = list(reader)
RTL_CODES = {lang[0] for lang in RTLS}

LAYOUTDIR = Path("modules/cldr-json/cldr-json/cldr-misc-full/main/")

for layout_file in LAYOUTDIR.glob("*/layout.json"):
json_text = layout_file.read_text()
data = json.loads(json_text)
for key, value in data["main"].items():
code = key.replace("-", "_")
if code not in LANGUAGE_CODES:
continue
character_order = value["layout"]["orientation"]["characterOrder"]
if character_order == "right-to-left":
RTL_CODES.add(code)
elif character_order != "left-to-right":
print(f"Uknown order for {code}: {character_order})")

print(RTL_CODES)

with open("rtl.csv", "w") as handle:
handle.write("code\n")
for code in sorted(RTL_CODES):
handle.write(f"{code}\n")

View file

@ -132,12 +132,19 @@ with open("cldr.csv") as csvfile:
if existing != number:
CLDRPLURALS.append((code, LANGUAGE_NAMES[code], number, equation))

# Read extra plurals
# Read default countries
with open("default_countries.csv") as csvfile:
reader = csv.reader(csvfile, delimiter=",")
next(reader)
DEFAULT_COUNTRIES = list(reader)

# Read RTL
with open("rtl.csv") as csvfile:
reader = csv.reader(csvfile, delimiter=",")
next(reader)
RTLS = list(reader)
RTL_CODES = {lang[0] for lang in RTLS}

# Write language definitions
with open("weblate_language_data/languages.py", "w") as output:
output.write(HEADER)
@ -195,6 +202,13 @@ with open("weblate_language_data/countries.py", "w") as output:
for row in DEFAULT_COUNTRIES:
output.write(" '{}',\n".format(*row))
output.write(")\n")
with open("weblate_language_data/rtl.py", "w") as output:
output.write(HEADER)
output.write("# List of RTL languages\n")
output.write("RTL_LANGS = {\n")
for code in sorted(RTL_CODES):
output.write(f' "{code}",\n')
output.write("}\n")

# Generate same check blacklist
words = set()
@ -322,6 +336,7 @@ subprocess.run(
"pre-commit",
"run",
"--files",
"weblate_language_data/rtl.py",
"weblate_language_data/countries.py",
"weblate_language_data/aliases.py",
"weblate_language_data/plurals.py",

View file

@ -2,21 +2,43 @@
#
# SPDX-License-Identifier: MIT

"""
Language data definitions.

This is an automatically generated file, see scripts/generate-language-data

Do not edit, please adjust language definitions in following repository:
https://github.com/WeblateOrg/language-data
"""
# pylint: disable=line-too-long,too-many-lines

# List of RTL languages
RTL_LANGS = {
"ae",
"aii",
"ajp",
"apc",
"ar",
"ar_BH",
"ar_DZ",
"ar_EG",
"ar_KW",
"ar_LY",
"ar_MA",
"ar_SA",
"ar_YE",
"ara",
"arc",
"ae",
"aii",
"ave",
"bal",
"bgn",
"bqi",
"ckb",
"ckb_IR",
"dv",
"egy",
"fa",
"fa_AF",
"fas",
"ha",
"he",
@ -26,18 +48,22 @@ RTL_LANGS = {
"lrc",
"luz",
"ms_Arab",
"mzn",
"nqo",
"pal",
"per",
"phn",
"ps",
"rhg",
"sam",
"sd",
"sdh",
"skr",
"syc",
"syr",
"ug",
"ur",
"ur_IN",
"urd",
"yi",
}