language-data/scripts/export-macrolanguages.py
Michal Čihař 4d3db52919
Some checks are pending
mypy / mypy (push) Waiting to run
Pre-commit check / pre-commit (push) Waiting to run
Distribution / Build packages (push) Waiting to run
Distribution / Publish release to PyPI (push) Blocked by required conditions
Distribution / Create release on GitHub (push) Blocked by required conditions
Languages / lint (push) Waiting to run
Update generated files / submodule-update (push) Waiting to run
feat: use prek instead of pre-commit
2026-02-03 17:29:25 +01:00

82 lines
2.2 KiB
Python
Executable file

#!/usr/bin/env python3
# Copyright © Michal Čihař <michal@weblate.org>
#
# SPDX-License-Identifier: MIT
"""Helper script to generate Python code with ambiguous."""
import csv
import subprocess
from collections import defaultdict
from io import StringIO
from pprint import pprint
from urllib.request import Request, urlopen
BASE_URL = "https://iso639-3.sil.org/sites/iso639-3/files/downloads/"
URL = f"{BASE_URL}iso-639-3-macrolanguages.tab"
# List of ambiguous codes to include, this is opt-in as we don't want to push
# disambiguation for languages where it doesn't seem to be needed (for example Arabic)
LANGUAGES = {"kur"}
HEADER = '''# Copyright © Michal Čihař <michal@weblate.org>
#
# SPDX-License-Identifier: MIT
"""
Language data definitions.
This is an automatically generated file, see scripts/export-macrolanguages.py
Do not edit, please adjust language definitions in following repository:
https://github.com/WeblateOrg/language-data
"""
# pylint: disable=line-too-long,too-many-lines
'''
request = Request(URL, headers={"User-Agent": "Mozilla/5.0"})
# Read data from URL
with urlopen(request) as handle:
content = handle.read().decode()
# Parse tsv
reader = csv.reader(StringIO(content), delimiter="\t")
macrolanguages = list(reader)[1:]
# Generate dict
result = defaultdict(list)
for macro, lang, _kind in macrolanguages:
if macro not in LANGUAGES:
continue
result[macro].append(lang)
# Read aliases
with open("aliases.csv") as csvfile:
reader = csv.reader(csvfile, delimiter=",")
next(reader)
ALIASES = dict([alias for alias in reader if alias[0] != "#"])
# Add aliased entries, needs to happen on snapshot of keys as
# we are updating the dict
for macro in list(result.keys()):
alias = ALIASES.get(macro)
if alias and len(alias) == 2: # noqa: PLR2004
result[ALIASES[macro]] = result[macro]
# Generate output
with open("weblate_language_data/ambiguous.py", "w") as output:
output.write(HEADER)
output.write("# List of ambiguous codes based on ISO-639-3 macrolanguages\n")
output.write("AMBIGUOUS = ")
pprint(dict(result), output)
output.write("\n")
# Black format code
subprocess.run(
["prek", "run", "--files", "weblate_language_data/ambiguous.py"],
check=False,
)