language-data/scripts/export-macrolanguages.py
Michal Čihař 3c04fcc193 chore: include extension in scritps filename
This allows to better lint the files.
2025-04-01 09:51:55 +02:00

82 lines
2.2 KiB
Python
Executable file

#!/usr/bin/env python3
# Copyright © Michal Čihař <michal@weblate.org>
#
# SPDX-License-Identifier: MIT
"""Helper script to generate Python code with ambiguous."""
import csv
import subprocess
from collections import defaultdict
from io import StringIO
from pprint import pprint
from urllib.request import Request, urlopen
BASE_URL = "https://iso639-3.sil.org/sites/iso639-3/files/downloads/"
URL = f"{BASE_URL}iso-639-3-macrolanguages.tab"
# List of ambiguous codes to include, this is opt-in as we don't want to push
# disambiguation for languages where it doesn't seem to be needed (for example Arabic)
LANGUAGES = {"kur"}
HEADER = '''# Copyright © Michal Čihař <michal@weblate.org>
#
# SPDX-License-Identifier: MIT
"""
Language data definitions.
This is an automatically generated file, see scripts/export-macrolanguages.py
Do not edit, please adjust language definitions in following repository:
https://github.com/WeblateOrg/language-data
"""
# pylint: disable=line-too-long,too-many-lines
'''
request = Request(URL, headers={"User-Agent": "Mozilla/5.0"})
# Read data from URL
with urlopen(request) as handle:
content = handle.read().decode()
# Parse tsv
reader = csv.reader(StringIO(content), delimiter="\t")
macrolanguages = list(reader)[1:]
# Generate dict
result = defaultdict(list)
for macro, lang, _kind in macrolanguages:
if macro not in LANGUAGES:
continue
result[macro].append(lang)
# Read aliases
with open("aliases.csv") as csvfile:
reader = csv.reader(csvfile, delimiter=",")
next(reader)
ALIASES = dict([alias for alias in reader if alias[0] != "#"])
# Add aliased entries, needs to happen on snapshot of keys as
# we are updating the dict
for macro in list(result.keys()):
alias = ALIASES.get(macro)
if alias and len(alias) == 2: # noqa: PLR2004
result[ALIASES[macro]] = result[macro]
# Generate output
with open("weblate_language_data/ambiguous.py", "w") as output:
output.write(HEADER)
output.write("# List of ambiguous codes based on ISO-639-3 macrolanguages\n")
output.write("AMBIGUOUS = ")
pprint(dict(result), output)
output.write("\n")
# Black format code
subprocess.run(
["pre-commit", "run", "--files", "weblate_language_data/ambiguous.py"],
check=False,
)