weblate/scripts/generate-language-data
2020-04-06 13:49:10 +02:00

246 lines
6.8 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
#
# Copyright © 2012 - 2020 Michal Čihař <michal@cihar.com>
#
# This file is part of Weblate <https://weblate.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""
Helper script to generate Python code from language-data repository.
See https://github.com/WeblateOrg/language-data
"""
import csv
import json
import re
import subprocess
from textwrap import TextWrapper
SPLIT_RE = re.compile(
r"(?:\&(?:nbsp|rsaquo|lt|gt|amp|ldquo|rdquo|times|quot);|"
+ r'[() ,.^`"\'\\/_<>!?;:|{}*^@%#&~=+\r\n✓—\[\]0-9-])+'
)
HEADER = '''#
# Copyright © 2012 - 2020 Michal Čihař <michal@cihar.com>
#
# This file is part of Weblate <https://weblate.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""Language data definitions.
This is an automatically generated file, see scripts/generate-language-data
Do not edit, please adjust language definitions in following repository:
https://github.com/WeblateOrg/language-data
"""
# pylint: disable=line-too-long,too-many-lines
'''
TEMPLATE = """ (
'{0}',
# Translators: Language name, ISO code: {0}
_('{1}'),
{2},
'{3}'
),
"""
WRAPPER = TextWrapper(width=79, initial_indent=" " * 4, subsequent_indent=" " * 4)
# Read languages
with open("scripts/language-data/languages.csv", "r") as csvfile:
reader = csv.reader(csvfile, delimiter=";")
LANGUAGES = list(reader)
# Read aliases
with open("scripts/language-data/aliases.csv", "r") as csvfile:
reader = csv.reader(csvfile, delimiter=";")
ALIASES = [alias for alias in reader if alias[0] != "#"]
# Read extra plurals
with open("scripts/language-data/extraplurals.csv", "r") as csvfile:
reader = csv.reader(csvfile, delimiter=";")
EXTRAPLURALS = list(reader)
# Read extra plurals
with open("scripts/language-data/default_countries.csv", "r") as csvfile:
reader = csv.reader(csvfile, delimiter=";")
DEFAULT_COUNTRIES = list(reader)
# Write language definitions
with open("weblate/langdata/languages.py", "w") as output:
output.write(HEADER)
output.write("from django.utils.translation import gettext_noop as _\n\n")
output.write("# Language definitions\n")
output.write("LANGUAGES = (\n")
for row in LANGUAGES:
output.write(
TEMPLATE.format(row[0], row[1].replace("'", "\\'"), row[2], row[3])
)
output.write(")\n")
with open("weblate/langdata/plurals.py", "w") as output:
output.write(HEADER)
output.write("from django.utils.translation import gettext_noop as _\n\n")
output.write("# Additional plural rules definitions\n")
output.write("EXTRAPLURALS = (\n")
for row in EXTRAPLURALS:
output.write(
TEMPLATE.format(row[0], row[1].replace("'", "\\'"), row[2], row[3])
)
output.write(")\n")
with open("weblate/langdata/aliases.py", "w") as output:
output.write(HEADER)
output.write("# Language aliases\n")
output.write("ALIASES = {\n")
for row in ALIASES:
output.write(" '{}': '{}',\n".format(*row))
output.write("}\n")
with open("weblate/langdata/countries.py", "w") as output:
output.write(HEADER)
output.write("# List of defaul languages, omitting country code should be okay\n")
output.write("DEFAULT_LANGS = (\n")
for row in DEFAULT_COUNTRIES:
output.write(" '{}',\n".format(*row))
output.write(")\n")
# Generate same check blacklist
words = set()
def add_word(word):
words.update(SPLIT_RE.split(word.lower()))
def process_iso(name):
with open("scripts/iso-codes/data/iso_{}.json".format(name), "r") as handle:
for item in json.load(handle)[name]:
add_word(item["name"])
if "common_name" in item:
add_word(item["common_name"])
# Our languages data
for row in LANGUAGES:
add_word(row[1])
# iso-codes
process_iso("639-2")
process_iso("639-3")
process_iso("639-5")
process_iso("15924")
process_iso("3166-1")
process_iso("3166-2")
process_iso("3166-3")
process_iso("4217")
words.difference_update(
{
"administered",
"administrative",
"air",
"and",
"are",
"association",
"autonomous",
"auxiliary",
"based",
"bassin",
"bath",
"bay",
"big",
"canal",
"canton",
"country",
"county",
"early",
"east",
"eastern",
"family",
"language",
"languages",
"long",
"metropolitan",
"miscellaneous",
"neutral",
"new",
"north",
"northeast",
"northeastern",
"northern",
"northwest",
"northwestern",
"region",
"see",
"small",
"south",
"southeast",
"southeastern",
"southern",
"southwest",
"southwestern",
"state",
"states",
"testing",
"transactions",
"trust",
"use",
"west",
"western",
}
)
# Write same check blacklist
with open("weblate/checks/languages.py", "w") as output:
output.write(HEADER)
output.write("# Language names to ignore in same check\n")
output.write("LANGUAGES = {\n")
content = ", ".join(
(
"'{}'".format(word.replace("'", "\\'"))
for word in sorted(words)
if len(word) > 2
)
)
output.write("\n".join(WRAPPER.wrap(content)))
output.write("\n}\n")
# Apply coding style
subprocess.run(
[
"black",
"weblate/langdata/countries.py",
"weblate/langdata/aliases.py",
"weblate/langdata/plurals.py",
"weblate/langdata/languages.py",
"weblate/checks/languages.py",
]
)