weblate/scripts/generate-language-data
Michal Čihař 2b62af5c4a Languages: Use module for iso-codes data
Otherwise it depends on system installed version, what is not reproducible.

Signed-off-by: Michal Čihař <michal@cihar.com>
2019-08-29 14:50:29 +02:00

192 lines
5.3 KiB
Python
Executable file
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright © 2012 - 2019 Michal Čihař <michal@cihar.com>
#
# This file is part of Weblate <https://weblate.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# Helper script to generate Python code from language-data repository
# see https://github.com/WeblateOrg/language-data
import csv
import json
import re
from textwrap import TextWrapper
SPLIT_RE = re.compile(
r"(?:\&(?:nbsp|rsaquo|lt|gt|amp|ldquo|rdquo|times|quot);|"
+ r'[() ,.^`"\'\\/_<>!?;:|{}*^@%#&~=+\r\n✓—\[\]0-9-])+'
)
HEADER = '''# -*- coding: utf-8 -*-
#
# Copyright © 2012 - 2019 Michal Čihař <michal@cihar.com>
#
# This file is part of Weblate <https://weblate.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
"""
Language data definitions.
This is an automatically generated file, see scripts/generate-language-data
Do not edit, please adjust language definitions in following repository:
https://github.com/WeblateOrg/language-data
"""
# pylint: disable=line-too-long,too-many-lines
from __future__ import unicode_literals
'''
TEMPLATE = """ # Translators: Language name, ISO code: {0}
('{0}', _('{1}'), {2}, '{3}'),
"""
WRAPPER = TextWrapper(width=79, initial_indent=" " * 4, subsequent_indent=" " * 4)
# Read languages
with open("scripts/language-data/languages.csv", "r") as csvfile:
reader = csv.reader(csvfile, delimiter=";")
LANGUAGES = list(reader)
# Read aliases
with open("scripts/language-data/aliases.csv", "r") as csvfile:
reader = csv.reader(csvfile, delimiter=";")
ALIASES = [alias for alias in reader if alias[0] != "#"]
# Read extra plurals
with open("scripts/language-data/extraplurals.csv", "r") as csvfile:
reader = csv.reader(csvfile, delimiter=";")
EXTRAPLURALS = list(reader)
# Write language definitions
with open("weblate/langdata/languages.py", "w") as output:
output.write(HEADER)
output.write("from django.utils.translation import ugettext_noop as _\n\n")
output.write("# Language definitions\n")
output.write("LANGUAGES = (\n")
for row in LANGUAGES:
output.write(
TEMPLATE.format(row[0], row[1].replace("'", "\\'"), row[2], row[3])
)
output.write(")\n\n")
output.write("# Additional plural rules definitions\n")
output.write("EXTRAPLURALS = (\n")
for row in EXTRAPLURALS:
output.write(
TEMPLATE.format(row[0], row[1].replace("'", "\\'"), row[2], row[3])
)
output.write(")\n")
output.write("# Language aliases\n")
output.write("ALIASES = {\n")
for row in ALIASES:
output.write(" '{}': '{}',\n".format(*row))
output.write("}\n")
# Generate same check blacklist
words = set()
def add_word(word):
words.update(SPLIT_RE.split(word.lower()))
def process_iso(name):
with open("scripts/iso-codes/data/iso_{}.json".format(name), "r") as handle:
for item in json.load(handle)[name]:
add_word(item["name"])
if "common_name" in item:
add_word(item["common_name"])
# Our languages data
for row in LANGUAGES:
add_word(row[1])
# iso-codes
process_iso("639-2")
process_iso("639-3")
process_iso("639-5")
process_iso("15924")
process_iso("3166-1")
process_iso("3166-2")
process_iso("3166-3")
process_iso("4217")
SKIP = (
"administered",
"administrative",
"and",
"are",
"association",
"autonomous",
"auxiliary",
"bay",
"based",
"bassin",
"bath",
"canal",
"canton",
"country",
"county",
"family",
"language",
"languages",
"long",
"metropolitan",
"miscellaneous",
"neutral",
"region",
"see",
"state",
"states",
"transactions",
"trust",
"testing",
"use",
)
for word in SKIP:
words.discard(word)
# Write same check blacklist
with open("weblate/checks/languages.py", "w") as output:
output.write(HEADER)
output.write("# Language names to ignore in same check\n")
output.write("LANGUAGES = frozenset((\n")
content = ", ".join(
(
"'{}'".format(word.replace("'", "\\'"))
for word in sorted(words)
if len(word) > 2
)
)
output.write("\n".join(WRAPPER.wrap(content)))
output.write("\n))\n")