weblate/scripts/generate-language-data

#!/usr/bin/env python3
#
# Copyright © 2012 - 2020 Michal Čihař <michal@cihar.com>
#
# This file is part of Weblate <https://weblate.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#
"""
Helper script to generate Python code from language-data repository.

See https://github.com/WeblateOrg/language-data
"""

import csv
import json
import re
import subprocess
from textwrap import TextWrapper

SPLIT_RE = re.compile(
    r"(?:\&(?:nbsp|rsaquo|lt|gt|amp|ldquo|rdquo|times|quot);|"
    + r'[() ,.^`"\'\\/_<>!?;:|{}*^@%#&~=+\r\n✓—‑…\[\]0-9-])+'
)

HEADER = '''#
# Copyright © 2012 - 2020 Michal Čihař <michal@cihar.com>
#
# This file is part of Weblate <https://weblate.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

"""Language data definitions.

This is an automatically generated file, see scripts/generate-language-data

Do not edit, please adjust language definitions in following repository:
https://github.com/WeblateOrg/language-data
"""
# pylint: disable=line-too-long,too-many-lines


'''

TEMPLATE = """    (
        '{0}',
        # Translators: Language name, ISO code: {0}
        _('{1}'),
        {2},
        '{3}'
    ),
"""

WRAPPER = TextWrapper(width=79, initial_indent=" " * 4, subsequent_indent=" " * 4)

# Read languages
with open("scripts/language-data/languages.csv", "r") as csvfile:
    reader = csv.reader(csvfile, delimiter=";")
    LANGUAGES = list(reader)

# Read aliases
with open("scripts/language-data/aliases.csv", "r") as csvfile:
    reader = csv.reader(csvfile, delimiter=";")
    ALIASES = [alias for alias in reader if alias[0] != "#"]

# Read extra plurals
with open("scripts/language-data/extraplurals.csv", "r") as csvfile:
    reader = csv.reader(csvfile, delimiter=";")
    EXTRAPLURALS = list(reader)

# Read extra plurals
with open("scripts/language-data/default_countries.csv", "r") as csvfile:
    reader = csv.reader(csvfile, delimiter=";")
    DEFAULT_COUNTRIES = list(reader)

# Write language definitions
with open("weblate/langdata/languages.py", "w") as output:
    output.write(HEADER)
    output.write("from django.utils.translation import gettext_noop as _\n\n")
    output.write("# Language definitions\n")
    output.write("LANGUAGES = (\n")
    for row in LANGUAGES:
        output.write(
            TEMPLATE.format(row[0], row[1].replace("'", "\\'"), row[2], row[3])
        )
    output.write(")\n")
with open("weblate/langdata/plurals.py", "w") as output:
    output.write(HEADER)
    output.write("from django.utils.translation import gettext_noop as _\n\n")
    output.write("# Additional plural rules definitions\n")
    output.write("EXTRAPLURALS = (\n")
    for row in EXTRAPLURALS:
        output.write(
            TEMPLATE.format(row[0], row[1].replace("'", "\\'"), row[2], row[3])
        )
    output.write(")\n")
with open("weblate/langdata/aliases.py", "w") as output:
    output.write(HEADER)
    output.write("# Language aliases\n")
    output.write("ALIASES = {\n")
    for row in ALIASES:
        output.write("    '{}': '{}',\n".format(*row))
    output.write("}\n")
with open("weblate/langdata/countries.py", "w") as output:
    output.write(HEADER)
    output.write("# List of defaul languages, omitting country code should be okay\n")
    output.write("DEFAULT_LANGS = (\n")
    for row in DEFAULT_COUNTRIES:
        output.write("    '{}',\n".format(*row))
    output.write(")\n")

# Generate same check blacklist
words = set()


def add_word(word):
    words.update(SPLIT_RE.split(word.lower()))


def process_iso(name):
    with open("scripts/iso-codes/data/iso_{}.json".format(name), "r") as handle:
        for item in json.load(handle)[name]:
            add_word(item["name"])
            if "common_name" in item:
                add_word(item["common_name"])


# Our languages data
for row in LANGUAGES:
    add_word(row[1])

# iso-codes
process_iso("639-2")
process_iso("639-3")
process_iso("639-5")
process_iso("15924")
process_iso("3166-1")
process_iso("3166-2")
process_iso("3166-3")
process_iso("4217")

words.difference_update(
    {
        "administered",
        "administrative",
        "air",
        "and",
        "are",
        "association",
        "autonomous",
        "auxiliary",
        "based",
        "bassin",
        "bath",
        "bay",
        "big",
        "canal",
        "canton",
        "country",
        "county",
        "early",
        "east",
        "eastern",
        "family",
        "language",
        "languages",
        "long",
        "metropolitan",
        "miscellaneous",
        "neutral",
        "new",
        "north",
        "northeast",
        "northeastern",
        "northern",
        "northwest",
        "northwestern",
        "region",
        "see",
        "small",
        "south",
        "southeast",
        "southeastern",
        "southern",
        "southwest",
        "southwestern",
        "state",
        "states",
        "testing",
        "transactions",
        "trust",
        "use",
        "west",
        "western",
    }
)

# Write same check blacklist
with open("weblate/checks/languages.py", "w") as output:
    output.write(HEADER)
    output.write("# Language names to ignore in same check\n")
    output.write("LANGUAGES = {\n")
    content = ", ".join(
        (
            "'{}'".format(word.replace("'", "\\'"))
            for word in sorted(words)
            if len(word) > 2
        )
    )
    output.write("\n".join(WRAPPER.wrap(content)))
    output.write("\n}\n")

# Apply coding style
subprocess.run(
    [
        "black",
        "weblate/langdata/countries.py",
        "weblate/langdata/aliases.py",
        "weblate/langdata/plurals.py",
        "weblate/langdata/languages.py",
        "weblate/checks/languages.py",
    ]
)