weblate/scripts/generate-language-data

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright © 2012 - 2019 Michal Čihař <michal@cihar.com>
#
# This file is part of Weblate <https://weblate.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#
# Helper script to generate Python code from language-data repository
# see https://github.com/WeblateOrg/language-data

import csv
import json
import re
from textwrap import TextWrapper

SPLIT_RE = re.compile(
    r"(?:\&(?:nbsp|rsaquo|lt|gt|amp|ldquo|rdquo|times|quot);|"
    + r'[() ,.^`"\'\\/_<>!?;:|{}*^@%#&~=+\r\n✓—‑…\[\]0-9-])+'
)

HEADER = '''# -*- coding: utf-8 -*-
#
# Copyright © 2012 - 2019 Michal Čihař <michal@cihar.com>
#
# This file is part of Weblate <https://weblate.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

"""
Language data definitions.

This is an automatically generated file, see scripts/generate-language-data

Do not edit, please adjust language definitions in following repository:
https://github.com/WeblateOrg/language-data
"""
# pylint: disable=line-too-long,too-many-lines

from __future__ import unicode_literals

'''

TEMPLATE = """    # Translators: Language name, ISO code: {0}
    ('{0}', _('{1}'), {2}, '{3}'),
"""

WRAPPER = TextWrapper(width=79, initial_indent=" " * 4, subsequent_indent=" " * 4)

# Read languages
with open("scripts/language-data/languages.csv", "r") as csvfile:
    reader = csv.reader(csvfile, delimiter=";")
    LANGUAGES = list(reader)

# Read aliases
with open("scripts/language-data/aliases.csv", "r") as csvfile:
    reader = csv.reader(csvfile, delimiter=";")
    ALIASES = [alias for alias in reader if alias[0] != "#"]

# Read extra plurals
with open("scripts/language-data/extraplurals.csv", "r") as csvfile:
    reader = csv.reader(csvfile, delimiter=";")
    EXTRAPLURALS = list(reader)

# Write language definitions
with open("weblate/langdata/languages.py", "w") as output:
    output.write(HEADER)
    output.write("from django.utils.translation import ugettext_noop as _\n\n")
    output.write("# Language definitions\n")
    output.write("LANGUAGES = (\n")
    for row in LANGUAGES:
        output.write(
            TEMPLATE.format(row[0], row[1].replace("'", "\\'"), row[2], row[3])
        )
    output.write(")\n\n")
    output.write("# Additional plural rules definitions\n")
    output.write("EXTRAPLURALS = (\n")
    for row in EXTRAPLURALS:
        output.write(
            TEMPLATE.format(row[0], row[1].replace("'", "\\'"), row[2], row[3])
        )
    output.write(")\n")
    output.write("# Language aliases\n")
    output.write("ALIASES = {\n")
    for row in ALIASES:
        output.write("    '{}': '{}',\n".format(*row))
    output.write("}\n")

# Generate same check blacklist
words = set()


def add_word(word):
    words.update(SPLIT_RE.split(word.lower()))


def process_iso(name):
    with open("scripts/iso-codes/data/iso_{}.json".format(name), "r") as handle:
        for item in json.load(handle)[name]:
            add_word(item["name"])
            if "common_name" in item:
                add_word(item["common_name"])


# Our languages data
for row in LANGUAGES:
    add_word(row[1])

# iso-codes
process_iso("639-2")
process_iso("639-3")
process_iso("639-5")
process_iso("15924")
process_iso("3166-1")
process_iso("3166-2")
process_iso("3166-3")
process_iso("4217")

SKIP = (
    "administered",
    "administrative",
    "and",
    "are",
    "association",
    "autonomous",
    "auxiliary",
    "bay",
    "based",
    "bassin",
    "bath",
    "canal",
    "canton",
    "country",
    "county",
    "family",
    "language",
    "languages",
    "long",
    "metropolitan",
    "miscellaneous",
    "neutral",
    "region",
    "see",
    "state",
    "states",
    "transactions",
    "trust",
    "testing",
    "use",
)

for word in SKIP:
    words.discard(word)

# Write same check blacklist
with open("weblate/checks/languages.py", "w") as output:
    output.write(HEADER)
    output.write("# Language names to ignore in same check\n")
    output.write("LANGUAGES = frozenset((\n")
    content = ", ".join(
        (
            "'{}'".format(word.replace("'", "\\'"))
            for word in sorted(words)
            if len(word) > 2
        )
    )
    output.write("\n".join(WRAPPER.wrap(content)))
    output.write("\n))\n")