weblate/scripts/generate-language-data

#!/usr/bin/env python3
#
# Copyright © 2012 - 2018 Michal Čihař <michal@cihar.com>
#
# This file is part of Weblate <https://weblate.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#
# Helper script to generate Python code from language-data repository
# see https://github.com/WeblateOrg/language-data

import csv
import json
import re
from textwrap import TextWrapper

SPLIT_RE = re.compile(
    r'(?:\&(?:nbsp|rsaquo|lt|gt|amp|ldquo|rdquo|times|quot);|' +
    r'[() ,.^`"\'\\/_<>!?;:|{}*^@%#&~=+\r\n✓—‑…\[\]0-9-])+'
)

HEADER = '''# -*- coding: utf-8 -*-
#
# Copyright © 2012 - 2018 Michal Čihař <michal@cihar.com>
#
# This file is part of Weblate <https://weblate.org/>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#

"""
Language data definitions.

This is an automatically generated file, see scripts/generate-language-data

Do not edit, please adjust language definitions in following repository:
https://github.com/WeblateOrg/language-data
"""
# pylint: disable=line-too-long,too-many-lines

from __future__ import unicode_literals
'''

TEMPLATE = '''    ('{}', _('{}'), {}, '{}'),
'''

WRAPPER = TextWrapper(
    width=79,
    initial_indent=' ' * 4,
    subsequent_indent=' ' * 4,
)

# Read languages
with open('scripts/language-data/languages.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=';')
    LANGUAGES = list(reader)

# Read aliases
with open("scripts/language-data/aliases.csv", "r") as csvfile:
    reader = csv.reader(csvfile, delimiter=";")
    ALIASES = [alias for alias in reader if alias[0] != "#"]

# Read extra plurals
with open('scripts/language-data/extraplurals.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=';')
    EXTRAPLURALS = list(reader)

# Write language definitions
with open('weblate/langdata/languages.py', 'w') as output:
    output.write(HEADER)
    output.write(
        'from django.utils.translation import ugettext_noop as _\n\n'
    )
    output.write('# Language definitions\n')
    output.write('LANGUAGES = (\n')
    for row in LANGUAGES:
        output.write(TEMPLATE.format(
            row[0],
            row[1].replace("'", "\\'"),
            row[2],
            row[3],
        ))
    output.write(')\n\n')
    output.write('# Additional plural rules definitions\n')
    output.write('EXTRAPLURALS = (\n')
    for row in EXTRAPLURALS:
        output.write(TEMPLATE.format(
            row[0],
            row[1].replace("'", "\\'"),
            row[2],
            row[3],
        ))
    output.write(')\n')
    output.write('# Language aliases\n')
    output.write('ALIASES = {\n')
    for row in ALIASES:
        output.write("    '{}': '{}',\n".format(*row))
    output.write('}\n')

# Generate same check blacklist
words = set()
def add_word(word):
    words.update(SPLIT_RE.split(word.lower()))

def process_iso(name):
    with open('/usr/share/iso-codes/json/iso_{}.json'.format(name), 'r') as handle:
        for item in json.load(handle)[name]:
            add_word(item['name'])
            if 'common_name' in item:
                add_word(item['common_name'])


# Our languages data
for row in LANGUAGES:
    add_word(row[1])

# iso-codes
process_iso('639-2')
process_iso('639-3')
process_iso('639-5')
process_iso('15924')
process_iso('3166-1')
process_iso('3166-2')
process_iso('3166-3')
process_iso('4217')

SKIP = (
    'administered',
    'administrative',
    'and',
    'are',
    'association',
    'autonomous'
    'auxiliary',
    'bay',
    'based',
    'bassin',
    'bath',
    'canal',
    'canton',
    'country',
    'county',
    'family',
    'language',
    'languages',
    'long',
    'metropolitan',
    'miscellaneous',
    'neutral',
    'region',
    'see',
    'state',
    'states',
    'transactions',
    'trust',
    'testing',
    'use',
)

for word in SKIP:
    words.discard(word)

# Write same check blacklist
with open('weblate/checks/languages.py', 'w') as output:
    output.write(HEADER)
    output.write('\n\n')
    output.write('# Language names to ignore in same check\n')
    output.write('LANGUAGES = frozenset((\n')
    content = ', '.join((
        '\'{}\''.format(word.replace("'", "\\'"))
        for word in sorted(words)
        if len(word) > 2
    ))
    output.write('\n'.join(WRAPPER.wrap(content)))
    output.write('\n))\n')