weblate/scripts/generate-non-word-chars.py
Michal Čihař 4cc1e17f42 chore: rename Python scripts to have py extension
This makes it easier to apply linting to them.
2025-07-18 12:33:16 +02:00

46 lines
1,003 B
Python
Executable file

#!/usr/bin/env python
# Copyright © Michal Čihař <michal@weblate.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""
Generates list of non-word chars.
Used in weblate/checks/data.py
"""
import pprint
import sys
import unicodedata
# Unicode categories to consider non word chars
CATEGORIES = {"Po", "Ps", "Zs", "Cc", "Sk"}
# Excluded chars
EXCLUDES = {
# Removed to avoid breaking regexp syntax
"]",
# We intentionally skip following
"-",
# Allow same words at sentence boundary
";",
":",
",",
".",
# Used in Catalan ŀ
"·",
"",
}
print("NON_WORD_CHARS = ")
pprint.pprint(
[
char
for char in map(chr, range(sys.maxunicode + 1))
if char not in EXCLUDES and unicodedata.category(char) in CATEGORIES
]
)
print("COMPOSITING_CHARS = {")
for char in map(chr, range(sys.maxunicode + 1)):
if unicodedata.category(char) == "Mn":
print(' "{}",'.format(char.encode("unicode-escape").decode()))
print("}")