weblate/scripts/generate-non-word-chars
Michal Čihař 46af586419 checks: Allow duplicate words at sentence boundary
This is less likely an error and more likely an intention.

Fixes #9351
2023-06-06 15:03:02 +02:00

40 lines
768 B
Python
Executable file

#!/usr/bin/env python
# Copyright © Michal Čihař <michal@weblate.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later
"""
Generates list of non-word chars.
Used in weblate/checks/data.py
"""
import pprint
import sys
import unicodedata
# Unicode categories to consider non word chars
CATEGORIES = {"Po", "Ps", "Zs", "Cc", "Sk"}
# Excluded chars
EXCLUDES = {
# Removed to avoid breaking regexp syntax
"]",
# We intentionally skip following
"-",
# Allow same words at sentence boundary
";",
":",
",",
".",
# Used in Catalan ŀ
"·",
"",
}
pprint.pprint(
[
char
for char in map(chr, range(sys.maxunicode + 1))
if char not in EXCLUDES and unicodedata.category(char) in CATEGORIES
]
)