mirror of
https://gh.wpcy.net/https://github.com/WeblateOrg/weblate.git
synced 2026-05-28 20:24:13 +08:00
218 lines
5.7 KiB
Python
Vendored
218 lines
5.7 KiB
Python
Vendored
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright © 2012 - 2013 Michal Čihař <michal@cihar.com>
|
|
#
|
|
# This file is part of Weblate <http://weblate.org/>
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
#
|
|
|
|
'''
|
|
Whoosh based full text search.
|
|
'''
|
|
|
|
from whoosh.fields import Schema, TEXT, ID
|
|
from whoosh.filedb.filestore import FileStorage
|
|
from whoosh import qparser
|
|
from django.db.models.signals import post_syncdb
|
|
from weblate import appsettings
|
|
from whoosh.writing import AsyncWriter, BufferedWriter
|
|
from django.dispatch import receiver
|
|
from lang.models import Language
|
|
|
|
TARGET_SCHEMA = Schema(
|
|
checksum=ID(stored=True, unique=True),
|
|
target=TEXT
|
|
)
|
|
|
|
SOURCE_SCHEMA = Schema(
|
|
checksum=ID(stored=True, unique=True),
|
|
source=TEXT,
|
|
context=TEXT
|
|
)
|
|
|
|
STORAGE = FileStorage(appsettings.WHOOSH_INDEX)
|
|
|
|
|
|
@receiver(post_syncdb)
|
|
def create_index(sender=None, **kwargs):
|
|
'''
|
|
Automatically creates storage directory.
|
|
'''
|
|
STORAGE.create()
|
|
|
|
|
|
def create_source_index():
|
|
'''
|
|
Creates source string index.
|
|
'''
|
|
return STORAGE.create_index(SOURCE_SCHEMA, 'source')
|
|
|
|
|
|
def create_target_index(lang):
|
|
'''
|
|
Creates traget string index for given language.
|
|
'''
|
|
return STORAGE.create_index(TARGET_SCHEMA, 'target-%s' % lang)
|
|
|
|
|
|
def update_source_unit_index(writer, unit):
|
|
'''
|
|
Updates source index for given unit.
|
|
'''
|
|
writer.update_document(
|
|
checksum=unicode(unit.checksum),
|
|
source=unicode(unit.source),
|
|
context=unicode(unit.context),
|
|
)
|
|
|
|
|
|
def update_target_unit_index(writer, unit):
|
|
'''
|
|
Updates target index for given unit.
|
|
'''
|
|
writer.update_document(
|
|
checksum=unicode(unit.checksum),
|
|
target=unicode(unit.target)
|
|
)
|
|
|
|
|
|
def get_source_index():
|
|
'''
|
|
Returns source index object.
|
|
'''
|
|
if not STORAGE.index_exists('source'):
|
|
create_source_index()
|
|
return STORAGE.open_index('source')
|
|
|
|
|
|
def get_target_index(lang):
|
|
'''
|
|
Returns target index object.
|
|
'''
|
|
name = 'target-%s' % lang
|
|
if not STORAGE.index_exists(name):
|
|
create_target_index(lang)
|
|
return STORAGE.open_index(name)
|
|
|
|
|
|
def update_index(units, source_units=None):
|
|
'''
|
|
Updates fulltext index for given set of units.
|
|
'''
|
|
languages = Language.objects.all()
|
|
|
|
# Default to same set for both updates
|
|
if source_units is None:
|
|
source_units = units
|
|
|
|
# Update source index
|
|
index = get_source_index()
|
|
writer = BufferedWriter(index)
|
|
try:
|
|
for unit in source_units.iterator():
|
|
update_source_unit_index(writer, unit)
|
|
finally:
|
|
writer.close()
|
|
|
|
# Update per language indices
|
|
for lang in languages:
|
|
index = get_target_index(lang.code)
|
|
writer = BufferedWriter(index)
|
|
try:
|
|
language_units = units.filter(
|
|
translation__language=lang
|
|
).exclude(
|
|
target=''
|
|
)
|
|
|
|
for unit in language_units.iterator():
|
|
update_target_unit_index(writer, unit)
|
|
finally:
|
|
writer.close()
|
|
|
|
|
|
def update_index_unit(unit, source=True):
|
|
'''
|
|
Adds single unit to index.
|
|
'''
|
|
# Should this happen in background?
|
|
if appsettings.OFFLOAD_INDEXING:
|
|
from trans.models.unitdata import IndexUpdate
|
|
IndexUpdate.objects.create(unit=unit, source=source)
|
|
return
|
|
|
|
# Update source
|
|
if source:
|
|
index = get_source_index()
|
|
with AsyncWriter(index) as writer:
|
|
update_source_unit_index(writer, unit)
|
|
|
|
# Update target
|
|
if unit.target != '':
|
|
index = get_target_index(unit.translation.language.code)
|
|
with AsyncWriter(index) as writer:
|
|
update_target_unit_index(writer, unit)
|
|
|
|
|
|
def base_search(searcher, field, schema, query):
|
|
'''
|
|
Wrapper for fulltext search.
|
|
'''
|
|
parser = qparser.QueryParser(field, schema)
|
|
parsed = parser.parse(query)
|
|
return [result['checksum'] for result in searcher.search(parsed)]
|
|
|
|
|
|
def fulltext_search(query, lang, source=True, context=True, target=True):
|
|
'''
|
|
Performs fulltext search in given areas, returns set of checksums.
|
|
'''
|
|
checksums = set()
|
|
|
|
if source or context:
|
|
index = get_source_index()
|
|
with index.searcher() as searcher:
|
|
if source:
|
|
checksums.update(
|
|
base_search(searcher, 'source', SOURCE_SCHEMA, query)
|
|
)
|
|
if context:
|
|
checksums.update(
|
|
base_search(searcher, 'context', SOURCE_SCHEMA, query)
|
|
)
|
|
|
|
if target:
|
|
index = get_target_index(lang)
|
|
with index.searcher() as searcher:
|
|
checksums.update(
|
|
base_search(searcher, 'target', TARGET_SCHEMA, query)
|
|
)
|
|
|
|
return checksums
|
|
|
|
|
|
def more_like(checksum, source, top=5):
|
|
'''
|
|
Finds similar units.
|
|
'''
|
|
index = get_source_index()
|
|
with index.searcher() as searcher:
|
|
docnum = searcher.document_number(checksum=checksum)
|
|
if docnum is None:
|
|
return set()
|
|
|
|
results = searcher.more_like(docnum, 'source', source, top)
|
|
|
|
return set([result['checksum'] for result in results])
|