import json
import os
import re
import shutil
import threading
import warnings
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from django.utils.datetime_safe import date, datetime
from django.utils.encoding import force_str
from haystack.backends import (
BaseEngine,
BaseSearchBackend,
BaseSearchQuery,
EmptyResults,
log_query,
)
from haystack.constants import (
DJANGO_CT,
DJANGO_ID,
FUZZY_WHOOSH_MAX_EDITS,
FUZZY_WHOOSH_MIN_PREFIX,
ID,
)
from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument
from haystack.inputs import Clean, Exact, PythonData, Raw
from haystack.models import SearchResult
from haystack.utils import get_identifier, get_model_ct
from haystack.utils import log as logging
from haystack.utils.app_loading import haystack_get_model
from .chinese_anlyzer import ChineseAnalyzer
try:
import whoosh
except ImportError:
raise MissingDependency(
"The 'whoosh' backend requires the installation of 'Whoosh'. Please refer to the documentation."
)
# Handle minimum requirement.
if not hasattr(whoosh, "__version__") or whoosh.__version__ < (2, 5, 0):
raise MissingDependency("The 'whoosh' backend requires version 2.5.0 or greater.")
# Bubble up the correct error.
from whoosh import index
from whoosh.analysis import StemmingAnalyzer
from whoosh.fields import BOOLEAN, DATETIME
from whoosh.fields import ID as WHOOSH_ID
from whoosh.fields import IDLIST, KEYWORD, NGRAM, NGRAMWORDS, NUMERIC, TEXT, Schema
from whoosh.filedb.filestore import FileStorage, RamStorage
from whoosh.highlight import ContextFragmenter, HtmlFormatter
from whoosh.highlight import highlight as whoosh_highlight
from whoosh.qparser import FuzzyTermPlugin, QueryParser
from whoosh.searching import ResultsPage
from whoosh.sorting import Count, DateRangeFacet, FieldFacet
from whoosh.support.relativedelta import relativedelta as RelativeDelta
from whoosh.writing import AsyncWriter
DATETIME_REGEX = re.compile(
r"^(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})(\.\d{3,"
r"6}Z?)?$"
)
LOCALS = threading.local()
LOCALS.RAM_STORE = None
class WhooshHtmlFormatter(HtmlFormatter):
"""
This is a HtmlFormatter simpler than the whoosh.HtmlFormatter.
We use it to have consistent results across backends. Specifically,
Solr, Xapian and Elasticsearch are using this formatting.
"""
template = "<%(tag)s>%(t)s</%(tag)s>"
class WhooshSearchBackend(BaseSearchBackend):
# Word reserved by Whoosh for special use.
RESERVED_WORDS = ("AND", "NOT", "OR", "TO")
# Characters reserved by Whoosh for special use.
# The '\\' must come first, so as not to overwrite the other slash replacements.
RESERVED_CHARACTERS = (
"\\",
"+",
"-",
"&&",
"||",
"!",
"(",
")",
"{",
"}",
"[",
"]",
"^",
'"',
"~",
"*",
"?",
":",
".",
)
def __init__(self, connection_alias, **connection_options):
super().__init__(connection_alias, **connection_options)
self.setup_complete = False
self.use_file_storage = True
self.post_limit = getattr(connection_options, "POST_LIMIT", 128 * 1024 * 1024)
self.path = connection_options.get("PATH")
if connection_options.get("STORAGE", "file") != "file":
self.use_file_storage = False
if self.use_file_storage and not self.path:
raise ImproperlyConfigured(
"You must specify a 'PATH' in your settings for connection '%s'."
% connection_alias
)
self.log = logging.getLogger("haystack")
def setup(self):
"""
Defers loading until needed.
"""
from haystack import connections
new_index = False
# Make sure the index is there.
if self.use_file_storage and not os.path.exists(self.path):
os.makedirs(self.path)
new_index = True
if self.use_file_storage and not os.access(self.path, os.W_OK):
raise IOError(
"The path to your Whoosh index '%s' is not writable for the current user/group."
% self.path
)
if self.use_file_storage:
self.storage = FileStorage(self.path)
else:
global LOCALS
if getattr(LOCALS, "RAM_STORE", None) is None:
LOCALS.RAM_STORE = RamStorage()
self.storage = LOCALS.RAM_STORE
self.content_field_name, self.schema = self.build_schema(
connections[self.connection_alias].get_unified_index().all_searchfields()
)
self.parser = QueryParser(self.content_field_name, schema=self.schema)
self.parser.add_plugins([FuzzyTermPlugin])
if new_index is True:
self.index = self.storage.create_index(self.schema)
else:
try:
self.index = self.storage.open_index(schema=self.schema)
except index.EmptyIndexError:
self.index = self.storage.create_index(self.schema)
self.setup_complete = True
def build_schema(self, fields):
schema_fields = {
ID: WHOOSH_ID(stored=True, unique=True),
DJANGO_CT: WHOOSH_ID(stored=True),
DJANGO_ID: WHOOSH_ID(stored=True),
}
# Grab the number of keys that are hard-coded into Haystack.
# We'll use this to (possibly) fail slightly more gracefully later.
initial_key_count = len(schema_fields)
content_field_name = ""
for _, field_class in fields.items():
if field_class.is_multivalued:
if field_class.indexed is False:
schema_fields[field_class.index_fieldname] = IDLIST(
stored=True, field_boost=field_class.boost
)
else:
schema_fields[field_class.index_fieldname] = KEYWORD(
stored=True,
commas=True,
scorable=True,
field_boost=field_class.boost,
)
elif field_class.field_type in ["date", "datetime"]:
schema_fields[field_class.index_fieldname] = DATETIME(
stored=field_class.stored, sortable=True
)
elif field_class.field_type == "integer":
schema_fields[field_class.index_fieldname] = NUMERIC(
stored=field_class.stored,
numtype=int,
field_boost=field_class.boost,
)
elif field_class.field_type == "float":
schema_fields[field_class.index_fieldname] = NUMERIC(
stored=field_class.stored,
numtype=float,
field_boost=field_class.boost,
)
elif field_class.field_type == "boolean":
# Field boost isn't supported on BOOLEAN as of 1.8.2.
schema_fields[field_class.index_fieldname] = BOOLEAN(
stored=field_class.stored
)
elif field_class.field_type == "ngram":
schema_fields[field_class.index_fieldname] = NGRAM(
minsize=3,
maxsize=15,
stored=field_class.stored,
field_boost=field_class.boost,
)
elif field_class.field_type == "edge_ngram":
schema_fields[field_class.index_fieldname] = NGRAMWORDS(
minsize=2,
maxsize=15,
at="start",
stored=field_class.stored,
field_boost=field_class.boost,
)
else: