赞
踩
1、什么是haystack?
1. haystack是django的开源搜索框架,该框架支持Solr,Elasticsearch,Whoosh, *Xapian*搜索引擎,不用更改代码,直接切换引擎,减少代码量。
2. 搜索引擎使用Whoosh,这是一个由纯Python实现的全文搜索引擎,没有二进制文件等,比较小巧,配置比较简单,当然性能自然略低。
3. 中文分词Jieba,由于Whoosh自带的是英文分词,对中文的分词支持不是太好,故用jieba替换whoosh的分词组件。
1、很多的搜索引擎对中的支持不友好,jieba作为一个中文分词器就是加强对中文的检索功能
1、Python的全文搜索库,Whoosh是索引文本及搜索文本的类和函数库
2、Whoosh 自带的是英文分词,对中文分词支持不太好,使用 jieba 替换 whoosh 的分词组件。
pip install django-haystack pip install whoosh pip install jieba
在setting.py中配置
- '''注册app '''
- INSTALLED_APPS = [
- 'django.contrib.admin',
- 'django.contrib.auth',
- 'django.contrib.contenttypes',
- 'django.contrib.sessions',
- 'django.contrib.messages',
- 'django.contrib.staticfiles',
- # haystack要放在应用的上面
- 'haystack',
- 'jsapp', # 这个jsapp是自己创建的app
- ]
-
-
- ''' 模板路径 '''
- TEMPLATES = [
- {
- 'DIRS': [os.path.join(BASE_DIR,'templates')],
-
- },
- ]
-
-
- '''配置haystack '''
- # 全文检索框架配置
- HAYSTACK_CONNECTIONS = {
- 'default': {
- # 指定whoosh引擎
- 'ENGINE': 'haystack.backends.whoosh_backend.WhooshEngine',
- # 'ENGINE': 'jsapp.whoosh_cn_backend.WhooshEngine', # whoosh_cn_backend是haystack的whoosh_backend.py改名的文件为了使用jieba分词
- # 索引文件路径
- 'PATH': os.path.join(BASE_DIR, 'whoosh_index'),
- }
- }
- # 添加此项,当数据库改变时,会自动更新索引,非常方便
- HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'
定义数据库 ( jsapp/models.py)
- from django.db import models
-
- # Create your models here.
- class UserInfo(models.Model):
- name = models.CharField(max_length=254)
- age = models.IntegerField()
-
-
- class ArticlePost(models.Model):
- author = models.ForeignKey(UserInfo,on_delete=models.CASCADE)
- title = models.CharField(max_length=200)
- desc = models.SlugField(max_length=500)
- body = models.TextField()
1)在子应用下创建索引文件
在子应用的目录下,创建一个名为 jsapp/search_indexes.py
的文件
- from haystack import indexes
- from .models import ArticlePost
-
- # 修改此处,类名为模型类的名称+Index,比如模型类为GoodsInfo,则这里类名为GoodsInfoIndex(其实可以随便写)
- class ArticlePostIndex(indexes.SearchIndex, indexes.Indexable):
- # text为索引字段
- # document = True,这代表haystack和搜索引擎将使用此字段的内容作为索引进行检索
- # use_template=True 指定根据表中的那些字段建立索引文件的说明放在一个文件中
- text = indexes.CharField(document=True, use_template=True)
-
- # 对那张表进行查询
- def get_model(self): # 重载get_model方法,必须要有!
- # 返回这个model
- return ArticlePost
-
- # 建立索引的数据
- def index_queryset(self, using=None):
- # 这个方法返回什么内容,最终就会对那些方法建立索引,这里是对所有字段建立索引
- return self.get_model().objects.all()
2)指定索引模板文件
# 创建文件路径命名必须这个规范:templates/search/indexes/应用名称/模型类名称_text.txt # templates/search/indexes/jsapp/articlepost_text.txt
templates/search/indexes/jsapp/articlepost_text.txt
- {{ object.title }}
- {{ object.author.name }}
- {{ object.body }}
3)使用命令创建索引
python manage.py rebuild_index # 建立索引文件
1)将haystack源码复制到项目中并改名
'''1.复制源码中文件并改名 ''' 将 C:\python37\Lib\site-packages\haystack\backends\whoosh_backend.py文件复制到项目中 并将 whoosh_backend.py改名为 whoosh_cn_backend.py 放在APP中如:jsapp\whoosh_cn_backend.py '''2.修改源码中文件''' # 在全局引入的最后一行加入jieba分词器 from jieba.analyse import ChineseAnalyzer # 修改为中文分词法 查找 analyzer=StemmingAnalyzer() 改为 analyzer=ChineseAnalyzer()
- # encoding: utf-8
-
- from __future__ import absolute_import, division, print_function, unicode_literals
-
- import json
- import os
- import re
- import shutil
- import threading
- import warnings
-
- from django.conf import settings
- from django.core.exceptions import ImproperlyConfigured
- from django.utils import six
- from django.utils.datetime_safe import datetime
- from django.utils.encoding import force_text
-
- from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, EmptyResults, log_query
- from haystack.constants import DJANGO_CT, DJANGO_ID, ID
- from haystack.exceptions import MissingDependency, SearchBackendError, SkipDocument
- from haystack.inputs import Clean, Exact, PythonData, Raw
- from haystack.models import SearchResult
- from haystack.utils import log as logging
- from haystack.utils import get_identifier, get_model_ct
- from haystack.utils.app_loading import haystack_get_model
-
- try:
- import whoosh
- except ImportError:
- raise MissingDependency(
- "The 'whoosh' backend requires the installation of 'Whoosh'. Please refer to the documentation.")
-
- # Handle minimum requirement.
- if not hasattr(whoosh, '__version__') or whoosh.__version__ < (2, 5, 0):
- raise MissingDependency("The 'whoosh' backend requires version 2.5.0 or greater.")
-
- # Bubble up the correct error.
- from whoosh import index
- from whoosh.analysis import StemmingAnalyzer
- from whoosh.fields import ID as WHOOSH_ID
- from whoosh.fields import BOOLEAN, DATETIME, IDLIST, KEYWORD, NGRAM, NGRAMWORDS, NUMERIC, Schema, TEXT
- from whoosh.filedb.filestore import FileStorage, RamStorage
- from whoosh.highlight import highlight as whoosh_highlight
- from whoosh.highlight import ContextFragmenter, HtmlFormatter
- from whoosh.qparser import QueryParser
- from whoosh.searching import ResultsPage
- from whoosh.writing import AsyncWriter
-
- DATETIME_REGEX = re.compile(
- '^(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})T(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})(\.\d{3,6}Z?)?$')
- LOCALS = threading.local()
- LOCALS.RAM_STORE = None
-
-
- class WhooshHtmlFormatter(HtmlFormatter):
- """
- This is a HtmlFormatter simpler than the whoosh.HtmlFormatter.
- We use it to have consistent results across backends. Specifically,
- Solr, Xapian and Elasticsearch are using this formatting.
- """
- template = '<%(tag)s>%(t)s</%(tag)s>'
-
-
- class WhooshSearchBackend(BaseSearchBackend):
- # Word reserved by Whoosh for special use.
- RESERVED_WORDS = (
- 'AND',
- 'NOT',
- 'OR',
- 'TO',
- )
-
- # Characters reserved by Whoosh for special use.
- # The '\\' must come first, so as not to overwrite the other slash replacements.
- RESERVED_CHARACTERS = (
- '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
- '[', ']', '^', '"', '~', '*', '?', ':', '.',
- )
-
- def __init__(self, connection_alias, **connection_options):
- super(WhooshSearchBackend, self).__init__(connection_alias, **connection_options)
- self.setup_complete = False
- self.use_file_storage = True
- self.post_limit = getattr(connection_options, 'POST_LIMIT', 128 * 1024 * 1024)
- self.path = connection_options.get('PATH')
-
- if connection_options.get('STORAGE', 'file') != 'file':
- self.use_file_storage = False
-
- if self.use_file_storage and not self.path:
- raise ImproperlyConfigured(
- "You must specify a 'PATH' in your settings for connection '%s'." % connection_alias)
-
- self.log = logging.getLogger('haystack')
-
- def setup(self):
- """
- Defers loading until needed.
- """
- from haystack import connections
- new_index = False
-
- # Make sure the index is there.
- if self.use_file_storage and not os.path.exists(self.path):
- os.makedirs(self.path)
- new_index = True
-
- if self.use_file_storage and not os.access(self.path, os.W_OK):
- raise IOError("The path to your Whoosh index '%s' is not writable for the current user/group." % self.path)
-
- if self.use_file_storage:
- self.storage = FileStorage(self.path)
- else:
- global LOCALS
-
- if getattr(LOCALS, 'RAM_STORE', None) is None:
- LOCALS.RAM_STORE = RamStorage()
-
- self.storage = LOCALS.RAM_STORE
-
- self.content_field_name, self.schema = self.build_schema(
- connections[self.connection_alias].get_unified_index().all_searchfields())
- self.parser = QueryParser(self.content_field_name, schema=self.schema)
-
- if new_index is True:
- self.index = self.storage.create_index(self.schema)
- else:
- try:
- self.index = self.storage.open_index(schema=self.schema)
- except index.EmptyIndexError:
- self.index = self.storage.create_index(self.schema)
-
- self.setup_complete = True
-
- def build_schema(self, fields):
- schema_fields = {
- ID: WHOOSH_ID(stored=True, unique=True),
- DJANGO_CT: WHOOSH_ID(stored=True),
- DJANGO_ID: WHOOSH_ID(stored=True),
- }
- # Grab the number of keys that are hard-coded into Haystack.
- # We'll use this to (possibly) fail slightly more gracefully later.
- initial_key_count = len(schema_fields)
- content_field_name = ''
-
- for field_name, field_class in fields.items():
- if field_class.is_multivalued:
- if field_class.indexed is False:
- schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
- else:
- schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True,
- field_boost=field_class.boost)
- elif field_class.field_type in ['date', 'datetime']:
- schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored, sortable=True)
- elif field_class.field_type == 'integer':
- schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=int,
- field_boost=field_class.boost)
- elif field_class.field_type == 'float':
- schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, numtype=float,
- field_boost=field_class.boost)
- elif field_class.field_type == 'boolean':
- # Field boost isn't supported on BOOLEAN as of 1.8.2.
- schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
- elif field_class.field_type == 'ngram':
- schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored,
- field_boost=field_class.boost)
- elif field_class.field_type == 'edge_ngram':
- schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, at='start',
- stored=field_class.stored,
- field_boost=field_class.boost)
- else:
- schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=ChineseAnalyzer(),
- field_boost=field_class.boost, sortable=True)
-
- if field_class.document is True:
- content_field_name = field_class.index_fieldname
- schema_fields[field_class.index_fieldname].spelling = True
-
- # Fail more gracefully than relying on the backend to die if no fields
- # are found.
- if len(schema_fields) <= initial_key_count:
- raise SearchBackendError(
- "No fields were found in any search_indexes. Please correct this before attempting to search.")
-
- return (content_field_name, Schema(**schema_fields))
-
- def update(self, index, iterable, commit=True):
- if not self.setup_complete:
- self.setup()
-
- self.index = self.index.refresh()
- writer = AsyncWriter(self.index)
-
- for obj in iterable:
- try:
- doc = index.full_prepare(obj)
- except SkipDocument:
- self.log.debug(u"Indexing for object `%s` skipped", obj)
- else:
- # Really make sure it's unicode, because Whoosh won't have it any
- # other way.
- for key in doc:
- doc[key] = self._from_python(doc[key])
-
- # Document boosts aren't supported in Whoosh 2.5.0+.
- if 'boost' in doc:
- del doc['boost']
-
- try:
- writer.update_document(**doc)
- except Exception as e:
- if not self.silently_fail:
- raise
-
- # We'll log the object identifier but won't include the actual object
- # to avoid the possibility of that generating encoding errors while
- # processing the log message:
- self.log.error(u"%s while preparing object for update" % e.__class__.__name__,
- exc_info=True, extra={"data": {"index": index,
- "object": get_identifier(obj)}})
-
- if len(iterable) > 0:
- # For now, commit no matter what, as we run into locking issues otherwise.
- writer.commit()
-
- def remove(self, obj_or_string, commit=True):
- if not self.setup_complete:
- self.setup()
-
- self.index = self.index.refresh()
- whoosh_id = get_identifier(obj_or_string)
-
- try:
- self.index.delete_by_query(q=self.parser.parse(u'%s:"%s"' % (ID, whoosh_id)))
- except Exception as e:
- if not self.silently_fail:
- raise
-
- self.log.error("Failed to remove document '%s' from Whoosh: %s", whoosh_id, e, exc_info=True)
-
- def clear(self, models=None, commit=True):
- if not self.setup_complete:
- self.setup()
-
- self.index = self.index.refresh()
-
- if models is not None:
- assert isinstance(models, (list, tuple))
-
- try:
- if models is None:
- self.delete_index()
- else:
- models_to_delete = []
-
- for model in models:
- models_to_delete.append(u"%s:%s" % (DJANGO_CT, get_model_ct(model)))
-
- self.index.delete_by_query(q=self.parser.parse(u" OR ".join(models_to_delete)))
- except Exception as e:
- if not self.silently_fail:
- raise
-
- if models is not None:
- self.log.error("Failed to clear Whoosh index of models '%s': %s", ','.join(models_to_delete),
- e, exc_info=True)
- else:
- self.log.error("Failed to clear Whoosh index: %s", e, exc_info=True)
-
- def delete_index(self):
- # Per the Whoosh mailing list, if wiping out everything from the index,
- # it's much more efficient to simply delete the index files.
- if self.use_file_storage and os.path.exists(self.path):
- shutil.rmtree(self.path)
- elif not self.use_file_storage:
- self.storage.clean()
-
- # Recreate everything.
- self.setup()
-
- def optimize(self):
- if not self.setup_complete:
- self.setup()
-
- self.index = self.index.refresh()
- self.index.optimize()
-
- def calculate_page(self, start_offset=0, end_offset=None):
- # Prevent against Whoosh throwing an error. Requires an end_offset
- # greater than 0.
- if end_offset is not None and end_offset <= 0:
- end_offset = 1
-
- # Determine the page.
- page_num = 0
-
- if end_offset is None:
- end_offset = 1000000
-
- if start_offset is None:
- start_offset = 0
-
- page_length = end_offset - start_offset
-
- if page_length and page_length > 0:
- page_num = int(start_offset / page_length)
-
- # Increment because Whoosh uses 1-based page numbers.
- page_num += 1
- return page_num, page_length
-
- @log_query
- def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
- fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
- narrow_queries=None, spelling_query=None, within=None,
- dwithin=None, distance_point=None, models=None,
- limit_to_registered_models=None, result_class=None, **kwargs):
- if not self.setup_complete:
- self.setup()
-
- # A zero length query should return no results.
- if len(query_string) == 0:
- return {
- 'results': [],
- 'hits': 0,
- }
-
- query_string = force_text(query_string)
-
- # A one-character query (non-wildcard) gets nabbed by a stopwords
- # filter and should yield zero results.
- if len(query_string) <= 1 and query_string != u'*':
- return {
- 'results': [],
- 'hits': 0,
- }
-
- reverse = False
-
- if sort_by is not None:
- # Determine if we need to reverse the results and if Whoosh can
- # handle what it's being asked to sort by. Reversing is an
- # all-or-nothing action, unfortunately.
- sort_by_list = []
- reverse_counter = 0
-
- for order_by in sort_by:
- if order_by.startswith('-'):
- reverse_counter += 1
-
- if reverse_counter and reverse_counter != len(sort_by):
- raise SearchBackendError("Whoosh requires all order_by fields"
- " to use the same sort direction")
-
- for order_by in sort_by:
- if order_by.startswith('-'):
- sort_by_list.append(order_by[1:])
-
- if len(sort_by_list) == 1:
- reverse = True
- else:
- sort_by_list.append(order_by)
-
- if len(sort_by_list) == 1:
- reverse = False
-
- sort_by = sort_by_list
-
- if facets is not None:
- warnings.warn("Whoosh does not handle faceting.", Warning, stacklevel=2)
-
- if date_facets is not None:
- warnings.warn("Whoosh does not handle date faceting.", Warning, stacklevel=2)
-
- if query_facets is not None:
- warnings.warn("Whoosh does not handle query faceting.", Warning, stacklevel=2)
-
- narrowed_results = None
- self.index = self.index.refresh()
-
- if limit_to_registered_models is None:
- limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
-
- if models and len(models):
- model_choices = sorted(get_model_ct(model) for model in models)
- elif limit_to_registered_models:
- # Using narrow queries, limit the results to only models handled
- # with the current routers.
- model_choices = self.build_models_list()
- else:
- model_choices = []
-
- if len(model_choices) > 0:
- if narrow_queries is None:
- narrow_queries = set()
-
- narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))
-
- narrow_searcher = None
-
- if narrow_queries is not None:
- # Potentially expensive? I don't see another way to do it in Whoosh...
- narrow_searcher = self.index.searcher()
-
- for nq in narrow_queries:
- recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)),
- limit=None)
-
- if len(recent_narrowed_results) <= 0:
- return {
- 'results': [],
- 'hits': 0,
- }
-
- if narrowed_results:
- narrowed_results.filter(recent_narrowed_results)
- else:
- narrowed_results = recent_narrowed_results
-
- self.index = self.index.refresh()
-
- if self.index.doc_count():
- searcher = self.index.searcher()
- parsed_query = self.parser.parse(query_string)
-
- # In the event of an invalid/stopworded query, recover gracefully.
- if parsed_query is None:
- return {
- 'results': [],
- 'hits': 0,
- }
-
- page_num, page_length = self.calculate_page(start_offset, end_offset)
-
- search_kwargs = {
- 'pagelen': page_length,
- 'sortedby': sort_by,
- 'reverse': reverse,
- }
-
- # Handle the case where the results have been narrowed.
- if narrowed_results is not None:
- search_kwargs['filter'] = narrowed_results
-
- try:
- raw_page = searcher.search_page(
- parsed_query,
- page_num,
- **search_kwargs
- )
- except ValueError:
- if not self.silently_fail:
- raise
-
- return {
- 'results': [],
- 'hits': 0,
- 'spelling_suggestion': None,
- }
-
- # Because as of Whoosh 2.5.1, it will return the wrong page of
- # results if you request something too high. :(
- if raw_page.pagenum < page_num:
- return {
- 'results': [],
- 'hits': 0,
- 'spelling_suggestion': None,
- }
-
- results = self._process_results(raw_page, highlight=highlight, query_string=query_string,
- spelling_query=spelling_query, result_class=result_class)
- searcher.close()
-
- if hasattr(narrow_searcher, 'close'):
- narrow_searcher.close()
-
- return results
- else:
- if self.include_spelling:
- if spelling_query:
- spelling_suggestion = self.create_spelling_suggestion(spelling_query)
- else:
- spelling_suggestion = self.create_spelling_suggestion(query_string)
- else:
- spelling_suggestion = None
-
- return {
- 'results': [],
- 'hits': 0,
- 'spelling_suggestion': spelling_suggestion,
- }
-
- def more_like_this(self, model_instance, additional_query_string=None,
- start_offset=0, end_offset=None, models=None,
- limit_to_registered_models=None, result_class=None, **kwargs):
- if not self.setup_complete:
- self.setup()
-
- field_name = self.content_field_name
- narrow_queries = set()
- narrowed_results = None
- self.index = self.index.refresh()
-
- if limit_to_registered_models is None:
- limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)
-
- if models and len(models):
- model_choices = sorted(get_model_ct(model) for model in models)
- elif limit_to_registered_models:
- # Using narrow queries, limit the results to only models handled
- # with the current routers.
- model_choices = self.build_models_list()
- else:
- model_choices = []
-
- if len(model_choices) > 0:
- if narrow_queries is None:
- narrow_queries = set()
-
- narrow_queries.add(' OR '.join(['%s:%s' % (DJANGO_CT, rm) for rm in model_choices]))
-
- if additional_query_string and additional_query_string != '*':
- narrow_queries.add(additional_query_string)
-
- narrow_searcher = None
-
- if narrow_queries is not None:
- # Potentially expensive? I don't see another way to do it in Whoosh...
- narrow_searcher = self.index.searcher()
-
- for nq in narrow_queries:
- recent_narrowed_results = narrow_searcher.search(self.parser.parse(force_text(nq)),
- limit=None)
-
- if len(recent_narrowed_results) <= 0:
- return {
- 'results': [],
- 'hits': 0,
- }
-
- if narrowed_results:
- narrowed_results.filter(recent_narrowed_results)
- else:
- narrowed_results = recent_narrowed_results
-
- page_num, page_length = self.calculate_page(start_offset, end_offset)
-
- self.index = self.index.refresh()
- raw_results = EmptyResults()
-
- searcher = None
- if self.index.doc_count():
- query = "%s:%s" % (ID, get_identifier(model_instance))
- searcher = self.index.searcher()
- parsed_query = self.parser.parse(query)
- results = searcher.search(parsed_query)
-
- if len(results):
- raw_results = results[0].more_like_this(field_name, top=end_offset)
-
- # Handle the case where the results have been narrowed.
- if narrowed_results is not None and hasattr(raw_results, 'filter'):
- raw_results.filter(narrowed_results)
-
- try:
- raw_page = ResultsPage(raw_results, page_num, page_length)
- except ValueError:
- if not self.silently_fail:
- raise
-
- return {
- 'results': [],
- 'hits': 0,
- 'spelling_suggestion': None,
- }
-
- # Because as of Whoosh 2.5.1, it will return the wrong page of
- # results if you request something too high. :(
- if raw_page.pagenum < page_num:
- return {
- 'results': [],
- 'hits': 0,
- 'spelling_suggestion': None,
- }
-
- results = self._process_results(raw_page, result_class=result_class)
-
- if searcher:
- searcher.close()
-
- if hasattr(narrow_searcher, 'close'):
- narrow_searcher.close()
-
- return results
-
- def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None):
- from haystack import connections
- results = []
-
- # It's important to grab the hits first before slicing. Otherwise, this
- # can cause pagination failures.
- hits = len(raw_page)
-
- if result_class is None:
- result_class = SearchResult
-
- facets = {}
- spelling_suggestion = None
- unified_index = connections[self.connection_alias].get_unified_index()
- indexed_models = unified_index.get_indexed_models()
-
- for doc_offset, raw_result in enumerate(raw_page):
- score = raw_page.score(doc_offset) or 0
- app_label, model_name = raw_result[DJANGO_CT].split('.')
- additional_fields = {}
- model = haystack_get_model(app_label, model_name)
-
- if model and model in indexed_models:
- for key, value in raw_result.items():
- index = unified_index.get_index(model)
- string_key = str(key)
-
- if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
- # Special-cased due to the nature of KEYWORD fields.
- if index.fields[string_key].is_multivalued:
- if value is None or len(value) is 0:
- additional_fields[string_key] = []
- else:
- additional_fields[string_key] = value.split(',')
- else:
- additional_fields[string_key] = index.fields[string_key].convert(value)
- else:
- additional_fields[string_key] = self._to_python(value)
-
- del (additional_fields[DJANGO_CT])
- del (additional_fields[DJANGO_ID])
-
- if highlight:
- sa = StemmingAnalyzer()
- formatter = WhooshHtmlFormatter('em')
- terms = [token.text for token in sa(query_string)]
-
- whoosh_result = whoosh_highlight(
- additional_fields.get(self.content_field_name),
- terms,
- sa,
- ContextFragmenter(),
- formatter
- )
- additional_fields['highlighted'] = {
- self.content_field_name: [whoosh_result],
- }
-
- result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, **additional_fields)
- results.append(result)
- else:
- hits -= 1
-
- if self.include_spelling:
- if spelling_query:
- spelling_suggestion = self.create_spelling_suggestion(spelling_query)
- else:
- spelling_suggestion = self.create_spelling_suggestion(query_string)
-
- return {
- 'results': results,
- 'hits': hits,
- 'facets': facets,
- 'spelling_suggestion': spelling_suggestion,
- }
-
- def create_spelling_suggestion(self, query_string):
- spelling_suggestion = None
- reader = self.index.reader()
- corrector = reader.corrector(self.content_field_name)
- cleaned_query = force_text(query_string)
-
- if not query_string:
- return spelling_suggestion
-
- # Clean the string.
- for rev_word in self.RESERVED_WORDS:
- cleaned_query = cleaned_query.replace(rev_word, '')
-
- for rev_char in self.RESERVED_CHARACTERS:
- cleaned_query = cleaned_query.replace(rev_char, '')
-
- # Break it down.
- query_words = cleaned_query.split()
- suggested_words = []
-
- for word in query_words:
- suggestions = corrector.suggest(word, limit=1)
-
- if len(suggestions) > 0:
- suggested_words.append(suggestions[0])
-
- spelling_suggestion = ' '.join(suggested_words)
- return spelling_suggestion
-
- def _from_python(self, value):
- """
- Converts Python values to a string for Whoosh.
- Code courtesy of pysolr.
- """
- if hasattr(value, 'strftime'):
- if not hasattr(value, 'hour'):
- value = datetime(value.year, value.month, value.day, 0, 0, 0)
- elif isinstance(value, bool):
- if value:
- value = 'true'
- else:
- value = 'false'
- elif isinstance(value, (list, tuple)):
- value = u','.join([force_text(v) for v in value])
- elif isinstance(value, (six.integer_types, float)):
- # Leave it alone.
- pass
- else:
- value = force_text(value)
- return value
-
- def _to_python(self, value):
- """
- Converts values from Whoosh to native Python values.
- A port of the same method in pysolr, as they deal with data the same way.
- """
- if value == 'true':
- return True
- elif value == 'false':
- return False
-
- if value and isinstance(value, six.string_types):
- possible_datetime = DATETIME_REGEX.search(value)
-
- if possible_datetime:
- date_values = possible_datetime.groupdict()
-
- for dk, dv in date_values.items():
- date_values[dk] = int(dv)
-
- return datetime(date_values['year'], date_values['month'], date_values['day'], date_values['hour'],
- date_values['minute'], date_values['second'])
-
- try:
- # Attempt to use json to load the values.
- converted_value = json.loads(value)
-
- # Try to handle most built-in types.
- if isinstance(converted_value, (list, tuple, set, dict, six.integer_types, float, complex)):
- return converted_value
- except:
- # If it fails (SyntaxError or its ilk) or we don't trust it,
- # continue on.
- pass
-
- return value
-
-
- class WhooshSearchQuery(BaseSearchQuery):
- def _convert_datetime(self, date):
- if hasattr(date, 'hour'):
- return force_text(date.strftime('%Y%m%d%H%M%S'))
- else:
- return force_text(date.strftime('%Y%m%d000000'))
-
- def clean(self, query_fragment):
- """
- Provides a mechanism for sanitizing user input before presenting the
- value to the backend.
- Whoosh 1.X differs here in that you can no longer use a backslash
- to escape reserved characters. Instead, the whole word should be
- quoted.
- """
- words = query_fragment.split()
- cleaned_words = []
-
- for word in words:
- if word in self.backend.RESERVED_WORDS:
- word = word.replace(word, word.lower())
-
- for char in self.backend.RESERVED_CHARACTERS:
- if char in word:
- word = "'%s'" % word
- break
-
- cleaned_words.append(word)
-
- return ' '.join(cleaned_words)
-
- def build_query_fragment(self, field, filter_type, value):
- from haystack import connections
- query_frag = ''
- is_datetime = False
-
- if not hasattr(value, 'input_type_name'):
- # Handle when we've got a ``ValuesListQuerySet``...
- if hasattr(value, 'values_list'):
- value = list(value)
-
- if hasattr(value, 'strftime'):
- is_datetime = True
-
- if isinstance(value, six.string_types) and value != ' ':
- # It's not an ``InputType``. Assume ``Clean``.
- value = Clean(value)
- else:
- value = PythonData(value)
-
- # Prepare the query using the InputType.
- prepared_value = value.prepare(self)
-
- if not isinstance(prepared_value, (set, list, tuple)):
- # Then convert whatever we get back to what pysolr wants if needed.
- prepared_value = self.backend._from_python(prepared_value)
-
- # 'content' is a special reserved word, much like 'pk' in
- # Django's ORM layer. It indicates 'no special field'.
- if field == 'content':
- index_fieldname = ''
- else:
- index_fieldname = u'%s:' % connections[self._using].get_unified_index().get_index_fieldname(field)
-
- filter_types = {
- 'content': '%s',
- 'contains': '*%s*',
- 'endswith': "*%s",
- 'startswith': "%s*",
- 'exact': '%s',
- 'gt': "{%s to}",
- 'gte': "[%s to]",
- 'lt': "{to %s}",
- 'lte': "[to %s]",
- 'fuzzy': u'%s~',
- }
-
- if value.post_process is False:
- query_frag = prepared_value
- else:
- if filter_type in ['content', 'contains', 'startswith', 'endswith', 'fuzzy']:
- if value.input_type_name == 'exact':
- query_frag = prepared_value
- else:
- # Iterate over terms & incorportate the converted form of each into the query.
- terms = []
-
- if isinstance(prepared_value, six.string_types):
- possible_values = prepared_value.split(' ')
- else:
- if is_datetime is True:
- prepared_value = self._convert_datetime(prepared_value)
-
- possible_values = [prepared_value]
-
- for possible_value in possible_values:
- terms.append(filter_types[filter_type] % self.backend._from_python(possible_value))
-
- if len(terms) == 1:
- query_frag = terms[0]
- else:
- query_frag = u"(%s)" % " AND ".join(terms)
- elif filter_type == 'in':
- in_options = []
-
- for possible_value in prepared_value:
- is_datetime = False
-
- if hasattr(possible_value, 'strftime'):
- is_datetime = True
-
- pv = self.backend._from_python(possible_value)
-
- if is_datetime is True:
- pv = self._convert_datetime(pv)
-
- if isinstance(pv, six.string_types) and not is_datetime:
- in_options.append('"%s"' % pv)
- else:
- in_options.append('%s' % pv)
-
- query_frag = "(%s)" % " OR ".join(in_options)
- elif filter_type == 'range':
- start = self.backend._from_python(prepared_value[0])
- end = self.backend._from_python(prepared_value[1])
-
- if hasattr(prepared_value[0], 'strftime'):
- start = self._convert_datetime(start)
-
- if hasattr(prepared_value[1], 'strftime'):
- end = self._convert_datetime(end)
-
- query_frag = u"[%s to %s]" % (start, end)
- elif filter_type == 'exact':
- if value.input_type_name == 'exact':
- query_frag = prepared_value
- else:
- prepared_value = Exact(prepared_value).prepare(self)
- query_frag = filter_types[filter_type] % prepared_value
- else:
- if is_datetime is True:
- prepared_value = self._convert_datetime(prepared_value)
-
- query_frag = filter_types[filter_type] % prepared_value
-
- if len(query_frag) and not isinstance(value, Raw):
- if not query_frag.startswith('(') and not query_frag.endswith(')'):
- query_frag = "(%s)" % query_frag
-
- return u"%s%s" % (index_fieldname, query_frag)
-
-
- class WhooshEngine(BaseEngine):
- backend = WhooshSearchBackend
- query = WhooshSearchQuery
-
-
- '''2.修改源码中文件'''
- # 在全局引入的最后一行加入jieba分词器
- from jieba.analyse import ChineseAnalyzer
-
- # 修改为中文分词法
- # 查找
- # analyzer=StemmingAnalyzer()
- # 改为
- analyzer=ChineseAnalyzer()
'运行
- # 全文检索框架配置
- HAYSTACK_CONNECTIONS = {
- 'default': {
- # 指定whoosh引擎
- 'ENGINE': 'haystack.backends.whoosh_backend.WhooshEngine',
- # 'ENGINE': 'jsapp.whoosh_cn_backend.WhooshEngine', #article.whoosh_cn_backend便是你刚刚添加的文件
- # 索引文件路径
- 'PATH': os.path.join(BASE_DIR, 'whoosh_index'),
- }
- }
- # 添加此项,当数据库改变时,会自动更新索引,非常方便
- HAYSTACK_SIGNAL_PROCESSOR = 'haystack.signals.RealtimeSignalProcessor'
jsapp/urls.py
- from django.conf.urls import url
- from . import views as view
-
- urlpatterns=[
- url(r'abc/$', view.basic_search),
-
- ]
jsapp/views.py
- from django.shortcuts import render
-
- # Create your views here.
- import json
- from django.conf import settings
- from django.core.paginator import InvalidPage, Paginator
- from django.http import Http404, HttpResponse,JsonResponse
- from haystack.forms import ModelSearchForm
- from haystack.query import EmptySearchQuerySet
- RESULTS_PER_PAGE = getattr(settings, 'HAYSTACK_SEARCH_RESULTS_PER_PAGE', 20)
-
-
-
- def basic_search(request, load_all=True, form_class=ModelSearchForm, searchqueryset=None, extra_context=None, results_per_page=None):
- query = ''
- results = EmptySearchQuerySet()
- if request.GET.get('q'):
- form = form_class(request.GET, searchqueryset=searchqueryset, load_all=load_all)
-
- if form.is_valid():
- query = form.cleaned_data['q']
- results = form.search()
- else:
- form = form_class(searchqueryset=searchqueryset, load_all=load_all)
-
- paginator = Paginator(results, results_per_page or RESULTS_PER_PAGE)
- try:
- page = paginator.page(int(request.GET.get('page', 1)))
- except InvalidPage:
- result = {"code": 404, "msg": 'No file found!', "data": []}
- return HttpResponse(json.dumps(result), content_type="application/json")
-
- context = {
- 'form': form,
- 'page': page,
- 'paginator': paginator,
- 'query': query,
- 'suggestion': None,
- }
- if results.query.backend.include_spelling:
- context['suggestion'] = form.get_suggestion()
-
- if extra_context:
- context.update(extra_context)
-
-
- jsondata = []
- print(len(page.object_list))
- for result in page.object_list:
- data = {
- 'pk': result.object.pk,
- 'title': result.object.title,
- 'content': result.object.body,
-
- }
- jsondata.append(data)
- result = {"code": 200, "msg": 'Search successfully!', "data": jsondata}
- return JsonResponse(result, content_type="application/json")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。