第8章:结果处理与展示
8.1 搜索结果高亮显示
8.1.1 什么是高亮显示?
高亮显示是指在搜索结果中标记出匹配的查询词,帮助用户快速定位相关信息。
高亮的作用:
- 提升用户体验
- 让用户一眼看到匹配位置
- 提高搜索结果的可读性
8.1.2 基本高亮方法
方法1:使用 highlights() 方法
from whoosh.index import open_dir
from whoosh.qparser import QueryParser
ix = open_dir("my_index")
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
query = parser.parse(u"python")
results = searcher.search(query)
for hit in results:
print(hit['title'])
print(hit.highlights("content"))代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
import os
import shutil
# 创建索引
index_dir = "highlight_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
docs = [
{
"title": "Python 编程教程",
"content": "Python 是一门简洁的编程语言,Python 广泛应用于数据分析、人工智能等领域。学习 Python 可以让你快速开发应用程序。"
},
{
"title": "数据分析实战",
"content": "数据分析是当前热门的技术方向。使用 Python 进行数据分析,可以处理大量数据,提取有价值的信息。Python 在数据科学领域非常流行。"
},
{
"title": "Web 开发指南",
"content": "Web 开发需要掌握 HTML、CSS、JavaScript 等技术。Python 也可以用于 Web 后端开发,如 Django 和 Flask 框架。"
}
]
for doc in docs:
writer.add_document(**doc)
writer.commit()
print("=== 搜索结果高亮演示 ===\n")
# 搜索并高亮
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
query = parser.parse(u"python 数据")
results = searcher.search(query)
print(f"搜索 'python 数据',命中 {len(results)} 篇:\n")
for i, hit in enumerate(results, 1):
print(f"【结果 {i}】{hit['title']}")
print(f"高亮: {hit.highlights('content')}")
print()8.1.3 自定义高亮格式
使用 HtmlFormatter
from whoosh.highlight import HtmlFormatter
# 自定义高亮格式
formatter = HtmlFormatter(tagname="strong", classname="highlight")
# 应用格式化器
results.formatter = formatter
for hit in results:
print(hit.highlights("content"))代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from whoosh.highlight import HtmlFormatter, WholeFragmenter
import os
import shutil
# 创建索引
index_dir = "highlight_custom_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
writer.add_document(
title="Python 教程",
content="Python 是一门优秀的编程语言。Python 在数据分析、人工智能、Web 开发等领域都有广泛应用。学习 Python 可以快速上手。"
)
writer.commit()
print("=== 自定义高亮格式演示 ===\n")
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
query = parser.parse(u"python")
results = searcher.search(query)
# 示例1:默认高亮
print("【示例1】默认高亮")
for hit in results:
print(f"标题: {hit['title']}")
print(f"高亮: {hit.highlights('content')}")
print()
# 示例2:自定义标签
print("【示例2】使用 <b> 标签")
results.formatter = HtmlFormatter(tagname="b")
for hit in results:
print(f"高亮: {hit.highlights('content')}")
print()
# 示例3:使用 CSS 类名
print("【示例3】使用 CSS 类名 'highlight'")
results.formatter = HtmlFormatter(tagname="span", classname="highlight")
for hit in results:
print(f"HTML: <span class=\"highlight\">python</span>")
print(f"高亮: {hit.highlights('content')}")
print()
# 示例4:自定义前后标记
print("【示例4】自定义前后标记")
from whoosh.highlight import ContextFormatter
class CustomFormatter(ContextFormatter):
def format_fragment(self, text, fragment):
return fragment.replace('<', '【').replace('>', '】')
results.formatter = CustomFormatter()
for hit in results:
print(f"高亮: {hit.highlights('content')}")
print()
# 示例5:设置片段大小
print("【示例5】设置片段大小(更多上下文)")
results.fragmenter = WholeFragmenter()
for hit in results:
print(f"高亮: {hit.highlights('content')}")
print()
print("✅ 自定义高亮格式演示完成!")8.1.4 高亮配置参数
| 参数 | 说明 | 默认值 |
|---|---|---|
| tagname | HTML 标签名 | strong |
| classname | CSS 类名 | 匹配项 |
| between | 片段分隔符 | ... |
| maxchars | 最大字符数 | 200 |
| snippet | 提取的片段数 | 1 |
8.2 摘要生成与片段提取
8.2.1 摘要生成
什么是摘要?
摘要是从文档中提取的简短描述,通常包含关键词的上下文,帮助用户快速了解文档内容。
基本摘要生成:
# 使用 highlights() 生成摘要
summary = hit.highlights("content", top=2)代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
import os
import shutil
# 创建索引
index_dir = "summary_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加长文档
writer = ix.writer()
writer.add_document(
title="Python 编程完整指南",
content="""
Python 是一门高级编程语言,由 Guido van Rossum 于 1991 年首次发布。
Python 设计简洁明了,易于学习和使用。Python 支持多种编程范式,包括面向对象、命令式、函数式和过程式编程。
Python 拥有庞大的标准库和第三方生态系统,可以轻松完成各种任务。
Python 在数据分析、人工智能、Web 开发、自动化脚本等领域都有广泛应用。
NumPy、Pandas、Matplotlib 等库使 Python 成为数据分析的首选工具。
TensorFlow、PyTorch 等框架推动了 Python 在人工智能领域的普及。
Django、Flask 等 Web 框架让 Python Web 开发变得简单高效。
Python 的语法简洁优雅,代码可读性高,适合团队协作开发。
Python 社区活跃,学习资源丰富,新手可以快速入门。
Python 的持续发展和完善,使其成为最受欢迎的编程语言之一。
"""
)
writer.commit()
print("=== 摘要生成演示 ===\n")
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
# 示例1:单片段摘要
print("【示例1】单片段摘要(默认)")
query = parser.parse(u"python 数据 分析")
results = searcher.search(query)
for hit in results:
print(f"标题: {hit['title']}")
print(f"摘要: {hit.highlights('content')}")
print()
# 示例2:多片段摘要
print("【示例2】多片段摘要(top=3)")
results = searcher.search(query)
for hit in results:
summary = hit.highlights('content', top=3)
print(f"标题: {hit['title']}")
print(f"摘要:\n{summary}")
print()
# 示例3:摘要字符数控制
print("【示例3】摘要字符数控制")
results.formatter.maxchars = 100
results = searcher.search(query)
for hit in results:
print(f"标题: {hit['title']}")
print(f"摘要(最多100字符): {hit.highlights('content')}")
print()
# 示例4:不包含高亮标记的纯文本
print("【示例4】纯文本摘要(去除高亮标记)")
import re
results = searcher.search(query)
for hit in results:
summary = hit.highlights('content')
# 去除 HTML 标签
plain_text = re.sub('<[^>]+>', '', summary)
print(f"纯文本摘要: {plain_text}")
print()
print("✅ 摘要生成演示完成!")8.2.2 片段提取
使用 Fragmenter
from whoosh.highlight import WholeFragmenter, SentenceFragmenter
# 整个文档作为一个片段
results.fragmenter = WholeFragmenter()
# 按句子分割
results.fragmenter = SentenceFragmenter()代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from whoosh.highlight import WholeFragmenter, SentenceFragmenter, ContextFragmenter
import os
import shutil
# 创建索引
index_dir = "fragment_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
writer.add_document(
title="技术文章",
content="Python 是一门编程语言。Python 广泛应用于数据分析。Web 开发也使用 Python。人工智能领域 Python 是主流。"
)
writer.commit()
print("=== 片段提取演示 ===\n")
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
query = parser.parse(u"python")
results = searcher.search(query)
# 示例1:默认片段
print("【示例1】默认片段(ContextFragmenter)")
for hit in results:
fragments = hit.highlights('content', top=2)
print(f"片段: {fragments}")
print()
# 示例2:整个文档作为片段
print("【示例2】整个文档(WholeFragmenter)")
results.fragmenter = WholeFragmenter()
results = searcher.search(query)
for hit in results:
fragment = hit.highlights('content')
print(f"片段: {fragment}")
print()
# 示例3:按句子分割
print("【示例3】按句子分割(SentenceFragmenter)")
results.fragmenter = SentenceFragmenter()
results = searcher.search(query)
for hit in results:
fragments = hit.highlights('content', top=3)
print(f"片段:\n{fragments}")
print()
# 示例4:自定义上下文大小
print("【示例4】自定义上下文大小")
results.fragmenter = ContextFragmenter(charsbefore=5, charsafter=5)
results = searcher.search(query)
for hit in results:
fragment = hit.highlights('content')
print(f"片段(前后5字符): {fragment}")
print("\n✅ 片段提取演示完成!")8.3 结果过滤与分组
8.3.1 结果过滤
方法1:使用 Filter
from whoosh.query import Term
# 创建查询
query = parser.parse(u"python")
# 创建过滤器
filter_query = Term("category", "教程")
# 应用过滤器
results = searcher.search(query, filter=filter_query)代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, KEYWORD
from whoosh.qparser import QueryParser
from whoosh.query import Term
import os
import shutil
# 创建索引
index_dir = "filter_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True),
category=KEYWORD(stored=True),
price=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
docs = [
{"title": "Python 教程", "content": "学习 Python", "category": "教程", "price": "99"},
{"title": "Java 教程", "content": "学习 Java", "category": "教程", "price": "89"},
{"title": "数据分析实战", "content": "数据分析", "category": "实战", "price": "129"},
{"title": "Web 开发", "content": "Web 技术", "category": "教程", "price": "79"},
{"title": "机器学习", "content": "机器学习", "category": "实战", "price": "149"}
]
for doc in docs:
writer.add_document(**doc)
writer.commit()
print("=== 结果过滤演示 ===\n")
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
# 示例1:无过滤
print("【示例1】无过滤 - 搜索 '教程'")
query = parser.parse(u"教程")
results = searcher.search(query)
print(f"命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']} ({hit['category']})")
print()
# 示例2:按分类过滤
print("【示例2】只显示 '教程' 类别")
query = parser.parse(u"教程")
filter_query = Term("category", "教程")
results = searcher.search(query, filter=filter_query)
print(f"命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']} ({hit['category']})")
print()
# 示例3:组合过滤
print("【示例3】价格 < 100 的文档")
from whoosh.query import TermRange
query = parser.parse(u"*")
filter_query = TermRange("price", None, "100", endexcl=True)
results = searcher.search(query, filter=filter_query)
print(f"命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']} (价格: ¥{hit['price']})")
print("\n✅ 结果过滤演示完成!")8.3.2 结果分组
使用 FacetGrouping
from whoosh.searching import Facets
# 创建分组
facets = Facets()
facets.add_facet("category", "category")
# 执行搜索
results = searcher.search(query, groupedby="category")
# 获取分组结果
for group_name, group_docs in results.groups("category").items():
print(f"{group_name}: {len(group_docs)} 篇")代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, KEYWORD
from whoosh.qparser import QueryParser
from whoosh.searching import Facets
import os
import shutil
# 创建索引
index_dir = "group_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True),
category=KEYWORD(stored=True),
author=KEYWORD(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
docs = [
{"title": "Python 教程", "content": "Python 基础", "category": "教程", "author": "张三"},
{"title": "Java 教程", "content": "Java 基础", "category": "教程", "author": "李四"},
{"title": "数据分析实战", "content": "数据分析", "category": "实战", "author": "张三"},
{"title": "Web 开发", "content": "Web 技术", "category": "教程", "author": "王五"},
{"title": "机器学习", "content": "机器学习", "category": "实战", "author": "李四"},
{"title": "算法导论", "content": "算法", "category": "书籍", "author": "赵六"}
]
for doc in docs:
writer.add_document(**doc)
writer.commit()
print("=== 结果分组演示 ===\n")
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
query = parser.parse(u"*")
# 示例1:按分类分组
print("【示例1】按 category 分组")
results = searcher.search(query, groupedby="category")
groups = results.groups("category")
for group_name, group_docs in groups.items():
print(f" {group_name}: {len(group_docs)} 篇")
print()
# 示例2:按作者分组
print("【示例2】按 author 分组")
results = searcher.search(query, groupedby="author")
groups = results.groups("author")
for author, group_docs in groups.items():
print(f" {author}: {len(group_docs)} 篇")
print()
# 示例3:多字段分组
print("【示例3】同时按 category 和 author 分组")
from whoosh.sorting import FieldFacet
facets = Facets()
facets.add_field("category")
facets.add_field("author")
results = searcher.search(query, groupedby=facets)
cat_groups = results.groups("category")
print(f"按分类:")
for cat, count in cat_groups.items():
print(f" {cat}: {count} 篇")
auth_groups = results.groups("author")
print(f"\n按作者:")
for auth, count in auth_groups.items():
print(f" {auth}: {count} 篇")
# 示例4:查看分组内的文档
print(f"\n【示例4】查看 '教程' 分组内的文档")
results = searcher.search(query, groupedby="category")
cat_groups = results.groups("category")
if "教程" in cat_groups:
doc_ids = cat_groups["教程"]
print(f"教程分组包含 {len(doc_ids)} 篇文档")
for doc_id in doc_ids[:3]: # 显示前3篇
searcher.searcher.document(doc_id)
print(f" - {doc_id}")
print("\n✅ 结果分组演示完成!")8.3.3 结果聚合
聚合统计:
# 按字段聚合
from whoosh.searching import Facets
facets = Facets()
facets.add_field("category")
results = searcher.search(query, groupedby=facets)
# 获取每个分组的文档数
for group_name, group_docs in results.groups("category").items():
print(f"{group_name}: {len(group_docs)}")8.4 自定义评分算法
8.4.1 BM25 评分
Whoosh 默认使用 BM25 评分算法:
BM25 公式:
score(D,Q) = sum IDF(qi) * (f(qi,D) * (k1 + 1)) / (f(qi,D) + k1 * (1 - b + b * |D|/avgdl))参数说明:
k1:词频饱和度参数(默认 1.2)b:长度归一化参数(默认 0.75)
修改 BM25 参数:
from whoosh.scoring import BM25F
# 创建自定义评分器
scorer = BM25F(B=0.5, K1=1.5)
# 使用自定义评分器
results = searcher.search(query, weighting=scorer)代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from whoosh.scoring import BM25F
import os
import shutil
# 创建索引
index_dir = "scoring_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
docs = [
{"title": "Python", "content": "Python 是一门编程语言"},
{"title": "Python 教程", "content": "学习 Python 编程,Python 是最好的选择"},
{"title": "Java", "content": "Java 是面向对象的编程语言"},
{"title": "编程语言", "content": "编程语言包括 Python、Java 等"}
]
for doc in docs:
writer.add_document(**doc)
writer.commit()
print("=== BM25 评分演示 ===\n")
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
query = parser.parse(u"python")
# 示例1:默认 BM25
print("【示例1】默认 BM25 评分 (B=0.75, K1=1.2)")
results = searcher.search(query)
for hit in results:
print(f" {hit['title']}: 评分 {hit.score:.4f}")
print()
# 示例2:调整长度归一化 (B=0.5)
print("【示例2】调整长度归一化 (B=0.5)")
scorer = BM25F(B=0.5)
results = searcher.search(query, weighting=scorer)
for hit in results:
print(f" {hit['title']}: 评分 {hit.score:.4f}")
print()
# 示例3:调整词频饱和度 (K1=2.0)
print("【示例3】调整词频饱和度 (K1=2.0)")
scorer = BM25F(K1=2.0)
results = searcher.search(query, weighting=scorer)
for hit in results:
print(f" {hit['title']}: 评分 {hit.score:.4f}")
print("\n✅ BM25 评分演示完成!")8.4.2 自定义评分器
创建自定义评分类:
from whoosh.scoring import Weighting
class CustomScorer(Weighting):
def __init__(self, boost_title=1.5):
self.boost_title = boost_title
def scorer(self, searcher, fieldname, text, qf=1):
# 自定义评分逻辑
pass代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from whoosh.scoring import Weighting
import os
import shutil
# 创建索引
index_dir = "custom_scoring_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True),
price=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
docs = [
{"title": "Python 教程", "content": "Python 入门", "price": "99"},
{"title": "Python 高级", "content": "Python 进阶", "price": "199"},
{"title": "Java 教程", "content": "Java 入门", "price": "89"},
{"title": "Python 实战", "content": "Python 项目", "price": "149"}
]
for doc in docs:
writer.add_document(**doc)
writer.commit()
print("=== 自定义评分演示 ===\n")
# 自定义评分器:基于价格的权重调整
class PriceBoostScorer(Weighting):
"""基于价格的评分器:价格越高,权重越低"""
def __init__(self, price_factor=0.001):
self.price_factor = price_factor
def score(self, searcher, fieldname, text, docnum, weight):
# 获取文档价格
doc = searcher.document(docnum)
price = float(doc.get('price', 100))
# 计算价格折扣因子
price_discount = 1 - (price * self.price_factor)
price_discount = max(0.5, min(1.0, price_discount))
# 返回调整后的分数
return weight * price_discount
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
query = parser.parse(u"python")
# 示例1:默认评分
print("【示例1】默认评分")
results = searcher.search(query)
for hit in results:
print(f" {hit['title']} (¥{hit['price']}): {hit.score:.4f}")
print()
# 示例2:基于价格调整评分
print("【示例2】基于价格调整评分(价格低优先)")
scorer = PriceBoostScorer(price_factor=0.002)
results = searcher.search(query, weighting=scorer)
for hit in results:
print(f" {hit['title']} (¥{hit['price']}): {hit.score:.4f}")
print("\n✅ 自定义评分演示完成!")8.4.3 多字段加权评分
使用 BM25F:
from whoosh.scoring import BM25F
# 为不同字段设置权重
field_boosts = {
"title": 2.0,
"content": 1.0
}
scorer = BM25F(field_boosts=field_boosts)
results = searcher.search(query, weighting=scorer)代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser, MultifieldParser
from whoosh.scoring import BM25F
import os
import shutil
# 创建索引
index_dir = "field_weight_scoring_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
docs = [
{"title": "Python 教程", "content": "学习编程"},
{"title": "编程入门", "content": "Python 基础"},
{"title": "数据分析", "content": "Python 数据分析"}
]
for doc in docs:
writer.add_document(**doc)
writer.commit()
print("=== 多字段加权评分演示 ===\n")
with ix.searcher() as searcher:
parser = MultifieldParser(["title", "content"], ix.schema)
query = parser.parse(u"python")
# 示例1:无加权
print("【示例1】无加权评分")
results = searcher.search(query)
for hit in results:
print(f" {hit['title']}: {hit.score:.4f}")
print()
# 示例2:标题权重 2.0
print("【示例2】标题权重 2.0")
scorer = BM25F(title_B=2.0)
results = searcher.search(query, weighting=scorer)
for hit in results:
print(f" {hit['title']}: {hit.score:.4f}")
print("\n✅ 多字段加权评分演示完成!")8.5 综合示例
8.5.1 完整的搜索结果处理器
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, KEYWORD, NUMERIC
from whoosh.qparser import QueryParser
from whoosh.highlight import HtmlFormatter
from whoosh.query import Term
from whoosh.scoring import BM25F
import os
import shutil
# 创建索引
index_dir = "result_processor_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True),
category=KEYWORD(stored=True),
price=NUMERIC(stored=True, sortable=True),
rating=NUMERIC(stored=True, sortable=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
products = [
{"title": "Python 编程入门", "content": "Python 是一门简洁的编程语言,适合初学者", "category": "教程", "price": 99, "rating": 4.5},
{"title": "数据分析实战", "content": "使用 Python 进行数据分析,包含大量实战案例", "category": "实战", "price": 129, "rating": 4.7},
{"title": "Java 高级编程", "content": "Java 企业级开发,面向对象深入", "category": "教程", "price": 89, "rating": 4.3},
{"title": "Web 开发指南", "content": "Web 全栈开发,HTML CSS JavaScript", "category": "教程", "price": 79, "rating": 4.2},
{"title": "机器学习入门", "content": "Python 机器学习基础,神经网络", "category": "实战", "price": 149, "rating": 4.8}
]
for p in products:
writer.add_document(**p)
writer.commit()
print("=== 搜索结果处理器演示 ===\n")
class SearchResultProcessor:
"""搜索结果处理器"""
def __init__(self, index):
self.index = index
def search(self, query_str, filters=None, sort_by="rating", highlight=True):
"""执行搜索"""
with self.index.searcher() as searcher:
# 解析查询
parser = QueryParser("content", self.index.schema)
query = parser.parse(query_str)
# 应用过滤器
filter_query = None
if filters:
filter_query = filters
# 应用评分器
if sort_by == "rating":
scorer = None # 默认 BM25
sort_field = "rating"
elif sort_by == "price":
scorer = BM25F()
sort_field = "price"
else:
scorer = None
sort_field = None
# 执行搜索
if sort_field:
results = searcher.search(query, filter=filter_query, sortedby=sort_field, reverse=True)
else:
results = searcher.search(query, filter=filter_query, weighting=scorer)
# 处理结果
processed_results = []
for hit in results:
result = {
'title': hit['title'],
'category': hit['category'],
'price': hit['price'],
'rating': hit['rating'],
'score': hit.score
}
# 高亮
if highlight:
result['highlight'] = hit.highlights('content')
else:
result['highlight'] = hit['content'][:100] + "..."
processed_results.append(result)
return processed_results
def search_with_facets(self, query_str):
"""带分组的搜索"""
with self.index.searcher() as searcher:
parser = QueryParser("content", self.index.schema)
query = parser.parse(query_str)
results = searcher.search(query, groupedby="category")
return {
'results': [hit.fields() for hit in results],
'groups': results.groups("category")
}
# 使用处理器
processor = SearchResultProcessor(ix)
# 示例1:基本搜索
print("【示例1】基本搜索 - 'python'")
results = processor.search("python")
for r in results:
print(f" {r['title']} (¥{r['price']}, {r['rating']}分)")
print(f" 摘要: {r['highlight']}")
print()
# 示例2:带过滤的搜索
print("【示例2】带过滤 - 'python' 且 category='实战'")
results = processor.search("python", filters=Term("category", "实战"))
for r in results:
print(f" {r['title']} (¥{r['price']}, {r['rating']}分)")
print()
# 示例3:按价格排序
print("【示例3】按价格排序 - '教程'")
results = processor.search("教程", sort_by="price")
for r in results:
print(f" {r['title']} (¥{r['price']}, {r['rating']}分)")
print()
# 示例4:带分组
print("【示例4】带分组搜索 - '教程'")
facet_results = processor.search_with_facets("教程")
print(f"总结果: {len(facet_results['results'])} 篇")
print(f"分组统计:")
for category, count in facet_results['groups'].items():
print(f" {category}: {count} 篇")
print("\n✅ 搜索结果处理器演示完成!")本章小结
本章我们学习了搜索结果的处理与展示:
- 搜索结果高亮显示:使用 highlights() 方法,自定义高亮格式
- 摘要生成与片段提取:生成文档摘要,提取关键片段
- 结果过滤与分组:使用 Filter 过滤结果,使用 Facets 分组结果
- 自定义评分算法:理解 BM25 评分,创建自定义评分器
通过本章的学习,你应该能够:
- 实现搜索结果的高亮显示
- 生成文档摘要和片段
- 实现结果的过滤和分组
- 自定义评分算法
在下一章中,我们将学习多语言与中文支持,包括中文分词器集成、停用词过滤等。