第8章:结果处理与展示

8.1 搜索结果高亮显示

8.1.1 什么是高亮显示?

高亮显示是指在搜索结果中标记出匹配的查询词,帮助用户快速定位相关信息。

高亮的作用

  • 提升用户体验
  • 让用户一眼看到匹配位置
  • 提高搜索结果的可读性

8.1.2 基本高亮方法

方法1:使用 highlights() 方法

from whoosh.index import open_dir
from whoosh.qparser import QueryParser

ix = open_dir("my_index")

with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    query = parser.parse(u"python")
    results = searcher.search(query)
    
    for hit in results:
        print(hit['title'])
        print(hit.highlights("content"))

代码示例

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
import os
import shutil

# 创建索引
index_dir = "highlight_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True)
)

ix = create_in(index_dir, schema)

# 添加文档
writer = ix.writer()
docs = [
    {
        "title": "Python 编程教程",
        "content": "Python 是一门简洁的编程语言,Python 广泛应用于数据分析、人工智能等领域。学习 Python 可以让你快速开发应用程序。"
    },
    {
        "title": "数据分析实战",
        "content": "数据分析是当前热门的技术方向。使用 Python 进行数据分析,可以处理大量数据,提取有价值的信息。Python 在数据科学领域非常流行。"
    },
    {
        "title": "Web 开发指南",
        "content": "Web 开发需要掌握 HTML、CSS、JavaScript 等技术。Python 也可以用于 Web 后端开发,如 Django 和 Flask 框架。"
    }
]
for doc in docs:
    writer.add_document(**doc)
writer.commit()

print("=== 搜索结果高亮演示 ===\n")

# 搜索并高亮
with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    query = parser.parse(u"python 数据")
    results = searcher.search(query)
    
    print(f"搜索 'python 数据',命中 {len(results)} 篇:\n")
    for i, hit in enumerate(results, 1):
        print(f"【结果 {i}】{hit['title']}")
        print(f"高亮: {hit.highlights('content')}")
        print()

8.1.3 自定义高亮格式

使用 HtmlFormatter

from whoosh.highlight import HtmlFormatter

# 自定义高亮格式
formatter = HtmlFormatter(tagname="strong", classname="highlight")

# 应用格式化器
results.formatter = formatter

for hit in results:
    print(hit.highlights("content"))

代码示例

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from whoosh.highlight import HtmlFormatter, WholeFragmenter
import os
import shutil

# 创建索引
index_dir = "highlight_custom_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True)
)

ix = create_in(index_dir, schema)

# 添加文档
writer = ix.writer()
writer.add_document(
    title="Python 教程",
    content="Python 是一门优秀的编程语言。Python 在数据分析、人工智能、Web 开发等领域都有广泛应用。学习 Python 可以快速上手。"
)
writer.commit()

print("=== 自定义高亮格式演示 ===\n")

with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    query = parser.parse(u"python")
    results = searcher.search(query)
    
    # 示例1:默认高亮
    print("【示例1】默认高亮")
    for hit in results:
        print(f"标题: {hit['title']}")
        print(f"高亮: {hit.highlights('content')}")
        print()
    
    # 示例2:自定义标签
    print("【示例2】使用 <b> 标签")
    results.formatter = HtmlFormatter(tagname="b")
    for hit in results:
        print(f"高亮: {hit.highlights('content')}")
        print()
    
    # 示例3:使用 CSS 类名
    print("【示例3】使用 CSS 类名 'highlight'")
    results.formatter = HtmlFormatter(tagname="span", classname="highlight")
    for hit in results:
        print(f"HTML: <span class=\"highlight\">python</span>")
        print(f"高亮: {hit.highlights('content')}")
        print()
    
    # 示例4:自定义前后标记
    print("【示例4】自定义前后标记")
    from whoosh.highlight import ContextFormatter
    class CustomFormatter(ContextFormatter):
        def format_fragment(self, text, fragment):
            return fragment.replace('<', '【').replace('>', '】')
    
    results.formatter = CustomFormatter()
    for hit in results:
        print(f"高亮: {hit.highlights('content')}")
        print()
    
    # 示例5:设置片段大小
    print("【示例5】设置片段大小(更多上下文)")
    results.fragmenter = WholeFragmenter()
    for hit in results:
        print(f"高亮: {hit.highlights('content')}")
        print()

print("✅ 自定义高亮格式演示完成!")

8.1.4 高亮配置参数

参数 说明 默认值
tagname HTML 标签名 strong
classname CSS 类名 匹配项
between 片段分隔符 ...
maxchars 最大字符数 200
snippet 提取的片段数 1

8.2 摘要生成与片段提取

8.2.1 摘要生成

什么是摘要?
摘要是从文档中提取的简短描述,通常包含关键词的上下文,帮助用户快速了解文档内容。

基本摘要生成

# 使用 highlights() 生成摘要
summary = hit.highlights("content", top=2)

代码示例

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
import os
import shutil

# 创建索引
index_dir = "summary_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True)
)

ix = create_in(index_dir, schema)

# 添加长文档
writer = ix.writer()
writer.add_document(
    title="Python 编程完整指南",
    content="""
    Python 是一门高级编程语言,由 Guido van Rossum 于 1991 年首次发布。
    Python 设计简洁明了,易于学习和使用。Python 支持多种编程范式,包括面向对象、命令式、函数式和过程式编程。
    Python 拥有庞大的标准库和第三方生态系统,可以轻松完成各种任务。
    Python 在数据分析、人工智能、Web 开发、自动化脚本等领域都有广泛应用。
    NumPy、Pandas、Matplotlib 等库使 Python 成为数据分析的首选工具。
    TensorFlow、PyTorch 等框架推动了 Python 在人工智能领域的普及。
    Django、Flask 等 Web 框架让 Python Web 开发变得简单高效。
    Python 的语法简洁优雅,代码可读性高,适合团队协作开发。
    Python 社区活跃,学习资源丰富,新手可以快速入门。
    Python 的持续发展和完善,使其成为最受欢迎的编程语言之一。
    """
)
writer.commit()

print("=== 摘要生成演示 ===\n")

with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    
    # 示例1:单片段摘要
    print("【示例1】单片段摘要(默认)")
    query = parser.parse(u"python 数据 分析")
    results = searcher.search(query)
    for hit in results:
        print(f"标题: {hit['title']}")
        print(f"摘要: {hit.highlights('content')}")
        print()
    
    # 示例2:多片段摘要
    print("【示例2】多片段摘要(top=3)")
    results = searcher.search(query)
    for hit in results:
        summary = hit.highlights('content', top=3)
        print(f"标题: {hit['title']}")
        print(f"摘要:\n{summary}")
        print()
    
    # 示例3:摘要字符数控制
    print("【示例3】摘要字符数控制")
    results.formatter.maxchars = 100
    results = searcher.search(query)
    for hit in results:
        print(f"标题: {hit['title']}")
        print(f"摘要(最多100字符): {hit.highlights('content')}")
        print()
    
    # 示例4:不包含高亮标记的纯文本
    print("【示例4】纯文本摘要(去除高亮标记)")
    import re
    results = searcher.search(query)
    for hit in results:
        summary = hit.highlights('content')
        # 去除 HTML 标签
        plain_text = re.sub('<[^>]+>', '', summary)
        print(f"纯文本摘要: {plain_text}")
        print()

print("✅ 摘要生成演示完成!")

8.2.2 片段提取

使用 Fragmenter

from whoosh.highlight import WholeFragmenter, SentenceFragmenter

# 整个文档作为一个片段
results.fragmenter = WholeFragmenter()

# 按句子分割
results.fragmenter = SentenceFragmenter()

代码示例

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from whoosh.highlight import WholeFragmenter, SentenceFragmenter, ContextFragmenter
import os
import shutil

# 创建索引
index_dir = "fragment_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True)
)

ix = create_in(index_dir, schema)

# 添加文档
writer = ix.writer()
writer.add_document(
    title="技术文章",
    content="Python 是一门编程语言。Python 广泛应用于数据分析。Web 开发也使用 Python。人工智能领域 Python 是主流。"
)
writer.commit()

print("=== 片段提取演示 ===\n")

with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    query = parser.parse(u"python")
    results = searcher.search(query)
    
    # 示例1:默认片段
    print("【示例1】默认片段(ContextFragmenter)")
    for hit in results:
        fragments = hit.highlights('content', top=2)
        print(f"片段: {fragments}")
        print()
    
    # 示例2:整个文档作为片段
    print("【示例2】整个文档(WholeFragmenter)")
    results.fragmenter = WholeFragmenter()
    results = searcher.search(query)
    for hit in results:
        fragment = hit.highlights('content')
        print(f"片段: {fragment}")
        print()
    
    # 示例3:按句子分割
    print("【示例3】按句子分割(SentenceFragmenter)")
    results.fragmenter = SentenceFragmenter()
    results = searcher.search(query)
    for hit in results:
        fragments = hit.highlights('content', top=3)
        print(f"片段:\n{fragments}")
        print()
    
    # 示例4:自定义上下文大小
    print("【示例4】自定义上下文大小")
    results.fragmenter = ContextFragmenter(charsbefore=5, charsafter=5)
    results = searcher.search(query)
    for hit in results:
        fragment = hit.highlights('content')
        print(f"片段(前后5字符): {fragment}")

print("\n✅ 片段提取演示完成!")

8.3 结果过滤与分组

8.3.1 结果过滤

方法1:使用 Filter

from whoosh.query import Term

# 创建查询
query = parser.parse(u"python")

# 创建过滤器
filter_query = Term("category", "教程")

# 应用过滤器
results = searcher.search(query, filter=filter_query)

代码示例

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, KEYWORD
from whoosh.qparser import QueryParser
from whoosh.query import Term
import os
import shutil

# 创建索引
index_dir = "filter_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True),
    category=KEYWORD(stored=True),
    price=TEXT(stored=True)
)

ix = create_in(index_dir, schema)

# 添加文档
writer = ix.writer()
docs = [
    {"title": "Python 教程", "content": "学习 Python", "category": "教程", "price": "99"},
    {"title": "Java 教程", "content": "学习 Java", "category": "教程", "price": "89"},
    {"title": "数据分析实战", "content": "数据分析", "category": "实战", "price": "129"},
    {"title": "Web 开发", "content": "Web 技术", "category": "教程", "price": "79"},
    {"title": "机器学习", "content": "机器学习", "category": "实战", "price": "149"}
]
for doc in docs:
    writer.add_document(**doc)
writer.commit()

print("=== 结果过滤演示 ===\n")

with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    
    # 示例1:无过滤
    print("【示例1】无过滤 - 搜索 '教程'")
    query = parser.parse(u"教程")
    results = searcher.search(query)
    print(f"命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']} ({hit['category']})")
    print()
    
    # 示例2:按分类过滤
    print("【示例2】只显示 '教程' 类别")
    query = parser.parse(u"教程")
    filter_query = Term("category", "教程")
    results = searcher.search(query, filter=filter_query)
    print(f"命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']} ({hit['category']})")
    print()
    
    # 示例3:组合过滤
    print("【示例3】价格 < 100 的文档")
    from whoosh.query import TermRange
    query = parser.parse(u"*")
    filter_query = TermRange("price", None, "100", endexcl=True)
    results = searcher.search(query, filter=filter_query)
    print(f"命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']} (价格: ¥{hit['price']})")

print("\n✅ 结果过滤演示完成!")

8.3.2 结果分组

使用 FacetGrouping

from whoosh.searching import Facets

# 创建分组
facets = Facets()
facets.add_facet("category", "category")

# 执行搜索
results = searcher.search(query, groupedby="category")

# 获取分组结果
for group_name, group_docs in results.groups("category").items():
    print(f"{group_name}: {len(group_docs)} 篇")

代码示例

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, KEYWORD
from whoosh.qparser import QueryParser
from whoosh.searching import Facets
import os
import shutil

# 创建索引
index_dir = "group_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True),
    category=KEYWORD(stored=True),
    author=KEYWORD(stored=True)
)

ix = create_in(index_dir, schema)

# 添加文档
writer = ix.writer()
docs = [
    {"title": "Python 教程", "content": "Python 基础", "category": "教程", "author": "张三"},
    {"title": "Java 教程", "content": "Java 基础", "category": "教程", "author": "李四"},
    {"title": "数据分析实战", "content": "数据分析", "category": "实战", "author": "张三"},
    {"title": "Web 开发", "content": "Web 技术", "category": "教程", "author": "王五"},
    {"title": "机器学习", "content": "机器学习", "category": "实战", "author": "李四"},
    {"title": "算法导论", "content": "算法", "category": "书籍", "author": "赵六"}
]
for doc in docs:
    writer.add_document(**doc)
writer.commit()

print("=== 结果分组演示 ===\n")

with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    query = parser.parse(u"*")
    
    # 示例1:按分类分组
    print("【示例1】按 category 分组")
    results = searcher.search(query, groupedby="category")
    groups = results.groups("category")
    for group_name, group_docs in groups.items():
        print(f"  {group_name}: {len(group_docs)} 篇")
    print()
    
    # 示例2:按作者分组
    print("【示例2】按 author 分组")
    results = searcher.search(query, groupedby="author")
    groups = results.groups("author")
    for author, group_docs in groups.items():
        print(f"  {author}: {len(group_docs)} 篇")
    print()
    
    # 示例3:多字段分组
    print("【示例3】同时按 category 和 author 分组")
    from whoosh.sorting import FieldFacet
    facets = Facets()
    facets.add_field("category")
    facets.add_field("author")
    results = searcher.search(query, groupedby=facets)
    
    cat_groups = results.groups("category")
    print(f"按分类:")
    for cat, count in cat_groups.items():
        print(f"  {cat}: {count} 篇")
    
    auth_groups = results.groups("author")
    print(f"\n按作者:")
    for auth, count in auth_groups.items():
        print(f"  {auth}: {count} 篇")
    
    # 示例4:查看分组内的文档
    print(f"\n【示例4】查看 '教程' 分组内的文档")
    results = searcher.search(query, groupedby="category")
    cat_groups = results.groups("category")
    if "教程" in cat_groups:
        doc_ids = cat_groups["教程"]
        print(f"教程分组包含 {len(doc_ids)} 篇文档")
        for doc_id in doc_ids[:3]:  # 显示前3篇
            searcher.searcher.document(doc_id)
            print(f"  - {doc_id}")

print("\n✅ 结果分组演示完成!")

8.3.3 结果聚合

聚合统计

# 按字段聚合
from whoosh.searching import Facets

facets = Facets()
facets.add_field("category")
results = searcher.search(query, groupedby=facets)

# 获取每个分组的文档数
for group_name, group_docs in results.groups("category").items():
    print(f"{group_name}: {len(group_docs)}")

8.4 自定义评分算法

8.4.1 BM25 评分

Whoosh 默认使用 BM25 评分算法:

BM25 公式

score(D,Q) = sum IDF(qi) * (f(qi,D) * (k1 + 1)) / (f(qi,D) + k1 * (1 - b + b * |D|/avgdl))

参数说明

  • k1:词频饱和度参数(默认 1.2)
  • b:长度归一化参数(默认 0.75)

修改 BM25 参数

from whoosh.scoring import BM25F

# 创建自定义评分器
scorer = BM25F(B=0.5, K1=1.5)

# 使用自定义评分器
results = searcher.search(query, weighting=scorer)

代码示例

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from whoosh.scoring import BM25F
import os
import shutil

# 创建索引
index_dir = "scoring_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True)
)

ix = create_in(index_dir, schema)

# 添加文档
writer = ix.writer()
docs = [
    {"title": "Python", "content": "Python 是一门编程语言"},
    {"title": "Python 教程", "content": "学习 Python 编程,Python 是最好的选择"},
    {"title": "Java", "content": "Java 是面向对象的编程语言"},
    {"title": "编程语言", "content": "编程语言包括 Python、Java 等"}
]
for doc in docs:
    writer.add_document(**doc)
writer.commit()

print("=== BM25 评分演示 ===\n")

with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    query = parser.parse(u"python")
    
    # 示例1:默认 BM25
    print("【示例1】默认 BM25 评分 (B=0.75, K1=1.2)")
    results = searcher.search(query)
    for hit in results:
        print(f"  {hit['title']}: 评分 {hit.score:.4f}")
    print()
    
    # 示例2:调整长度归一化 (B=0.5)
    print("【示例2】调整长度归一化 (B=0.5)")
    scorer = BM25F(B=0.5)
    results = searcher.search(query, weighting=scorer)
    for hit in results:
        print(f"  {hit['title']}: 评分 {hit.score:.4f}")
    print()
    
    # 示例3:调整词频饱和度 (K1=2.0)
    print("【示例3】调整词频饱和度 (K1=2.0)")
    scorer = BM25F(K1=2.0)
    results = searcher.search(query, weighting=scorer)
    for hit in results:
        print(f"  {hit['title']}: 评分 {hit.score:.4f}")

print("\n✅ BM25 评分演示完成!")

8.4.2 自定义评分器

创建自定义评分类

from whoosh.scoring import Weighting

class CustomScorer(Weighting):
    def __init__(self, boost_title=1.5):
        self.boost_title = boost_title
    
    def scorer(self, searcher, fieldname, text, qf=1):
        # 自定义评分逻辑
        pass

代码示例

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from whoosh.scoring import Weighting
import os
import shutil

# 创建索引
index_dir = "custom_scoring_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True),
    price=TEXT(stored=True)
)

ix = create_in(index_dir, schema)

# 添加文档
writer = ix.writer()
docs = [
    {"title": "Python 教程", "content": "Python 入门", "price": "99"},
    {"title": "Python 高级", "content": "Python 进阶", "price": "199"},
    {"title": "Java 教程", "content": "Java 入门", "price": "89"},
    {"title": "Python 实战", "content": "Python 项目", "price": "149"}
]
for doc in docs:
    writer.add_document(**doc)
writer.commit()

print("=== 自定义评分演示 ===\n")

# 自定义评分器:基于价格的权重调整
class PriceBoostScorer(Weighting):
    """基于价格的评分器:价格越高,权重越低"""
    
    def __init__(self, price_factor=0.001):
        self.price_factor = price_factor
    
    def score(self, searcher, fieldname, text, docnum, weight):
        # 获取文档价格
        doc = searcher.document(docnum)
        price = float(doc.get('price', 100))
        
        # 计算价格折扣因子
        price_discount = 1 - (price * self.price_factor)
        price_discount = max(0.5, min(1.0, price_discount))
        
        # 返回调整后的分数
        return weight * price_discount

with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    query = parser.parse(u"python")
    
    # 示例1:默认评分
    print("【示例1】默认评分")
    results = searcher.search(query)
    for hit in results:
        print(f"  {hit['title']} (¥{hit['price']}): {hit.score:.4f}")
    print()
    
    # 示例2:基于价格调整评分
    print("【示例2】基于价格调整评分(价格低优先)")
    scorer = PriceBoostScorer(price_factor=0.002)
    results = searcher.search(query, weighting=scorer)
    for hit in results:
        print(f"  {hit['title']} (¥{hit['price']}): {hit.score:.4f}")

print("\n✅ 自定义评分演示完成!")

8.4.3 多字段加权评分

使用 BM25F

from whoosh.scoring import BM25F

# 为不同字段设置权重
field_boosts = {
    "title": 2.0,
    "content": 1.0
}

scorer = BM25F(field_boosts=field_boosts)
results = searcher.search(query, weighting=scorer)

代码示例

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser, MultifieldParser
from whoosh.scoring import BM25F
import os
import shutil

# 创建索引
index_dir = "field_weight_scoring_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True)
)

ix = create_in(index_dir, schema)

# 添加文档
writer = ix.writer()
docs = [
    {"title": "Python 教程", "content": "学习编程"},
    {"title": "编程入门", "content": "Python 基础"},
    {"title": "数据分析", "content": "Python 数据分析"}
]
for doc in docs:
    writer.add_document(**doc)
writer.commit()

print("=== 多字段加权评分演示 ===\n")

with ix.searcher() as searcher:
    parser = MultifieldParser(["title", "content"], ix.schema)
    query = parser.parse(u"python")
    
    # 示例1:无加权
    print("【示例1】无加权评分")
    results = searcher.search(query)
    for hit in results:
        print(f"  {hit['title']}: {hit.score:.4f}")
    print()
    
    # 示例2:标题权重 2.0
    print("【示例2】标题权重 2.0")
    scorer = BM25F(title_B=2.0)
    results = searcher.search(query, weighting=scorer)
    for hit in results:
        print(f"  {hit['title']}: {hit.score:.4f}")

print("\n✅ 多字段加权评分演示完成!")

8.5 综合示例

8.5.1 完整的搜索结果处理器

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, KEYWORD, NUMERIC
from whoosh.qparser import QueryParser
from whoosh.highlight import HtmlFormatter
from whoosh.query import Term
from whoosh.scoring import BM25F
import os
import shutil

# 创建索引
index_dir = "result_processor_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True),
    category=KEYWORD(stored=True),
    price=NUMERIC(stored=True, sortable=True),
    rating=NUMERIC(stored=True, sortable=True)
)

ix = create_in(index_dir, schema)

# 添加文档
writer = ix.writer()
products = [
    {"title": "Python 编程入门", "content": "Python 是一门简洁的编程语言,适合初学者", "category": "教程", "price": 99, "rating": 4.5},
    {"title": "数据分析实战", "content": "使用 Python 进行数据分析,包含大量实战案例", "category": "实战", "price": 129, "rating": 4.7},
    {"title": "Java 高级编程", "content": "Java 企业级开发,面向对象深入", "category": "教程", "price": 89, "rating": 4.3},
    {"title": "Web 开发指南", "content": "Web 全栈开发,HTML CSS JavaScript", "category": "教程", "price": 79, "rating": 4.2},
    {"title": "机器学习入门", "content": "Python 机器学习基础,神经网络", "category": "实战", "price": 149, "rating": 4.8}
]
for p in products:
    writer.add_document(**p)
writer.commit()

print("=== 搜索结果处理器演示 ===\n")

class SearchResultProcessor:
    """搜索结果处理器"""
    
    def __init__(self, index):
        self.index = index
    
    def search(self, query_str, filters=None, sort_by="rating", highlight=True):
        """执行搜索"""
        with self.index.searcher() as searcher:
            # 解析查询
            parser = QueryParser("content", self.index.schema)
            query = parser.parse(query_str)
            
            # 应用过滤器
            filter_query = None
            if filters:
                filter_query = filters
            
            # 应用评分器
            if sort_by == "rating":
                scorer = None  # 默认 BM25
                sort_field = "rating"
            elif sort_by == "price":
                scorer = BM25F()
                sort_field = "price"
            else:
                scorer = None
                sort_field = None
            
            # 执行搜索
            if sort_field:
                results = searcher.search(query, filter=filter_query, sortedby=sort_field, reverse=True)
            else:
                results = searcher.search(query, filter=filter_query, weighting=scorer)
            
            # 处理结果
            processed_results = []
            for hit in results:
                result = {
                    'title': hit['title'],
                    'category': hit['category'],
                    'price': hit['price'],
                    'rating': hit['rating'],
                    'score': hit.score
                }
                
                # 高亮
                if highlight:
                    result['highlight'] = hit.highlights('content')
                else:
                    result['highlight'] = hit['content'][:100] + "..."
                
                processed_results.append(result)
            
            return processed_results
    
    def search_with_facets(self, query_str):
        """带分组的搜索"""
        with self.index.searcher() as searcher:
            parser = QueryParser("content", self.index.schema)
            query = parser.parse(query_str)
            
            results = searcher.search(query, groupedby="category")
            
            return {
                'results': [hit.fields() for hit in results],
                'groups': results.groups("category")
            }

# 使用处理器
processor = SearchResultProcessor(ix)

# 示例1:基本搜索
print("【示例1】基本搜索 - 'python'")
results = processor.search("python")
for r in results:
    print(f"  {r['title']} (¥{r['price']}, {r['rating']}分)")
    print(f"    摘要: {r['highlight']}")
    print()

# 示例2:带过滤的搜索
print("【示例2】带过滤 - 'python' 且 category='实战'")
results = processor.search("python", filters=Term("category", "实战"))
for r in results:
    print(f"  {r['title']} (¥{r['price']}, {r['rating']}分)")
    print()

# 示例3:按价格排序
print("【示例3】按价格排序 - '教程'")
results = processor.search("教程", sort_by="price")
for r in results:
    print(f"  {r['title']} (¥{r['price']}, {r['rating']}分)")
    print()

# 示例4:带分组
print("【示例4】带分组搜索 - '教程'")
facet_results = processor.search_with_facets("教程")
print(f"总结果: {len(facet_results['results'])} 篇")
print(f"分组统计:")
for category, count in facet_results['groups'].items():
    print(f"  {category}: {count} 篇")

print("\n✅ 搜索结果处理器演示完成!")

本章小结

本章我们学习了搜索结果的处理与展示:

  1. 搜索结果高亮显示:使用 highlights() 方法,自定义高亮格式
  2. 摘要生成与片段提取:生成文档摘要,提取关键片段
  3. 结果过滤与分组:使用 Filter 过滤结果,使用 Facets 分组结果
  4. 自定义评分算法:理解 BM25 评分,创建自定义评分器

通过本章的学习,你应该能够:

  • 实现搜索结果的高亮显示
  • 生成文档摘要和片段
  • 实现结果的过滤和分组
  • 自定义评分算法

在下一章中,我们将学习多语言与中文支持,包括中文分词器集成、停用词过滤等。

« 上一篇 索引优化与管理 下一篇 » 多语言与中文支持