第6章:高级查询技巧
6.1 布尔查询(AND/OR/NOT)
6.1.1 什么是布尔查询?
布尔查询是全文检索中最常用的高级查询方式,它通过逻辑运算符组合多个查询条件,实现精确的检索需求。
Whoosh 支持三种基本的布尔运算:
- AND:同时满足所有条件
- OR:满足任一条件即可
- NOT:排除满足某个条件的结果
6.1.2 使用 BooleanQuery 类
基本语法:
from whoosh.query import And, Or, Not
# AND 查询:必须同时匹配所有子查询
query = And([Term("title", "python"), Term("category", "教程")])
# OR 查询:匹配任一子查询即可
query = Or([Term("title", "python"), Term("title", "java")])
# NOT 查询:排除匹配的子查询
query = And([Term("content", "python"), Not(Term("category", "广告"))])
# 复杂嵌套查询
query = And([
Or([Term("title", "python"), Term("title", "java")]),
Not(Term("category", "教程"))
])代码示例:
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.query import And, Or, Not, Term
import os
import shutil
# 创建示例索引
index_dir = "boolean_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True),
category=TEXT(stored=True),
tags=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
docs = [
{"title": "Python 入门教程", "content": "Python 是一门简洁的编程语言", "category": "教程", "tags": "python 编程"},
{"title": "Java 实战指南", "content": "Java 企业级开发实践", "category": "教程", "tags": "java 编程"},
{"title": "Python 高级编程", "content": "深入理解 Python 高级特性", "category": "进阶", "tags": "python 高级"},
{"title": "数据分析实战", "content": "使用 Python 进行数据分析", "category": "实战", "tags": "python 数据"},
{"title": "Web 开发技巧", "content": "Web 开发最佳实践", "category": "技巧", "tags": "web 开发"},
]
for doc in docs:
writer.add_document(**doc)
writer.commit()
# 执行布尔查询
with ix.searcher() as searcher:
# 示例1: AND 查询 - 同时包含 python 和教程
print("=== AND 查询:python AND 教程 ===")
query = And([Term("title", "python"), Term("category", "教程")])
results = searcher.search(query)
for hit in results:
print(f" {hit['title']} (分类: {hit['category']})")
# 示例2: OR 查询 - 标题包含 python 或 java
print("\n=== OR 查询:python OR java ===")
query = Or([Term("title", "python"), Term("title", "java")])
results = searcher.search(query)
for hit in results:
print(f" {hit['title']}")
# 示例3: NOT 查询 - 包含 python 但不是教程
print("\n=== NOT 查询:python NOT 教程 ===")
query = And([Term("title", "python"), Not(Term("category", "教程"))])
results = searcher.search(query)
for hit in results:
print(f" {hit['title']} (分类: {hit['category']})")
# 示例4: 复杂嵌套 - (python OR java) AND NOT 教程
print("\n=== 复杂嵌套:(python OR java) AND NOT 教程 ===")
query = And([
Or([Term("title", "python"), Term("title", "java")]),
Not(Term("category", "教程"))
])
results = searcher.search(query)
for hit in results:
print(f" {hit['title']}")6.1.3 使用查询语法
通过 QueryParser 也可以直接使用布尔运算符:
from whoosh.qparser import QueryParser
parser = QueryParser("content", ix.schema)
# AND 查询(空格或 AND)
query1 = parser.parse(u"python 检索")
query2 = parser.parse(u"python AND 检索")
# OR 查询
query = parser.parse(u"python OR java")
# NOT 查询
query = parser.parse(u"python NOT java")
# 组合查询
query = parser.parse(u"(python OR java) AND 检索 NOT 广告")6.1.4 布尔运算符优先级
在组合查询中,AND 的优先级高于 OR:
# 以下两个查询等价
query1 = parser.parse(u"a AND b OR c") # (a AND b) OR c
query2 = Or([And([Term("a"), Term("b")]), Term("c")])
# 使用括号改变优先级
query3 = parser.parse(u"a AND (b OR c)") # a AND (b OR c)
query4 = And([Term("a"), Or([Term("b"), Term("c")])])6.2 范围查询与日期查询
6.2.1 什么是范围查询?
范围查询用于查找字段值在指定范围内的文档,常见应用场景包括:
- 价格区间筛选(100-500元)
- 日期范围查询(2024-01-01 到 2024-12-31)
- 评分范围(4.0-5.0分)
- 数值范围(数量 > 10)
6.2.2 使用 TermRange 查询
基本语法:
from whoosh.query import TermRange
# 包含边界值(默认)
query = TermRange("price", "100", "500")
# 排除边界值
query = TermRange("price", "100", "500", startexcl=True, endexcl=True)
# 只排除上边界
query = TermRange("price", "100", "500", endexcl=True)
# 只排除下边界
query = TermRange("price", "100", "500", startexcl=True)代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID, NUMERIC
from whoosh.query import TermRange
import os
import shutil
# 创建包含数值字段的索引
index_dir = "range_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
price=NUMERIC(stored=True, sortable=True),
rating=NUMERIC(stored=True),
category=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加商品数据
writer = ix.writer()
products = [
{"title": "Python 编程入门", "price": 59, "rating": 4.5, "category": "书籍"},
{"title": "Java 实战教程", "price": 89, "rating": 4.2, "category": "书籍"},
{"title": "数据分析实战", "price": 129, "rating": 4.8, "category": "书籍"},
{"title": "Web 开发基础", "price": 45, "rating": 3.9, "category": "书籍"},
{"title": "算法导论", "price": 199, "rating": 4.9, "category": "书籍"},
]
for p in products:
writer.add_document(**p)
writer.commit()
# 范围查询示例
with ix.searcher() as searcher:
# 示例1: 价格范围 50-100
print("=== 价格范围:50-100 元 ===")
query = TermRange("price", 50, 100)
results = searcher.search(query)
for hit in results:
print(f" {hit['title']} - ¥{hit['price']}")
# 示例2: 评分 4.0 以上
print("\n=== 评分:4.0 分以上 ===")
query = TermRange("rating", 4.0, 5.0, startexcl=False, endexcl=False)
results = searcher.search(query)
for hit in results:
print(f" {hit['title']} - {hit['rating']} 分")
# 示例3: 价格 100 以上(排除100)
print("\n=== 价格:100 元以上 ===")
query = TermRange("price", 100, 99999, startexcl=True)
results = searcher.search(query)
for hit in results:
print(f" {hit['title']} - ¥{hit['price']}")6.2.3 使用 DateRange 查询
基本语法:
from whoosh.query import DateRange
from datetime import datetime
# 日期范围查询
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)
query = DateRange("date", start_date, end_date)代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID, DATETIME
from whoosh.query import DateRange
from datetime import datetime
import os
import shutil
# 创建包含日期字段的索引
index_dir = "date_range_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
publish_date=DATETIME(stored=True, sortable=True),
author=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加带日期的文档
writer = ix.writer()
articles = [
{"title": "Python 3.12 新特性", "publish_date": datetime(2024, 1, 15), "author": "张三"},
{"title": "深入理解异步编程", "publish_date": datetime(2024, 3, 20), "author": "李四"},
{"title": "数据分析实战指南", "publish_date": datetime(2024, 6, 10), "author": "王五"},
{"title": "Web 开发最佳实践", "publish_date": datetime(2024, 8, 5), "author": "张三"},
{"title": "2024 年度总结", "publish_date": datetime(2024, 12, 25), "author": "赵六"},
]
for article in articles:
writer.add_document(**article)
writer.commit()
# 日期范围查询示例
with ix.searcher() as searcher:
# 示例1: 2024 年上半年发布的文章
print("=== 2024 年上半年(1-6月)===")
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 6, 30, 23, 59, 59)
query = DateRange("publish_date", start_date, end_date)
results = searcher.search(query)
for hit in results:
print(f" {hit['title']} - {hit['publish_date'].strftime('%Y-%m-%d')}")
# 示例2: 最近3个月
print("\n=== 最近 3 个月 ===")
start_date = datetime(2024, 6, 1)
end_date = datetime(2024, 12, 31)
query = DateRange("publish_date", start_date, end_date)
results = searcher.search(query, sortedby="publish_date", reverse=True)
for hit in results:
print(f" {hit['title']} - {hit['publish_date'].strftime('%Y-%m-%d')}")
# 示例3: 指定日期之后
print("\n=== 2024年6月之后 ===")
start_date = datetime(2024, 6, 1)
query = DateRange("publish_date", start_date, None)
results = searcher.search(query)
for hit in results:
print(f" {hit['title']} - {hit['publish_date'].strftime('%Y-%m-%d')}")6.2.4 使用 NumericRange 查询
对于数值类型字段,可以使用 NumericRange 进行范围查询:
from whoosh.query import NumericRange
# 整数范围
query = NumericRange("price", 100, 500)
# 浮点数范围
query = NumericRange("rating", 4.0, 5.0)6.3 多字段查询与权重设置
6.3.1 什么是多字段查询?
多字段查询允许同时在多个字段中搜索关键词,并为不同字段设置不同的权重,从而优化搜索结果的排序。
6.3.2 使用 MultifieldParser
基本用法:
from whoosh.qparser import MultifieldParser
# 创建多字段查询解析器
parser = MultifieldParser(["title", "content"], ix.schema)
query = parser.parse(u"python 搜索")代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import MultifieldParser
import os
import shutil
# 创建索引
index_dir = "multifield_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True),
author=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
docs = [
{"title": "Python 编程", "content": "学习 Python 编程语言", "author": "张三"},
{"title": "Java 入门", "content": "Java 编程基础教程", "author": "李四"},
{"title": "数据分析", "content": "使用 Python 进行数据分析", "author": "王五"},
{"title": "搜索算法", "content": "全文搜索算法详解", "author": "赵六"},
]
for doc in docs:
writer.add_document(**doc)
writer.commit()
# 多字段查询示例
with ix.searcher() as searcher:
# 示例1: 在标题和内容中搜索
print("=== 在 title 和 content 中搜索 'python' ===")
parser = MultifieldParser(["title", "content"], ix.schema)
query = parser.parse(u"python")
results = searcher.search(query)
for hit in results:
print(f" {hit['title']} (评分: {hit.score:.2f})")
# 示例2: 在所有字段中搜索
print("\n=== 在所有字段中搜索 '张三' ===")
parser = MultifieldParser(["title", "content", "author"], ix.schema)
query = parser.parse(u"张三")
results = searcher.search(query)
for hit in results:
print(f" {hit['title']} - 作者: {hit['author']}")6.3.3 设置字段权重
通过为不同字段设置权重,可以影响搜索结果的排序:
代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import MultifieldParser
from whoosh.qparser import BoostPlugin
import os
import shutil
# 创建索引
index_dir = "weight_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True),
tags=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
docs = [
{"title": "Python 教程", "content": "学习 Python 编程语言", "tags": "python 教程"},
{"title": "数据分析入门", "content": "本教程详细介绍数据分析方法,使用 Python 实现数据分析", "tags": "python 数据"},
{"title": "Web 开发指南", "content": "Web 开发需要掌握 HTML、CSS、JavaScript 等技术", "tags": "web 开发"},
{"title": "Python 实战", "content": "通过实战项目学习 Python", "tags": "python 实战"},
]
for doc in docs:
writer.add_document(**doc)
writer.commit()
# 权重查询示例
with ix.searcher() as searcher:
# 示例1: 标题权重更高
print("=== 标题权重 2.0,内容权重 1.0 ===")
parser = MultifieldParser(["title^2.0", "content^1.0"], ix.schema)
query = parser.parse(u"python")
results = searcher.search(query)
for i, hit in enumerate(results, 1):
print(f" {i}. {hit['title']} (评分: {hit.score:.2f})")
# 示例2: 使用 BoostPlugin 设置权重
print("\n=== 使用 BoostPlugin ===")
parser = MultifieldParser(["title", "content"], ix.schema)
parser.add_plugin(BoostPlugin())
# 在查询字符串中直接指定权重
query = parser.parse(u"title:python^2.0 content:python^1.0")
results = searcher.search(query)
for i, hit in enumerate(results, 1):
print(f" {i}. {hit['title']} (评分: {hit.score:.2f})")6.3.4 字段权重应用场景
| 场景 | 推荐权重 | 说明 |
|---|---|---|
| 电商商品搜索 | 标题 2.0,描述 1.0 | 标题匹配更重要 |
| 文档搜索 | 标题 3.0,内容 1.0 | 标题完全匹配应优先 |
| 新闻搜索 | 标题 2.0,正文 1.0 | 标题直接影响点击率 |
| 论文搜索 | 标题 2.5,摘要 1.5,正文 1.0 | 摘要的重要性介于标题和正文之间 |
6.4 查询结果排序与评分
6.4.1 评分机制
Whoosh 使用 BM25 算法计算文档的相关性评分,评分考虑因素包括:
- 词频(Term Frequency):词在文档中出现的次数
- 文档频率(Document Frequency):词在整个索引中出现的文档数
- 字段长度:文档长度越长,权重越低
- 字段权重:不同字段的重要性
查看评分:
with ix.searcher() as searcher:
query = parser.parse(u"python")
results = searcher.search(query)
for hit in results:
print(f"标题: {hit['title']}")
print(f"评分: {hit.score}")
print(f"排名: {hit.rank}")6.4.2 结果排序
默认排序:按评分降序(相关性从高到低)
按字段排序:
# 按价格升序
results = searcher.search(query, sortedby="price")
# 按价格降序
results = searcher.search(query, sortedby="price", reverse=True)
# 按日期排序
results = searcher.search(query, sortedby="date", reverse=True)代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, NUMERIC, DATETIME
from whoosh.qparser import QueryParser
from datetime import datetime
import os
import shutil
# 创建索引
index_dir = "sort_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
price=NUMERIC(stored=True, sortable=True),
publish_date=DATETIME(stored=True, sortable=True),
rating=NUMERIC(stored=True, sortable=True)
)
ix = create_in(index_dir, schema)
# 添加文档
writer = ix.writer()
products = [
{"title": "Python 入门", "price": 59, "publish_date": datetime(2024, 1, 10), "rating": 4.5},
{"title": "Java 进阶", "price": 129, "publish_date": datetime(2024, 3, 15), "rating": 4.8},
{"title": "数据分析", "price": 89, "publish_date": datetime(2024, 2, 20), "rating": 4.2},
{"title": "Web 开发", "price": 45, "publish_date": datetime(2024, 4, 5), "rating": 3.9},
]
for p in products:
writer.add_document(**p)
writer.commit()
# 排序示例
with ix.searcher() as searcher:
parser = QueryParser("title", ix.schema)
query = parser.parse(u"*") # 匹配所有文档
# 示例1: 默认按评分排序
print("=== 默认按相关性评分排序 ===")
results = searcher.search(query)
for i, hit in enumerate(results, 1):
print(f" {i}. {hit['title']} (评分: {hit.score:.2f})")
# 示例2: 按价格升序
print("\n=== 按价格升序排序 ===")
results = searcher.search(query, sortedby="price")
for i, hit in enumerate(results, 1):
print(f" {i}. {hit['title']} - ¥{hit['price']}")
# 示例3: 按价格降序
print("\n=== 按价格降序排序 ===")
results = searcher.search(query, sortedby="price", reverse=True)
for i, hit in enumerate(results, 1):
print(f" {i}. {hit['title']} - ¥{hit['price']}")
# 示例4: 按日期降序
print("\n=== 按发布日期降序排序 ===")
results = searcher.search(query, sortedby="publish_date", reverse=True)
for i, hit in enumerate(results, 1):
print(f" {i}. {hit['title']} - {hit['publish_date'].strftime('%Y-%m-%d')}")
# 示例5: 按评分降序
print("\n=== 按用户评分降序排序 ===")
results = searcher.search(query, sortedby="rating", reverse=True)
for i, hit in enumerate(results, 1):
print(f" {i}. {hit['title']} - {hit['rating']} 分")6.4.3 多字段排序
from whoosh.sorting import FieldFacet, MultiFacet
# 按多个字段排序
# 先按价格升序,价格相同时按评分降序
facet = MultiFacet()
facet.add_field("price")
facet.add_field("rating", reverse=True)
results = searcher.search(query, sortedby=facet)6.4.4 自定义评分函数
可以通过自定义函数修改评分:
def custom_score(searcher, docnum, score):
# 自定义评分逻辑
# 例如:结合价格因素
return score * 0.8 + (1 - price / max_price) * 0.26.5 分页与限制结果数量
6.5.1 限制结果数量
使用 limit 参数:
# 最多返回 10 条结果
results = searcher.search(query, limit=10)
# 只返回前 5 条
results = searcher.search(query, limit=5)6.5.2 分页查询
使用 search_page 方法:
# 获取第一页,每页 10 条
page_num = 1
page_size = 10
results = searcher.search_page(query, page_num, pagelen=page_size)
# 获取总页数
total_pages = results.pagecount
# 获取总结果数
total_results = results.total_length()代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
import os
import shutil
# 创建索引并添加大量文档
index_dir = "pagination_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加 35 篇文档
writer = ix.writer()
for i in range(1, 36):
doc = {
"title": f"文档 {i:02d} - Python 编程",
"content": f"这是第 {i} 篇关于 Python 编程的文档"
}
writer.add_document(**doc)
writer.commit()
# 分页查询示例
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
query = parser.parse(u"python")
# 获取总结果数
all_results = searcher.search(query)
total = len(all_results)
print(f"总结果数: {total}")
# 示例1: 分页显示,每页 10 条
page_size = 10
total_pages = (total + page_size - 1) // page_size
print(f"\n总页数: {total_pages} (每页 {page_size} 条)")
# 显示所有页
for page_num in range(1, total_pages + 1):
results = searcher.search_page(query, page_num, pagelen=page_size)
print(f"\n--- 第 {page_num} 页 ---")
for hit in results:
print(f" [{hit.rank + 1}] {hit['title']}")
# 示例2: 使用 search_page 对象的方法
print(f"\n--- 分页对象信息 ---")
results = searcher.search_page(query, 1, pagelen=10)
print(f"当前页: {results.pagenumber}")
print(f"总页数: {results.pagecount}")
print(f"总结果: {results.total_length()}")
# 示例3: 上一页和下一页
print(f"\n--- 翻页演示 ---")
for page_num in range(1, total_pages + 1):
results = searcher.search_page(query, page_num, pagelen=10)
print(f"\n第 {page_num} 页: {', '.join([hit['title'] for hit in results])}")6.5.3 分页实现
完整的分页类:
class Pagination:
def __init__(self, searcher, query, page, per_page=10):
self.searcher = searcher
self.query = query
self.page = page
self.per_page = per_page
self.results = searcher.search_page(query, page, pagelen=per_page)
self.total = results.total_length()
self.pages = results.pagecount
def __iter__(self):
return iter(self.results)
def has_prev(self):
return self.page > 1
def has_next(self):
return self.page < self.pages
def prev_num(self):
return self.page - 1 if self.has_prev() else None
def next_num(self):
return self.page + 1 if self.has_next() else None
# 使用示例
pagination = Pagination(searcher, query, page=1, per_page=10)
print(f"第 {pagination.page} 页 / 共 {pagination.pages} 页")
for hit in pagination:
print(hit['title'])6.5.4 搜索建议(Suggest)
使用 Prefix 查询实现:
from whoosh.query import Prefix
def get_suggestions(searcher, field, prefix, limit=10):
"""获取搜索建议"""
query = Prefix(field, prefix)
results = searcher.search(query, limit=limit)
return [hit[field] for hit in results]
# 使用示例
suggestions = get_suggestions(searcher, "title", "py")
print(f"搜索建议: {', '.join(suggestions)}")6.6 综合示例
6.6.1 完整的高级查询示例
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, NUMERIC, DATETIME
from whoosh.qparser import QueryParser, MultifieldParser
from whoosh.query import And, Or, Not, Term, TermRange, DateRange
from whoosh.sorting import FieldFacet, MultiFacet
from datetime import datetime
import os
import shutil
# 创建商品搜索索引
index_dir = "advanced_query_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
description=TEXT(stored=True),
category=TEXT(stored=True),
price=NUMERIC(stored=True, sortable=True),
rating=NUMERIC(stored=True, sortable=True),
publish_date=DATETIME(stored=True, sortable=True)
)
ix = create_in(index_dir, schema)
# 添加商品数据
writer = ix.writer()
products = [
{"title": "Python 编程入门", "description": "零基础学习 Python 编程", "category": "教程", "price": 59, "rating": 4.5, "publish_date": datetime(2024, 1, 10)},
{"title": "Java 高级编程", "description": "Java 企业级开发实战", "category": "教程", "price": 89, "rating": 4.8, "publish_date": datetime(2024, 2, 15)},
{"title": "数据分析实战", "description": "使用 Python 进行数据分析", "category": "实战", "price": 129, "rating": 4.7, "publish_date": datetime(2024, 3, 20)},
{"title": "Web 开发指南", "description": "全栈 Web 开发教程", "category": "教程", "price": 79, "rating": 4.3, "publish_date": datetime(2024, 4, 10)},
{"title": "算法导论", "description": "经典算法与数据结构", "category": "书籍", "price": 199, "rating": 4.9, "publish_date": datetime(2024, 5, 5)},
{"title": "Python 机器学习", "description": "机器学习基础与实践", "category": "实战", "price": 149, "rating": 4.6, "publish_date": datetime(2024, 6, 15)},
{"title": "Go 语言实战", "description": "Go 语言编程指南", "category": "教程", "price": 69, "rating": 4.4, "publish_date": datetime(2024, 7, 20)},
]
for p in products:
writer.add_document(**p)
writer.commit()
print("=== 高级查询综合示例 ===\n")
with ix.searcher() as searcher:
# 示例1: 布尔查询 - 包含 Python 且价格在 100 以下
print("【示例1】包含 Python 且价格 < 100")
query = And([
Term("title", "python"),
TermRange("price", None, 100, endexcl=True)
])
results = searcher.search(query, sortedby="rating", reverse=True)
for hit in results:
print(f" {hit['title']} - ¥{hit['price']} - {hit['rating']}分")
# 示例2: 多字段加权查询 - Python 或 Java,标题权重更高
print("\n【示例2】标题权重 2.0,描述权重 1.0,搜索 Python")
parser = MultifieldParser(["title^2.0", "description^1.0"], ix.schema)
query = parser.parse(u"python")
results = searcher.search(query)
for i, hit in enumerate(results, 1):
print(f" {i}. {hit['title']} (评分: {hit.score:.2f})")
# 示例3: 范围查询 - 价格 50-150 且评分 4.5 以上
print("\n【示例3】价格 50-150 元且评分 ≥ 4.5")
query = And([
TermRange("price", 50, 150),
TermRange("rating", 4.5, 5.0)
])
results = searcher.search(query, sortedby="rating", reverse=True)
for hit in results:
print(f" {hit['title']} - ¥{hit['price']} - {hit['rating']}分")
# 示例4: 日期范围查询 - 2024 年 1-6 月发布的 Python 相关内容
print("\n【示例4】2024 年 1-6 月发布的 Python 相关内容")
query = And([
Or([Term("title", "python"), Term("description", "python")]),
DateRange("publish_date", datetime(2024, 1, 1), datetime(2024, 6, 30, 23, 59, 59))
])
results = searcher.search(query, sortedby="publish_date", reverse=True)
for hit in results:
print(f" {hit['title']} - {hit['publish_date'].strftime('%Y-%m-%d')}")
# 示例5: 复杂查询 - (Python 或 Java) 且价格 < 100 且评分 > 4.0,排除 "书籍"
print("\n【示例5】(Python 或 Java) 且价格 < 100 且评分 > 4.0,排除 '书籍'")
query = And([
Or([Term("title", "python"), Term("title", "java")]),
TermRange("price", None, 100, endexcl=True),
TermRange("rating", 4.0, 5.0, startexcl=True),
Not(Term("category", "书籍"))
])
results = searcher.search(query)
for hit in results:
print(f" {hit['title']} - ¥{hit['price']} - {hit['rating']}分 - {hit['category']}")
# 示例6: 多字段排序 - 先按价格升序,再按评分降序
print("\n【示例6】多字段排序:价格升序,评分降序")
parser = QueryParser("title", ix.schema)
query = parser.parse(u"*")
facet = MultiFacet()
facet.add_field("price")
facet.add_field("rating", reverse=True)
results = searcher.search(query, sortedby=facet)
for i, hit in enumerate(results, 1):
print(f" {i}. {hit['title']} - ¥{hit['price']} - {hit['rating']}分")
# 示例7: 分页查询
print("\n【示例7】分页查询(每页 3 条)")
query = parser.parse(u"*")
page_size = 3
total_pages = (len(searcher.search(query)) + page_size - 1) // page_size
for page_num in range(1, total_pages + 1):
results = searcher.search_page(query, page_num, pagelen=page_size)
print(f"\n 第 {page_num} 页:")
for hit in results:
print(f" {hit['title']} - ¥{hit['price']}")
print("\n✅ 高级查询示例演示完成!")本章小结
本章我们学习了 Whoosh 的高级查询技巧:
- 布尔查询:使用 AND、OR、NOT 组合多个查询条件
- 范围查询:使用 TermRange、DateRange 查询数值和日期范围
- 多字段查询:使用 MultifieldParser 在多个字段中搜索,并设置字段权重
- 结果排序与评分:理解 BM25 评分机制,实现自定义排序
- 分页查询:使用 search_page 方法实现分页显示
通过本章的学习,你应该能够:
- 构建复杂的组合查询条件
- 实现范围筛选功能
- 优化搜索结果的排序
- 实现分页浏览功能
在下一章中,我们将学习索引优化与管理,包括索引合并、增量更新、删除文档等操作。