第9章:多语言与中文支持
9.1 中文分词器集成
9.1.1 Whoosh 默认分词器
Whoosh 默认使用的分词器主要针对英文等西文语言:
- StandardAnalyzer:标准分析器,适用于大多数西文
- SimpleAnalyzer:简单分析器,仅按非字母数字字符分割
- StemmingAnalyzer:词干提取分析器,将单词还原为词干
问题:这些分词器对中文支持有限,只能按字或简单标点分割。
9.1.2 集成 jieba 分词器
jieba 是最流行的中文分词库,支持三种分词模式:
- 精确模式:最精确的分词
- 全模式:扫描所有可能的词
- 搜索引擎模式:在精确模式基础上,对长词再切分
安装 jieba:
pip install jieba创建自定义分词器:
import jieba
from whoosh.analysis import Tokenizer, Token
class JiebaTokenizer(Tokenizer):
"""jieba 分词器"""
def __init__(self, cut_all=False):
self.cut_all = cut_all
def __call__(self, value, positions=False, chars=False,
keeporiginal=False, removestops=True,
start_pos=0, start_char=0, mode='', **kwargs):
assert isinstance(value, str)
t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
# 使用 jieba 分词
words = jieba.cut(value, cut_all=self.cut_all)
pos = start_pos
for word in words:
if not word.strip():
continue
t.text = word
t.pos = pos
t.start_char = start_char
t.end_char = start_char + len(word)
yield t
pos += 1
start_char += len(word)代码示例:
import jieba
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.analysis import Tokenizer, Token
from whoosh.qparser import QueryParser
import os
import shutil
# 自定义 jieba 分词器
class JiebaTokenizer(Tokenizer):
def __init__(self, cut_all=False):
self.cut_all = cut_all
def __call__(self, value, positions=False, chars=False,
keeporiginal=False, removestops=True,
start_pos=0, start_char=0, mode='', **kwargs):
assert isinstance(value, str)
t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
words = jieba.cut(value, cut_all=self.cut_all)
pos = start_pos
for word in words:
if not word.strip():
continue
t.text = word
t.pos = pos
t.start_char = start_char
t.end_char = start_char + len(word)
yield t
pos += 1
start_char += len(word)
# 创建使用 jieba 分词的分析器
from whoosh.analysis import IDListAnalyzer
print("=== jieba 分词器演示 ===\n")
# 示例1:jieba 分词测试
print("【示例1】jieba 分词测试")
text = "Python是一门优秀的编程语言,广泛应用于数据分析、人工智能等领域"
print(f"原文: {text}")
print("\n精确模式:")
print(" ".join(jieba.cut(text)))
print("\n全模式:")
print(" ".join(jieba.cut(text, cut_all=True)))
print("\n搜索引擎模式:")
print(" ".join(jieba.cut_for_search(text)))
# 创建索引
index_dir = "jieba_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
# 使用自定义分词器创建 Schema
from whoosh.analysis import StandardAnalyzer
# 创建分析器(结合 jieba)
from whoosh.analysis import CompositeAnalyzer
analyzer = StandardAnalyzer()
schema = Schema(
title=TEXT(stored=True, analyzer=analyzer),
content=TEXT(stored=True, analyzer=analyzer)
)
ix = create_in(index_dir, schema)
# 添加中文文档
print("\n\n【示例2】索引中文文档")
writer = ix.writer()
docs = [
{
"title": "Python编程入门",
"content": "Python是一门简洁的编程语言,适合初学者学习。Python在数据分析、人工智能等领域有广泛应用。"
},
{
"title": "数据分析实战",
"content": "数据分析是当前热门的技术方向。使用Python进行数据处理和分析,可以提取有价值的信息。"
},
{
"title": "机器学习基础",
"content": "机器学习是人工智能的核心技术之一。Python的机器学习库如scikit-learn、TensorFlow等非常强大。"
}
]
for doc in docs:
writer.add_document(**doc)
print(f"添加: {doc['title']}")
writer.commit()
# 搜索中文
print("\n\n【示例3】搜索中文")
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
# 搜索 "数据分析"
query = parser.parse(u"数据分析")
results = searcher.search(query)
print(f"搜索 '数据分析': 命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']}")
# 搜索 "人工智能"
print(f"\n搜索 '人工智能': 命中 {len(results)} 篇")
query = parser.parse(u"人工智能")
results = searcher.search(query)
for hit in results:
print(f" - {hit['title']}")
print("\n✅ jieba 分词器演示完成!")9.1.3 集成其他中文分词器
HanLP 分词器:
import hanlp
class HanLPTokenizer(Tokenizer):
def __call__(self, value, positions=False, chars=False, **kwargs):
# 使用 HanLP 进行分词
passpkuseg 分词器:
import pkuseg
class PKUSegTokenizer(Tokenizer):
def __call__(self, value, positions=False, chars=False, **kwargs):
# 使用 pkuseg 进行分词
pass9.1.4 分词器性能对比
| 分词器 | 速度 | 准确率 | 内存占用 |
|---|---|---|---|
| jieba | 快 | 高 | 低 |
| HanLP | 中 | 很高 | 中 |
| pkuseg | 快 | 高 | 低 |
9.2 停用词过滤配置
9.2.1 什么是停用词?
停用词是指在搜索中频繁出现但实际意义很小的词,如:
- 中文:的、了、和、是、在、我、你、他...
- 英文:the, a, an, is, are, was, were, to, of...
9.2.2 配置停用词
方法1:使用 StopFilter
from whoosh.analysis import StandardAnalyzer, StopFilter
# 创建自定义分析器
analyzer = StandardAnalyzer() | StopFilter(stoplist=custom_stopwords)代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.analysis import StandardAnalyzer, StopFilter
from whoosh.qparser import QueryParser
import os
import shutil
# 创建索引
index_dir = "stopwords_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
# 自定义中文停用词
chinese_stopwords = {
"的", "了", "和", "是", "在", "我", "你", "他", "她", "它",
"我们", "你们", "他们", "这", "那", "就", "都", "而", "及",
"与", "或", "等", "等", "但", "不", "没有", "有", "一个"
}
# 创建带停用词过滤的分析器
analyzer = StandardAnalyzer() | StopFilter(stoplist=chinese_stopwords)
schema = Schema(
title=TEXT(stored=True, analyzer=analyzer),
content=TEXT(stored=True, analyzer=analyzer)
)
ix = create_in(index_dir, schema)
print("=== 停用词过滤演示 ===\n")
# 添加文档
print("【步骤1】添加文档")
writer = ix.writer()
docs = [
{
"title": "Python编程语言",
"content": "Python是一门优秀的编程语言。Python在人工智能、数据分析等领域有广泛应用。"
},
{
"title": "Web开发技术",
"content": "Web开发需要掌握HTML、CSS、JavaScript等技术。这些技术在现代网站开发中非常重要。"
}
]
for doc in docs:
writer.add_document(**doc)
print(f" 添加: {doc['title']}")
writer.commit()
# 测试停用词过滤
print("\n\n【步骤2】搜索包含停用词的查询")
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
# 搜索 "的"(停用词,应该不返回结果)
query = parser.parse(u"的")
results = searcher.search(query)
print(f"搜索 '的': 命中 {len(results)} 篇")
# 搜索 "Python的"(停用词被过滤)
query = parser.parse(u"Python的")
results = searcher.search(query)
print(f"搜索 'Python的': 命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']}")
# 搜索 "人工智能"
query = parser.parse(u"人工智能")
results = searcher.search(query)
print(f"\n搜索 '人工智能': 命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']}")
# 对比:无停用词过滤
print("\n\n【步骤3】对比:无停用词过滤")
index_dir2 = "no_stopwords_demo"
if os.path.exists(index_dir2):
shutil.rmtree(index_dir2)
os.makedirs(index_dir2)
# 不使用停用词过滤
analyzer2 = StandardAnalyzer()
schema2 = Schema(
title=TEXT(stored=True, analyzer=analyzer2),
content=TEXT(stored=True, analyzer=analyzer2)
)
ix2 = create_in(index_dir2, schema2)
writer2 = ix2.writer()
for doc in docs:
writer2.add_document(**doc)
writer2.commit()
with ix2.searcher() as searcher:
parser = QueryParser("content", ix2.schema)
query = parser.parse(u"的")
results = searcher.search(query)
print(f"无停用词过滤搜索 '的': 命中 {len(results)} 篇")
print("\n✅ 停用词过滤演示完成!")9.2.3 内置停用词列表
Whoosh 内置了常见语言的停用词列表:
from whoosh.analysis import StandardAnalyzer, STOP_WORDS
# 获取内置停用词
print(english_stopwords = STOP_WORDS)支持的语言:
- 英语
- 法语
- 德语
- 西班牙语
- 等等
9.2.4 动态添加停用词
from whoosh.analysis import StandardAnalyzer, StopFilter
# 创建基础分析器
analyzer = StandardAnalyzer()
# 动态添加停用词
new_stopwords = {"新", "停用", "词"}
analyzer = analyzer | StopFilter(stoplist=new_stopwords)9.3 同义词扩展与处理
9.3.1 什么是同义词扩展?
同义词扩展是指在搜索时,自动将查询词扩展为其同义词,提高召回率。
示例:
- 搜索 "Python" → 匹配 "Python"、"py"、"蟒蛇"
- 搜索 "AI" → 匹配 "AI"、"人工智能"、"机器学习"
9.3.2 创建同义词映射
方法1:使用同义词映射表
# 同义词映射
synonyms = {
"python": ["py", "蟒蛇"],
"人工智能": ["ai", "AI", "机器学习", "深度学习"],
"数据": ["data"]
}方法2:查询重写
from whoosh.qparser import QueryParser
def expand_query(query_str, synonyms):
"""扩展查询"""
expanded_terms = []
for word in query_str.split():
if word in synonyms:
expanded_terms.extend(synonyms[word])
else:
expanded_terms.append(word)
# 构建扩展后的查询
from whoosh.query import Or, Term
from whoosh.fields import TEXT
terms = [Term("content", term) for term in expanded_terms]
return Or(terms)代码示例:
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from whoosh.query import Or, Term
import os
import shutil
print("=== 同义词扩展演示 ===\n")
# 创建同义词映射
synonyms = {
"python": ["py", "蟒蛇", "PY"],
"人工智能": ["ai", "AI", "机器学习"],
"数据": ["data"],
"编程": ["开发", "coding"]
}
# 创建索引
index_dir = "synonym_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
print("【步骤1】添加文档")
writer = ix.writer()
docs = [
{
"title": "Python编程教程",
"content": "Python是一门流行的编程语言。Python在AI领域应用广泛。"
},
{
"title": "数据分析技术",
"content": "数据分析是数据科学的重要组成部分。使用Python可以方便地进行数据处理。"
},
{
"title": "人工智能入门",
"content": "AI技术正在改变世界。人工智能包括机器学习和深度学习。"
},
{
"title": "Web开发指南",
"content": "Web开发是现代编程的重要组成部分。"
}
]
for doc in docs:
writer.add_document(**doc)
print(f" 添加: {doc['title']}")
writer.commit()
# 查询重写函数
def expand_query(query_str, synonyms, field="content"):
"""扩展查询字符串"""
words = query_str.split()
expanded_terms = []
for word in words:
word_lower = word.lower()
# 检查是否在同义词映射中
found_synonym = False
for key, syns in synonyms.items():
if word_lower == key.lower() or word_lower in [s.lower() for s in syns]:
# 添加所有同义词
expanded_terms.extend(synonyms[key])
found_synonym = True
break
if not found_synonym:
expanded_terms.append(word)
# 构建查询
terms = [Term(field, term) for term in expanded_terms]
if len(terms) == 1:
return terms[0]
else:
return Or(terms)
# 搜索示例
with ix.searcher() as searcher:
# 示例1:不使用同义词扩展
print("\n\n【示例1】不使用同义词扩展")
parser = QueryParser("content", ix.schema)
query = parser.parse(u"python")
results = searcher.search(query)
print(f"搜索 'python': 命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']}")
# 示例2:使用同义词扩展
print("\n【示例2】使用同义词扩展")
expanded_query = expand_query("python", synonyms)
results = searcher.search(expanded_query)
print(f"扩展搜索 'python' (python, py, 蟒蛇): 命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']}")
# 示例3:同义词扩展 "AI"
print("\n【示例3】同义词扩展 'AI'")
expanded_query = expand_query("AI", synonyms)
results = searcher.search(expanded_query)
print(f"扩展搜索 'AI' (AI, 人工智能, 机器学习): 命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']}")
# 示例4:多词同义词扩展
print("\n【示例4】多词同义词扩展 'Python 编程'")
expanded_query = expand_query("Python 编程", synonyms)
results = searcher.search(expanded_query)
print(f"扩展搜索 'Python 编程': 命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']}")
print("\n✅ 同义词扩展演示完成!")9.3.3 同义词文件管理
创建同义词文件(synonyms.txt):
python,py,蟒蛇
AI,人工智能,机器学习,深度学习
数据,data读取同义词文件:
def load_synonyms(filepath):
"""从文件加载同义词"""
synonyms = {}
with open(filepath, 'r', encoding='utf-8') as f:
for line in f:
words = line.strip().split(',')
if len(words) > 1:
main_word = words[0]
synonyms[main_word] = words[1:]
return synonyms
# 使用
synonyms = load_synonyms("synonyms.txt")9.4 多语言混合检索方案
9.4.1 多语言索引设计
方案1:单索引多字段
from whoosh.fields import Schema, TEXT
schema = Schema(
title_zh=TEXT(stored=True), # 中文标题
title_en=TEXT(stored=True), # 英文标题
content_zh=TEXT(stored=True),
content_en=TEXT(stored=True),
lang=KEYWORD(stored=True) # 语言标记
)方案2:分语言索引
my_index/
zh/ # 中文索引
en/ # 英文索引代码示例:
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, KEYWORD
from whoosh.qparser import QueryParser, MultifieldParser
import os
import shutil
print("=== 多语言混合检索演示 ===\n")
# 创建索引
index_dir = "multilang_demo"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
# 单索引多字段方案
schema = Schema(
title=TEXT(stored=True),
content=TEXT(stored=True),
lang=KEYWORD(stored=True), # 语言字段
tags=KEYWORD(stored=True)
)
ix = create_in(index_dir, schema)
# 添加多语言文档
print("【步骤1】添加多语言文档")
writer = ix.writer()
docs = [
{
"title": "Python编程教程",
"content": "Python是一门流行的编程语言,适用于数据分析和人工智能。",
"lang": "zh",
"tags": "python, 编程, 中文"
},
{
"title": "Python Programming Tutorial",
"content": "Python is a popular programming language for data analysis and AI.",
"lang": "en",
"tags": "python, programming, english"
},
{
"title": "数据分析实战",
"content": "使用Python进行数据处理和分析,提取有价值的信息。",
"lang": "zh",
"tags": "数据, 分析, 中文"
},
{
"title": "Data Analysis Guide",
"content": "Learn data analysis with Python and extract valuable insights.",
"lang": "en",
"tags": "data, analysis, english"
},
{
"title": "人工智能入门",
"content": "人工智能包括机器学习和深度学习技术。",
"lang": "zh",
"tags": "AI, 人工智能, 中文"
}
]
for doc in docs:
writer.add_document(**doc)
print(f" 添加: {doc['title']} ({doc['lang']})")
writer.commit()
# 搜索示例
with ix.searcher() as searcher:
# 示例1:中文搜索
print("\n\n【示例1】中文搜索 - 'Python'")
parser = QueryParser("content", ix.schema)
query = parser.parse(u"Python")
results = searcher.search(query)
print(f"命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']} ({hit['lang']})")
# 示例2:按语言过滤
print("\n\n【示例2】只搜索中文文档 - 'Python'")
from whoosh.query import Term
query = parser.parse(u"Python")
lang_filter = Term("lang", "zh")
results = searcher.search(query, filter=lang_filter)
print(f"命中 {len(results)} 篇(仅中文)")
for hit in results:
print(f" - {hit['title']} ({hit['lang']})")
# 示例3:英文搜索
print("\n\n【示例3】英文搜索 - 'Python'")
lang_filter = Term("lang", "en")
results = searcher.search(query, filter=lang_filter)
print(f"命中 {len(results)} 篇(仅英文)")
for hit in results:
print(f" - {hit['title']} ({hit['lang']})")
# 示例4:多语言关键词搜索
print("\n\n【示例4】多语言关键词搜索")
parser = MultifieldParser(["title", "content"], ix.schema)
# 搜索 "数据"(中)
query = parser.parse(u"数据")
results = searcher.search(query)
print(f"搜索 '数据': 命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']} ({hit['lang']})")
# 搜索 "analysis"(英)
query = parser.parse(u"analysis")
results = searcher.search(query)
print(f"\n搜索 'analysis': 命中 {len(results)} 篇")
for hit in results:
print(f" - {hit['title']} ({hit['lang']})")
# 按语言分组统计
print("\n\n【示例5】按语言分组统计")
with ix.searcher() as searcher:
parser = QueryParser("content", ix.schema)
query = parser.parse(u"*")
results = searcher.search(query, groupedby="lang")
groups = results.groups("lang")
for lang, count in groups.items():
print(f"{lang}: {count} 篇文档")
print("\n✅ 多语言混合检索演示完成!")9.4.2 语言识别
使用 langdetect 库识别语言:
from langdetect import detect
text = "Python是一门编程语言"
lang = detect(text) # 'zh-cn'9.4.3 自动分语言检索
def detect_language(text):
"""检测文本语言"""
# 简单的字符编码检测
for char in text:
if '\u4e00' <= char <= '\u9fff':
return 'zh'
return 'en'
def auto_search(searcher, query_str):
"""自动分语言搜索"""
lang = detect_language(query_str)
parser = QueryParser("content", searcher.schema)
query = parser.parse(query_str)
# 添加语言过滤
from whoosh.query import Term
lang_filter = Term("lang", lang)
return searcher.search(query, filter=lang_filter)9.5 综合示例
9.5.1 完整的中文搜索系统
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, KEYWORD, ID
from whoosh.qparser import QueryParser
from whoosh.query import Term, Or
from whoosh.highlight import HtmlFormatter
import jieba
import os
import shutil
print("=== 完整中文搜索系统演示 ===\n")
# 同义词映射
synonyms = {
"python": ["py", "蟒蛇"],
"人工智能": ["ai", "AI", "机器学习"],
"数据": ["data"]
}
# 中文停用词
chinese_stopwords = {
"的", "了", "和", "是", "在", "我", "你", "他", "这", "那",
"就", "都", "而", "及", "与", "或", "等", "但", "不"
}
# 创建索引
index_dir = "chinese_search_system"
if os.path.exists(index_dir):
shutil.rmtree(index_dir)
os.makedirs(index_dir)
schema = Schema(
id=ID(stored=True, unique=True),
title=TEXT(stored=True),
content=TEXT(stored=True),
category=KEYWORD(stored=True)
)
ix = create_in(index_dir, schema)
# 添加文档
print("【步骤1】索引文档")
writer = ix.writer()
docs = [
{
"id": "1",
"title": "Python编程入门教程",
"content": "Python是一门简洁的编程语言。Python在数据分析、人工智能等领域有广泛应用。Python适合初学者学习。",
"category": "教程"
},
{
"id": "2",
"title": "数据分析实战指南",
"content": "数据分析是热门技术。使用Python进行数据处理和分析,可以提取有价值的信息。Python在数据科学领域非常重要。",
"category": "实战"
},
{
"id": "3",
"title": "机器学习入门",
"content": "机器学习是人工智能的核心技术。Python的机器学习库如scikit-learn、TensorFlow等非常强大。AI正在改变世界。",
"category": "入门"
},
{
"id": "4",
"title": "Web开发技术",
"content": "Web开发需要掌握HTML、CSS、JavaScript等技术。Python也可以用于Web后端开发,如Django框架。",
"category": "技术"
}
]
for doc in docs:
writer.add_document(**doc)
print(f" 添加: {doc['title']}")
writer.commit()
# 中文搜索引擎类
class ChineseSearchEngine:
def __init__(self, index_dir):
self.ix = open_dir(index_dir)
def expand_query(self, query_str):
"""扩展查询(同义词)"""
words = list(jieba.cut(query_str))
expanded_terms = []
for word in words:
if word in chinese_stopwords:
continue
# 查找同义词
found = False
for key, syns in synonyms.items():
if word == key:
expanded_terms.extend([key] + syns)
found = True
break
if not found:
expanded_terms.append(word)
return expanded_terms
def search(self, query_str, category=None, topn=10):
"""搜索"""
with self.ix.searcher() as searcher:
# 扩展查询
expanded_terms = self.expand_query(query_str)
# 构建查询
from whoosh.query import Or, Term
terms = [Term("content", term) for term in expanded_terms]
if len(terms) == 1:
query = terms[0]
else:
query = Or(terms)
# 应用分类过滤
filter_query = None
if category:
filter_query = Term("category", category)
# 执行搜索
results = searcher.search(query, filter=filter_query, limit=topn)
# 处理结果
output = []
for hit in results:
# 高亮
highlighted = hit.highlights('content')
output.append({
'id': hit['id'],
'title': hit['title'],
'category': hit['category'],
'highlight': highlighted,
'score': hit.score
})
return output
# 使用搜索引擎
engine = ChineseSearchEngine(index_dir)
# 示例1:基本搜索
print("\n\n【示例2】基本搜索 - 'Python'")
results = engine.search("Python")
print(f"命中 {len(results)} 篇:")
for r in results:
print(f" - {r['title']} ({r['category']})")
print(f" 摘要: {r['highlight']}")
print()
# 示例2:同义词扩展搜索
print("【示例3】同义词扩展搜索 - 'AI'")
results = engine.search("AI")
print(f"命中 {len(results)} 篇(扩展为: AI, 人工智能, 机器学习):")
for r in results:
print(f" - {r['title']}")
# 示例3:带分类过滤
print("\n【示例4】分类过滤 - 'Python' 且 category='实战'")
results = engine.search("Python", category="实战")
print(f"命中 {len(results)} 篇:")
for r in results:
print(f" - {r['title']}")
# 示例4:长词搜索
print("\n【示例5】长词搜索 - '数据分析人工智能'")
results = engine.search("数据分析人工智能")
print(f"命中 {len(results)} 篇:")
for r in results:
print(f" - {r['title']} (评分: {r['score']:.2f})")
print("\n✅ 完整中文搜索系统演示完成!")本章小结
本章我们学习了 Whoosh 的多语言与中文支持:
- 中文分词器集成:集成 jieba 等中文分词库
- 停用词过滤配置:配置和使用停用词列表
- 同义词扩展与处理:实现查询的同义词扩展
- 多语言混合检索方案:设计和实现多语言索引
通过本章的学习,你应该能够:
- 集成中文分词器到 Whoosh
- 配置和使用停用词过滤
- 实现同义词扩展功能
- 设计多语言混合检索方案
下一章我们将学习性能调优,包括索引性能优化、查询性能监控等。