第9章:多语言与中文支持

9.1 中文分词器集成

9.1.1 Whoosh 默认分词器

Whoosh 默认使用的分词器主要针对英文等西文语言:

  • StandardAnalyzer:标准分析器,适用于大多数西文
  • SimpleAnalyzer:简单分析器,仅按非字母数字字符分割
  • StemmingAnalyzer:词干提取分析器,将单词还原为词干

问题:这些分词器对中文支持有限,只能按字或简单标点分割。

9.1.2 集成 jieba 分词器

jieba 是最流行的中文分词库,支持三种分词模式:

  • 精确模式:最精确的分词
  • 全模式:扫描所有可能的词
  • 搜索引擎模式:在精确模式基础上,对长词再切分

安装 jieba

pip install jieba

创建自定义分词器

import jieba
from whoosh.analysis import Tokenizer, Token

class JiebaTokenizer(Tokenizer):
    """jieba 分词器"""
    
    def __init__(self, cut_all=False):
        self.cut_all = cut_all
    
    def __call__(self, value, positions=False, chars=False, 
                 keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0, mode='', **kwargs):
        assert isinstance(value, str)
        t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
        
        # 使用 jieba 分词
        words = jieba.cut(value, cut_all=self.cut_all)
        
        pos = start_pos
        for word in words:
            if not word.strip():
                continue
            
            t.text = word
            t.pos = pos
            t.start_char = start_char
            t.end_char = start_char + len(word)
            
            yield t
            pos += 1
            start_char += len(word)

代码示例

import jieba
from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.analysis import Tokenizer, Token
from whoosh.qparser import QueryParser
import os
import shutil

# 自定义 jieba 分词器
class JiebaTokenizer(Tokenizer):
    def __init__(self, cut_all=False):
        self.cut_all = cut_all
    
    def __call__(self, value, positions=False, chars=False, 
                 keeporiginal=False, removestops=True,
                 start_pos=0, start_char=0, mode='', **kwargs):
        assert isinstance(value, str)
        t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
        
        words = jieba.cut(value, cut_all=self.cut_all)
        
        pos = start_pos
        for word in words:
            if not word.strip():
                continue
            
            t.text = word
            t.pos = pos
            t.start_char = start_char
            t.end_char = start_char + len(word)
            
            yield t
            pos += 1
            start_char += len(word)

# 创建使用 jieba 分词的分析器
from whoosh.analysis import IDListAnalyzer

print("=== jieba 分词器演示 ===\n")

# 示例1:jieba 分词测试
print("【示例1】jieba 分词测试")
text = "Python是一门优秀的编程语言,广泛应用于数据分析、人工智能等领域"
print(f"原文: {text}")
print("\n精确模式:")
print(" ".join(jieba.cut(text)))
print("\n全模式:")
print(" ".join(jieba.cut(text, cut_all=True)))
print("\n搜索引擎模式:")
print(" ".join(jieba.cut_for_search(text)))

# 创建索引
index_dir = "jieba_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

# 使用自定义分词器创建 Schema
from whoosh.analysis import StandardAnalyzer

# 创建分析器(结合 jieba)
from whoosh.analysis import CompositeAnalyzer
analyzer = StandardAnalyzer()

schema = Schema(
    title=TEXT(stored=True, analyzer=analyzer),
    content=TEXT(stored=True, analyzer=analyzer)
)

ix = create_in(index_dir, schema)

# 添加中文文档
print("\n\n【示例2】索引中文文档")
writer = ix.writer()
docs = [
    {
        "title": "Python编程入门",
        "content": "Python是一门简洁的编程语言,适合初学者学习。Python在数据分析、人工智能等领域有广泛应用。"
    },
    {
        "title": "数据分析实战",
        "content": "数据分析是当前热门的技术方向。使用Python进行数据处理和分析,可以提取有价值的信息。"
    },
    {
        "title": "机器学习基础",
        "content": "机器学习是人工智能的核心技术之一。Python的机器学习库如scikit-learn、TensorFlow等非常强大。"
    }
]
for doc in docs:
    writer.add_document(**doc)
    print(f"添加: {doc['title']}")
writer.commit()

# 搜索中文
print("\n\n【示例3】搜索中文")
with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    
    # 搜索 "数据分析"
    query = parser.parse(u"数据分析")
    results = searcher.search(query)
    print(f"搜索 '数据分析': 命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']}")
    
    # 搜索 "人工智能"
    print(f"\n搜索 '人工智能': 命中 {len(results)} 篇")
    query = parser.parse(u"人工智能")
    results = searcher.search(query)
    for hit in results:
        print(f"  - {hit['title']}")

print("\n✅ jieba 分词器演示完成!")

9.1.3 集成其他中文分词器

HanLP 分词器

import hanlp

class HanLPTokenizer(Tokenizer):
    def __call__(self, value, positions=False, chars=False, **kwargs):
        # 使用 HanLP 进行分词
        pass

pkuseg 分词器

import pkuseg

class PKUSegTokenizer(Tokenizer):
    def __call__(self, value, positions=False, chars=False, **kwargs):
        # 使用 pkuseg 进行分词
        pass

9.1.4 分词器性能对比

分词器 速度 准确率 内存占用
jieba
HanLP 很高
pkuseg

9.2 停用词过滤配置

9.2.1 什么是停用词?

停用词是指在搜索中频繁出现但实际意义很小的词,如:

  • 中文:的、了、和、是、在、我、你、他...
  • 英文:the, a, an, is, are, was, were, to, of...

9.2.2 配置停用词

方法1:使用 StopFilter

from whoosh.analysis import StandardAnalyzer, StopFilter

# 创建自定义分析器
analyzer = StandardAnalyzer() | StopFilter(stoplist=custom_stopwords)

代码示例

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.analysis import StandardAnalyzer, StopFilter
from whoosh.qparser import QueryParser
import os
import shutil

# 创建索引
index_dir = "stopwords_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

# 自定义中文停用词
chinese_stopwords = {
    "的", "了", "和", "是", "在", "我", "你", "他", "她", "它",
    "我们", "你们", "他们", "这", "那", "就", "都", "而", "及",
    "与", "或", "等", "等", "但", "不", "没有", "有", "一个"
}

# 创建带停用词过滤的分析器
analyzer = StandardAnalyzer() | StopFilter(stoplist=chinese_stopwords)

schema = Schema(
    title=TEXT(stored=True, analyzer=analyzer),
    content=TEXT(stored=True, analyzer=analyzer)
)

ix = create_in(index_dir, schema)

print("=== 停用词过滤演示 ===\n")

# 添加文档
print("【步骤1】添加文档")
writer = ix.writer()
docs = [
    {
        "title": "Python编程语言",
        "content": "Python是一门优秀的编程语言。Python在人工智能、数据分析等领域有广泛应用。"
    },
    {
        "title": "Web开发技术",
        "content": "Web开发需要掌握HTML、CSS、JavaScript等技术。这些技术在现代网站开发中非常重要。"
    }
]
for doc in docs:
    writer.add_document(**doc)
    print(f"  添加: {doc['title']}")
writer.commit()

# 测试停用词过滤
print("\n\n【步骤2】搜索包含停用词的查询")
with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    
    # 搜索 "的"(停用词,应该不返回结果)
    query = parser.parse(u"的")
    results = searcher.search(query)
    print(f"搜索 '的': 命中 {len(results)} 篇")
    
    # 搜索 "Python的"(停用词被过滤)
    query = parser.parse(u"Python的")
    results = searcher.search(query)
    print(f"搜索 'Python的': 命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']}")
    
    # 搜索 "人工智能"
    query = parser.parse(u"人工智能")
    results = searcher.search(query)
    print(f"\n搜索 '人工智能': 命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']}")

# 对比:无停用词过滤
print("\n\n【步骤3】对比:无停用词过滤")
index_dir2 = "no_stopwords_demo"
if os.path.exists(index_dir2):
    shutil.rmtree(index_dir2)
os.makedirs(index_dir2)

# 不使用停用词过滤
analyzer2 = StandardAnalyzer()
schema2 = Schema(
    title=TEXT(stored=True, analyzer=analyzer2),
    content=TEXT(stored=True, analyzer=analyzer2)
)
ix2 = create_in(index_dir2, schema2)

writer2 = ix2.writer()
for doc in docs:
    writer2.add_document(**doc)
writer2.commit()

with ix2.searcher() as searcher:
    parser = QueryParser("content", ix2.schema)
    query = parser.parse(u"的")
    results = searcher.search(query)
    print(f"无停用词过滤搜索 '的': 命中 {len(results)} 篇")

print("\n✅ 停用词过滤演示完成!")

9.2.3 内置停用词列表

Whoosh 内置了常见语言的停用词列表:

from whoosh.analysis import StandardAnalyzer, STOP_WORDS

# 获取内置停用词
print(english_stopwords = STOP_WORDS)

支持的语言

  • 英语
  • 法语
  • 德语
  • 西班牙语
  • 等等

9.2.4 动态添加停用词

from whoosh.analysis import StandardAnalyzer, StopFilter

# 创建基础分析器
analyzer = StandardAnalyzer()

# 动态添加停用词
new_stopwords = {"新", "停用", "词"}
analyzer = analyzer | StopFilter(stoplist=new_stopwords)

9.3 同义词扩展与处理

9.3.1 什么是同义词扩展?

同义词扩展是指在搜索时,自动将查询词扩展为其同义词,提高召回率。

示例

  • 搜索 "Python" → 匹配 "Python"、"py"、"蟒蛇"
  • 搜索 "AI" → 匹配 "AI"、"人工智能"、"机器学习"

9.3.2 创建同义词映射

方法1:使用同义词映射表

# 同义词映射
synonyms = {
    "python": ["py", "蟒蛇"],
    "人工智能": ["ai", "AI", "机器学习", "深度学习"],
    "数据": ["data"]
}

方法2:查询重写

from whoosh.qparser import QueryParser

def expand_query(query_str, synonyms):
    """扩展查询"""
    expanded_terms = []
    
    for word in query_str.split():
        if word in synonyms:
            expanded_terms.extend(synonyms[word])
        else:
            expanded_terms.append(word)
    
    # 构建扩展后的查询
    from whoosh.query import Or, Term
    from whoosh.fields import TEXT
    terms = [Term("content", term) for term in expanded_terms]
    return Or(terms)

代码示例

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT
from whoosh.qparser import QueryParser
from whoosh.query import Or, Term
import os
import shutil

print("=== 同义词扩展演示 ===\n")

# 创建同义词映射
synonyms = {
    "python": ["py", "蟒蛇", "PY"],
    "人工智能": ["ai", "AI", "机器学习"],
    "数据": ["data"],
    "编程": ["开发", "coding"]
}

# 创建索引
index_dir = "synonym_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True)
)

ix = create_in(index_dir, schema)

# 添加文档
print("【步骤1】添加文档")
writer = ix.writer()
docs = [
    {
        "title": "Python编程教程",
        "content": "Python是一门流行的编程语言。Python在AI领域应用广泛。"
    },
    {
        "title": "数据分析技术",
        "content": "数据分析是数据科学的重要组成部分。使用Python可以方便地进行数据处理。"
    },
    {
        "title": "人工智能入门",
        "content": "AI技术正在改变世界。人工智能包括机器学习和深度学习。"
    },
    {
        "title": "Web开发指南",
        "content": "Web开发是现代编程的重要组成部分。"
    }
]
for doc in docs:
    writer.add_document(**doc)
    print(f"  添加: {doc['title']}")
writer.commit()

# 查询重写函数
def expand_query(query_str, synonyms, field="content"):
    """扩展查询字符串"""
    words = query_str.split()
    expanded_terms = []
    
    for word in words:
        word_lower = word.lower()
        # 检查是否在同义词映射中
        found_synonym = False
        for key, syns in synonyms.items():
            if word_lower == key.lower() or word_lower in [s.lower() for s in syns]:
                # 添加所有同义词
                expanded_terms.extend(synonyms[key])
                found_synonym = True
                break
        
        if not found_synonym:
            expanded_terms.append(word)
    
    # 构建查询
    terms = [Term(field, term) for term in expanded_terms]
    if len(terms) == 1:
        return terms[0]
    else:
        return Or(terms)

# 搜索示例
with ix.searcher() as searcher:
    # 示例1:不使用同义词扩展
    print("\n\n【示例1】不使用同义词扩展")
    parser = QueryParser("content", ix.schema)
    
    query = parser.parse(u"python")
    results = searcher.search(query)
    print(f"搜索 'python': 命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']}")
    
    # 示例2:使用同义词扩展
    print("\n【示例2】使用同义词扩展")
    expanded_query = expand_query("python", synonyms)
    results = searcher.search(expanded_query)
    print(f"扩展搜索 'python' (python, py, 蟒蛇): 命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']}")
    
    # 示例3:同义词扩展 "AI"
    print("\n【示例3】同义词扩展 'AI'")
    expanded_query = expand_query("AI", synonyms)
    results = searcher.search(expanded_query)
    print(f"扩展搜索 'AI' (AI, 人工智能, 机器学习): 命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']}")
    
    # 示例4:多词同义词扩展
    print("\n【示例4】多词同义词扩展 'Python 编程'")
    expanded_query = expand_query("Python 编程", synonyms)
    results = searcher.search(expanded_query)
    print(f"扩展搜索 'Python 编程': 命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']}")

print("\n✅ 同义词扩展演示完成!")

9.3.3 同义词文件管理

创建同义词文件(synonyms.txt)

python,py,蟒蛇
AI,人工智能,机器学习,深度学习
数据,data

读取同义词文件

def load_synonyms(filepath):
    """从文件加载同义词"""
    synonyms = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            words = line.strip().split(',')
            if len(words) > 1:
                main_word = words[0]
                synonyms[main_word] = words[1:]
    return synonyms

# 使用
synonyms = load_synonyms("synonyms.txt")

9.4 多语言混合检索方案

9.4.1 多语言索引设计

方案1:单索引多字段

from whoosh.fields import Schema, TEXT

schema = Schema(
    title_zh=TEXT(stored=True),  # 中文标题
    title_en=TEXT(stored=True),  # 英文标题
    content_zh=TEXT(stored=True),
    content_en=TEXT(stored=True),
    lang=KEYWORD(stored=True)    # 语言标记
)

方案2:分语言索引

my_index/
  zh/  # 中文索引
  en/  # 英文索引

代码示例

from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, KEYWORD
from whoosh.qparser import QueryParser, MultifieldParser
import os
import shutil

print("=== 多语言混合检索演示 ===\n")

# 创建索引
index_dir = "multilang_demo"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

# 单索引多字段方案
schema = Schema(
    title=TEXT(stored=True),
    content=TEXT(stored=True),
    lang=KEYWORD(stored=True),  # 语言字段
    tags=KEYWORD(stored=True)
)

ix = create_in(index_dir, schema)

# 添加多语言文档
print("【步骤1】添加多语言文档")
writer = ix.writer()
docs = [
    {
        "title": "Python编程教程",
        "content": "Python是一门流行的编程语言,适用于数据分析和人工智能。",
        "lang": "zh",
        "tags": "python, 编程, 中文"
    },
    {
        "title": "Python Programming Tutorial",
        "content": "Python is a popular programming language for data analysis and AI.",
        "lang": "en",
        "tags": "python, programming, english"
    },
    {
        "title": "数据分析实战",
        "content": "使用Python进行数据处理和分析,提取有价值的信息。",
        "lang": "zh",
        "tags": "数据, 分析, 中文"
    },
    {
        "title": "Data Analysis Guide",
        "content": "Learn data analysis with Python and extract valuable insights.",
        "lang": "en",
        "tags": "data, analysis, english"
    },
    {
        "title": "人工智能入门",
        "content": "人工智能包括机器学习和深度学习技术。",
        "lang": "zh",
        "tags": "AI, 人工智能, 中文"
    }
]
for doc in docs:
    writer.add_document(**doc)
    print(f"  添加: {doc['title']} ({doc['lang']})")
writer.commit()

# 搜索示例
with ix.searcher() as searcher:
    # 示例1:中文搜索
    print("\n\n【示例1】中文搜索 - 'Python'")
    parser = QueryParser("content", ix.schema)
    query = parser.parse(u"Python")
    results = searcher.search(query)
    print(f"命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']} ({hit['lang']})")
    
    # 示例2:按语言过滤
    print("\n\n【示例2】只搜索中文文档 - 'Python'")
    from whoosh.query import Term
    query = parser.parse(u"Python")
    lang_filter = Term("lang", "zh")
    results = searcher.search(query, filter=lang_filter)
    print(f"命中 {len(results)} 篇(仅中文)")
    for hit in results:
        print(f"  - {hit['title']} ({hit['lang']})")
    
    # 示例3:英文搜索
    print("\n\n【示例3】英文搜索 - 'Python'")
    lang_filter = Term("lang", "en")
    results = searcher.search(query, filter=lang_filter)
    print(f"命中 {len(results)} 篇(仅英文)")
    for hit in results:
        print(f"  - {hit['title']} ({hit['lang']})")
    
    # 示例4:多语言关键词搜索
    print("\n\n【示例4】多语言关键词搜索")
    parser = MultifieldParser(["title", "content"], ix.schema)
    
    # 搜索 "数据"(中)
    query = parser.parse(u"数据")
    results = searcher.search(query)
    print(f"搜索 '数据': 命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']} ({hit['lang']})")
    
    # 搜索 "analysis"(英)
    query = parser.parse(u"analysis")
    results = searcher.search(query)
    print(f"\n搜索 'analysis': 命中 {len(results)} 篇")
    for hit in results:
        print(f"  - {hit['title']} ({hit['lang']})")

# 按语言分组统计
print("\n\n【示例5】按语言分组统计")
with ix.searcher() as searcher:
    parser = QueryParser("content", ix.schema)
    query = parser.parse(u"*")
    results = searcher.search(query, groupedby="lang")
    
    groups = results.groups("lang")
    for lang, count in groups.items():
        print(f"{lang}: {count} 篇文档")

print("\n✅ 多语言混合检索演示完成!")

9.4.2 语言识别

使用 langdetect 库识别语言

from langdetect import detect

text = "Python是一门编程语言"
lang = detect(text)  # 'zh-cn'

9.4.3 自动分语言检索

def detect_language(text):
    """检测文本语言"""
    # 简单的字符编码检测
    for char in text:
        if '\u4e00' <= char <= '\u9fff':
            return 'zh'
    return 'en'

def auto_search(searcher, query_str):
    """自动分语言搜索"""
    lang = detect_language(query_str)
    
    parser = QueryParser("content", searcher.schema)
    query = parser.parse(query_str)
    
    # 添加语言过滤
    from whoosh.query import Term
    lang_filter = Term("lang", lang)
    
    return searcher.search(query, filter=lang_filter)

9.5 综合示例

9.5.1 完整的中文搜索系统

from whoosh.index import create_in
from whoosh.fields import Schema, TEXT, KEYWORD, ID
from whoosh.qparser import QueryParser
from whoosh.query import Term, Or
from whoosh.highlight import HtmlFormatter
import jieba
import os
import shutil

print("=== 完整中文搜索系统演示 ===\n")

# 同义词映射
synonyms = {
    "python": ["py", "蟒蛇"],
    "人工智能": ["ai", "AI", "机器学习"],
    "数据": ["data"]
}

# 中文停用词
chinese_stopwords = {
    "的", "了", "和", "是", "在", "我", "你", "他", "这", "那",
    "就", "都", "而", "及", "与", "或", "等", "但", "不"
}

# 创建索引
index_dir = "chinese_search_system"
if os.path.exists(index_dir):
    shutil.rmtree(index_dir)
os.makedirs(index_dir)

schema = Schema(
    id=ID(stored=True, unique=True),
    title=TEXT(stored=True),
    content=TEXT(stored=True),
    category=KEYWORD(stored=True)
)

ix = create_in(index_dir, schema)

# 添加文档
print("【步骤1】索引文档")
writer = ix.writer()
docs = [
    {
        "id": "1",
        "title": "Python编程入门教程",
        "content": "Python是一门简洁的编程语言。Python在数据分析、人工智能等领域有广泛应用。Python适合初学者学习。",
        "category": "教程"
    },
    {
        "id": "2",
        "title": "数据分析实战指南",
        "content": "数据分析是热门技术。使用Python进行数据处理和分析,可以提取有价值的信息。Python在数据科学领域非常重要。",
        "category": "实战"
    },
    {
        "id": "3",
        "title": "机器学习入门",
        "content": "机器学习是人工智能的核心技术。Python的机器学习库如scikit-learn、TensorFlow等非常强大。AI正在改变世界。",
        "category": "入门"
    },
    {
        "id": "4",
        "title": "Web开发技术",
        "content": "Web开发需要掌握HTML、CSS、JavaScript等技术。Python也可以用于Web后端开发,如Django框架。",
        "category": "技术"
    }
]
for doc in docs:
    writer.add_document(**doc)
    print(f"  添加: {doc['title']}")
writer.commit()

# 中文搜索引擎类
class ChineseSearchEngine:
    def __init__(self, index_dir):
        self.ix = open_dir(index_dir)
    
    def expand_query(self, query_str):
        """扩展查询(同义词)"""
        words = list(jieba.cut(query_str))
        expanded_terms = []
        
        for word in words:
            if word in chinese_stopwords:
                continue
            
            # 查找同义词
            found = False
            for key, syns in synonyms.items():
                if word == key:
                    expanded_terms.extend([key] + syns)
                    found = True
                    break
            
            if not found:
                expanded_terms.append(word)
        
        return expanded_terms
    
    def search(self, query_str, category=None, topn=10):
        """搜索"""
        with self.ix.searcher() as searcher:
            # 扩展查询
            expanded_terms = self.expand_query(query_str)
            
            # 构建查询
            from whoosh.query import Or, Term
            terms = [Term("content", term) for term in expanded_terms]
            if len(terms) == 1:
                query = terms[0]
            else:
                query = Or(terms)
            
            # 应用分类过滤
            filter_query = None
            if category:
                filter_query = Term("category", category)
            
            # 执行搜索
            results = searcher.search(query, filter=filter_query, limit=topn)
            
            # 处理结果
            output = []
            for hit in results:
                # 高亮
                highlighted = hit.highlights('content')
                
                output.append({
                    'id': hit['id'],
                    'title': hit['title'],
                    'category': hit['category'],
                    'highlight': highlighted,
                    'score': hit.score
                })
            
            return output

# 使用搜索引擎
engine = ChineseSearchEngine(index_dir)

# 示例1:基本搜索
print("\n\n【示例2】基本搜索 - 'Python'")
results = engine.search("Python")
print(f"命中 {len(results)} 篇:")
for r in results:
    print(f"  - {r['title']} ({r['category']})")
    print(f"    摘要: {r['highlight']}")
    print()

# 示例2:同义词扩展搜索
print("【示例3】同义词扩展搜索 - 'AI'")
results = engine.search("AI")
print(f"命中 {len(results)} 篇(扩展为: AI, 人工智能, 机器学习):")
for r in results:
    print(f"  - {r['title']}")

# 示例3:带分类过滤
print("\n【示例4】分类过滤 - 'Python' 且 category='实战'")
results = engine.search("Python", category="实战")
print(f"命中 {len(results)} 篇:")
for r in results:
    print(f"  - {r['title']}")

# 示例4:长词搜索
print("\n【示例5】长词搜索 - '数据分析人工智能'")
results = engine.search("数据分析人工智能")
print(f"命中 {len(results)} 篇:")
for r in results:
    print(f"  - {r['title']} (评分: {r['score']:.2f})")

print("\n✅ 完整中文搜索系统演示完成!")

本章小结

本章我们学习了 Whoosh 的多语言与中文支持:

  1. 中文分词器集成:集成 jieba 等中文分词库
  2. 停用词过滤配置:配置和使用停用词列表
  3. 同义词扩展与处理:实现查询的同义词扩展
  4. 多语言混合检索方案:设计和实现多语言索引

通过本章的学习,你应该能够:

  • 集成中文分词器到 Whoosh
  • 配置和使用停用词过滤
  • 实现同义词扩展功能
  • 设计多语言混合检索方案

下一章我们将学习性能调优,包括索引性能优化、查询性能监控等。

« 上一篇 结果处理与展示 下一篇 » 性能调优