第73集：集合推导式

学习目标

理解集合推导式的概念和优势
掌握基本集合推导式的语法
学会使用条件表达式过滤集合
了解集合推导式与列表推导式的区别
掌握集合推导式的实际应用场景

集合推导式概念

什么是集合推导式

集合推导式（Set Comprehension）是Python中创建集合的简洁语法，它允许我们通过一个表达式和迭代器来快速生成集合。与列表推导式类似，但集合推导式会自动去除重复元素。

集合推导式的优势

自动去重：自动去除重复元素
代码简洁：一行代码替代多行循环
性能更好：比传统for循环执行更快
可读性强：意图明确，易于理解
集合运算：支持交集、并集等集合运算

基本集合推导式

基本语法结构

# 基本语法
{expression for item in iterable}

# 传统for循环方式
result = set()
for item in iterable:
    result.add(expression)

基本示例

# 1. 从列表创建集合
numbers = [1, 2, 3, 2, 4, 3, 5]
unique_numbers = {num for num in numbers}
print(f"唯一数字集合: {unique_numbers}")
# 输出: 唯一数字集合: {1, 2, 3, 4, 5}

# 2. 创建平方数集合
squares = {x**2 for x in range(6)}
print(f"平方数集合: {squares}")
# 输出: 平方数集合: {0, 1, 4, 9, 16, 25}

# 3. 从字符串创建字符集合
text = "hello world"
chars = {char for char in text if char != ' '}
print(f"字符集合: {chars}")
# 输出: 字符集合: {'d', 'e', 'h', 'l', 'o', 'r', 'w'}

# 4. 创建单词首字母集合
words = ["apple", "banana", "cherry", "apricot", "blueberry"]
first_letters = {word[0].upper() for word in words}
print(f"首字母集合: {first_letters}")
# 输出: 首字母集合: {'A', 'B', 'C'}

# 5. 创建长度集合
words = ["apple", "banana", "cherry", "date", "fig", "grape"]
lengths = {len(word) for word in words}
print(f"长度集合: {lengths}")
# 输出: 长度集合: {3, 4, 5, 6}

使用函数和复杂表达式

# 1. 使用数学函数
import math
angles = [0, 30, 45, 60, 90]
sin_values = {round(math.sin(math.radians(angle)), 4) for angle in angles}
print(f"正弦值集合: {sin_values}")
# 输出: 正弦值集合: {0.0, 0.5, 0.7071, 0.866, 1.0}

# 2. 处理字符串列表
words = ["hello", "world", "python"]
word_lengths = {len(word) for word in words}
print(f"单词长度集合: {word_lengths}")
# 输出: 单词长度集合: {5, 6}

# 3. 处理字典值
data = {"a": 1, "b": 2, "c": 3, "d": 2}
unique_values = {v for v in data.values()}
print(f"唯一值集合: {unique_values}")
# 输出: 唯一值集合: {1, 2, 3}

# 4. 复杂计算
numbers = [1, 2, 3, 4, 5]
calculated = {num**2 + num for num in numbers}
print(f"计算结果集合: {calculated}")
# 输出: 计算结果集合: {2, 6, 12, 20, 30}

带条件的集合推导式

基本条件语法

# 条件筛选语法
{expression for item in iterable if condition}

# 传统for循环方式
result = set()
for item in iterable:
    if condition:
        result.add(expression)

条件筛选示例

# 1. 筛选偶数
numbers = range(1, 21)
even_numbers = {num for num in numbers if num % 2 == 0}
print(f"偶数集合: {even_numbers}")
# 输出: 偶数集合: {2, 4, 6, 8, 10, 12, 14, 16, 18, 20}

# 2. 筛选特定长度的字符串
words = ["apple", "banana", "cherry", "date", "fig", "grape"]
short_words = {word for word in words if len(word) <= 4}
print(f"短单词集合: {short_words}")
# 输出: 短单词集合: {'date', 'fig'}

# 3. 筛选特定范围的数字
numbers = range(1, 101)
filtered = {num for num in numbers if 20 <= num <= 30}
print(f"20-30之间的数字: {filtered}")
# 输出: 20-30之间的数字: {20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}

# 4. 筛选特定字符
text = "Python is awesome!"
vowels = {char for char in text.lower() if char in 'aeiou'}
print(f"元音字母集合: {vowels}")
# 输出: 元音字母集合: {'a', 'e', 'i', 'o'}

使用复杂条件

# 1. 多条件筛选
numbers = range(1, 31)
filtered = {num for num in numbers if num % 2 == 0 and num % 3 == 0}
print(f"能被2和3整除的数: {filtered}")
# 输出: 能被2和3整除的数: {6, 12, 18, 24, 30}

# 2. 字符串条件筛选
words = ["apple", "banana", "cherry", "apricot", "blueberry", "coconut"]
filtered_words = {
    word for word in words 
    if len(word) >= 6 and word.startswith(('a', 'b'))
}
print(f"长度≥6且以a或b开头的单词: {filtered_words}")
# 输出: 长度≥6且以a或b开头的单词: {'banana', 'apricot', 'blueberry'}

# 3. 使用函数作为条件
def is_prime(n):
    """检查是否为质数"""
    if n < 2:
        return False
    for i in range(2, int(n**0.5) + 1):
        if n % i == 0:
            return False
    return True

numbers = range(2, 31)
prime_numbers = {num for num in numbers if is_prime(num)}
print(f"质数集合: {prime_numbers}")
# 输出: 质数集合: {2, 3, 5, 7, 11, 13, 17, 19, 23, 29}

# 4. 文本处理条件
sentences = [
    "Python is powerful",
    "List comprehensions are useful",
    "Code should be readable",
    "Programming is fun"
]

# 提取包含特定单词的句子中的单词
python_words = {
    word.lower() 
    for sentence in sentences 
    if "python" in sentence.lower()
    for word in sentence.split()
}
print(f"包含Python的句子中的单词: {python_words}")
# 输出: 包含Python的句子中的单词: {'python', 'is', 'powerful'}

集合推导式的高级用法

条件表达式与集合推导式

# 1. 根据条件生成不同值
numbers = range(-5, 6)
absolute_values = {num if num >= 0 else -num for num in numbers}
print(f"绝对值集合: {absolute_values}")
# 输出: 绝对值集合: {0, 1, 2, 3, 4, 5}

# 2. 分类标签
scores = [95, 67, 82, 45, 73, 88, 52]
grades = {
    "优秀" if score >= 85 
    else "良好" if score >= 70 
    else "及格" if score >= 60 
    else "不及格" 
    for score in scores
}
print(f"成绩等级集合: {grades}")
# 输出: 成绩等级集合: {'优秀', '良好', '及格', '不及格'}

# 3. 处理边界情况
numbers = [1, 2, 0, -1, -2, 3, -3]
safe_division = {
    1/num if num != 0 else 0 
    for num in numbers
}
print(f"安全除法结果集合: {safe_division}")
# 输出: 安全除法结果集合: {0, 0.3333333333333333, 0.5, 1.0, -0.3333333333333333, -0.5, -1.0}

# 4. 字符串处理
words = ["", "hello", "world", "", "python", ""]
processed_words = {
    word.upper() if word else "EMPTY" 
    for word in words
}
print(f"处理后的单词集合: {processed_words}")
# 输出: 处理后的单词集合: {'EMPTY', 'HELLO', 'WORLD', 'PYTHON'}

集合运算

# 1. 并集运算
set1 = {1, 2, 3, 4}
set2 = {3, 4, 5, 6}
union_set = set1 | set2
print(f"并集: {union_set}")
# 输出: 并集: {1, 2, 3, 4, 5, 6}

# 2. 交集运算
intersection_set = set1 & set2
print(f"交集: {intersection_set}")
# 输出: 交集: {3, 4}

# 3. 差集运算
difference_set = set1 - set2
print(f"差集: {difference_set}")
# 输出: 差集: {1, 2}

# 4. 对称差集运算
symmetric_diff = set1 ^ set2
print(f"对称差集: {symmetric_diff}")
# 输出: 对称差集: {1, 2, 5, 6}

# 5. 使用集合推导式进行集合运算
list1 = [1, 2, 3, 4, 5]
list2 = [4, 5, 6, 7, 8]

# 并集
union = {x for x in list1} | {x for x in list2}
print(f"使用推导式的并集: {union}")

# 交集
intersection = {x for x in list1 if x in list2}
print(f"使用推导式的交集: {intersection}")

# 差集
difference = {x for x in list1 if x not in list2}
print(f"使用推导式的差集: {difference}")

集合推导式的应用案例

案例1：数据去重

# 1. 列表去重
numbers = [1, 2, 3, 2, 4, 3, 5, 1, 6]
unique_numbers = list({num for num in numbers})
print(f"去重后的列表: {unique_numbers}")
# 输出: 去重后的列表: [1, 2, 3, 4, 5, 6]

# 2. 字符串去重
text = "hello world"
unique_chars = sorted({char for char in text if char != ' '})
print(f"去重后的字符: {unique_chars}")
# 输出: 去重后的字符: ['d', 'e', 'h', 'l', 'o', 'r', 'w']

# 3. 字典值去重
data = [
    {"name": "张三", "city": "北京"},
    {"name": "李四", "city": "上海"},
    {"name": "王五", "city": "北京"},
    {"name": "赵六", "city": "广州"}
]
unique_cities = {person["city"] for person in data}
print(f"唯一城市: {unique_cities}")
# 输出: 唯一城市: {'北京', '上海', '广州'}

# 4. 多字段去重
students = [
    {"name": "张三", "class": "1班"},
    {"name": "李四", "class": "2班"},
    {"name": "王五", "class": "1班"},
    {"name": "赵六", "class": "2班"}
]
unique_classes = {student["class"] for student in students}
print(f"唯一班级: {unique_classes}")
# 输出: 唯一班级: {'1班', '2班'}

案例2：数据分析

# 1. 统计唯一值
data = [1, 2, 3, 2, 4, 3, 5, 1, 6, 2, 3]
unique_values = {x for x in data}
count_unique = len(unique_values)
count_total = len(data)
print(f"唯一值数量: {count_unique}")
print(f"总数量: {count_total}")
print(f"重复率: {(count_total - count_unique) / count_total * 100:.1f}%")

# 2. 查找共同元素
list1 = [1, 2, 3, 4, 5]
list2 = [4, 5, 6, 7, 8]
common_elements = {x for x in list1 if x in list2}
print(f"共同元素: {common_elements}")
# 输出: 共同元素: {4, 5}

# 3. 查找唯一元素
list1 = [1, 2, 3, 4, 5]
list2 = [4, 5, 6, 7, 8]
unique_in_list1 = {x for x in list1 if x not in list2}
unique_in_list2 = {x for x in list2 if x not in list1}
print(f"list1独有的元素: {unique_in_list1}")
print(f"list2独有的元素: {unique_in_list2}")
# 输出:
# list1独有的元素: {1, 2, 3}
# list2独有的元素: {6, 7, 8}

# 4. 数据验证
valid_values = {1, 2, 3, 4, 5}
test_data = [1, 2, 3, 6, 7, 8]
invalid_values = {x for x in test_data if x not in valid_values}
print(f"无效值: {invalid_values}")
# 输出: 无效值: {6, 7, 8}

案例3：文本处理

# 1. 提取唯一单词
text = "Python is great. Python is popular. Python is powerful."
words = [word.strip(". ").lower() for word in text.split()]
unique_words = {word for word in words}
print(f"唯一单词: {unique_words}")
# 输出: 唯一单词: {'python', 'is', 'great', 'popular', 'powerful'}

# 2. 提取唯一字符
sentence = "Hello World!"
unique_chars = {char for char in sentence if char.isalpha()}
print(f"唯一字符: {unique_chars}")
# 输出: 唯一字符: {'d', 'e', 'H', 'l', 'o', 'r', 'W'}

# 3. 提取首字母
words = ["apple", "banana", "cherry", "apricot", "blueberry", "coconut"]
first_letters = {word[0].upper() for word in words}
print(f"首字母: {first_letters}")
# 输出: 首字母: {'A', 'B', 'C'}

# 4. 提取元音字母
text = "Python programming"
vowels = {char for char in text.lower() if char in 'aeiou'}
print(f"元音字母: {vowels}")
# 输出: 元音字母: {'a', 'i', 'o'}

集合推导式与列表推导式的区别

去重特性

# 1. 列表推导式不去重
numbers = [1, 2, 3, 2, 4, 3, 5]
list_result = [x**2 for x in numbers]
print(f"列表推导式结果: {list_result}")
# 输出: 列表推导式结果: [1, 4, 9, 4, 16, 9, 25]

# 2. 集合推导式自动去重
set_result = {x**2 for x in numbers}
print(f"集合推导式结果: {set_result}")
# 输出: 集合推导式结果: {1, 4, 9, 16, 25}

# 3. 顺序问题
# 列表推导式保持顺序
list_result = [x % 3 for x in range(10)]
print(f"列表推导式(有序): {list_result}")
# 输出: 列表推导式(有序): [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]

# 集合推导式不保证顺序
set_result = {x % 3 for x in range(10)}
print(f"集合推导式(无序): {set_result}")
# 输出: 集合推导式(无序): {0, 1, 2}

性能对比

import time

# 1. 去重性能对比
data = list(range(10000)) * 10  # 100,000个元素，有很多重复

# 列表推导式 + set转换
start_time = time.time()
unique_list = list(set([x for x in data]))
list_time = time.time() - start_time

# 集合推导式
start_time = time.time()
unique_set = {x for x in data}
set_time = time.time() - start_time

print(f"列表推导式+set转换时间: {list_time:.6f}秒")
print(f"集合推导式时间: {set_time:.6f}秒")
print(f"性能提升: {list_time/set_time:.2f}倍")

# 2. 查找性能对比
# 创建查找集合
lookup_set = {x for x in range(10000)}
lookup_list = [x for x in range(10000)]

# 在集合中查找
start_time = time.time()
for i in range(1000):
    _ = i in lookup_set
set_lookup_time = time.time() - start_time

# 在列表中查找
start_time = time.time()
for i in range(1000):
    _ = i in lookup_list
list_lookup_time = time.time() - start_time

print(f"\n集合查找时间: {set_lookup_time:.6f}秒")
print(f"列表查找时间: {list_lookup_time:.6f}秒")
print(f"性能提升: {list_lookup_time/set_lookup_time:.2f}倍")

集合推导式的局限性

何时不应使用集合推导式

# 1. 需要保持顺序的情况
# 不推荐：集合不保证顺序
result = {x for x in range(10)}

# 推荐：使用列表推导式
result = [x for x in range(10)]

# 2. 需要重复元素的情况
# 不推荐：集合会去重
result = {x % 3 for x in range(10)}

# 推荐：使用列表推导式
result = [x % 3 for x in range(10)]

# 3. 复杂逻辑不适合集合推导式
# 不推荐：过于复杂的集合推导式
complex_result = {
    x**2 + y**2 if (x + y) % 2 == 0 else (x - y)**2 
    for x in range(5) 
    for y in range(5) 
    if x != y and (x * y) % 3 == 0
}

# 推荐：使用传统for循环
complex_result = set()
for x in range(5):
    for y in range(5):
        if x != y and (x * y) % 3 == 0:
            if (x + y) % 2 == 0:
                complex_result.add(x**2 + y**2)
            else:
                complex_result.add((x - y)**2)

最佳实践

使用集合推导式的建议

# 1. 保持简洁
# 推荐：简单明了
squares = {x**2 for x in range(10)}

# 不推荐：过于复杂
result = {x if condition(x) else default(x) for x in data if filter_condition(x)}

# 2. 使用有意义的变量名
# 推荐：清晰的变量名
word_lengths = {len(word) for word in words}

# 不推荐：模糊的变量名
s = {len(x) for x in y}

# 3. 合理使用条件
# 推荐：简单的条件筛选
even_numbers = {x for x in range(10) if x % 2 == 0}

# 不推荐：复杂的嵌套条件
result = {x for x in data if condition1(x) and condition2(x) or condition3(x)}

# 4. 考虑可读性
# 推荐：易于理解
unique_words = {word.lower() for word in words}

# 不推荐：难以理解
result = {x.strip().lower() for x in y if x and not x.startswith('#')}

总结

集合推导式是Python中创建集合和去重的强大工具，它具有以下特点：

自动去重：自动去除重复元素
简洁性：一行代码完成复杂的集合创建和去重
高效性：查找和去重操作性能优异
可读性：意图明确，易于理解
集合运算：支持交集、并集等集合运算

在使用集合推导式时，应该：

保持代码简洁明了
使用有意义的变量名
避免过于复杂的逻辑
注意集合无序的特性
在需要去重时优先使用

通过合理使用集合推导式，可以写出更加Pythonic和高效的代码。