第73集:集合推导式
学习目标
- 理解集合推导式的概念和优势
- 掌握基本集合推导式的语法
- 学会使用条件表达式过滤集合
- 了解集合推导式与列表推导式的区别
- 掌握集合推导式的实际应用场景
集合推导式概念
什么是集合推导式
集合推导式(Set Comprehension)是Python中创建集合的简洁语法,它允许我们通过一个表达式和迭代器来快速生成集合。与列表推导式类似,但集合推导式会自动去除重复元素。
集合推导式的优势
- 自动去重:自动去除重复元素
- 代码简洁:一行代码替代多行循环
- 性能更好:比传统for循环执行更快
- 可读性强:意图明确,易于理解
- 集合运算:支持交集、并集等集合运算
基本集合推导式
基本语法结构
# 基本语法
{expression for item in iterable}
# 传统for循环方式
result = set()
for item in iterable:
result.add(expression)基本示例
# 1. 从列表创建集合
numbers = [1, 2, 3, 2, 4, 3, 5]
unique_numbers = {num for num in numbers}
print(f"唯一数字集合: {unique_numbers}")
# 输出: 唯一数字集合: {1, 2, 3, 4, 5}
# 2. 创建平方数集合
squares = {x**2 for x in range(6)}
print(f"平方数集合: {squares}")
# 输出: 平方数集合: {0, 1, 4, 9, 16, 25}
# 3. 从字符串创建字符集合
text = "hello world"
chars = {char for char in text if char != ' '}
print(f"字符集合: {chars}")
# 输出: 字符集合: {'d', 'e', 'h', 'l', 'o', 'r', 'w'}
# 4. 创建单词首字母集合
words = ["apple", "banana", "cherry", "apricot", "blueberry"]
first_letters = {word[0].upper() for word in words}
print(f"首字母集合: {first_letters}")
# 输出: 首字母集合: {'A', 'B', 'C'}
# 5. 创建长度集合
words = ["apple", "banana", "cherry", "date", "fig", "grape"]
lengths = {len(word) for word in words}
print(f"长度集合: {lengths}")
# 输出: 长度集合: {3, 4, 5, 6}使用函数和复杂表达式
# 1. 使用数学函数
import math
angles = [0, 30, 45, 60, 90]
sin_values = {round(math.sin(math.radians(angle)), 4) for angle in angles}
print(f"正弦值集合: {sin_values}")
# 输出: 正弦值集合: {0.0, 0.5, 0.7071, 0.866, 1.0}
# 2. 处理字符串列表
words = ["hello", "world", "python"]
word_lengths = {len(word) for word in words}
print(f"单词长度集合: {word_lengths}")
# 输出: 单词长度集合: {5, 6}
# 3. 处理字典值
data = {"a": 1, "b": 2, "c": 3, "d": 2}
unique_values = {v for v in data.values()}
print(f"唯一值集合: {unique_values}")
# 输出: 唯一值集合: {1, 2, 3}
# 4. 复杂计算
numbers = [1, 2, 3, 4, 5]
calculated = {num**2 + num for num in numbers}
print(f"计算结果集合: {calculated}")
# 输出: 计算结果集合: {2, 6, 12, 20, 30}带条件的集合推导式
基本条件语法
# 条件筛选语法
{expression for item in iterable if condition}
# 传统for循环方式
result = set()
for item in iterable:
if condition:
result.add(expression)条件筛选示例
# 1. 筛选偶数
numbers = range(1, 21)
even_numbers = {num for num in numbers if num % 2 == 0}
print(f"偶数集合: {even_numbers}")
# 输出: 偶数集合: {2, 4, 6, 8, 10, 12, 14, 16, 18, 20}
# 2. 筛选特定长度的字符串
words = ["apple", "banana", "cherry", "date", "fig", "grape"]
short_words = {word for word in words if len(word) <= 4}
print(f"短单词集合: {short_words}")
# 输出: 短单词集合: {'date', 'fig'}
# 3. 筛选特定范围的数字
numbers = range(1, 101)
filtered = {num for num in numbers if 20 <= num <= 30}
print(f"20-30之间的数字: {filtered}")
# 输出: 20-30之间的数字: {20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}
# 4. 筛选特定字符
text = "Python is awesome!"
vowels = {char for char in text.lower() if char in 'aeiou'}
print(f"元音字母集合: {vowels}")
# 输出: 元音字母集合: {'a', 'e', 'i', 'o'}使用复杂条件
# 1. 多条件筛选
numbers = range(1, 31)
filtered = {num for num in numbers if num % 2 == 0 and num % 3 == 0}
print(f"能被2和3整除的数: {filtered}")
# 输出: 能被2和3整除的数: {6, 12, 18, 24, 30}
# 2. 字符串条件筛选
words = ["apple", "banana", "cherry", "apricot", "blueberry", "coconut"]
filtered_words = {
word for word in words
if len(word) >= 6 and word.startswith(('a', 'b'))
}
print(f"长度≥6且以a或b开头的单词: {filtered_words}")
# 输出: 长度≥6且以a或b开头的单词: {'banana', 'apricot', 'blueberry'}
# 3. 使用函数作为条件
def is_prime(n):
"""检查是否为质数"""
if n < 2:
return False
for i in range(2, int(n**0.5) + 1):
if n % i == 0:
return False
return True
numbers = range(2, 31)
prime_numbers = {num for num in numbers if is_prime(num)}
print(f"质数集合: {prime_numbers}")
# 输出: 质数集合: {2, 3, 5, 7, 11, 13, 17, 19, 23, 29}
# 4. 文本处理条件
sentences = [
"Python is powerful",
"List comprehensions are useful",
"Code should be readable",
"Programming is fun"
]
# 提取包含特定单词的句子中的单词
python_words = {
word.lower()
for sentence in sentences
if "python" in sentence.lower()
for word in sentence.split()
}
print(f"包含Python的句子中的单词: {python_words}")
# 输出: 包含Python的句子中的单词: {'python', 'is', 'powerful'}集合推导式的高级用法
条件表达式与集合推导式
# 1. 根据条件生成不同值
numbers = range(-5, 6)
absolute_values = {num if num >= 0 else -num for num in numbers}
print(f"绝对值集合: {absolute_values}")
# 输出: 绝对值集合: {0, 1, 2, 3, 4, 5}
# 2. 分类标签
scores = [95, 67, 82, 45, 73, 88, 52]
grades = {
"优秀" if score >= 85
else "良好" if score >= 70
else "及格" if score >= 60
else "不及格"
for score in scores
}
print(f"成绩等级集合: {grades}")
# 输出: 成绩等级集合: {'优秀', '良好', '及格', '不及格'}
# 3. 处理边界情况
numbers = [1, 2, 0, -1, -2, 3, -3]
safe_division = {
1/num if num != 0 else 0
for num in numbers
}
print(f"安全除法结果集合: {safe_division}")
# 输出: 安全除法结果集合: {0, 0.3333333333333333, 0.5, 1.0, -0.3333333333333333, -0.5, -1.0}
# 4. 字符串处理
words = ["", "hello", "world", "", "python", ""]
processed_words = {
word.upper() if word else "EMPTY"
for word in words
}
print(f"处理后的单词集合: {processed_words}")
# 输出: 处理后的单词集合: {'EMPTY', 'HELLO', 'WORLD', 'PYTHON'}集合运算
# 1. 并集运算
set1 = {1, 2, 3, 4}
set2 = {3, 4, 5, 6}
union_set = set1 | set2
print(f"并集: {union_set}")
# 输出: 并集: {1, 2, 3, 4, 5, 6}
# 2. 交集运算
intersection_set = set1 & set2
print(f"交集: {intersection_set}")
# 输出: 交集: {3, 4}
# 3. 差集运算
difference_set = set1 - set2
print(f"差集: {difference_set}")
# 输出: 差集: {1, 2}
# 4. 对称差集运算
symmetric_diff = set1 ^ set2
print(f"对称差集: {symmetric_diff}")
# 输出: 对称差集: {1, 2, 5, 6}
# 5. 使用集合推导式进行集合运算
list1 = [1, 2, 3, 4, 5]
list2 = [4, 5, 6, 7, 8]
# 并集
union = {x for x in list1} | {x for x in list2}
print(f"使用推导式的并集: {union}")
# 交集
intersection = {x for x in list1 if x in list2}
print(f"使用推导式的交集: {intersection}")
# 差集
difference = {x for x in list1 if x not in list2}
print(f"使用推导式的差集: {difference}")集合推导式的应用案例
案例1:数据去重
# 1. 列表去重
numbers = [1, 2, 3, 2, 4, 3, 5, 1, 6]
unique_numbers = list({num for num in numbers})
print(f"去重后的列表: {unique_numbers}")
# 输出: 去重后的列表: [1, 2, 3, 4, 5, 6]
# 2. 字符串去重
text = "hello world"
unique_chars = sorted({char for char in text if char != ' '})
print(f"去重后的字符: {unique_chars}")
# 输出: 去重后的字符: ['d', 'e', 'h', 'l', 'o', 'r', 'w']
# 3. 字典值去重
data = [
{"name": "张三", "city": "北京"},
{"name": "李四", "city": "上海"},
{"name": "王五", "city": "北京"},
{"name": "赵六", "city": "广州"}
]
unique_cities = {person["city"] for person in data}
print(f"唯一城市: {unique_cities}")
# 输出: 唯一城市: {'北京', '上海', '广州'}
# 4. 多字段去重
students = [
{"name": "张三", "class": "1班"},
{"name": "李四", "class": "2班"},
{"name": "王五", "class": "1班"},
{"name": "赵六", "class": "2班"}
]
unique_classes = {student["class"] for student in students}
print(f"唯一班级: {unique_classes}")
# 输出: 唯一班级: {'1班', '2班'}案例2:数据分析
# 1. 统计唯一值
data = [1, 2, 3, 2, 4, 3, 5, 1, 6, 2, 3]
unique_values = {x for x in data}
count_unique = len(unique_values)
count_total = len(data)
print(f"唯一值数量: {count_unique}")
print(f"总数量: {count_total}")
print(f"重复率: {(count_total - count_unique) / count_total * 100:.1f}%")
# 2. 查找共同元素
list1 = [1, 2, 3, 4, 5]
list2 = [4, 5, 6, 7, 8]
common_elements = {x for x in list1 if x in list2}
print(f"共同元素: {common_elements}")
# 输出: 共同元素: {4, 5}
# 3. 查找唯一元素
list1 = [1, 2, 3, 4, 5]
list2 = [4, 5, 6, 7, 8]
unique_in_list1 = {x for x in list1 if x not in list2}
unique_in_list2 = {x for x in list2 if x not in list1}
print(f"list1独有的元素: {unique_in_list1}")
print(f"list2独有的元素: {unique_in_list2}")
# 输出:
# list1独有的元素: {1, 2, 3}
# list2独有的元素: {6, 7, 8}
# 4. 数据验证
valid_values = {1, 2, 3, 4, 5}
test_data = [1, 2, 3, 6, 7, 8]
invalid_values = {x for x in test_data if x not in valid_values}
print(f"无效值: {invalid_values}")
# 输出: 无效值: {6, 7, 8}案例3:文本处理
# 1. 提取唯一单词
text = "Python is great. Python is popular. Python is powerful."
words = [word.strip(". ").lower() for word in text.split()]
unique_words = {word for word in words}
print(f"唯一单词: {unique_words}")
# 输出: 唯一单词: {'python', 'is', 'great', 'popular', 'powerful'}
# 2. 提取唯一字符
sentence = "Hello World!"
unique_chars = {char for char in sentence if char.isalpha()}
print(f"唯一字符: {unique_chars}")
# 输出: 唯一字符: {'d', 'e', 'H', 'l', 'o', 'r', 'W'}
# 3. 提取首字母
words = ["apple", "banana", "cherry", "apricot", "blueberry", "coconut"]
first_letters = {word[0].upper() for word in words}
print(f"首字母: {first_letters}")
# 输出: 首字母: {'A', 'B', 'C'}
# 4. 提取元音字母
text = "Python programming"
vowels = {char for char in text.lower() if char in 'aeiou'}
print(f"元音字母: {vowels}")
# 输出: 元音字母: {'a', 'i', 'o'}集合推导式与列表推导式的区别
去重特性
# 1. 列表推导式不去重
numbers = [1, 2, 3, 2, 4, 3, 5]
list_result = [x**2 for x in numbers]
print(f"列表推导式结果: {list_result}")
# 输出: 列表推导式结果: [1, 4, 9, 4, 16, 9, 25]
# 2. 集合推导式自动去重
set_result = {x**2 for x in numbers}
print(f"集合推导式结果: {set_result}")
# 输出: 集合推导式结果: {1, 4, 9, 16, 25}
# 3. 顺序问题
# 列表推导式保持顺序
list_result = [x % 3 for x in range(10)]
print(f"列表推导式(有序): {list_result}")
# 输出: 列表推导式(有序): [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
# 集合推导式不保证顺序
set_result = {x % 3 for x in range(10)}
print(f"集合推导式(无序): {set_result}")
# 输出: 集合推导式(无序): {0, 1, 2}性能对比
import time
# 1. 去重性能对比
data = list(range(10000)) * 10 # 100,000个元素,有很多重复
# 列表推导式 + set转换
start_time = time.time()
unique_list = list(set([x for x in data]))
list_time = time.time() - start_time
# 集合推导式
start_time = time.time()
unique_set = {x for x in data}
set_time = time.time() - start_time
print(f"列表推导式+set转换时间: {list_time:.6f}秒")
print(f"集合推导式时间: {set_time:.6f}秒")
print(f"性能提升: {list_time/set_time:.2f}倍")
# 2. 查找性能对比
# 创建查找集合
lookup_set = {x for x in range(10000)}
lookup_list = [x for x in range(10000)]
# 在集合中查找
start_time = time.time()
for i in range(1000):
_ = i in lookup_set
set_lookup_time = time.time() - start_time
# 在列表中查找
start_time = time.time()
for i in range(1000):
_ = i in lookup_list
list_lookup_time = time.time() - start_time
print(f"\n集合查找时间: {set_lookup_time:.6f}秒")
print(f"列表查找时间: {list_lookup_time:.6f}秒")
print(f"性能提升: {list_lookup_time/set_lookup_time:.2f}倍")集合推导式的局限性
何时不应使用集合推导式
# 1. 需要保持顺序的情况
# 不推荐:集合不保证顺序
result = {x for x in range(10)}
# 推荐:使用列表推导式
result = [x for x in range(10)]
# 2. 需要重复元素的情况
# 不推荐:集合会去重
result = {x % 3 for x in range(10)}
# 推荐:使用列表推导式
result = [x % 3 for x in range(10)]
# 3. 复杂逻辑不适合集合推导式
# 不推荐:过于复杂的集合推导式
complex_result = {
x**2 + y**2 if (x + y) % 2 == 0 else (x - y)**2
for x in range(5)
for y in range(5)
if x != y and (x * y) % 3 == 0
}
# 推荐:使用传统for循环
complex_result = set()
for x in range(5):
for y in range(5):
if x != y and (x * y) % 3 == 0:
if (x + y) % 2 == 0:
complex_result.add(x**2 + y**2)
else:
complex_result.add((x - y)**2)最佳实践
使用集合推导式的建议
# 1. 保持简洁
# 推荐:简单明了
squares = {x**2 for x in range(10)}
# 不推荐:过于复杂
result = {x if condition(x) else default(x) for x in data if filter_condition(x)}
# 2. 使用有意义的变量名
# 推荐:清晰的变量名
word_lengths = {len(word) for word in words}
# 不推荐:模糊的变量名
s = {len(x) for x in y}
# 3. 合理使用条件
# 推荐:简单的条件筛选
even_numbers = {x for x in range(10) if x % 2 == 0}
# 不推荐:复杂的嵌套条件
result = {x for x in data if condition1(x) and condition2(x) or condition3(x)}
# 4. 考虑可读性
# 推荐:易于理解
unique_words = {word.lower() for word in words}
# 不推荐:难以理解
result = {x.strip().lower() for x in y if x and not x.startswith('#')}总结
集合推导式是Python中创建集合和去重的强大工具,它具有以下特点:
- 自动去重:自动去除重复元素
- 简洁性:一行代码完成复杂的集合创建和去重
- 高效性:查找和去重操作性能优异
- 可读性:意图明确,易于理解
- 集合运算:支持交集、并集等集合运算
在使用集合推导式时,应该:
- 保持代码简洁明了
- 使用有意义的变量名
- 避免过于复杂的逻辑
- 注意集合无序的特性
- 在需要去重时优先使用
通过合理使用集合推导式,可以写出更加Pythonic和高效的代码。