Python查找优化案例实操?

wen python案例 2

本文目录导读:

  1. 案例1:列表查找 vs 集合查找
  2. 案例2:字典查找优化
  3. 案例3:二分查找优化
  4. 案例4:字符串查找优化
  5. 案例5:实战:用户系统查找优化
  6. 案例6:高级:LRU缓存优化
  7. 性能优化总结表
  8. 选择建议

我来分享几个Python查找优化的实战案例,从基础到进阶。

案例1:列表查找 vs 集合查找

import time
import random
# 生成测试数据
n = 1000000
data_list = list(range(n))
data_set = set(data_list)
search_items = [random.randint(0, n*2) for _ in range(10000)]
# 列表查找
start = time.time()
for item in search_items:
    item in data_list
list_time = time.time() - start
# 集合查找
start = time.time()
for item in search_items:
    item in data_set
set_time = time.time() - start
print(f"列表查找时间: {list_time:.4f}秒")
print(f"集合查找时间: {set_time:.4f}秒")
print(f"集合比列表快了 {list_time/set_time:.0f} 倍")

案例2:字典查找优化

from collections import defaultdict, Counter
import time
# 场景:统计单词频率并快速查找
words = ["apple", "banana", "apple", "orange", "banana"] * 100000
# 方法1:普通字典
def count_with_dict():
    count = {}
    for word in words:
        if word in count:
            count[word] += 1
        else:
            count[word] = 1
    return count
# 方法2:defaultdict
def count_with_defaultdict():
    count = defaultdict(int)
    for word in words:
        count[word] += 1
    return count
# 方法3:Counter(最快)
def count_with_counter():
    return Counter(words)
# 性能对比
for func in [count_with_dict, count_with_defaultdict, count_with_counter]:
    start = time.time()
    result = func()
    print(f"{func.__name__}: {time.time() - start:.4f}秒")

案例3:二分查找优化

import bisect
import time
import random
# 有序列表查找优化
data = sorted([random.randint(0, 1000000) for _ in range(100000)])
search_items = [random.randint(0, 1000000) for _ in range(1000)]
# 线性查找
def linear_search(arr, targets):
    results = []
    for target in targets:
        found = False
        for item in arr:
            if item == target:
                found = True
                break
        results.append(found)
    return results
# 二分查找
def binary_search(arr, targets):
    results = []
    for target in targets:
        idx = bisect.bisect_left(arr, target)
        results.append(idx < len(arr) and arr[idx] == target)
    return results
# 性能对比
start = time.time()
linear_search(data, search_items)
print(f"线性查找: {time.time() - start:.4f}秒")
start = time.time()
binary_search(data, search_items)
print(f"二分查找: {time.time() - start:.4f}秒")

案例4:字符串查找优化

import re
import time
# 场景:在大量文本中查找特定模式
text = "Python" * 100000 + "优化" * 100000
# 方法1:使用in操作符
def find_with_in(text, pattern):
    return pattern in text
# 方法2:使用find方法
def find_with_find(text, pattern):
    return text.find(pattern) != -1
# 方法3:使用正则表达式(最适合复杂模式)
def find_with_regex(text, pattern):
    return bool(re.search(pattern, text))
# 测试性能
patterns = ["Python", "优化", "不存在"]
for pattern in patterns:
    print(f"\n查找 '{pattern}':")
    for func in [find_with_in, find_with_find, find_with_regex]:
        start = time.time()
        for _ in range(1000):
            func(text, pattern)
        print(f"{func.__name__}: {time.time() - start:.4f}秒")

案例5:实战:用户系统查找优化

import hashlib
import time
from collections import defaultdict
# 模拟用户数据
class UserDatabase:
    def __init__(self, size=100000):
        self.users = []
        self.id_index = {}
        self.email_index = defaultdict(list)
        self.name_index = {}
        self._generate_users(size)
    def _generate_users(self, size):
        for i in range(size):
            user = {
                'id': i,
                'name': f'user_{i}',
                'email': f'user{i}@example.com',
                'phone': f'138{i:08d}'
            }
            self.users.append(user)
            self.id_index[i] = user
            domain = user['email'].split('@')[1]
            self.email_index[domain].append(user)
    # 无索引查找
    def find_by_id_linear(self, user_id):
        for user in self.users:
            if user['id'] == user_id:
                return user
        return None
    # 有索引查找
    def find_by_id_indexed(self, user_id):
        return self.id_index.get(user_id)
    # 按域名查找(多值索引)
    def find_by_domain(self, domain):
        return self.email_index.get(domain, [])
# 性能测试
db = UserDatabase(100000)
# 测试ID查找
user_id = 50000
start = time.time()
for _ in range(1000):
    db.find_by_id_linear(user_id)
print(f"线性查找: {time.time() - start:.4f}秒")
start = time.time()
for _ in range(1000):
    db.find_by_id_indexed(user_id)
print(f"索引查找: {time.time() - start:.4f}秒")
# 测试域名查找
start = time.time()
domain_users = db.find_by_domain('example.com')
print(f"域名查找: 找到 {len(domain_users)} 个用户")

案例6:高级:LRU缓存优化

from functools import lru_cache
import time
# 模拟计算密集型函数
def expensive_function(n):
    """模拟耗时计算"""
    time.sleep(0.001)  # 模拟计算延迟
    return n * n + n
# 不带缓存的版本
def find_without_cache(numbers):
    results = []
    for n in numbers:
        results.append(expensive_function(n))
    return results
# 带LRU缓存的版本
@lru_cache(maxsize=1000)
def cached_expensive_function(n):
    return expensive_function(n)
def find_with_cache(numbers):
    results = []
    for n in numbers:
        results.append(cached_expensive_function(n))
    return results
# 测试数据(包含重复值)
test_data = [i % 500 for i in range(1000)]
# 性能对比
start = time.time()
find_without_cache(test_data)
print(f"无缓存: {time.time() - start:.4f}秒")
start = time.time()
find_with_cache(test_data)
print(f"LRU缓存: {time.time() - start:.4f}秒")
# 查看缓存信息
print(f"缓存命中率: {cached_expensive_function.cache_info()}")

性能优化总结表

查找场景 优化前 优化后 速度提升
成员检查 列表(list) 集合(set) 100-1000x
频率统计 普通字典 Counter 2-5x
有序查找 线性搜索 二分查找 100-1000x
字符串查找 复杂模式手写 正则表达式 10-100x
重复计算 每次重新计算 LRU缓存 100-10000x

选择建议

  1. 小数据量 (<100):不必过度优化,列表查找足够
  2. 大数据量:优先考虑集合(set)、字典(dict)
  3. 频繁查找:建立索引(字典、集合)
  4. 有序数据:使用二分查找(bisect)
  5. 重复计算:添加缓存(lru_cache)

这些实战案例覆盖了日常开发中最常见的查找优化场景,你可以根据实际需求选择合适的优化策略。

标签: 哈希索引

抱歉,评论功能暂时关闭!