散列表(Hash Table)是一种高效的键值对数据结构,具有快速的查找、插入和删除操作。它通过哈希函数将键映射到散列表的索引,实现对数据的快速访问。
散列表的实现
- 哈希函数:将键转换为整数索引。
- 冲突处理 :解决哈希冲突(不同的键映射到同一索引)。
- 链地址法(拉链法):每个索引位置用链表存储多个键值对。
- 开放寻址法:当冲突发生时,使用线性探测、二次探测或双重散列等策略寻找新的空闲位置。
Python 实现:链地址法(拉链法)
我们将使用链地址法来实现一个简单的散列表。
散列表类实现:
python
class HashTable:
def __init__(self, size=10):
"""初始化散列表"""
self.size = size
self.table = [[] for _ in range(size)]
def _hash_function(self, key):
"""哈希函数:将键转换为表索引"""
return hash(key) % self.size
def insert(self, key, value):
"""插入键值对"""
index = self._hash_function(key)
# 检查键是否已经存在
for pair in self.table[index]:
if pair[0] == key:
pair[1] = value # 更新现有键的值
return
# 插入新键值对
self.table[index].append([key, value])
def search(self, key):
"""根据键查找值"""
index = self._hash_function(key)
for pair in self.table[index]:
if pair[0] == key:
return pair[1]
return None # 找不到对应键
def delete(self, key):
"""删除键值对"""
index = self._hash_function(key)
for i, pair in enumerate(self.table[index]):
if pair[0] == key:
del self.table[index][i]
return True
return False # 找不到对应键
def display(self):
"""显示散列表"""
for i, pairs in enumerate(self.table):
print(f"Index {i}: {pairs}")
# 示例使用
hash_table = HashTable(size=7)
# 插入键值对
hash_table.insert("Alice", 25)
hash_table.insert("Bob", 30)
hash_table.insert("Charlie", 35)
hash_table.insert("Diana", 40)
hash_table.insert("Eve", 45)
hash_table.insert("Frank", 50)
hash_table.insert("Grace", 55)
hash_table.insert("Heidi", 60)
# 显示散列表
print("Initial Hash Table:")
hash_table.display()
# 查找键
print("\nSearching for 'Diana':", hash_table.search("Diana"))
print("Searching for 'Eve':", hash_table.search("Eve"))
print("Searching for 'Zara':", hash_table.search("Zara"))
# 删除键值对
hash_table.delete("Alice")
hash_table.delete("Charlie")
hash_table.delete("Heidi")
# 显示删除后的散列表
print("\nHash Table after deletions:")
hash_table.display()
Python 实现:开放寻址法
我们还可以使用开放寻址法来实现散列表。这里使用线性探测作为冲突处理策略。
散列表类实现:
python
class OpenAddressingHashTable:
def __init__(self, size=10):
"""初始化散列表"""
self.size = size
self.table = [None] * size
self.deleted = object() # 标记删除位置
def _hash_function(self, key):
"""哈希函数:将键转换为表索引"""
return hash(key) % self.size
def _probe(self, key, index):
"""线性探测"""
return (index + 1) % self.size
def insert(self, key, value):
"""插入键值对"""
index = self._hash_function(key)
while self.table[index] is not None and self.table[index] != self.deleted:
if self.table[index][0] == key:
self.table[index] = (key, value) # 更新现有键的值
return
index = self._probe(key, index)
self.table[index] = (key, value)
def search(self, key):
"""根据键查找值"""
index = self._hash_function(key)
start_index = index
while self.table[index] is not None:
if self.table[index] != self.deleted and self.table[index][0] == key:
return self.table[index][1]
index = self._probe(key, index)
if index == start_index:
break
return None # 找不到对应键
def delete(self, key):
"""删除键值对"""
index = self._hash_function(key)
start_index = index
while self.table[index] is not None:
if self.table[index] != self.deleted and self.table[index][0] == key:
self.table[index] = self.deleted
return True
index = self._probe(key, index)
if index == start_index:
break
return False # 找不到对应键
def display(self):
"""显示散列表"""
for i, pair in enumerate(self.table):
print(f"Index {i}: {pair}")
# 示例使用
open_addressing_hash_table = OpenAddressingHashTable(size=7)
# 插入键值对
open_addressing_hash_table.insert("Alice", 25)
open_addressing_hash_table.insert("Bob", 30)
open_addressing_hash_table.insert("Charlie", 35)
open_addressing_hash_table.insert("Diana", 40)
open_addressing_hash_table.insert("Eve", 45)
open_addressing_hash_table.insert("Frank", 50)
open_addressing_hash_table.insert("Grace", 55)
open_addressing_hash_table.insert("Heidi", 60)
# 显示散列表
print("Initial Open Addressing Hash Table:")
open_addressing_hash_table.display()
# 查找键
print("\nSearching for 'Diana':", open_addressing_hash_table.search("Diana"))
print("Searching for 'Eve':", open_addressing_hash_table.search("Eve"))
print("Searching for 'Zara':", open_addressing_hash_table.search("Zara"))
# 删除键值对
open_addressing_hash_table.delete("Alice")
open_addressing_hash_table.delete("Charlie")
open_addressing_hash_table.delete("Heidi")
# 显示删除后的散列表
print("\nOpen Addressing Hash Table after deletions:")
open_addressing_hash_table.display()
总结
- 链地址法(拉链法):使用链表解决哈希冲突,简单易用。
- 开放寻址法:通过线性探测或其他策略处理哈希冲突,但需保持装载因子不过高。
- Python 实现 :Python 中可以使用
dict
类实现高效的键值对存储,但手动实现散列表有助于更好理解其原理和应用。
我们继续扩展和深入探讨散列表的实现与应用,可以考虑:
- 改进的开放寻址法:实现更复杂的探测策略,如二次探测和双重散列。
- 扩展的链地址法:使用红黑树或 AVL 树作为链表中的数据结构,提高查询效率。
- 实际应用案例:例如,统计英文文章中每个单词的出现次数。
改进的开放寻址法:二次探测和双重散列
二次探测实现
在二次探测中,探测位置依次为:
其中, ℎ(𝑘)为哈希函数, 𝑖是探测次数, 𝑚为表的大小。
Python 实现:
python
class QuadraticProbingHashTable:
def __init__(self, size=10):
"""初始化散列表"""
self.size = size
self.table = [None] * size
self.deleted = object()
def _hash_function(self, key):
"""哈希函数:将键转换为表索引"""
return hash(key) % self.size
def _probe(self, key, index, attempt):
"""二次探测"""
return (index + attempt**2) % self.size
def insert(self, key, value):
"""插入键值对"""
index = self._hash_function(key)
attempt = 0
while self.table[index] is not None and self.table[index] != self.deleted:
if self.table[index][0] == key:
self.table[index] = (key, value) # 更新现有键的值
return
attempt += 1
index = self._probe(key, index, attempt)
self.table[index] = (key, value)
def search(self, key):
"""根据键查找值"""
index = self._hash_function(key)
start_index = index
attempt = 0
while self.table[index] is not None:
if self.table[index] != self.deleted and self.table[index][0] == key:
return self.table[index][1]
attempt += 1
index = self._probe(key, index, attempt)
if index == start_index:
break
return None # 找不到对应键
def delete(self, key):
"""删除键值对"""
index = self._hash_function(key)
start_index = index
attempt = 0
while self.table[index] is not None:
if self.table[index] != self.deleted and self.table[index][0] == key:
self.table[index] = self.deleted
return True
attempt += 1
index = self._probe(key, index, attempt)
if index == start_index:
break
return False # 找不到对应键
def display(self):
"""显示散列表"""
for i, pair in enumerate(self.table):
print(f"Index {i}: {pair}")
# 示例使用
quadratic_probing_hash_table = QuadraticProbingHashTable(size=7)
# 插入键值对
quadratic_probing_hash_table.insert("Alice", 25)
quadratic_probing_hash_table.insert("Bob", 30)
quadratic_probing_hash_table.insert("Charlie", 35)
quadratic_probing_hash_table.insert("Diana", 40)
quadratic_probing_hash_table.insert("Eve", 45)
quadratic_probing_hash_table.insert("Frank", 50)
quadratic_probing_hash_table.insert("Grace", 55)
quadratic_probing_hash_table.insert("Heidi", 60)
# 显示散列表
print("Initial Quadratic Probing Hash Table:")
quadratic_probing_hash_table.display()
# 查找键
print("\nSearching for 'Diana':", quadratic_probing_hash_table.search("Diana"))
print("Searching for 'Eve':", quadratic_probing_hash_table.search("Eve"))
print("Searching for 'Zara':", quadratic_probing_hash_table.search("Zara"))
# 删除键值对
quadratic_probing_hash_table.delete("Alice")
quadratic_probing_hash_table.delete("Charlie")
quadratic_probing_hash_table.delete("Heidi")
# 显示删除后的散列表
print("\nQuadratic Probing Hash Table after deletions:")
quadratic_probing_hash_table.display()
双重散列实现
在双重散列中,我们使用两个不同的哈希函数来确定探测位置:
其中, ℎ1(𝑘)和 ℎ2(𝑘)分别是两个不同的哈希函数。
Python 实现:
python
class DoubleHashingHashTable:
def __init__(self, size=10):
"""初始化散列表"""
self.size = size
self.table = [None] * size
self.deleted = object()
def _hash_function1(self, key):
"""主哈希函数:将键转换为表索引"""
return hash(key) % self.size
def _hash_function2(self, key):
"""第二哈希函数:确保非零返回值"""
return 1 + (hash(key) % (self.size - 1))
def _probe(self, key, index, attempt):
"""双重散列探测"""
return (index + attempt * self._hash_function2(key)) % self.size
def insert(self, key, value):
"""插入键值对"""
index = self._hash_function1(key)
attempt = 0
while self.table[index] is not None and self.table[index] != self.deleted:
if self.table[index][0] == key:
self.table[index] = (key, value) # 更新现有键的值
return
attempt += 1
index = self._probe(key, index, attempt)
self.table[index] = (key, value)
def search(self, key):
"""根据键查找值"""
index = self._hash_function1(key)
start_index = index
attempt = 0
while self.table[index] is not None:
if self.table[index] != self.deleted and self.table[index][0] == key:
return self.table[index][1]
attempt += 1
index = self._probe(key, index, attempt)
if index == start_index:
break
return None # 找不到对应键
def delete(self, key):
"""删除键值对"""
index = self._hash_function1(key)
start_index = index
attempt = 0
while self.table[index] is not None:
if self.table[index] != self.deleted and self.table[index][0] == key:
self.table[index] = self.deleted
return True
attempt += 1
index = self._probe(key, index, attempt)
if index == start_index:
break
return False # 找不到对应键
def display(self):
"""显示散列表"""
for i, pair in enumerate(self.table):
print(f"Index {i}: {pair}")
# 示例使用
double_hashing_hash_table = DoubleHashingHashTable(size=7)
# 插入键值对
double_hashing_hash_table.insert("Alice", 25)
double_hashing_hash_table.insert("Bob", 30)
double_hashing_hash_table.insert("Charlie", 35)
double_hashing_hash_table.insert("Diana", 40)
double_hashing_hash_table.insert("Eve", 45)
double_hashing_hash_table.insert("Frank", 50)
double_hashing_hash_table.insert("Grace", 55)
double_hashing_hash_table.insert("Heidi", 60)
# 显示散列表
print("Initial Double Hashing Hash Table:")
double_hashing_hash_table.display()
# 查找键
print("\nSearching for 'Diana':", double_hashing_hash_table.search("Diana"))
print("Searching for 'Eve':", double_hashing_hash_table.search("Eve"))
print("Searching for 'Zara':", double_hashing_hash_table.search("Zara"))
# 删除键值对
double_hashing_hash_table.delete("Alice")
double_hashing_hash_table.delete("Charlie")
double_hashing_hash_table.delete("Heidi")
# 显示删除后的散列表
print("\nDouble Hashing Hash Table after deletions:")
double_hashing_hash_table.display()
实际应用案例:单词计数
利用散列表来统计英文文章中每个单词的出现次数。
Python 实现:
python
import string
class WordCounter:
def __init__(self, size=100):
"""初始化散列表"""
self.size = size
self.table = [None] * size
self.deleted = object()
def _hash_function(self, key):
"""哈希函数:将键转换为表索引"""
return hash(key) % self.size
def insert(self, key, value):
"""插入键值对"""
index = self._hash_function(key)
while self.table[index] is not None and self.table[index] != self.deleted:
if self.table[index][0] == key:
self.table[index] = (key, value) # 更新现有键的值
return
index = (index + 1) % self.size
self.table[index] = (key, value)
def search(self, key):
"""根据键查找值"""
index = self._hash_function(key)
start_index = index
while self.table[index] is not None:
if self.table[index] != self.deleted and self.table[index][0] == key:
return self.table[index][1]
index = (index + 1) % self.size
if index == start_index:
break
return None # 找不到对应键
def increment(self, key):
"""增加单词的计数"""
count = self.search(key)
if count is None:
self.insert(key, 1)
else:
self.insert(key, count + 1)
def display(self):
"""显示单词计数"""
for pair in self.table:
if pair is not None and pair != self.deleted:
print(f"{pair[0]}: {pair[1]}")
# 示例使用
text = """
This is a sample text. It is designed to test the hash table implementation.
The hash table is supposed to count the occurrence of each word.
"""
# 清理文本,转换为小写
translator = str.maketrans('', '', string.punctuation)
clean_text = text.translate(translator).lower()
words = clean_text.split()
# 使用散列表计数单词
word_counter = WordCounter(size=50)
for word in words:
word_counter.increment(word)
# 显示单词计数结果
print("Word Count:")
word_counter.display()
总结
- 开放寻址法改进:二次探测和双重散列是两种有效的开放寻址方法。
- 链地址法扩展:可以用红黑树、AVL 树等数据结构代替链表,提高查找效率。
- 实际应用案例:散列表在单词计数、字典存储等应用中具有广泛的实际价值。
让我们继续深入探讨散列表的其他改进和实际应用案例,包括:
- 基于红黑树或 AVL 树的链地址法实现:提高冲突处理效率。
- Bloom Filter:利用散列表和布隆过滤器进行快速集合成员测试。
- LRU Cache:结合散列表与双向链表实现的缓存系统。
基于红黑树的链地址法实现
链地址法通常使用链表来处理冲突。我们可以使用更高效的红黑树或 AVL 树来替代链表,提高冲突处理效率。
Python 实现:红黑树节点类
python
class RedBlackTreeNode:
def __init__(self, key, value, color='RED', left=None, right=None, parent=None):
"""初始化红黑树节点"""
self.key = key
self.value = value
self.color = color # 'RED' or 'BLACK'
self.left = left
self.right = right
self.parent = parent
class RedBlackTree:
def __init__(self):
"""初始化红黑树"""
self.NIL = RedBlackTreeNode(key=None, value=None, color='BLACK')
self.root = self.NIL
def _rotate_left(self, node):
"""左旋转"""
right = node.right
node.right = right.left
if right.left != self.NIL:
right.left.parent = node
right.parent = node.parent
if node.parent is None:
self.root = right
elif node == node.parent.left:
node.parent.left = right
else:
node.parent.right = right
right.left = node
node.parent = right
def _rotate_right(self, node):
"""右旋转"""
left = node.left
node.left = left.right
if left.right != self.NIL:
left.right.parent = node
left.parent = node.parent
if node.parent is None:
self.root = left
elif node == node.parent.right:
node.parent.right = left
else:
node.parent.left = left
left.right = node
node.parent = left
def _fix_insert(self, node):
"""修复插入操作"""
while node != self.root and node.parent.color == 'RED':
if node.parent == node.parent.parent.left:
uncle = node.parent.parent.right
if uncle.color == 'RED':
node.parent.color = 'BLACK'
uncle.color = 'BLACK'
node.parent.parent.color = 'RED'
node = node.parent.parent
else:
if node == node.parent.right:
node = node.parent
self._rotate_left(node)
node.parent.color = 'BLACK'
node.parent.parent.color = 'RED'
self._rotate_right(node.parent.parent)
else:
uncle = node.parent.parent.left
if uncle.color == 'RED':
node.parent.color = 'BLACK'
uncle.color = 'BLACK'
node.parent.parent.color = 'RED'
node = node.parent.parent
else:
if node == node.parent.left:
node = node.parent
self._rotate_right(node)
node.parent.color = 'BLACK'
node.parent.parent.color = 'RED'
self._rotate_left(node.parent.parent)
self.root.color = 'BLACK'
def insert(self, key, value):
"""插入键值对"""
new_node = RedBlackTreeNode(key, value, color='RED', left=self.NIL, right=self.NIL, parent=None)
parent = None
current = self.root
while current != self.NIL:
parent = current
if key < current.key:
current = current.left
else:
current = current.right
new_node.parent = parent
if parent is None:
self.root = new_node
elif key < parent.key:
parent.left = new_node
else:
parent.right = new_node
self._fix_insert(new_node)
def search(self, key):
"""根据键查找值"""
current = self.root
while current != self.NIL and key != current.key:
if key < current.key:
current = current.left
else:
current = current.right
if current == self.NIL:
return None
return current.value
def display(self, node=None, depth=0):
"""显示红黑树"""
if node is None:
node = self.root
if node != self.NIL:
print(" " * 4 * depth + f"Key: {node.key}, Value: {node.value}, Color: {node.color}")
self.display(node.left, depth + 1)
self.display(node.right, depth + 1)
# 示例使用
rbt = RedBlackTree()
rbt.insert("Alice", 25)
rbt.insert("Bob", 30)
rbt.insert("Charlie", 35)
rbt.insert("Diana", 40)
rbt.insert("Eve", 45)
print("Initial Red-Black Tree:")
rbt.display()
Python 实现:基于红黑树的链地址法
python
class RedBlackTreeHashTable:
def __init__(self, size=10):
"""初始化散列表"""
self.size = size
self.table = [RedBlackTree() for _ in range(size)]
def _hash_function(self, key):
"""哈希函数:将键转换为表索引"""
return hash(key) % self.size
def insert(self, key, value):
"""插入键值对"""
index = self._hash_function(key)
self.table[index].insert(key, value)
def search(self, key):
"""根据键查找值"""
index = self._hash_function(key)
return self.table[index].search(key)
def display(self):
"""显示散列表"""
for i, tree in enumerate(self.table):
print(f"Index {i}:")
tree.display()
# 示例使用
rbt_hash_table = RedBlackTreeHashTable(size=7)
rbt_hash_table.insert("Alice", 25)
rbt_hash_table.insert("Bob", 30)
rbt_hash_table.insert("Charlie", 35)
rbt_hash_table.insert("Diana", 40)
rbt_hash_table.insert("Eve", 45)
print("Initial Red-Black Tree Hash Table:")
rbt_hash_table.display()
布隆过滤器
布隆过滤器是一种基于散列表的概率性数据结构,用于测试集合成员资格。它具有空间效率高的特点,但可能会产生误报。
Python 实现:
python
from bitarray import bitarray
import hashlib
class BloomFilter:
def __init__(self, size, hash_count):
"""初始化布隆过滤器"""
self.size = size
self.hash_count = hash_count
self.bit_array = bitarray(size)
self.bit_array.setall(0)
def _hashes(self, item):
"""计算哈希函数值"""
result = []
for i in range(self.hash_count):
hash_result = int(hashlib.md5((str(i) + item).encode()).hexdigest(), 16)
result.append(hash_result % self.size)
return result
def add(self, item):
"""添加元素到布隆过滤器"""
for index in self._hashes(item):
self.bit_array[index] = 1
def __contains__(self, item):
"""检查元素是否在布隆过滤器中"""
return all(self.bit_array[index] for index in self._hashes(item))
# 示例使用
bloom_filter = BloomFilter(size=1000, hash_count=7)
bloom_filter.add("Alice")
bloom_filter.add("Bob")
bloom_filter.add("Charlie")
print("Checking membership in Bloom Filter:")
print("Alice in filter:", "Alice" in bloom_filter)
print("Bob in filter:", "Bob" in bloom_filter)
print("Charlie in filter:", "Charlie" in bloom_filter)
print("Diana in filter:", "Diana" in bloom_filter)
LRU Cache
LRU Cache(Least Recently Used Cache)是一种常见的缓存策略,它通过淘汰最少使用的数据保持缓存的最新状态。结合散列表和双向链表,可以有效实现该策略。
Python 实现:
python
class ListNode:
"""双向链表节点类"""
def __init__(self, key=None, value=None):
self.key = key
self.value = value
self.prev = None
self.next = None
class LRUCache:
"""LRU Cache 实现类"""
def __init__(self, capacity=10):
self.capacity = capacity
self.cache = {}
self.head = ListNode()
self.tail = ListNode()
self.head.next = self.tail
self.tail.prev = self.head
def _remove(self, node):
"""移除节点"""
prev = node.prev
next = node.next
prev.next = next
next.prev = prev
def _add_to_head(self, node):
"""添加节点到头部"""
node.next = self.head.next
node.prev = self.head
self.head.next.prev = node
self.head.next = node
def _move_to_head(self, node):
"""移动节点到头部"""
self._remove(node)
self._add_to_head(node)
def _pop_tail(self):
"""弹出尾部节点"""
node = self.tail.prev
self._remove(node)
return node
def get(self, key):
"""获取缓存中的值"""
node = self.cache.get(key)
if not node:
return -1
self._move_to_head(node)
return node.value
def put(self, key, value):
"""存储键值对"""
node = self.cache.get(key)
if not node:
new_node = ListNode(key, value)
self.cache[key] = new_node
self._add_to_head(new_node)
if len(self.cache) > self.capacity:
tail = self._pop_tail()
del self.cache[tail.key]
else:
node.value = value
self._move_to_head(node)
def display(self):
"""显示缓存状态"""
current = self.head.next
while current != self.tail:
print(f"({current.key}, {current.value})", end=" ")
current = current.next
print()
# 示例使用
lru_cache = LRUCache(capacity=3)
lru_cache.put("Alice", 25)
lru_cache.put("Bob", 30)
lru_cache.put("Charlie", 35)
print("Initial Cache:")
lru_cache.display()
# 更新并添加新的键值对
lru_cache.put("Diana", 40)
lru_cache.get("Alice")
lru_cache.put("Eve", 45)
print("Cache after updates:")
lru_cache.display()
总结
- 红黑树链地址法:使用红黑树替代链表,提高冲突处理效率。
- 布隆过滤器:高效的集合成员测试结构。
- LRU Cache:结合散列表与双向链表,实现高效缓存策略。
通过这些改进与应用案例,我们可以更深入地了解散列表的实际使用场景与优化策略。