文本是《数据结构(共13篇)》专题的第 6 篇。阅读本文前,建议先阅读前面的文章:
一、什么是集合和映射?
集合是高级数据结构的一种,它们的底层可以用多种方式来实现,就像之前的栈和队列一样。集合就像一个容器一样,我们这里实现的集合里边存储的数据是不能重复的,这样的数据结构可以帮助我们更加快速的进行统计。讲到这里是不是发现和我们之前的二分搜索树非常相似,其实这一次我们就会使用二分搜索树作为集合的底层实现。
二、二分搜索树实现集合Set
1️⃣ 实现集合Set接口
from abc import ABC, abstractmethod
class Set(ABC):
"""集合抽象类"""
@abstractmethod
def add(self, e):
"""
向集合中添加元素(实现去重的操作)
"""
pass
@abstractmethod
def remove(self, e):
"""
从集合中删除元素
"""
pass
@abstractmethod
def contains(self, e):
"""
查询集合中是否有相同元素
"""
pass
@abstractmethod
def get_size(self):
"""
获取集合中元素的个数
"""
pass
@abstractmethod
def is_empty(self):
"""
查看集合是否为空
"""
pass
2️⃣ 实现二分搜索树BST类
class TreeNode:
"""二分搜索树节点"""
def __init__(self, e):
self.e = e
self.left = None
self.right = None
class BST:
"""二分搜索树"""
def __init__(self):
self.root = None
self.size = 0
def get_size(self):
return self.size
def is_empty(self):
return self.size == 0
def add(self, e):
"""向二分搜索树中添加元素e"""
self.root = self._add(self.root, e)
def _add(self, node, e):
"""向以node为根的二分搜索树中插入元素e"""
if node is None:
self.size += 1
return TreeNode(e)
if e < node.e:
node.left = self._add(node.left, e)
elif e > node.e:
node.right = self._add(node.right, e)
return node
def contains(self, e):
"""查看二分搜索树中是否包含元素e"""
return self._contains(self.root, e)
def _contains(self, node, e):
"""查看以node为根的二分搜索树中是否包含元素e"""
if node is None:
return False
if e == node.e:
return True
elif e < node.e:
return self._contains(node.left, e)
else:
return self._contains(node.right, e)
def remove(self, e):
"""从二分搜索树中删除元素e"""
self.root = self._remove(self.root, e)
def _remove(self, node, e):
"""删除以node为根的二分搜索树中的元素e"""
if node is None:
return None
if e < node.e:
node.left = self._remove(node.left, e)
return node
elif e > node.e:
node.right = self._remove(node.right, e)
return node
else: # e == node.e
if node.left is None:
right_node = node.right
self.size -= 1
return right_node
if node.right is None:
left_node = node.left
self.size -= 1
return left_node
# 待删除节点左右子树均不为空的情况
successor = self._minimum(node.right)
successor.right = self._remove_min(node.right)
successor.left = node.left
return successor
def _minimum(self, node):
"""返回以node为根的二分搜索树的最小值所在的节点"""
if node.left is None:
return node
return self._minimum(node.left)
def _remove_min(self, node):
"""删除掉以node为根的二分搜索树中的最小节点"""
if node.left is None:
right_node = node.right
self.size -= 1
return right_node
node.left = self._remove_min(node.left)
return node
3️⃣ 实现具体的BSTSet类
class BSTSet(Set):
"""基于二分搜索树的集合实现"""
def __init__(self):
"""初始化BST"""
self.bst = BST()
def add(self, e):
"""向集合中添加元素(实现去重添加)"""
self.bst.add(e)
def remove(self, e):
"""从集合中删除一个元素"""
self.bst.remove(e)
def contains(self, e):
"""查看集合中是否包含元素e"""
return self.bst.contains(e)
def get_size(self):
"""获取集合中元素的个数"""
return self.bst.get_size()
def is_empty(self):
"""查看集合是否为空"""
return self.bst.is_empty()
其实不难发现,在使用二分搜索树实现集合的时候,我们没有做多余的操作,因为我们自己实现的二分搜索树已经完全包含了集合的所有特性;
三、测试
1️⃣ 编写测试工具类
import re
import os
class FileOperation:
"""文件操作工具类"""
@staticmethod
def read_file(filename):
"""
读取文件名称为filename中的内容,并将其中包含的所有词语返回
"""
words = []
if not filename:
print("filename is None")
return words
try:
# 检查文件是否存在
if not os.path.exists(filename):
print(f"File {filename} does not exist")
return words
# 读取文件内容
with open(filename, 'r', encoding='utf-8') as file:
content = file.read()
# 简单分词 - 提取所有字母组成的单词
words = re.findall(r'[a-zA-Z]+', content.lower())
except IOError:
print(f"Cannot open {filename}")
return words
2️⃣ 编写测试程序
def main():
print("Pride and Prejudice")
words1 = FileOperation.read_file("pride-and-prejudice.txt")
if words1:
print(f"Total words: {len(words1)}")
set1 = BSTSet()
for word in words1:
set1.add(word)
print(f"Total different words: {set1.get_size()}")
print()
print("A Tale of Two Cities")
words2 = FileOperation.read_file("a-tale-of-two-cities.txt")
if words2:
print(f"Total words: {len(words2)}")
set2 = BSTSet()
for word in words2:
set2.add(word)
print(f"Total different words: {set2.get_size()}")
if __name__ == "__main__":
main()
四、链表实现集合Set
1️⃣ 实现链表类
class ListNode:
"""链表节点"""
def __init__(self, e=None):
self.e = e
self.next = None
class LinkedList:
"""链表"""
def __init__(self):
self.dummy_head = ListNode() # 虚拟头节点
self.size = 0
def get_size(self):
return self.size
def is_empty(self):
return self.size == 0
def add_first(self, e):
"""在链表头添加元素"""
self.add(0, e)
def add(self, index, e):
"""在链表的index位置添加新的元素e"""
if index < 0 or index > self.size:
raise ValueError("Add failed. Illegal index.")
prev = self.dummy_head
for i in range(index):
prev = prev.next
node = ListNode(e)
node.next = prev.next
prev.next = node
self.size += 1
def contains(self, e):
"""查找链表中是否有元素e"""
cur = self.dummy_head.next
while cur is not None:
if cur.e == e:
return True
cur = cur.next
return False
def remove_element(self, e):
"""从链表中删除元素e"""
prev = self.dummy_head
while prev.next is not None:
if prev.next.e == e:
del_node = prev.next
prev.next = del_node.next
self.size -= 1
return
prev = prev.next
2️⃣ 实现LinkedListSet类
class LinkedListSet(Set):
"""基于链表的集合实现"""
def __init__(self):
"""初始化"""
self.linked_list = LinkedList()
def add(self, e):
"""向集合中添加元素(去重添加)"""
# 实现去重添加元素e
if not self.linked_list.contains(e):
self.linked_list.add_first(e)
def remove(self, e):
"""从集合中删除元素"""
self.linked_list.remove_element(e)
def contains(self, e):
"""查看集合中是否包含元素e"""
return self.linked_list.contains(e)
def get_size(self):
"""获取集合中元素的个数"""
return self.linked_list.get_size()
def is_empty(self):
"""查看集合是否为空"""
return self.linked_list.is_empty()
五、测试
def test_linkedlist_set():
print("Pride and Prejudice")
words1 = FileOperation.read_file("pride-and-prejudice.txt")
if words1:
print(f"Total words: {len(words1)}")
set1 = LinkedListSet()
for word in words1:
set1.add(word)
print(f"Total different words: {set1.get_size()}")
print()
print("A Tale of Two Cities")
words2 = FileOperation.read_file("a-tale-of-two-cities.txt")
if words2:
print(f"Total words: {len(words2)}")
set2 = LinkedListSet()
for word in words2:
set2.add(word)
print(f"Total different words: {set2.get_size()}")
if __name__ == "__main__":
test_linkedlist_set()
六、两种实现方式的性能对比
1️⃣ 测试类
import time
def test_set(set_impl, filename):
"""测试集合性能"""
start_time = time.time()
print(filename)
words = FileOperation.read_file(filename)
if words:
print(f"Total words: {len(words)}")
for word in words:
set_impl.add(word)
print(f"Total different words: {set_impl.get_size()}")
end_time = time.time()
return end_time - start_time
def performance_comparison():
filename = "pride-and-prejudice.txt"
# 测试BST Set
bst_set = BSTSet()
time1 = test_set(bst_set, filename)
print(f"BST Set: {time1:.6f} s")
print()
# 测试Linked List Set
linkedlist_set = LinkedListSet()
time2 = test_set(linkedlist_set, filename)
print(f"Linked List Set: {time2:.6f} s")
if __name__ == "__main__":
performance_comparison()
时间复杂度分析
BST Set (二分搜索树实现)
- 添加操作:平均 O(log n),最坏 O(n)
- 查询操作:平均 O(log n),最坏 O(n)
- 删除操作:平均 O(log n),最坏 O(n)
Linked List Set (链表实现)
- 添加操作:O(n) (需要先查询是否存在)
- 查询操作:O(n)
- 删除操作:O(n)
从时间复杂度可以看出,BST Set在平均情况下性能更好,特别是在处理大量数据时优势明显。