当前位置:首页>文章>数据结构>Python集合Set实现详解:二分搜索树vs链表性能对比 | 数据结构教程

Python集合Set实现详解:二分搜索树vs链表性能对比 | 数据结构教程

一、什么是集合和映射?

集合是高级数据结构的一种,它们的底层可以用多种方式来实现,就像之前的栈和队列一样。集合就像一个容器一样,我们这里实现的集合里边存储的数据是不能重复的,这样的数据结构可以帮助我们更加快速的进行统计。讲到这里是不是发现和我们之前的二分搜索树非常相似,其实这一次我们就会使用二分搜索树作为集合的底层实现。

二、二分搜索树实现集合Set

1️⃣ 实现集合Set接口

from abc import ABC, abstractmethod

class Set(ABC):
    """集合抽象类"""

    @abstractmethod
    def add(self, e):
        """
        向集合中添加元素(实现去重的操作)
        """
        pass

    @abstractmethod
    def remove(self, e):
        """
        从集合中删除元素
        """
        pass

    @abstractmethod
    def contains(self, e):
        """
        查询集合中是否有相同元素
        """
        pass

    @abstractmethod
    def get_size(self):
        """
        获取集合中元素的个数
        """
        pass

    @abstractmethod
    def is_empty(self):
        """
        查看集合是否为空
        """
        pass

2️⃣ 实现二分搜索树BST类

class TreeNode:
    """二分搜索树节点"""
    def __init__(self, e):
        self.e = e
        self.left = None
        self.right = None

class BST:
    """二分搜索树"""

    def __init__(self):
        self.root = None
        self.size = 0

    def get_size(self):
        return self.size

    def is_empty(self):
        return self.size == 0

    def add(self, e):
        """向二分搜索树中添加元素e"""
        self.root = self._add(self.root, e)

    def _add(self, node, e):
        """向以node为根的二分搜索树中插入元素e"""
        if node is None:
            self.size += 1
            return TreeNode(e)

        if e < node.e:
            node.left = self._add(node.left, e)
        elif e > node.e:
            node.right = self._add(node.right, e)

        return node

    def contains(self, e):
        """查看二分搜索树中是否包含元素e"""
        return self._contains(self.root, e)

    def _contains(self, node, e):
        """查看以node为根的二分搜索树中是否包含元素e"""
        if node is None:
            return False

        if e == node.e:
            return True
        elif e < node.e:
            return self._contains(node.left, e)
        else:
            return self._contains(node.right, e)

    def remove(self, e):
        """从二分搜索树中删除元素e"""
        self.root = self._remove(self.root, e)

    def _remove(self, node, e):
        """删除以node为根的二分搜索树中的元素e"""
        if node is None:
            return None

        if e < node.e:
            node.left = self._remove(node.left, e)
            return node
        elif e > node.e:
            node.right = self._remove(node.right, e)
            return node
        else:  # e == node.e
            if node.left is None:
                right_node = node.right
                self.size -= 1
                return right_node

            if node.right is None:
                left_node = node.left
                self.size -= 1
                return left_node

            # 待删除节点左右子树均不为空的情况
            successor = self._minimum(node.right)
            successor.right = self._remove_min(node.right)
            successor.left = node.left
            return successor

    def _minimum(self, node):
        """返回以node为根的二分搜索树的最小值所在的节点"""
        if node.left is None:
            return node
        return self._minimum(node.left)

    def _remove_min(self, node):
        """删除掉以node为根的二分搜索树中的最小节点"""
        if node.left is None:
            right_node = node.right
            self.size -= 1
            return right_node

        node.left = self._remove_min(node.left)
        return node

3️⃣ 实现具体的BSTSet类

class BSTSet(Set):
    """基于二分搜索树的集合实现"""

    def __init__(self):
        """初始化BST"""
        self.bst = BST()

    def add(self, e):
        """向集合中添加元素(实现去重添加)"""
        self.bst.add(e)

    def remove(self, e):
        """从集合中删除一个元素"""
        self.bst.remove(e)

    def contains(self, e):
        """查看集合中是否包含元素e"""
        return self.bst.contains(e)

    def get_size(self):
        """获取集合中元素的个数"""
        return self.bst.get_size()

    def is_empty(self):
        """查看集合是否为空"""
        return self.bst.is_empty()

其实不难发现,在使用二分搜索树实现集合的时候,我们没有做多余的操作,因为我们自己实现的二分搜索树已经完全包含了集合的所有特性;

三、测试

1️⃣ 编写测试工具类

import re
import os

class FileOperation:
    """文件操作工具类"""

    @staticmethod
    def read_file(filename):
        """
        读取文件名称为filename中的内容,并将其中包含的所有词语返回
        """
        words = []

        if not filename:
            print("filename is None")
            return words

        try:
            # 检查文件是否存在
            if not os.path.exists(filename):
                print(f"File {filename} does not exist")
                return words

            # 读取文件内容
            with open(filename, 'r', encoding='utf-8') as file:
                content = file.read()

            # 简单分词 - 提取所有字母组成的单词
            words = re.findall(r'[a-zA-Z]+', content.lower())

        except IOError:
            print(f"Cannot open {filename}")

        return words

2️⃣ 编写测试程序

def main():
    print("Pride and Prejudice")

    words1 = FileOperation.read_file("pride-and-prejudice.txt")
    if words1:
        print(f"Total words: {len(words1)}")

        set1 = BSTSet()
        for word in words1:
            set1.add(word)
        print(f"Total different words: {set1.get_size()}")

    print()

    print("A Tale of Two Cities")

    words2 = FileOperation.read_file("a-tale-of-two-cities.txt")
    if words2:
        print(f"Total words: {len(words2)}")

        set2 = BSTSet()
        for word in words2:
            set2.add(word)
        print(f"Total different words: {set2.get_size()}")

if __name__ == "__main__":
    main()

四、链表实现集合Set

1️⃣ 实现链表类

class ListNode:
    """链表节点"""
    def __init__(self, e=None):
        self.e = e
        self.next = None

class LinkedList:
    """链表"""

    def __init__(self):
        self.dummy_head = ListNode()  # 虚拟头节点
        self.size = 0

    def get_size(self):
        return self.size

    def is_empty(self):
        return self.size == 0

    def add_first(self, e):
        """在链表头添加元素"""
        self.add(0, e)

    def add(self, index, e):
        """在链表的index位置添加新的元素e"""
        if index < 0 or index > self.size:
            raise ValueError("Add failed. Illegal index.")

        prev = self.dummy_head
        for i in range(index):
            prev = prev.next

        node = ListNode(e)
        node.next = prev.next
        prev.next = node
        self.size += 1

    def contains(self, e):
        """查找链表中是否有元素e"""
        cur = self.dummy_head.next
        while cur is not None:
            if cur.e == e:
                return True
            cur = cur.next
        return False

    def remove_element(self, e):
        """从链表中删除元素e"""
        prev = self.dummy_head
        while prev.next is not None:
            if prev.next.e == e:
                del_node = prev.next
                prev.next = del_node.next
                self.size -= 1
                return
            prev = prev.next

2️⃣ 实现LinkedListSet类

class LinkedListSet(Set):
    """基于链表的集合实现"""

    def __init__(self):
        """初始化"""
        self.linked_list = LinkedList()

    def add(self, e):
        """向集合中添加元素(去重添加)"""
        # 实现去重添加元素e
        if not self.linked_list.contains(e):
            self.linked_list.add_first(e)

    def remove(self, e):
        """从集合中删除元素"""
        self.linked_list.remove_element(e)

    def contains(self, e):
        """查看集合中是否包含元素e"""
        return self.linked_list.contains(e)

    def get_size(self):
        """获取集合中元素的个数"""
        return self.linked_list.get_size()

    def is_empty(self):
        """查看集合是否为空"""
        return self.linked_list.is_empty()

五、测试

def test_linkedlist_set():
    print("Pride and Prejudice")

    words1 = FileOperation.read_file("pride-and-prejudice.txt")
    if words1:
        print(f"Total words: {len(words1)}")

        set1 = LinkedListSet()
        for word in words1:
            set1.add(word)
        print(f"Total different words: {set1.get_size()}")

    print()

    print("A Tale of Two Cities")

    words2 = FileOperation.read_file("a-tale-of-two-cities.txt")
    if words2:
        print(f"Total words: {len(words2)}")

        set2 = LinkedListSet()
        for word in words2:
            set2.add(word)
        print(f"Total different words: {set2.get_size()}")

if __name__ == "__main__":
    test_linkedlist_set()

六、两种实现方式的性能对比

1️⃣ 测试类

import time

def test_set(set_impl, filename):
    """测试集合性能"""
    start_time = time.time()

    print(filename)
    words = FileOperation.read_file(filename)
    if words:
        print(f"Total words: {len(words)}")

        for word in words:
            set_impl.add(word)
        print(f"Total different words: {set_impl.get_size()}")

    end_time = time.time()
    return end_time - start_time

def performance_comparison():
    filename = "pride-and-prejudice.txt"

    # 测试BST Set
    bst_set = BSTSet()
    time1 = test_set(bst_set, filename)
    print(f"BST Set: {time1:.6f} s")

    print()

    # 测试Linked List Set
    linkedlist_set = LinkedListSet()
    time2 = test_set(linkedlist_set, filename)
    print(f"Linked List Set: {time2:.6f} s")

if __name__ == "__main__":
    performance_comparison()

时间复杂度分析

BST Set (二分搜索树实现)

  • 添加操作:平均 O(log n),最坏 O(n)
  • 查询操作:平均 O(log n),最坏 O(n)
  • 删除操作:平均 O(log n),最坏 O(n)

Linked List Set (链表实现)

  • 添加操作:O(n) (需要先查询是否存在)
  • 查询操作:O(n)
  • 删除操作:O(n)

从时间复杂度可以看出,BST Set在平均情况下性能更好,特别是在处理大量数据时优势明显。

数据结构

二分搜索树详解|定义、原理、代码实现与操作解析

2025-8-15 10:26:46

数据结构

映射(Map)数据结构详解:链表与二分搜索树实现(含 Python 代码与性能对比)

2025-8-17 16:21:25

搜索