哈希表实现与解析：从理论到实践

一、哈希表概念与特性

哈希表（Hash Table）是一种高效的数据结构，它通过键（Key）直接访问在内存存储位置的数据结构。它通过一个哈希函数将键映射到表中的某个位置，从而实现 O (1) 平均时间复杂度的插入、删除和查找操作。

1.1 核心概念

哈希函数：将键映射到哈希表中索引的函数，理想情况下每个键都能映射到唯一索引
哈希冲突：不同的键通过哈希函数得到相同索引的现象
负载因子：哈希表中存储的元素个数与哈希表大小的比值，通常用 α 表示
哈希桶：哈希表中的每个位置，当发生冲突时，通常通过链表或红黑树等数据结构存储多个元素

1.2 哈希表特性

高效性：平均情况下，插入、删除和查找操作的时间复杂度为 O (1)
空间换时间：通过额外的存储空间来换取高效的操作速度
无序性：哈希表中的元素是无序存储的，与插入顺序无关
冲突处理：需要专门的机制来处理哈希冲突，常见的有链地址法和开放地址法

二、哈希冲突解决：链地址法

本文实现的哈希表采用链地址法（Separate Chaining）来解决哈希冲突，这是一种非常常用的哈希冲突解决方法。

2.1 链地址法原理

链地址法的基本思想是：将哈希表中的每个桶（Bucket）作为一个链表的头节点，当发生哈希冲突时，将冲突的元素插入到对应桶的链表中。

2.2 链地址法优缺点

优点：

处理冲突简单，且无堆积现象
链表节点可以动态申请，适合无法确定表长的情况
删除节点操作易于实现

缺点：

存储指针需要额外空间
当链表较长时，查询效率会下降到 O (n)

三、哈希表完整实现代码

3.1 哈希表模板类 (Hash_Tables_template.h)

该文件实现了哈希表的核心功能，采用链地址法处理哈希冲突，以下是完整代码：

复制代码

#pragma once
#include <utility>  // 用于 std::pair
#include <vector>
#include <algorithm>
#include <string>
using namespace std;

// 链地址法(哈希桶)
namespace hash_bucket {
    template<class T>
    struct HashData {
        T _data;
        HashData<T>* _next;  // 下一个结点(单链表)
        HashData(const T& data)
            :_data(data)
            ,_next(nullptr)
        {
        }
    };

    // 前置声明
    template<class K, class T, class Hash, class KeyOfValue>
    class HashTable; 

    // 迭代器
    template<class K, class T, class Ptr, class Ref, class KeyOfValue, class Hash>
    struct HashIterator {
        typedef HashData<T> Node;
        typedef HashIterator<K, T, Ptr, Ref, KeyOfValue, Hash> Self;

        Node* _node;
        const HashTable<K, T, Hash, KeyOfValue>* _pht;  // 传进来的表对象

        HashIterator(Node* node, const HashTable<K, T, Hash, KeyOfValue>* pht)
            :_node(node)
            ,_pht(pht)
        {}

        Ref operator*() {
            return _node->_data;
        }

        Ptr operator->()
        {
            return &_node->_data;
        }

        bool operator!=(const Self& s)
        {
            return _node != s._node;
        }

        bool operator==(const Self& s) {
            return !(*this != s);
        }

        Self& operator++() {
            if (_node->_next) {   // 当前桶还有元素就继续往下遍历
                _node = _node->_next;
            }
            else {   // 找不为空的桶
                Hash hash;
                KeyOfValue kof;
                size_t hashi = hash(kof(_node->_data)) % _pht->_tables.size();  // 计算进来的值相对映射的位置
                hashi++; // 从下一个桶开始寻找
                while (hashi < _pht->_tables.size()) {  // 哈希表没走完就继续找
                    // 找到不为空的桶就跳出循环
                    if (_pht->_tables[hashi]) {
                        break;
                    }
                    hashi++;
                }
                if (hashi == _pht->_tables.size())  // 相等说明哈希表已经走完
                {
                    _node = nullptr;  // end
                }
                else {
                    _node = _pht->_tables[hashi];
                }
            }
            return *this;
        }
    };

    // 哈希表核心实现
    template<class K, class T, class Hash, class KeyOfValue>
    class HashTable {
        // 友元类
        template<class K, class T, class Ptr, class Ref, class KeyOfValue, class Hash>
        friend struct HashIterator;

        typedef HashData<T> Node;
    public:
        // 迭代器
        typedef HashIterator<const K, T, T*, T&, KeyOfValue, Hash>  Iterator;
        typedef HashIterator<const K, T, const T*, const T&, KeyOfValue, Hash>  ConstIterator;

        Iterator Begin() {
            if (_n == 0) return End();
            for (size_t i = 0; i < _tables.size(); i++) {  // 找到第一个不为空的表
                if (_tables[i]) {
                    return Iterator(_tables[i], this);
                }
            }
            return End();
        }

        Iterator End() {
            return Iterator(nullptr, this);
        }

        ConstIterator Begin() const {
            if (_n == 0) return End();
            for (size_t i = 0; i < _tables.size(); i++) {  // 找到第一个不为空的表
                if (_tables[i]) {
                    return ConstIterator(_tables[i], this);
                }
            }
            return End();
        }

        ConstIterator End() const {
            return ConstIterator(nullptr, this);
        }

        // 扩容时，确保容量总为距离2的幂次方较远的素数
        inline unsigned long __stl_next_prime(unsigned long n)
        {
            // Note: assumes long is at least 32 bits.
            static const int __stl_num_primes = 28;
            static const unsigned long __stl_prime_list[__stl_num_primes] =
            {
                53,         97,         193,       389,       769,
                1543,       3079,       6151,      12289,     24593,
                49157,      98317,      196613,    393241,    786433,
                1572869,    3145739,    6291469,   12582917,  25165843,
                50331653,   100663319,  201326611, 402653189, 805306457,
                1610612741, 3221225473, 4294967291
            };

            const unsigned long* first = __stl_prime_list;
            const unsigned long* last = __stl_prime_list + __stl_num_primes;
            const unsigned long* pos = lower_bound(first, last, n);
            return pos == last ? *(last - 1) : *pos;
        }

        HashTable()
        {
            _tables.resize(__stl_next_prime(0));
            _n = 0;
        }

        ~HashTable() {
            for (size_t i = 0; i < _tables.size(); i++) {
                Node* cur = _tables[i];
                while (cur) {
                    Node* next = cur->_next;  // 保存下一个待删除的节点
                    delete cur;
                    cur = next;
                }
                _tables[i] = nullptr;
            }
            _n = 0;
        }

        pair<Iterator,bool> Insert(const T& data) {
            KeyOfValue kof;
            Iterator it = Find(kof(data));
            if (it != End()) {
                return make_pair(it, false);  // 已经存在就直接返回
            }
            Hash hash;
            // 负载因子等于1时扩容
            if (_n == _tables.size()) {
                vector<Node*> newtables;
                newtables.resize(__stl_next_prime(static_cast<unsigned long>(_tables.size() + 1)), nullptr);  // 扩容
                // 把结点挂到新表上
                for (size_t i = 0; i < _tables.size(); i++) {
                    Node* cur = _tables[i];
                    while (cur) {
                        Node* next = cur->_next;  // 保存当前结点的下一个结点
                        size_t hashi = hash(kof(cur->_data)) % newtables.size();  // 计算进来的值相对映射的位置
                        cur->_next = newtables[hashi];
                        newtables[hashi] = cur;
                        cur = next;
                    }
                    _tables[i] = nullptr;
                }
                _tables.swap(newtables);
            }

            // 头插
            size_t hashi = hash(kof(data)) % _tables.size();  // 计算进来的值相对映射的位置
            Node* newnode = new Node(data);  // 创建新结点
            newnode->_next = _tables[hashi];  // 链接原来结点
            _tables[hashi] = newnode;  // 替换原来结点(头插)
            _n++;
            return  make_pair(Iterator(newnode, this), true);
        }

        Iterator Find(const K& key) {
            Hash hash;
            KeyOfValue kof;
            size_t hashi = hash(key) % _tables.size();  // 计算进来的值相对映射的位置
            Node* cur = _tables[hashi];
            while (cur) {
                if (kof(cur->_data) == key) {
                    return Iterator(cur, this);
                }
                cur = cur->_next;
            }
            return End();
        }

        bool Erase(const K& key) {
            Hash hash;
            KeyOfValue kof;
            size_t hashi = hash(key) % _tables.size();  // 计算进来的值相对映射的位置
            Node* cur = _tables[hashi];
            Node* prev = nullptr;
            while (cur) {
                if (kof(cur->_data) == key) {
                    if (prev == nullptr) {  // 头结点的情况
                        _tables[hashi] = cur->_next;
                    }
                    else {   // 中间节点或尾结点的情况
                        prev->_next = cur->_next;
                    }
                    delete cur;
                    _n--;
                    return true;
                }
                prev = cur;
                cur = cur->_next;
            }
            return false;  // 找不到
        }

    private:
        vector<Node*> _tables;  // 哈希表
        size_t _n = 0;  // 有效数据个数
    };
}

代码解析

HashData 结构体：哈希表的节点结构，包含数据域和指向下一个节点的指针，用于构建单链表
HashIterator 迭代器 ：
- 实现了哈希表的遍历功能，支持解引用、箭头访问、不等于 / 等于比较、自增操作
- 自增操作是核心难点：当前桶遍历完后，自动寻找下一个非空桶
HashTable 核心类 ：
- 素数容量设计：使用素数作为哈希表容量，减少哈希冲突
- 扩容机制：负载因子达到 1 时触发扩容，重新计算所有元素的哈希值
- 插入操作：头插法插入新元素，保证 O (1) 插入效率
- 查找操作：先哈希定位桶，再遍历链表查找
- 删除操作：处理头节点、中间节点、尾节点三种情况

3.2 哈希函数实现 (HashFun)

哈希函数是哈希表的关键组成部分，以下是完整的哈希函数实现：

复制代码

// 哈希函数模板
template<class K>
struct HashFun {
    size_t operator()(const K& key) const {
        return (size_t)key;
    }
};

// string类型特化版本
template<>
struct HashFun<string> {
    size_t operator()(const string& key) const {
        size_t hash = 0;
        for (auto e : key) {
            hash *= 131;  // 131是经验值，能提供较好的哈希分布
            hash += e;
        }
        return hash;
    }
};

代码解析

通用版本：对于整数类型，直接将其转换为 size_t 类型作为哈希值
string 特化版本 ：
- 采用多项式哈希算法：hash = hash * 131 + char_code
- 131 是经验值，能有效减少字符串哈希冲突
- 能区分不同顺序的相同字符组成的字符串（如 "abc" 和 "cba"）

3.3 unordered_map 完整实现 (My_unordered_map.h)

复制代码

#pragma once
#include "Hash_Tables_template.h"

// 哈希函数模板
template<class K>
struct HashFun {
    size_t operator()(const K& key) const {
        return (size_t)key;
    }
};

// string类型特化
template<>
struct HashFun<string> {
    size_t operator()(const string& key) const {
        size_t hash = 0;
        for (auto e : key) {
            hash *= 131;  // 让不同顺序但相同的字母分布在其他地方，防止堆积
            hash += e;
        }
        return hash;
    }
};

namespace yzq {
    template<class K, class V, class Hash = HashFun<K>>
    class unordered_map {
    public:
        struct MapKeyOfValue
        {
            const K& operator()(const pair<K, V>& kv)
            {
                return kv.first;
            }
        };
        typedef typename hash_bucket::HashTable<const K, pair<K, V>, Hash, MapKeyOfValue>::Iterator iterator;
        typedef typename hash_bucket::HashTable<const K, pair<K, V>, Hash, MapKeyOfValue>::ConstIterator const_iterator;
        
        iterator begin() {
            return _ht.Begin();
        }

        iterator end() {
            return _ht.End();
        }

        const_iterator begin() const
        {
            return _ht.Begin();
        }

        const_iterator end() const
        {
            return _ht.End();
        }

        pair<iterator, bool> insert(const pair<K, V>& kv)
        {
            return _ht.Insert(kv);
        }

        V& operator[](const K& key) {
            pair<iterator, bool> ret = _ht.Insert(make_pair(key, V()));
            return ret.first->second;
        }

        bool erase(const K& key) {
            return _ht.Erase(key);
        }

        iterator find(const K& key) {
            return _ht.Find(key);
        }

        const_iterator find(const K& key) const {
            return _ht.Find(key);
        }

    private:
        hash_bucket::HashTable<const K, pair<K, V>, Hash, MapKeyOfValue> _ht;
    };
}

代码解析

MapKeyOfValue 结构体：从键值对中提取键，供哈希表使用
迭代器类型定义：复用哈希表的迭代器
核心接口 ：
- begin ()/end ()：提供迭代器访问
- insert ()：插入键值对，返回迭代器和插入是否成功
- operator []：便捷访问，不存在则插入默认值
- erase ()/find ()：删除和查找操作

3.4 unordered_set 完整实现 (My_unordered_set.h)

复制代码

#pragma once
#include "Hash_Tables_template.h"

// 哈希函数模板
template<class K>
struct HashFun {
    size_t operator()(const K& key) const {
        return (size_t)key;
    }
};

// string类型特化
template<>
struct HashFun<string> {
    size_t operator()(const string& key) const {
        size_t hash = 0;
        for (auto e : key) {
            hash *= 131;  // 让不同顺序但相同的字母分布在其他地方，防止堆积
            hash += e;
        }
        return hash;
    }
};

namespace yzq {
    template<class K, class Hash = HashFun<K>>
    class unordered_set {
    public:
        struct SetKeyOfValue
        {
            const K& operator()(const K& key)
            {
                return key;
            }
        };
        typedef typename hash_bucket::HashTable<const K, const K, Hash, SetKeyOfValue>::Iterator iterator;
        typedef typename hash_bucket::HashTable<const K, const K, Hash, SetKeyOfValue>::ConstIterator const_iterator;

        iterator begin() {
            return _ht.Begin();
        }

        iterator end() {
            return _ht.End();
        }

        const_iterator begin() const
        {
            return _ht.Begin();
        }

        const_iterator end() const
        {
            return _ht.End();
        }

        pair<iterator, bool> insert(const K& key)
        {
            return _ht.Insert(key);
        }

        bool erase(const K& key) {
            return _ht.Erase(key);
        }

        iterator find(const K& key) {
            return _ht.Find(key);
        }

        const_iterator find(const K& key) const {
            return _ht.Find(key);
        }

    private:
        hash_bucket::HashTable<const K, const K, Hash, SetKeyOfValue> _ht;
    };
}

代码解析

SetKeyOfValue 结构体：直接返回元素本身作为键
模板参数：只需要键类型，不需要值类型
元素特性：存储 const K 类型，保证元素不可修改（避免哈希值变化）
接口设计：与 unordered_map 类似，但操作的是单个元素而非键值对

四、完整测试代码

4.1 unordered_set 测试代码

复制代码

#include <iostream>
#include <cassert>
#include "My_unordered_set.h"

using namespace std;
using namespace yzq;

// 基础功能测试
void test_set_basic() {
    cout << "=== Testing unordered_set basic operations ===" << endl;

    unordered_set<int> s;

    // 测试插入
    assert(s.insert(1).second == true);
    assert(s.insert(2).second == true);
    assert(s.insert(3).second == true);
    assert(s.insert(1).second == false); // 重复插入

    // 测试范围for
    cout << "Set elements: ";
    for (auto e : s) {
        cout << e << " ";
    }
    cout << endl;

    // 测试迭代器
    unordered_set<int>::iterator it = s.begin();
    int sum = 0;
    while (it != s.end()) {
        sum += *it;
        ++it;
    }
    cout << "Sum of elements: " << sum << endl;
    assert(sum == 6);

    // 测试查找
    assert(s.find(2) != s.end());
    assert(s.find(4) == s.end());

    // 测试删除
    assert(s.erase(2) == true);
    assert(s.erase(4) == false);

    // 验证删除结果
    sum = 0;
    for (auto e : s) {
        sum += e;
    }
    cout << "Sum after erase 2: " << sum << endl;
    assert(sum == 4);

    cout << "Set basic tests passed!" << endl << endl;
}

// 字符串元素测试
void test_set_string() {
    cout << "=== Testing unordered_set with string ===" << endl;

    unordered_set<string> s;

    s.insert("apple");
    s.insert("banana");
    s.insert("cherry");
    s.insert("apple"); // 重复

    // 遍历
    cout << "String set elements: ";
    for (const auto& str : s) {
        cout << str << " ";
    }
    cout << endl;

    // 查找
    assert(s.find("banana") != s.end());
    assert(s.find("date") == s.end());

    // 删除
    s.erase("cherry");
    assert(s.find("cherry") == s.end());

    cout << "Set string tests passed!" << endl << endl;
}

// 自定义类型测试
struct Person {
    std::string name;
    int age;

    Person() : name(""), age(0) {}
    Person(const std::string& n, int a) : name(n), age(a) {}

    bool operator==(const Person& other) const {
        return name == other.name && age == other.age;
    }
};

// 自定义哈希函数
struct PersonHash {
    size_t operator()(const Person& p) const {
        size_t nameHash = 0;
        for (char c : p.name) {
            nameHash = nameHash * 131 + c;
        }
        return nameHash ^ (p.age * 2654435761U);
    }
};

void test_set_custom_type() {
    cout << "=== Testing unordered_set with custom type ===" << endl;

    unordered_set<Person, PersonHash> s;

    Person p1("Alice", 25);
    Person p2("Bob", 30);
    Person p3("Charlie", 35);

    s.insert(p1);
    s.insert(p2);
    s.insert(p3);
    s.insert(p1); // 重复

    // 遍历
    cout << "Custom type set elements: " << endl;
    for (const auto& p : s) {
        cout << "Name: " << p.name << ", Age: " << p.age << endl;
    }

    // 查找
    assert(s.find(p2) != s.end());
    
    Person p4("David", 40);
    assert(s.find(p4) == s.end());

    // 删除
    s.erase(p3);
    assert(s.find(p3) == s.end());

    cout << "Set custom type tests passed!" << endl << endl;
}

// 边界情况测试
void test_set_edge_cases() {
    cout << "=== Testing unordered_set edge cases ===" << endl;

    // 空集合测试
    unordered_set<int> s;
    assert(s.begin() == s.end());
    assert(s.erase(1) == false);
    assert(s.find(1) == s.end());

    // 大量元素测试（触发扩容）
    for (int i = 0; i < 100; ++i) {
        s.insert(i);
    }

    // 验证所有元素都存在
    for (int i = 0; i < 100; ++i) {
        assert(s.find(i) != s.end());
    }

    // 删除所有元素
    for (int i = 0; i < 100; ++i) {
        assert(s.erase(i) == true);
    }

    // 验证为空
    assert(s.begin() == s.end());

    cout << "Set edge cases tests passed!" << endl << endl;
}

int main() {
    test_set_basic();
    test_set_string();
    test_set_custom_type();
    test_set_edge_cases();

    cout << "All unordered_set tests passed successfully!" << endl;

    return 0;
}

4.2 unordered_map 测试代码

复制代码

#include <iostream>
#include <cassert>
#include "My_unordered_map.h"

using namespace std;
using namespace yzq;

// 基础功能测试
void test_map_basic() {
    cout << "=== Testing unordered_map basic operations ===" << endl;

    unordered_map<int, string> dict;

    // 测试insert
    assert(dict.insert({ 1, "one" }).second == true);
    assert(dict.insert({ 2, "two" }).second == true);
    assert(dict.insert({ 3, "three" }).second == true);
    assert(dict.insert({ 1, "ONE" }).second == false); // 重复键

    // 测试operator[]
    dict[4] = "four";
    dict[5] = "five";
    dict[2] = "TWO"; // 修改已存在的值

    // 测试访问不存在的键（应创建默认值）
    dict[6]; // 应创建空字符串

    // 遍历并验证
    cout << "Map elements:" << endl;
    int count = 0;
    for (auto& kv : dict) {
        cout << kv.first << " -> " << kv.second << endl;
        count++;
    }

    assert(count == 6);
    assert(dict[1] == "one");
    assert(dict[2] == "TWO");
    assert(dict[6] == "");

    // 测试查找
    auto it = dict.find(3);
    assert(it != dict.end());
    assert(it->second == "three");
    assert(dict.find(7) == dict.end());

    // 测试删除
    assert(dict.erase(4) == true);
    assert(dict.erase(7) == false);
    assert(dict.find(4) == dict.end());

    cout << "Map basic tests passed!" << endl << endl;
}

// 字符串键测试
void test_map_string_key() {
    cout << "=== Testing unordered_map with string key ===" << endl;

    unordered_map<string, int> scoreMap;

    scoreMap["Alice"] = 90;
    scoreMap["Bob"] = 85;
    scoreMap["Charlie"] = 95;

    // 遍历
    cout << "Score map:" << endl;
    for (const auto& pair : scoreMap) {
        cout << pair.first << ": " << pair.second << endl;
    }

    // 验证值
    assert(scoreMap["Alice"] == 90);
    assert(scoreMap["Bob"] == 85);

    // 修改值
    scoreMap["Bob"] = 88;
    assert(scoreMap["Bob"] == 88);

    // 查找
    assert(scoreMap.find("Charlie") != scoreMap.end());
    assert(scoreMap.find("David") == scoreMap.end());

    // 删除
    scoreMap.erase("Charlie");
    assert(scoreMap.find("Charlie") == scoreMap.end());

    cout << "Map string key tests passed!" << endl << endl;
}

// 复杂值类型测试
struct Student {
    string name;
    int age;
    double gpa;

    Student() : name(""), age(0), gpa(0.0) {}
    Student(const string& n, int a, double g) : name(n), age(a), gpa(g) {}
};

void test_map_complex_value() {
    cout << "=== Testing unordered_map with complex value ===" << endl;

    unordered_map<int, Student> studentMap;

    // 插入
    studentMap[101] = Student("Alice", 20, 3.8);
    studentMap[102] = Student("Bob", 21, 3.5);
    studentMap[103] = Student("Charlie", 19, 3.9);

    // 访问和修改
    studentMap[102].gpa = 3.7;

    // 验证
    assert(studentMap[101].name == "Alice");
    assert(studentMap[102].gpa == 3.7);
    assert(studentMap[103].age == 19);

    // 遍历
    cout << "Student map:" << endl;
    for (const auto& pair : studentMap) {
        cout << "ID: " << pair.first 
             << ", Name: " << pair.second.name
             << ", Age: " << pair.second.age
             << ", GPA: " << pair.second.gpa << endl;
    }

    cout << "Map complex value tests passed!" << endl << endl;
}

// 边界情况测试
void test_map_edge_cases() {
    cout << "=== Testing unordered_map edge cases ===" << endl;

    // 空map测试
    unordered_map<int, string> emptyMap;
    assert(emptyMap.begin() == emptyMap.end());
    assert(emptyMap.find(1) == emptyMap.end());
    assert(emptyMap.erase(1) == false);

    // operator[] 插入空值
    emptyMap[1];
    assert(emptyMap.find(1) != emptyMap.end());
    assert(emptyMap[1] == "");

    // 大量元素测试（触发扩容）
    unordered_map<int, int> bigMap;
    for (int i = 0; i < 200; ++i) {
        bigMap[i] = i * 10;
    }

    // 验证所有元素
    for (int i = 0; i < 200; ++i) {
        assert(bigMap[i] == i * 10);
    }

    cout << "Map edge cases tests passed!" << endl << endl;
}

int main() {
    test_map_basic();
    test_map_string_key();
    test_map_complex_value();
    test_map_edge_cases();

    cout << "All unordered_map tests passed successfully!" << endl;

    return 0;
}

五、实现要点总结

5.1 核心设计思路

模板化设计：通过模板实现通用的哈希表，支持任意键值类型
分离关注点 ：
- HashTable：核心哈希表实现
- unordered_map/unordered_set：基于哈希表的具体容器
- HashFun：哈希函数策略
迭代器设计：支持标准的迭代器操作，符合 STL 容器规范

5.2 性能优化点

素数容量：使用素数作为哈希表容量，减少哈希冲突
负载因子控制：负载因子达到 1 时扩容，保证操作效率
头插法：链表头插，插入操作 O (1) 时间复杂度
高效哈希函数：字符串哈希采用 131 多项式算法，分布更均匀

5.3 注意事项

const 正确性：迭代器分为普通迭代器和 const 迭代器
内存管理：析构函数正确释放所有节点内存
异常安全：扩容时先创建新表，再迁移数据，最后交换
哈希冲突：链地址法处理冲突，保证稳定性