【数据结构与算法】第30篇：哈希表（Hash Table）

一、什么是哈希表

1.1 基本思想

哈希表通过哈希函数将关键字映射到数组的某个位置，实现快速访问。

text

复制代码

key → 哈希函数 → 数组下标 → 访问/存储

示例：hash(key) = key % 10

key=25 → 25%10=5 → 存入下标5
key=37 → 37%10=7 → 存入下标7

1.2 哈希冲突

不同的key映射到同一个位置，称为哈希冲突。

text

复制代码

key=25 → 5
key=35 → 5  // 冲突！

解决冲突的两种主要方法：

链地址法：每个位置是一个链表
开放地址法：冲突后找下一个空位

二、哈希函数

2.1 常用哈希函数

方法	公式	适用场景
直接定址法	H(key)=a×key+b	关键字分布连续
除留余数法	H(key)=key % p	最常用，p为质数
平方取中法	取平方后中间几位	关键字无规律
折叠法	分割后求和	关键字位数多

2.2 除留余数法

复制代码

int hash(int key, int size) {
    return key % size;  // size通常取质数
}

为什么size取质数：减少冲突概率，使分布更均匀。

三、链地址法（拉链法）

3.1 原理

每个数组元素是一个链表的头指针，冲突的元素放入同一链表。

text

复制代码

数组: [0] → [1] → [2] → [3] → ...
链表:      ↓
          key1 → key2 → ...

3.2 代码实现

复制代码

#include <stdio.h>
#include <stdlib.h>

#define SIZE 10

// 链表节点
typedef struct Node {
    int key;
    struct Node *next;
} Node;

// 哈希表
typedef struct {
    Node *buckets[SIZE];
} HashTable;

// 初始化
void initHashTable(HashTable *ht) {
    for (int i = 0; i < SIZE; i++) {
        ht->buckets[i] = NULL;
    }
}

// 哈希函数
int hash(int key) {
    return key % SIZE;
}

// 插入
void insert(HashTable *ht, int key) {
    int index = hash(key);
    Node *newNode = (Node*)malloc(sizeof(Node));
    newNode->key = key;
    newNode->next = ht->buckets[index];
    ht->buckets[index] = newNode;
}

// 查找
int search(HashTable *ht, int key) {
    int index = hash(key);
    Node *cur = ht->buckets[index];
    while (cur != NULL) {
        if (cur->key == key) return 1;
        cur = cur->next;
    }
    return 0;
}

// 删除
int delete(HashTable *ht, int key) {
    int index = hash(key);
    Node *cur = ht->buckets[index];
    Node *prev = NULL;
    
    while (cur != NULL) {
        if (cur->key == key) {
            if (prev == NULL) {
                ht->buckets[index] = cur->next;
            } else {
                prev->next = cur->next;
            }
            free(cur);
            return 1;
        }
        prev = cur;
        cur = cur->next;
    }
    return 0;
}

// 打印
void printHashTable(HashTable *ht) {
    for (int i = 0; i < SIZE; i++) {
        printf("[%d]: ", i);
        Node *cur = ht->buckets[i];
        while (cur != NULL) {
            printf("%d -> ", cur->key);
            cur = cur->next;
        }
        printf("NULL\n");
    }
}

int main() {
    HashTable ht;
    initHashTable(&ht);
    
    insert(&ht, 25);
    insert(&ht, 35);
    insert(&ht, 45);
    insert(&ht, 17);
    insert(&ht, 28);
    
    printf("哈希表（链地址法）:\n");
    printHashTable(&ht);
    
    printf("\n查找25: %s\n", search(&ht, 25) ? "找到" : "未找到");
    printf("查找99: %s\n", search(&ht, 99) ? "找到" : "未找到");
    
    delete(&ht, 35);
    printf("\n删除35后:\n");
    printHashTable(&ht);
    
    return 0;
}

运行结果：

text

复制代码

哈希表（链地址法）:
[0]: NULL
[1]: NULL
[2]: NULL
[3]: NULL
[4]: NULL
[5]: 45 -> 35 -> 25 -> NULL
[6]: NULL
[7]: 17 -> NULL
[8]: 28 -> NULL
[9]: NULL

查找25: 找到
查找99: 未找到

删除35后:
[5]: 45 -> 25 -> NULL

四、开放地址法

4.1 线性探测

冲突时，依次检查下一个位置：H = (H(key) + i) % SIZE

缺点：容易产生聚集（一连串被占用的位置）。

复制代码

#define SIZE 10
#define EMPTY -1

typedef struct {
    int data[SIZE];
} OpenHashTable;

void initOpenHash(OpenHashTable *ht) {
    for (int i = 0; i < SIZE; i++) {
        ht->data[i] = EMPTY;
    }
}

int hash(int key) {
    return key % SIZE;
}

// 线性探测插入
void linearInsert(OpenHashTable *ht, int key) {
    int index = hash(key);
    int i = 0;
    while (ht->data[(index + i) % SIZE] != EMPTY && i < SIZE) {
        i++;
    }
    if (i < SIZE) {
        ht->data[(index + i) % SIZE] = key;
    } else {
        printf("哈希表已满\n");
    }
}

// 线性探测查找
int linearSearch(OpenHashTable *ht, int key) {
    int index = hash(key);
    int i = 0;
    while (i < SIZE) {
        int pos = (index + i) % SIZE;
        if (ht->data[pos] == key) return pos;
        if (ht->data[pos] == EMPTY) return -1;  // 空位说明不存在
        i++;
    }
    return -1;
}

4.2 二次探测

解决线性探测的聚集问题：H = (H(key) + i²) % SIZE

复制代码

// 二次探测插入
void quadraticInsert(OpenHashTable *ht, int key) {
    int index = hash(key);
    int i = 0;
    while (ht->data[(index + i * i) % SIZE] != EMPTY && i < SIZE) {
        i++;
    }
    if (i < SIZE) {
        ht->data[(index + i * i) % SIZE] = key;
    } else {
        printf("哈希表已满\n");
    }
}

五、完整哈希表实现（链地址法+动态扩容）

复制代码

#include <stdio.h>
#include <stdlib.h>

#define INIT_SIZE 8
#define LOAD_FACTOR 0.75

typedef struct Node {
    int key;
    struct Node *next;
} Node;

typedef struct {
    Node **buckets;
    int size;       // 桶的数量
    int count;      // 元素个数
} HashTable;

// 哈希函数
int hash(HashTable *ht, int key) {
    return key % ht->size;
}

// 创建哈希表
HashTable* createHashTable() {
    HashTable *ht = (HashTable*)malloc(sizeof(HashTable));
    ht->size = INIT_SIZE;
    ht->count = 0;
    ht->buckets = (Node**)calloc(ht->size, sizeof(Node*));
    return ht;
}

// 插入（不检查扩容）
void insertNoResize(HashTable *ht, int key) {
    int index = hash(ht, key);
    Node *newNode = (Node*)malloc(sizeof(Node));
    newNode->key = key;
    newNode->next = ht->buckets[index];
    ht->buckets[index] = newNode;
    ht->count++;
}

// 查找
int search(HashTable *ht, int key) {
    int index = hash(ht, key);
    Node *cur = ht->buckets[index];
    while (cur != NULL) {
        if (cur->key == key) return 1;
        cur = cur->next;
    }
    return 0;
}

// 获取所有键
void getAllKeys(HashTable *ht, int *keys, int *len) {
    *len = 0;
    for (int i = 0; i < ht->size; i++) {
        Node *cur = ht->buckets[i];
        while (cur != NULL) {
            keys[(*len)++] = cur->key;
            cur = cur->next;
        }
    }
}

// 扩容
void resize(HashTable *ht) {
    int oldSize = ht->size;
    Node **oldBuckets = ht->buckets;
    
    // 创建新哈希表
    ht->size = oldSize * 2;
    ht->count = 0;
    ht->buckets = (Node**)calloc(ht->size, sizeof(Node*));
    
    // 重新插入所有元素
    for (int i = 0; i < oldSize; i++) {
        Node *cur = oldBuckets[i];
        while (cur != NULL) {
            insertNoResize(ht, cur->key);
            Node *temp = cur;
            cur = cur->next;
            free(temp);
        }
    }
    
    free(oldBuckets);
}

// 插入（带扩容）
void insert(HashTable *ht, int key) {
    if ((float)ht->count / ht->size >= LOAD_FACTOR) {
        printf("负载因子 %.2f >= %.2f，扩容到 %d\n", 
               (float)ht->count / ht->size, LOAD_FACTOR, ht->size * 2);
        resize(ht);
    }
    insertNoResize(ht, key);
}

// 打印
void printHashTable(HashTable *ht) {
    printf("哈希表 (size=%d, count=%d, load=%.2f):\n", 
           ht->size, ht->count, (float)ht->count / ht->size);
    for (int i = 0; i < ht->size; i++) {
        printf("[%d]: ", i);
        Node *cur = ht->buckets[i];
        while (cur != NULL) {
            printf("%d -> ", cur->key);
            cur = cur->next;
        }
        printf("NULL\n");
    }
}

// 销毁
void destroyHashTable(HashTable *ht) {
    for (int i = 0; i < ht->size; i++) {
        Node *cur = ht->buckets[i];
        while (cur != NULL) {
            Node *temp = cur;
            cur = cur->next;
            free(temp);
        }
    }
    free(ht->buckets);
    free(ht);
}

int main() {
    HashTable *ht = createHashTable();
    
    printf("=== 插入元素（观察扩容）===\n");
    int values[] = {25, 35, 45, 17, 28, 19, 33, 42, 51, 67, 73, 88};
    for (int i = 0; i < 12; i++) {
        insert(ht, values[i]);
        printf("插入 %d\n", values[i]);
    }
    
    printf("\n");
    printHashTable(ht);
    
    printf("\n=== 查找测试 ===\n");
    printf("查找25: %s\n", search(ht, 25) ? "找到" : "未找到");
    printf("查找99: %s\n", search(ht, 99) ? "找到" : "未找到");
    
    destroyHashTable(ht);
    return 0;
}

运行结果：

text

复制代码

=== 插入元素（观察扩容）===
插入 25
插入 35
插入 45
插入 17
插入 28
插入 19
插入 33
负载因子 0.75 >= 0.75，扩容到 16
插入 42
插入 51
插入 67
插入 73
插入 88

哈希表 (size=16, count=12, load=0.75):
[0]: 64 -> NULL
[1]: 33 -> 17 -> 25 -> NULL
[2]: 42 -> 18 -> NULL
[3]: 35 -> 19 -> 51 -> 67 -> NULL
...

六、两种冲突解决方法的对比

对比项	链地址法	开放地址法
原理	冲突元素用链表存储	找下一个空位
空间利用率	需要额外指针	100%利用
删除操作	简单	复杂（需标记删除）
聚集问题	无	有（线性探测）
性能稳定性	稳定	可能退化
实现复杂度	中等	较低
适用场景	通用	数据量可预估

七、复杂度分析

操作	平均时间复杂度	最坏时间复杂度
插入	O(1)	O(n)
查找	O(1)	O(n)
删除	O(1)	O(n)

最坏情况：所有key映射到同一个位置，退化成链表。

如何保证平均O(1)：

哈希函数分布均匀
负载因子控制在0.75以下
适时扩容

八、小结

这一篇我们学习了哈希表：

要点	说明
哈希函数	除留余数法最常用，size取质数
链地址法	数组+链表，实现简单，性能稳定
开放地址法	线性探测、二次探测，无额外指针
负载因子	扩容阈值，通常0.75
时间复杂度	平均O(1)，最坏O(n)

哈希表的核心：

好的哈希函数让数据分布均匀
合理的负载因子保证性能
冲突解决策略影响实现复杂度

下一篇我们讲排序算法概述与插入排序。

九、思考题

除留余数法中，为什么模数取质数能减少冲突？
链地址法和开放地址法，哪种删除操作更简单？为什么？
如果负载因子超过1，链地址法和开放地址法分别会发生什么？
如何设计一个字符串的哈希函数？

欢迎在评论区讨论你的答案。