一、基本概念
1.1 带权路径长度
在二叉树中:
-
路径长度:从一个节点到另一个节点经过的边数
-
带权路径长度(WPL):所有叶子节点的权重 × 路径长度 之和
示例:
text
叶子节点:A(7), B(5), C(2), D(4)
普通树:
15
/ \
7 8
/ \
5 3
/ \
2 4
WPL = 7×1 + 5×2 + 2×3 + 4×3 = 7 + 10 + 6 + 12 = 35
哈夫曼树:
18
/ \
7 11
/ \
5 6
/ \
2 4
WPL = 7×1 + 5×2 + 2×3 + 4×3 = 35(相同)
1.2 哈夫曼树的定义
哈夫曼树:带权路径长度最小的二叉树。权值越大的叶子节点离根越近。
1.3 应用场景
-
数据压缩(哈夫曼编码)
-
文件压缩(ZIP、RAR)
-
多媒体编码(JPEG、MP3中的熵编码)
二、哈夫曼树的构造(贪心算法)
2.1 算法步骤
-
将每个权值看作一个只有根节点的二叉树
-
选择两个权值最小的树作为左右子树,构造新树(权值为二者之和)
-
从森林中删除这两棵树,加入新树
-
重复步骤2-3,直到只剩一棵树
示例 :权值 [5, 4, 2, 7]
text
步骤1:森林 {5}, {4}, {2}, {7}
步骤2:取2和4 → 新树6,森林 {5}, {6}, {7}
步骤3:取5和6 → 新树11,森林 {7}, {11}
步骤4:取7和11 → 新树18,森林 {18}
2.2 手动构造
text
18
/ \
7 11
/ \
5 6
/ \
2 4
2.3 代码实现
c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_NODES 100
typedef struct {
int weight; // 权值
int parent; // 父节点下标(-1表示无)
int left, right; // 左右孩子下标(-1表示无)
} HuffmanNode;
typedef struct {
HuffmanNode nodes[MAX_NODES * 2]; // 存储所有节点
int leafNum; // 叶子节点数
int nodeNum; // 总节点数
} HuffmanTree;
// 初始化哈夫曼树
void initHuffmanTree(HuffmanTree *tree, int *weights, int n) {
tree->leafNum = n;
tree->nodeNum = 2 * n - 1; // 哈夫曼树总节点数 = 2n-1
// 初始化所有节点
for (int i = 0; i < tree->nodeNum; i++) {
tree->nodes[i].weight = (i < n) ? weights[i] : 0;
tree->nodes[i].parent = -1;
tree->nodes[i].left = -1;
tree->nodes[i].right = -1;
}
}
// 在[0, range)范围内找两个权值最小且parent=-1的节点
void selectMin(HuffmanTree *tree, int range, int *s1, int *s2) {
int min1 = -1, min2 = -1;
for (int i = 0; i < range; i++) {
if (tree->nodes[i].parent != -1) continue; // 已使用
if (min1 == -1 || tree->nodes[i].weight < tree->nodes[min1].weight) {
min2 = min1;
min1 = i;
} else if (min2 == -1 || tree->nodes[i].weight < tree->nodes[min2].weight) {
min2 = i;
}
}
*s1 = min1;
*s2 = min2;
}
// 构造哈夫曼树
void createHuffmanTree(HuffmanTree *tree) {
int n = tree->leafNum;
int total = tree->nodeNum;
for (int i = n; i < total; i++) {
int s1, s2;
selectMin(tree, i, &s1, &s2);
// 创建新节点
tree->nodes[i].weight = tree->nodes[s1].weight + tree->nodes[s2].weight;
tree->nodes[i].left = s1;
tree->nodes[i].right = s2;
tree->nodes[s1].parent = i;
tree->nodes[s2].parent = i;
}
}
// 打印哈夫曼树
void printHuffmanTree(HuffmanTree *tree) {
printf("索引\t权值\t父节点\t左孩子\t右孩子\n");
for (int i = 0; i < tree->nodeNum; i++) {
printf("%d\t%d\t%d\t%d\t%d\n", i, tree->nodes[i].weight,
tree->nodes[i].parent, tree->nodes[i].left, tree->nodes[i].right);
}
}
三、哈夫曼编码
3.1 编码规则
从根到叶子节点的路径:
-
向左走 → 编码 '0'
-
向右走 → 编码 '1'
示例(以上面的树为例):
text
叶子节点及其路径:
7: 根 → 左 → "0"
5: 根 → 右 → 左 → "10"
2: 根 → 右 → 右 → 左 → "110"
4: 根 → 右 → 右 → 右 → "111"
编码结果:
7: 0
5: 10
2: 110
4: 111
3.2 编码特点
-
前缀编码:没有任何编码是另一个编码的前缀
-
变长编码:出现频率高的字符用短编码
-
唯一可解码:不会产生歧义
3.3 代码实现
c
#define MAX_CODE 100
// 从叶子向上生成编码
void getHuffmanCodes(HuffmanTree *tree, char **codes) {
char *temp = (char*)malloc(MAX_CODE * sizeof(char));
for (int i = 0; i < tree->leafNum; i++) {
int start = MAX_CODE - 1;
temp[start] = '\0';
int child = i;
int parent = tree->nodes[child].parent;
while (parent != -1) {
if (tree->nodes[parent].left == child) {
temp[--start] = '0';
} else {
temp[--start] = '1';
}
child = parent;
parent = tree->nodes[child].parent;
}
// 复制编码
codes[i] = (char*)malloc((MAX_CODE - start) * sizeof(char));
strcpy(codes[i], &temp[start]);
}
free(temp);
}
四、完整代码演示
c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_NODES 100
#define MAX_CODE 100
typedef struct {
int weight;
int parent;
int left, right;
} HuffmanNode;
typedef struct {
HuffmanNode nodes[MAX_NODES * 2];
int leafNum;
int nodeNum;
} HuffmanTree;
void initHuffmanTree(HuffmanTree *tree, int *weights, int n) {
tree->leafNum = n;
tree->nodeNum = 2 * n - 1;
for (int i = 0; i < tree->nodeNum; i++) {
tree->nodes[i].weight = (i < n) ? weights[i] : 0;
tree->nodes[i].parent = -1;
tree->nodes[i].left = -1;
tree->nodes[i].right = -1;
}
}
void selectMin(HuffmanTree *tree, int range, int *s1, int *s2) {
int min1 = -1, min2 = -1;
for (int i = 0; i < range; i++) {
if (tree->nodes[i].parent != -1) continue;
if (min1 == -1 || tree->nodes[i].weight < tree->nodes[min1].weight) {
min2 = min1;
min1 = i;
} else if (min2 == -1 || tree->nodes[i].weight < tree->nodes[min2].weight) {
min2 = i;
}
}
*s1 = min1;
*s2 = min2;
}
void createHuffmanTree(HuffmanTree *tree) {
int n = tree->leafNum;
int total = tree->nodeNum;
for (int i = n; i < total; i++) {
int s1, s2;
selectMin(tree, i, &s1, &s2);
tree->nodes[i].weight = tree->nodes[s1].weight + tree->nodes[s2].weight;
tree->nodes[i].left = s1;
tree->nodes[i].right = s2;
tree->nodes[s1].parent = i;
tree->nodes[s2].parent = i;
}
}
void getHuffmanCodes(HuffmanTree *tree, char **codes) {
char *temp = (char*)malloc(MAX_CODE * sizeof(char));
for (int i = 0; i < tree->leafNum; i++) {
int start = MAX_CODE - 1;
temp[start] = '\0';
int child = i;
int parent = tree->nodes[child].parent;
while (parent != -1) {
if (tree->nodes[parent].left == child) {
temp[--start] = '0';
} else {
temp[--start] = '1';
}
child = parent;
parent = tree->nodes[child].parent;
}
codes[i] = (char*)malloc((MAX_CODE - start) * sizeof(char));
strcpy(codes[i], &temp[start]);
}
free(temp);
}
void printHuffmanTree(HuffmanTree *tree) {
printf("\n=== 哈夫曼树 ===\n");
printf("索引\t权值\t父节点\t左孩子\t右孩子\n");
for (int i = 0; i < tree->nodeNum; i++) {
printf("%d\t%d\t%d\t%d\t%d\n", i, tree->nodes[i].weight,
tree->nodes[i].parent, tree->nodes[i].left, tree->nodes[i].right);
}
}
int main() {
// 示例:字符频率
char chars[] = {'A', 'B', 'C', 'D'};
int weights[] = {7, 5, 2, 4};
int n = 4;
HuffmanTree tree;
initHuffmanTree(&tree, weights, n);
createHuffmanTree(&tree);
printHuffmanTree(&tree);
char *codes[MAX_NODES];
getHuffmanCodes(&tree, codes);
printf("\n=== 哈夫曼编码 ===\n");
for (int i = 0; i < n; i++) {
printf("%c (权值=%d): %s\n", chars[i], weights[i], codes[i]);
}
// 计算压缩率
int originalBits = 0;
int compressedBits = 0;
for (int i = 0; i < n; i++) {
originalBits += weights[i] * 8; // 假设每个字符原用8位
compressedBits += weights[i] * strlen(codes[i]);
}
printf("\n原始总位数: %d\n", originalBits);
printf("压缩后总位数: %d\n", compressedBits);
printf("压缩率: %.1f%%\n", (1 - (float)compressedBits / originalBits) * 100);
// 释放内存
for (int i = 0; i < n; i++) {
free(codes[i]);
}
return 0;
}
运行结果:
text
=== 哈夫曼树 ===
索引 权值 父节点 左孩子 右孩子
0 7 5 -1 -1
1 5 4 -1 -1
2 2 3 -1 -1
3 4 4 -1 -1
4 9 5 1 3
5 16 -1 0 4
=== 哈夫曼编码 ===
A (权值=7): 0
B (权值=5): 10
C (权值=2): 110
D (权值=4): 111
原始总位数: 144
压缩后总位数: 41
压缩率: 71.5%
五、哈夫曼编码的应用
5.1 数据压缩流程
text
原始数据 → 统计频率 → 构造哈夫曼树 → 生成编码 → 压缩数据
↓
存储编码表 + 编码数据
5.2 解压流程
text
压缩文件 → 读取编码表 → 重建哈夫曼树 → 解码数据 → 原始数据
5.3 实际应用
| 应用 | 说明 |
|---|---|
| ZIP压缩 | 结合LZ77和哈夫曼编码 |
| JPEG | 对DCT系数进行哈夫曼编码 |
| MP3 | 对量化后的频谱数据进行哈夫曼编码 |
| PNG | 使用DEFLATE算法(LZ77+哈夫曼) |
六、复杂度分析
| 操作 | 时间复杂度 | 说明 |
|---|---|---|
| 构造哈夫曼树 | O(n log n) | 每次找最小值,可用堆优化到O(n log n) |
| 生成编码 | O(n × 树高) | 最坏O(n²),平均O(n log n) |
| 编码数据 | O(m) | m为数据长度 |
| 解码数据 | O(m) | 从根到叶子,每字符走一次 |
七、小结
这一篇我们学习了哈夫曼树和哈夫曼编码:
| 要点 | 说明 |
|---|---|
| 哈夫曼树 | 带权路径长度最小的二叉树 |
| 构造算法 | 贪心:每次取两个最小的合并 |
| 哈夫曼编码 | 左0右1,频率高的用短码 |
| 特性 | 前缀编码,唯一可解码 |
| 应用 | 数据压缩(ZIP、JPEG、MP3) |
核心思想:让出现频率高的字符用最短的编码,从而实现整体压缩。
下一篇我们讲静态查找(顺序查找与折半查找)。
八、思考题
-
哈夫曼树是否唯一?权值相同的两个节点交换位置会怎样?
-
如果有n个叶子节点,哈夫曼树的总节点数是多少?为什么?
-
哈夫曼编码为什么不会产生歧义(即为什么是前缀编码)?
-
尝试用最小堆(优先队列)优化构造哈夫曼树的过程。
欢迎在评论区讨论你的答案。