【操作系统入门】文件系统

【操作系统入门】第九章：文件系统------数据的持久化宇宙

本系列共10篇，这是第9/10篇。在第八章，我们深入探讨了虚拟内存与页面置换。今天，我们将进入操作系统的持久化存储领域------文件系统，探索数据如何在磁盘上被组织、存储和检索。

开篇：从物理磁盘到逻辑文件

想象一个巨大的图书馆：数以百万计的书籍（文件）需要被有序存放，读者（进程）能够快速找到所需书籍，图书管理员（文件系统）负责管理所有空间和借阅记录。文件系统就是计算机世界的图书馆管理系统，将原始的磁盘扇区转换为用户熟悉的文件和目录。

第一部分：文件系统基础概念

1.1 文件系统的核心抽象

文件系统在物理存储之上提供了四个关键抽象：

文件：命名的数据集合
目录：包含文件和其他目录的容器
路径名：层次化定位文件的机制
元数据：描述文件属性的数据

1.2 文件类型与属性

c 复制代码

// 文件控制块（FCB）的基本结构
typedef struct {
    uint32_t inode_number;          // 索引节点号
    char filename[256];             // 文件名
    file_type_t type;               // 文件类型
    uint64_t size;                  // 文件大小（字节）
    uint32_t block_count;           // 占用磁盘块数
    time_t created_time;            // 创建时间
    time_t modified_time;           // 修改时间
    time_t accessed_time;           // 访问时间
    uint16_t permissions;           // 权限位
    uint32_t link_count;            // 硬链接计数
    uint32_t owner_uid;             // 所有者用户ID
    uint32_t group_gid;             // 所属组ID
} file_control_block_t;

// 文件类型枚举
typedef enum {
    FT_REGULAR = 0,     // 普通文件
    FT_DIRECTORY = 1,   // 目录文件
    FT_SYMLINK = 2,     // 符号链接
    FT_CHAR_DEV = 3,    // 字符设备
    FT_BLOCK_DEV = 4,   // 块设备
    FT_FIFO = 5,        // 命名管道
    FT_SOCKET = 6       // 套接字
} file_type_t;

1.3 文件操作的系统调用实现

c 复制代码

// 文件系统调用在内核中的实现框架
struct file_operations {
    ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
    ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
    int (*open)(struct inode *, struct file *);
    int (*release)(struct inode *, struct file *);
    int (*mmap)(struct file *, struct vm_area_struct *);
    long (*ioctl)(struct file *, unsigned int, unsigned long);
};

// 打开文件的系统调用实现
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
    struct file *file;
    int fd;
    
    // 分配文件描述符
    fd = get_unused_fd_flags(flags);
    if (fd < 0)
        return fd;
    
    // 根据路径名查找或创建文件
    file = do_file_open(root, filename, flags, mode);
    if (IS_ERR(file)) {
        put_unused_fd(fd);
        return PTR_ERR(file);
    }
    
    // 安装文件描述符
    fd_install(fd, file);
    return fd;
}

第二部分：磁盘空间分配策略

2.1 连续分配

早期的文件系统使用连续分配，类似内存管理的连续分配：

c 复制代码

// 连续分配的文件控制块
typedef struct {
    uint32_t start_block;    // 起始磁盘块号
    uint32_t block_count;    // 连续块数量
    uint32_t file_size;      // 文件大小
} contiguous_allocation_t;

// 连续分配的优缺点分析
void analyze_contiguous_allocation(void)
{
    printf("连续分配策略分析:\n");
    printf("优点:\n");
    printf("  - 顺序访问性能极佳\n");
    printf("  - 实现简单，寻道时间少\n");
    printf("缺点:\n");
    printf("  - 外部碎片严重\n");
    printf("  - 文件大小难以动态增长\n");
    printf("  - 需要预分配空间\n");
}

2.2 链接分配

使用链表连接文件的磁盘块：

c 复制代码

// 链接分配的磁盘块结构
typedef struct disk_block {
    uint32_t block_number;           // 当前块号
    uint32_t data_blocks[DATA_PER_BLOCK]; // 数据区
    uint32_t next_block;             // 下一个块指针
} disk_block_t;

// 文件分配表（FAT）实现
typedef struct {
    uint32_t cluster_count;          // 簇总数
    int32_t *fat_table;              // FAT表
    uint32_t free_clusters;          // 空闲簇计数
} fat_filesystem_t;

// FAT表项的特殊值
#define FAT_FREE       0x00000000    // 空闲簇
#define FAT_RESERVED   0x00000001    // 保留簇
#define FAT_BAD        0xFFFFFFF7    // 坏簇
#define FAT_EOF        0xFFFFFFFF    // 文件结束

// 在FAT中分配新簇
uint32_t fat_allocate_cluster(fat_filesystem_t *fat)
{
    // 查找第一个空闲簇
    for (uint32_t i = 2; i < fat->cluster_count; i++) {
        if (fat->fat_table[i] == FAT_FREE) {
            fat->fat_table[i] = FAT_EOF;
            fat->free_clusters--;
            return i;
        }
    }
    return 0; // 分配失败
}

2.3 索引分配

为每个文件创建索引块，存储所有数据块指针：

c 复制代码

// 多级索引节点结构（类Unix系统）
typedef struct inode {
    uint16_t mode;                   // 文件类型和权限
    uint16_t link_count;             // 硬链接数
    uint32_t uid;                    // 所有者ID
    uint32_t gid;                    // 组ID
    uint64_t size;                   // 文件大小
    uint32_t block_count;            // 使用的磁盘块数
    uint32_t direct_blocks[12];      // 直接指针（12个）
    uint32_t indirect_block;         // 一级间接指针
    uint32_t double_indirect_block;  // 二级间接指针
    uint32_t triple_indirect_block;  // 三级间接指针
    time_t atime, mtime, ctime;      // 时间戳
} inode_t;

// 计算文件最大支持的大小
uint64_t calculate_max_file_size(uint32_t block_size)
{
    uint32_t pointers_per_block = block_size / sizeof(uint32_t);
    
    uint64_t max_size = 0;
    
    // 直接块
    max_size += 12 * block_size;
    
    // 一级间接
    max_size += pointers_per_block * block_size;
    
    // 二级间接
    max_size += pointers_per_block * pointers_per_block * block_size;
    
    // 三级间接
    max_size += (uint64_t)pointers_per_block * pointers_per_block * pointers_per_block * block_size;
    
    return max_size;
}

// 根据文件偏移查找对应的数据块
uint32_t inode_find_block(inode_t *inode, uint64_t offset, uint32_t block_size)
{
    uint32_t block_index = offset / block_size;
    uint32_t pointers_per_block = block_size / sizeof(uint32_t);
    
    if (block_index < 12) {
        // 直接块
        return inode->direct_blocks[block_index];
    }
    else if (block_index < 12 + pointers_per_block) {
        // 一级间接
        uint32_t *indirect_block = read_block(inode->indirect_block);
        return indirect_block[block_index - 12];
    }
    else if (block_index < 12 + pointers_per_block + pointers_per_block * pointers_per_block) {
        // 二级间接
        uint32_t idx1 = (block_index - 12 - pointers_per_block) / pointers_per_block;
        uint32_t idx2 = (block_index - 12 - pointers_per_block) % pointers_per_block;
        
        uint32_t *double_indirect = read_block(inode->double_indirect_block);
        uint32_t *indirect_block = read_block(double_indirect[idx1]);
        return indirect_block[idx2];
    }
    else {
        // 三级间接（类似处理）
        // ...
    }
    
    return 0; // 块未分配
}

第三部分：目录实现与路径解析

3.1 目录条目结构

c 复制代码

// 简单的目录条目（FAT风格）
typedef struct {
    char filename[8];        // 文件名（8.3格式）
    char extension[3];       // 扩展名
    uint8_t attributes;      // 文件属性
    uint32_t first_cluster;  // 起始簇号
    uint32_t file_size;      // 文件大小
} fat_directory_entry_t;

// Unix风格的目录条目
typedef struct {
    uint32_t inode_number;   // 索引节点号
    uint16_t entry_length;   // 条目总长度
    uint8_t name_length;     // 文件名长度
    uint8_t file_type;       // 文件类型
    char filename[];         // 可变长度文件名
} unix_directory_entry_t;

// 扩展属性目录条目（现代文件系统）
typedef struct {
    uint64_t inode_number;
    uint32_t entry_length;
    uint16_t name_length;
    uint8_t file_type;
    uint16_t extra_field_length;
    char filename[];
    // 扩展字段：文件版本、加密信息等
} ext_directory_entry_t;

3.2 目录操作实现

c 复制代码

// 目录查找函数
inode_t *directory_lookup(inode_t *dir_inode, const char *name)
{
    if (!S_ISDIR(dir_inode->mode)) {
        return ERR_PTR(-ENOTDIR);
    }
    
    char buffer[BLOCK_SIZE];
    uint64_t offset = 0;
    
    // 遍历目录的所有数据块
    while (offset < dir_inode->size) {
        uint32_t block_num = inode_find_block(dir_inode, offset, BLOCK_SIZE);
        read_block(block_num, buffer);
        
        // 解析目录条目
        char *ptr = buffer;
        while (ptr < buffer + BLOCK_SIZE) {
            unix_directory_entry_t *entry = (unix_directory_entry_t *)ptr;
            
            // 空条目表示结束
            if (entry->inode_number == 0) {
                break;
            }
            
            // 比较文件名
            if (strncmp(entry->filename, name, entry->name_length) == 0) {
                return get_inode(entry->inode_number);
            }
            
            ptr += entry->entry_length;
        }
        
        offset += BLOCK_SIZE;
    }
    
    return ERR_PTR(-ENOENT); // 未找到
}

// 创建新目录条目
int directory_add_entry(inode_t *dir_inode, const char *name, uint32_t inode_num, uint8_t type)
{
    // 计算需要的空间
    size_t name_len = strlen(name);
    size_t entry_size = sizeof(unix_directory_entry_t) + name_len;
    entry_size = (entry_size + 3) & ~3; // 4字节对齐
    
    // 查找空闲位置
    char buffer[BLOCK_SIZE];
    uint64_t offset = 0;
    
    while (offset < dir_inode->size) {
        uint32_t block_num = inode_find_block(dir_inode, offset, BLOCK_SIZE);
        read_block(block_num, buffer);
        
        char *ptr = buffer;
        while (ptr < buffer + BLOCK_SIZE) {
            unix_directory_entry_t *entry = (unix_directory_entry_t *)ptr;
            
            if (entry->inode_number == 0 || 
                entry->entry_length >= entry_size) {
                // 找到空闲位置或足够大的已删除条目
                if (entry->inode_number == 0) {
                    // 新条目
                    entry->entry_length = BLOCK_SIZE - (ptr - buffer);
                }
                
                // 填充条目信息
                entry->inode_number = inode_num;
                entry->name_length = name_len;
                entry->file_type = type;
                memcpy(entry->filename, name, name_len);
                
                write_block(block_num, buffer);
                return 0;
            }
            
            ptr += entry->entry_length;
        }
        
        offset += BLOCK_SIZE;
    }
    
    // 需要扩展目录
    return directory_extend(dir_inode, name, inode_num, type);
}

3.3 路径名解析

c 复制代码

// 路径解析状态机
typedef struct path_walk {
    inode_t *current;        // 当前目录inode
    inode_t *parent;         // 父目录inode
    char *component;         // 当前路径组件
    int symlink_depth;       // 符号链接深度（防循环）
} path_walk_t;

// 路径名解析核心算法
inode_t *path_lookup(const char *pathname)
{
    path_walk_t walk;
    walk.current = get_current_directory();
    walk.symlink_depth = 0;
    
    char *path = strdup(pathname);
    char *saveptr;
    char *component = strtok_r(path, "/", &saveptr);
    
    while (component != NULL) {
        walk.component = component;
        
        // 处理特殊目录
        if (strcmp(component, ".") == 0) {
            // 当前目录，继续
            component = strtok_r(NULL, "/", &saveptr);
            continue;
        }
        else if (strcmp(component, "..") == 0) {
            // 父目录
            walk.parent = walk.current;
            walk.current = directory_lookup(walk.current, "..");
            if (IS_ERR(walk.current)) {
                free(path);
                return walk.current;
            }
            iput(walk.parent); // 释放父目录引用
        }
        else {
            // 普通文件或目录
            walk.parent = walk.current;
            walk.current = directory_lookup(walk.current, component);
            if (IS_ERR(walk.current)) {
                free(path);
                return walk.current;
            }
            iput(walk.parent);
            
            // 处理符号链接
            if (S_ISLNK(walk.current->mode)) {
                walk.current = handle_symlink(&walk);
                if (IS_ERR(walk.current)) {
                    free(path);
                    return walk.current;
                }
            }
        }
        
        component = strtok_r(NULL, "/", &saveptr);
    }
    
    free(path);
    return walk.current;
}

第四部分：磁盘空间管理

4.1 空闲空间管理

c 复制代码

// 位图空闲空间管理
typedef struct {
    uint8_t *bitmap;         // 位图数组
    uint32_t bitmap_blocks;  // 位图占用的块数
    uint32_t total_blocks;   // 总块数
    uint32_t free_blocks;    // 空闲块数
    spinlock_t lock;         // 保护锁
} bitmap_allocator_t;

// 使用位图分配空闲块
uint32_t bitmap_allocate_block(bitmap_allocator_t *alloc)
{
    spin_lock(&alloc->lock);
    
    if (alloc->free_blocks == 0) {
        spin_unlock(&alloc->lock);
        return 0; // 磁盘已满
    }
    
    // 查找第一个空闲位
    for (uint32_t i = 0; i < alloc->bitmap_blocks * BLOCK_SIZE * 8; i++) {
        uint32_t byte_index = i / 8;
        uint32_t bit_index = i % 8;
        
        if (!(alloc->bitmap[byte_index] & (1 << bit_index))) {
            // 找到空闲块，标记为已分配
            alloc->bitmap[byte_index] |= (1 << bit_index);
            alloc->free_blocks--;
            
            spin_unlock(&alloc->lock);
            return i + 1; // 块号从1开始（0通常保留）
        }
    }
    
    spin_unlock(&alloc->lock);
    return 0; // 不应该到达这里
}

// 使用成组链接法的高效空闲块管理
typedef struct free_block_group {
    uint32_t free_blocks[50];    // 空闲块号数组
    uint32_t count;              // 有效数量
    uint32_t next_group;         // 下一组块号
} free_block_group_t;

// 成组链接分配器
typedef struct {
    uint32_t super_block;        // 超级块中保存的组
    free_block_group_t cache;    // 缓存的空闲块组
    uint32_t total_groups;       // 总组数
} group_allocator_t;

uint32_t group_allocate_block(group_allocator_t *alloc)
{
    if (alloc->cache.count == 0) {
        // 缓存为空，加载下一组
        if (alloc->cache.next_group == 0) {
            return 0; // 磁盘已满
        }
        
        free_block_group_t new_group;
        read_block(alloc->cache.next_group, &new_group);
        
        alloc->cache = new_group;
        
        // 释放旧的超级块
        bitmap_free_block(alloc->cache.next_group);
    }
    
    // 从缓存中分配一个块
    alloc->cache.count--;
    uint32_t block = alloc->cache.free_blocks[alloc->cache.count];
    
    // 如果缓存将空，保存下一组指针到超级块
    if (alloc->cache.count == 0) {
        alloc->super_block = alloc->cache.next_group;
    }
    
    return block;
}

4.2 磁盘块预分配策略

c 复制代码

// 扩展预分配策略
typedef struct {
    uint32_t prealloc_blocks[8]; // 预分配块数组
    uint32_t prealloc_count;     // 预分配数量
    uint32_t last_allocated;     // 最后分配的块号
    uint8_t allocation_policy;   // 分配策略
} file_allocation_t;

// 为文件预分配磁盘块
int file_preallocate_blocks(inode_t *inode, uint32_t block_count)
{
    file_allocation_t *fa = &inode->allocation_info;
    
    // 根据文件大小和增长模式选择预分配策略
    if (inode->size < 1024 * 1024) {
        // 小文件：适度预分配
        fa->allocation_policy = POLICY_SMALL_FILE;
        fa->prealloc_count = min(block_count, 8);
    } else if (inode->size < 100 * 1024 * 1024) {
        // 中等文件：积极预分配
        fa->allocation_policy = POLICY_MEDIUM_FILE;
        fa->prealloc_count = min(block_count, 32);
    } else {
        // 大文件：保守预分配
        fa->allocation_policy = POLICY_LARGE_FILE;
        fa->prealloc_count = min(block_count, 64);
    }
    
    // 分配连续的磁盘块
    uint32_t start_block = find_contiguous_blocks(fa->prealloc_count);
    if (start_block == 0) {
        return -ENOSPC;
    }
    
    for (uint32_t i = 0; i < fa->prealloc_count; i++) {
        fa->prealloc_blocks[i] = start_block + i;
        mark_block_allocated(start_block + i);
    }
    
    fa->last_allocated = start_block + fa->prealloc_count - 1;
    return 0;
}

第五部分：文件系统性能优化

5.1 缓冲区缓存

c 复制代码

// 缓冲区缓存数据结构
typedef struct buffer_head {
    uint32_t block_number;           // 磁盘块号
    uint8_t *data;                   // 数据指针
    uint8_t dirty : 1;               // 脏位
    uint8_t valid : 1;               // 有效位
    uint8_t locked : 1;              // 锁定位
    uint32_t use_count;              // 使用计数
    struct list_head lru_list;       // LRU链表
    struct hash_list hash_list;      // 哈希链表
} buffer_head_t;

// 缓冲区缓存管理器
typedef struct buffer_cache {
    buffer_head_t *buffers;          // 缓冲区数组
    uint32_t buffer_count;           // 缓冲区数量
    struct list_head lru_list;       // LRU链表头
    struct hash_bucket *hash_table;  // 哈希表
    uint32_t hash_size;              // 哈希表大小
    spinlock_t lock;                 // 保护锁
} buffer_cache_t;

// 缓冲区查找算法
buffer_head_t *buffer_cache_get(buffer_cache_t *cache, uint32_t block_num)
{
    uint32_t hash = block_num % cache->hash_size;
    
    spin_lock(&cache->lock);
    
    // 在哈希链表中查找
    buffer_head_t *bh;
    list_for_each_entry(bh, &cache->hash_table[hash], hash_list) {
        if (bh->block_number == block_num) {
            // 找到缓冲区，更新LRU
            list_del(&bh->lru_list);
            list_add(&bh->lru_list, &cache->lru_list);
            bh->use_count++;
            spin_unlock(&cache->lock);
            return bh;
        }
    }
    
    spin_unlock(&cache->lock);
    return NULL; // 未找到
}

// 缓冲区分配算法
buffer_head_t *buffer_cache_alloc(buffer_cache_t *cache, uint32_t block_num)
{
    buffer_head_t *bh;
    
    spin_lock(&cache->lock);
    
    // 尝试从LRU末尾获取空闲缓冲区
    if (!list_empty(&cache->lru_list)) {
        bh = list_last_entry(&cache->lru_list, buffer_head_t, lru_list);
        
        if (bh->use_count == 0) {
            // 找到空闲缓冲区
            if (bh->dirty) {
                // 脏缓冲区需要写回
                spin_unlock(&cache->lock);
                write_buffer_to_disk(bh);
                spin_lock(&cache->lock);
            }
            
            // 从哈希表中移除旧映射
            list_del(&bh->hash_list);
            
            // 重新初始化缓冲区
            bh->block_number = block_num;
            bh->dirty = 0;
            bh->valid = 0;
            bh->use_count = 1;
            
            // 添加到哈希表和新位置
            uint32_t hash = block_num % cache->hash_size;
            list_add(&bh->hash_list, &cache->hash_table[hash]);
            list_del(&bh->lru_list);
            list_add(&bh->lru_list, &cache->lru_list);
            
            spin_unlock(&cache->lock);
            return bh;
        }
    }
    
    spin_unlock(&cache->lock);
    
    // 没有空闲缓冲区，需要同步写回或失败
    return buffer_cache_alloc_slow(cache, block_num);
}

5.2 预读优化

c 复制代码

// 预读算法状态机
typedef struct readahead_state {
    uint64_t start_offset;           // 起始偏移
    uint32_t ra_size;                // 预读大小
    uint32_t ra_index;               // 预读索引
    uint8_t ra_flags;                // 预读标志
    uint32_t prev_page;              // 前一页
    uint32_t next_page;              // 后一页
} readahead_state_t;

// 自适应预读算法
void do_adaptive_readahead(struct file *file, readahead_state_t *ra)
{
    struct inode *inode = file->f_inode;
    uint32_t max_readahead = min(inode->i_sb->s_max_readahead, 256);
    
    // 检测访问模式
    if (ra->prev_page + 1 == ra->next_page) {
        // 顺序访问，增加预读窗口
        ra->ra_size = min(ra->ra_size * 2, max_readahead);
    } else if (ra->next_page > ra->prev_page + 1) {
        // 向前跳转，可能是随机访问
        ra->ra_size = max_readahead / 4;
    } else {
        // 向后访问，重置预读
        ra->ra_size = get_init_ra_size(file->f_ra);
    }
    
    // 执行预读
    uint32_t block_start = ra->next_page;
    uint32_t block_end = min(ra->next_page + ra->ra_size, 
                            (inode->i_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
    
    for (uint32_t block = block_start; block < block_end; block++) {
        struct buffer_head *bh = buffer_cache_get(block);
        if (!bh) {
            bh = buffer_cache_alloc(block);
            submit_bh(READ, bh); // 异步读取
        }
    }
    
    ra->prev_page = ra->next_page;
    ra->next_page = block_end;
}

第六部分：日志与崩溃一致性

6.1 日志文件系统原理

c 复制代码

// 日志事务结构
typedef struct journal_transaction {
    uint32_t transaction_id;         // 事务ID
    uint32_t start_block;            // 起始块号
    uint32_t block_count;            // 块数量
    uint8_t  commit_flag;            // 提交标志
    uint32_t checksum;               // 校验和
    journal_descriptor_t *descriptors; // 描述符列表
} journal_transaction_t;

// 日志条目类型
typedef enum {
    JOURNAL_DESCRIPTOR_BLOCK = 1,    // 描述符块
    JOURNAL_COMMIT_BLOCK = 2,        // 提交块
    JOURNAL_REVOKE_BLOCK = 3,        // 撤销块
    JOURNAL_SUPERBLOCK_V2 = 4        // 超级块
} journal_block_type_t;

// 日志提交过程
int journal_commit_transaction(journal_t *journal, journal_transaction_t *transaction)
{
    // 阶段1：写描述符块
    journal_descriptor_t *desc = write_descriptor_block(journal, transaction);
    if (!desc) {
        return -EIO;
    }
    
    // 阶段2：写数据块到日志区域
    for (uint32_t i = 0; i < transaction->block_count; i++) {
        if (write_data_block_to_log(journal, transaction->blocks[i]) < 0) {
            journal_abort_transaction(journal, transaction);
            return -EIO;
        }
    }
    
    // 阶段3：写提交记录
    if (write_commit_block(journal, transaction) < 0) {
        journal_abort_transaction(journal, transaction);
        return -EIO;
    }
    
    // 阶段4：检查点 - 将日志数据写回文件系统
    journal_checkpoint(journal, transaction);
    
    // 阶段5：释放日志空间
    journal_free_transaction(journal, transaction);
    
    return 0;
}

// 日志恢复过程
int journal_recovery(journal_t *journal)
{
    uint32_t next_transaction = journal->superblock->s_start;
    
    while (next_transaction != 0) {
        journal_transaction_t *trans = read_transaction(journal, next_transaction);
        
        if (!trans || !validate_transaction(trans)) {
            // 损坏的事务，停止恢复
            break;
        }
        
        if (trans->commit_flag) {
            // 已提交的事务，需要重做
            for (uint32_t i = 0; i < trans->block_count; i++) {
                write_block_to_filesystem(trans->blocks[i].fs_block,
                                        trans->blocks[i].log_data);
            }
        } else {
            // 未提交的事务，需要撤销
            journal_undo_transaction(trans);
        }
        
        next_transaction = trans->next_transaction;
        free(trans);
    }
    
    return 0;
}

6.2 写时复制（Copy-on-Write）

c 复制代码

// COW文件系统数据结构
typedef struct cow_block {
    uint32_t original_block;         // 原始块号
    uint32_t new_block;              // 新分配的块号
    uint8_t  copied;                 // 是否已复制
    struct list_head list;           // 链表
} cow_block_t;

// COW事务处理
int cow_transaction_begin(cow_filesystem_t *cow_fs)
{
    cow_fs->current_transaction = kmalloc(sizeof(cow_transaction_t));
    INIT_LIST_HEAD(&cow_fs->current_transaction->modified_blocks);
    cow_fs->current_transaction->transaction_id = cow_fs->next_transaction_id++;
    return 0;
}

// COW块修改
int cow_modify_block(cow_filesystem_t *cow_fs, uint32_t block_num)
{
    // 检查是否已在当前事务中修改
    cow_block_t *cow_block;
    list_for_each_entry(cow_block, &cow_fs->current_transaction->modified_blocks, list) {
        if (cow_block->original_block == block_num) {
            return 0; // 已在事务中
        }
    }
    
    // 分配新块
    uint32_t new_block = allocate_block(cow_fs);
    if (new_block == 0) {
        return -ENOSPC;
    }
    
    // 复制原始数据到新块
    read_block(block_num, temp_buffer);
    write_block(new_block, temp_buffer);
    
    // 记录COW映射
    cow_block = kmalloc(sizeof(cow_block_t));
    cow_block->original_block = block_num;
    cow_block->new_block = new_block;
    cow_block->copied = 1;
    list_add(&cow_block->list, &cow_fs->current_transaction->modified_blocks);
    
    return 0;
}

// COW事务提交
int cow_transaction_commit(cow_filesystem_t *cow_fs)
{
    // 原子性地更新指针
    cow_block_t *cow_block, *tmp;
    list_for_each_entry_safe(cow_block, tmp, 
                           &cow_fs->current_transaction->modified_blocks, list) {
        atomic_update_block_pointer(cow_block->original_block, cow_block->new_block);
        list_del(&cow_block->list);
        kfree(cow_block);
    }
    
    // 更新超级块指向新的根
    update_superblock_root(cow_fs);
    
    kfree(cow_fs->current_transaction);
    cow_fs->current_transaction = NULL;
    
    return 0;
}

第七部分：现代文件系统特性

7.1 扩展属性与访问控制列表

c 复制代码

// 扩展属性系统
typedef struct xattr_entry {
    char name[XATTR_NAME_MAX];       // 属性名
    uint32_t value_length;           // 值长度
    uint32_t value_offset;           // 值偏移
    uint8_t name_len;                // 名称长度
    uint8_t flags;                   // 标志位
} xattr_entry_t;

// 访问控制列表
typedef struct acl_entry {
    uint16_t tag;                    // 条目类型
    uint16_t perm;                   // 权限
    uint32_t id;                     // 用户/组ID
} acl_entry_t;

typedef struct acl {
    uint16_t version;                // ACL版本
    uint16_t entry_count;            // 条目数量
    acl_entry_t entries[];           // 条目数组
} acl_t;

// ACL权限检查
int acl_permission_check(struct inode *inode, int mask)
{
    acl_t *acl = get_inode_acl(inode);
    
    if (!acl) {
        // 无ACL，使用传统Unix权限
        return traditional_permission_check(inode, mask);
    }
    
    // 检查所有者权限
    if (current_user() == inode->owner_uid) {
        if ((acl->owner_perm & mask) == mask) {
            return 0;
        }
    }
    
    // 检查组权限
    if (in_group_p(inode->group_gid)) {
        if ((acl->group_perm & mask) == mask) {
            return 0;
        }
    }
    
    // 检查其他用户权限
    if ((acl->other_perm & mask) == mask) {
        return 0;
    }
    
    // 检查ACL中的特定用户和组
    for (int i = 0; i < acl->entry_count; i++) {
        acl_entry_t *entry = &acl->entries[i];
        
        if (entry->tag == ACL_USER && entry->id == current_user()) {
            if ((entry->perm & mask) == mask) {
                return 0;
            }
        }
        
        if (entry->tag == ACL_GROUP && in_group_p(entry->id)) {
            if ((entry->perm & mask) == mask) {
                return 0;
            }
        }
    }
    
    return -EACCES;
}

7.2 快照与数据去重

c 复制代码

// 写时复制快照
typedef struct filesystem_snapshot {
    uint64_t snapshot_id;            // 快照ID
    uint64_t create_time;            // 创建时间
    uint32_t root_inode;             // 快照根inode
    uint32_t block_size;             // 块大小
    uint32_t referenced_blocks;      // 引用块数
    struct list_head block_mappings; // 块映射表
} filesystem_snapshot_t;

// 数据去重引擎
typedef struct deduplication_engine {
    uint8_t *chunk_hashes;           // 块哈希表
    uint32_t *hash_to_block;         // 哈希到块映射
    uint32_t hash_table_size;        // 哈希表大小
    uint64_t saved_blocks;           // 节省的块数
} deduplication_engine_t;

// 数据去重处理
int deduplicate_data(uint32_t block_num, const uint8_t *data)
{
    // 计算数据块的哈希值
    uint8_t hash[SHA256_DIGEST_SIZE];
    sha256_hash(data, BLOCK_SIZE, hash);
    
    // 查找是否已存在相同数据的块
    uint32_t existing_block = find_block_by_hash(dedup_engine, hash);
    
    if (existing_block != 0) {
        // 找到重复块，创建引用
        create_block_reference(block_num, existing_block);
        dedup_engine->saved_blocks++;
        return 1; // 去重成功
    }
    
    // 新数据块，添加到哈希表
    add_block_to_dedup_table(dedup_engine, block_num, hash);
    return 0; // 无去重
}

第八部分：虚拟文件系统（VFS）

8.1 VFS抽象层

c 复制代码

// VFS通用inode结构
struct inode {
    umode_t            i_mode;           // 文件类型和权限
    uid_t              i_uid;            // 所有者ID
    gid_t              i_gid;            // 组ID
    const struct inode_operations *i_op; // inode操作
    const struct file_operations *i_fop; // 文件操作
    struct super_block *i_sb;            // 超级块
    void              *i_private;        // 文件系统私有数据
};

// 文件操作函数表
struct file_operations {
    struct module *owner;
    loff_t (*llseek) (struct file *, loff_t, int);
    ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
    ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
    int (*open) (struct inode *, struct file *);
    int (*release) (struct inode *, struct file *);
    int (*fsync) (struct file *, loff_t, loff_t, int datasync);
    long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
    int (*mmap) (struct file *, struct vm_area_struct *);
};

// inode操作函数表
struct inode_operations {
    struct dentry * (*lookup) (struct inode *, struct dentry *, unsigned int);
    int (*create) (struct inode *, struct dentry *, umode_t, bool);
    int (*link) (struct dentry *, struct inode *, struct dentry *);
    int (*unlink) (struct inode *, struct dentry *);
    int (*symlink) (struct inode *, struct dentry *, const char *);
    int (*mkdir) (struct inode *, struct dentry *, umode_t);
    int (*rmdir) (struct inode *, struct dentry *);
    int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int);
    int (*setattr) (struct dentry *, struct iattr *);
    int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
};

8.2 文件系统注册与挂载

c 复制代码

// 文件系统类型注册
struct file_system_type {
    const char *name;                   // 文件系统名称
    int fs_flags;                       // 文件系统标志
    struct dentry *(*mount) (struct file_system_type *, int, const char *, void *);
    void (*kill_sb) (struct super_block *); // 超级块销毁
    struct module *owner;               // 模块所有者
    struct file_system_type * next;     // 下一个文件系统
    struct hlist_head fs_supers;        // 超级块链表
};

// 文件系统注册函数
int register_filesystem(struct file_system_type * fs)
{
    int res = 0;
    struct file_system_type ** p;
    
    BUG_ON(strchr(fs->name, '.'));
    if (fs->next)
        return -EBUSY;
    
    write_lock(&file_systems_lock);
    p = find_filesystem(fs->name, strlen(fs->name));
    if (*p) {
        res = -EBUSY; // 已注册
    } else {
        *p = fs; // 添加到链表
    }
    write_unlock(&file_systems_lock);
    
    return res;
}

// 挂载系统调用实现
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
                char __user *, type, unsigned long, flags, void __user *, data)
{
    int ret;
    char *kernel_type;
    char *kernel_dev;
    unsigned long data_page;
    
    // 拷贝用户空间参数到内核空间
    kernel_type = copy_mount_string(type);
    if (IS_ERR(kernel_type))
        return PTR_ERR(kernel_type);
    
    kernel_dev = copy_mount_string(dev_name);
    if (IS_ERR(kernel_dev)) {
        ret = PTR_ERR(kernel_dev);
        goto out_type;
    }
    
    // 执行挂载操作
    ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
                  (void *) data_page);
    
    kfree(kernel_dev);
out_type:
    kfree(kernel_type);
    return ret;
}

总结与展望

今天我们深入探讨了：

文件系统的基本抽象和核心数据结构
各种磁盘空间分配策略的优缺点
目录实现和路径名解析的复杂算法
文件系统性能优化技术，包括缓存和预读
崩溃一致性保障的日志和COW技术
现代文件系统的高级特性
虚拟文件系统的抽象层设计

文件系统是操作系统中最复杂、最精妙的子系统之一，它需要在性能、可靠性、功能和复杂性之间做出精细的平衡。

系列导航：

上一篇：[操作系统入门] 第八章：虚拟内存与页面置换------超越物理限制的魔法
下一篇：[操作系统入门] 第十章：设备管理与系统安全------硬件交互与安全防线