数据库存储大量的json文件怎么样高效的读取和分页，利用文件缓存办法不占用内存

利用文件缓存的优势：

内存友好：使用文件流处理，不将整个大文件加载到内存
可扩展性：支持超大型JSON文件（GB级别）
缓存效率：文件缓存可以持久化，服务器重启后仍然有效
并发安全：每个分页请求生成独立的缓存文件
灵活性：可以根据需要调整缓存策略和分页大小

首先假设我的数据库里面有大量的json文件，我们现在把导出来，用json格式导出到一个文件内

javascript 复制代码

const fs=require('fs');
const ClientDB=require('../db/db');
const db=new ClientDB('employees','mylog');
const path=require('path');

async function testBigJson(){
    try{
        const docs=await db.find({});
        if(docs===false)
        {
            return;
        }

		//获取临时文件名
        const outputFile=getRandomFileName();
        //存储临时文件路径
        const outfile=path.join(__dirname,'..','public',outputFile);
		//创建文件写入流
        const writeableStream=fs.createWriteStream(outfile);
		//我们存储的在数组中，首先写[
        writeableStream.write('[');
        let isFile=true;
        docs.forEach((doc)=>{
            if(!isFile)
            {
                writeableStream.write(",\n");
            }
            //将我们要导出的字段写成对象，再用json.stringify()写成json格式
            const formatid=doc.i;
            const formatusername=doc.username;
            const formatage=doc.age;
            const formatcreated=doc.created;
            
            const result={id:formatid,username:formatusername,age:formatage,createAt:formatcreated};
            writeableStream.write(JSON.stringify(result));
            isFile=false;
        });
        writeableStream.write(']');
        writeableStream.end();
        //这里是返回临时文件名称，注意不是返回json数据
        return outputFile;
    }finally{
        await db.disconnect();
    }
}

//生成临时的随机json文件
function getRandomFileName(){
    return `output_${Math.random().toString(36).substring(2,9)}.json`
}

testBigJson()
.then(console.log());

第2步使用内存映射（mmap）和缓冲区操作，真正高效的文件读取方案

javascript 复制代码

const fs=require('fs');
const {promisify}=require('util');
const open=promisify(fs.open);
const read=promisify(fs.read);
const close=promisify(fs.close);
const stat=promisify(fs.stat);

class HighJsonPaginator{
    constructor(filePath)
    {
    	//json文件名路径
        this.filePath=filePath;
        this.fd=null;
        this.fileSize=0;
        this.buffer=Buffer.alloc(64*1024);  //64KB缓存区 
    }
	//初始化文件，先获取文件句柄，再获取文件属性，取得文件大小
    async init(){
        this.fd=await open(this.filePath,'r');
        const stats=await stat(this.filePath); //返回文件所有属性
        this.fileSize=stats.size;
    }

    async close(){
        if(this.fd)
        {
            await close(this.fd);
        }
    }

    //快速查找json数组的起始和结束位置
    async findArrayBounds(){
        const satrtBuffer=Buffer.alloc(1024);
        const endBuffer=Buffer.alloc(1024);

        //读取文件开头寻找'['
        //startBuffer指定的缓冲区，存放后面读取的内容
        //第1个0为写入缓冲区的位置，先写入缓存再读取内容，1024为读取文件字节的长度
        //最后的0为读取文件的开始位置
        await read(this.fd,satrtBuffer,0,1024,0);
        const startIndex=satrtBuffer.indexOf('[');


        //读取文件末尾寻找']'
        await read(this.fd,endBuffer,0,1024,this.fileSize-1024);
        const endIndex=endBuffer.indexOf(']');

        return{
            dataStart:startIndex+1, //跳过'['
            dataEnd:this.fileSize - (1024-endIndex) //找到']'的位置
        };
    };


    //高效分页读取  使用二进制搜索
    async paginateEfficiently(page=1,pageSize=10)
    {
        const bounds=await this.findArrayBounds();
        const totalBytes=bounds.dataEnd - bounds.dataStart;

        //创建索引(如果不存在)
        const index=await this.createOrLoadIndex(bounds);

        const startItem=(page-1)*pageSize;
        const endItem=startItem + pageSize;

        //这里表示小于分页需要的数量，这里是10个，数据就全返回
        if(startItem >=index.positions.length)
        {
            return {data:[],pagination:{page,pageSize,total:index.positions.length}};
        }

        const items=[];
        for(let i=startItem;i<Math.min(endItem,index.positions.length);i++)
        {
            //提取单个json数据，即{}这样一对数据
            const position=index.positions[i];
            //然后用readJsonItem()读取转换为对象
            const item=await this.readJsonItem(position.start,position.end); //这个start/end位置在下面createOrLoadIndex()的循环中
            //将单个对象加入数组
            if(item) items.push(item);
        }

        return{
            data:items,
            pagination:{
                page,
                pageSize,
                total:index.positions.length,
                totalPages:Math.ceil(index.positions.length / pageSize)
            }
        }

    }


    //创建或加载索引文件
    async createOrLoadIndex(bounds)
    {
        const indexFile=this.filePath +'.index';

        try{//尝试读取现有索引
            
            const indexData=await fs.promises.readFile(indexFile,'utf8');
            return JSON.parse(indexData);
        }catch{
            //创建新索引
            const index={positions:[]};
            let position=bounds.dataStart;
            let depth=0; //尝试
            let itemStart=position; //项目开始位置

            while(position < bounds.dataEnd)
            {
                const bytesToRead=Math.min(this.buffer.length,bounds.dataEnd-position); //选择最小那个值 
                const {bytesRead}=await read(this.fd,this.buffer,0,bytesToRead,position);

                for(let i=0;i<bytesRead;i++)
                {
                    const char=String.fromCharCode(this.buffer[i]);

                    if(char==='{')
                    {
                        if(depth===0) itemStart=position+i;
                        depth++;
                    }else if(char==='}')
                    {
                        depth--;
                        if(depth===0)
                        {
                            index.positions.push({
                                start:itemStart,
                                end:position+i+1
                            });
                        }
                    }else if(char==='[' || char===']')
                    {
                        //跳过数组括号
                        continue;
                    }
                }

                //这里相当于移动指针，比如btyesRead实际读取到字节数为50字节
                //处理完这50字节后，我们需要的移动指针，
                //position 从0->50,下一次read(this.fd,this.buffer,0,bytesToRead,position);里面的position就从50开始了
                position+=bytesRead; 
            }

            //保存索引
            await fs.promises.writeFile(indexFile,JSON.stringify(index));
            return index;
        }
    }

    //读取单个Json项目
    async readJsonItem(start,end)
    {
        const length =end-start;
        const buffer = Buffer.alloc(length);
        await read(this.fd,buffer,0,length,start);

        try{
            return JSON.parse(buffer.toString('utf-8'));
        }catch(error)
        {
            console.error('解析json失败：',error);
            return null;
        }
    }
}

module.exports =HighJsonPaginator;

下面就是测试，比用readline=require('readline')按行读取json文件快10-100倍，读取1000个文档就在30毫秒左右，看你是什么硬盘，读100个以下几乎就是几毫秒

javascript 复制代码

const HighJsonPaginator=require('./highJsonPaginator');

async function benchmark() {
  const testFile = '../public/output_vzqfri8.json';
  console.time('高效二进制读取');
  const efficientPaginator = new HighJsonPaginator(testFile);
  await efficientPaginator.init();
  const result=await efficientPaginator.paginateEfficiently(1, 30);
  await efficientPaginator.close();
  console.timeEnd('高效二进制读取');
  
  console.log(result);
}

benchmark();

到这里就结束了，如果你前端要调用读取分页可以直接用第2步的HighJsonPaginator{}类，

如果你想要再中间写一个缓存也是可以的，下面的方案只是作为参考

首先创建一个文件映射，遮避真实json文件名

我这里是直接写进fileMap中，你在用的时候可以把这个文件导入第1个代码中，将返回的临时文件中用set设置进这个fileMap中

javascript 复制代码

// config/files-map.js
const fileMap = new Map();

// 将真实文件名映射为随机ID
fileMap.set('output_vzqfri8.json', {
  id: 'a1b2c3d4e5',
  name: 'large-data.json',
  path: './data/output_vzqfri8.json',
  accessible: true
});

fileMap.set('sensitive-data.json', {
  id: 'f6g7h8i9j0', 
  name: 'sensitive-data.json',
  path: './data/sensitive-data.json',
  accessible: false // 限制访问
});

// 通过ID获取文件信息
function getFileById(fileId) {
  for (const [key, value] of fileMap.entries()) {
    if (value.id === fileId) {
      return value;
    }
  }
  return null;
}

// 通过名称获取文件信息
function getFileByName(filename) {
  return fileMap.get(filename);
}

module.exports = { fileMap, getFileById, getFileByName };

第2步前端的调用，这里只写了个大概

javascript 复制代码

const { getFileById } = require('../config/files-map');

router.get('/api/optimized-data/:fileId', async (req, res) => {
  try {
    const { fileId } = req.params;
    const { page = '1', pageSize = '20' } = req.query;

    // 通过ID获取文件信息
    const fileInfo = getFileById(fileId);
    if (!fileInfo) {
      return res.status(404).json({ error: '文件不存在或无权访问' });
    }

    if (!fileInfo.accessible) {
      return res.status(403).json({ error: '无权访问该文件' });
    }

    const pageNum = Math.max(1, parseInt(page));
    const pageSizeNum = Math.min(Math.max(1, parseInt(pageSize)), 200);

    const cacheKey = `data:${fileId}:${pageNum}:${pageSizeNum}`;

    // 使用示例：/api/optimized-data/a1b2c3d4e5?page=2&pageSize=50
    // 用户看不到真实的文件名 large-data.json

    // ... 其余处理逻辑
    const result = await processFile(fileInfo.path, pageNum, pageSizeNum);
    
    res.json(result);

  } catch (error) {
    console.error('文件处理错误:', error);
    res.status(500).json({ error: '内部服务器错误' });
  }
});

或者用固定文件名，或jwt令牌也可以的

javascript 复制代码

// 使用固定的文件标识，不暴露真实文件名
const allowedFiles = {
  'dataset-1': './data/large-data.json',
  'report-2024': './data/sensitive-report.json',
  'user-stats': './data/user-statistics.json'
};

router.get('/api/optimized-data/:fileKey', async (req, res) => {
  try {
    const { fileKey } = req.params;
    
    if (!allowedFiles[fileKey]) {
      return res.status(404).json({ error: '文件不存在' });
    }

    const filePath = allowedFiles[fileKey];
    
    // 使用示例：/api/optimized-data/dataset-1?page=2&pageSize=50
    // 用户看到的是 dataset-1，而不是 large-data.json

    // ... 处理逻辑
  } catch (error) {
    // 错误处理
  }
});

下面是文件缓存管理

javascript 复制代码

const crypto = require('crypto');
const fs = require('fs').promises;
const path = require('path');

class OptimizedFileCache {
  constructor(cacheDir = './cache') {
    this.cacheDir = cacheDir;
    this.indexCache = new Map(); // 内存缓存索引
  }

  async getCache(key, generator) {
    const cachePath = this.getCachePath(key);
    
    try {
      // 检查缓存是否存在且新鲜
      const cachedData = await this.readCache(cachePath);
      if (cachedData) {
        return cachedData;
      }
    } catch (error) {
      // 缓存不存在或损坏
    }

    // 生成新数据并缓存
    const data = await generator();
    await this.writeCache(cachePath, data);
    return data;
  }

  getCachePath(key) {
    const hash = crypto.createHash('sha256').update(key).digest('hex');
    return path.join(this.cacheDir, `${hash}.cache`);
  }

  async readCache(filePath) {
    try {
      const [data, stats] = await Promise.all([
        fs.readFile(filePath, 'utf8'),
        fs.stat(filePath)
      ]);
      
      // 检查缓存是否过期（1小时）
      if (Date.now() - stats.mtimeMs > 3600000) {
        return null;
      }
      
      return JSON.parse(data);
    } catch {
      return null;
    }
  }

  async writeCache(filePath, data) {
    await fs.mkdir(path.dirname(filePath), { recursive: true });
    await fs.writeFile(filePath, JSON.stringify(data));
  }
}

这里是使用文件预读和缓冲区池

javascript 复制代码

class BufferPool {
  constructor(poolSize = 10, bufferSize = 64 * 1024) {
    this.pool = [];
    this.bufferSize = bufferSize;
    
    for (let i = 0; i < poolSize; i++) {
      this.pool.push(Buffer.alloc(bufferSize));
    }
  }

  acquire() {
    return this.pool.pop() || Buffer.alloc(this.bufferSize);
  }

  release(buffer) {
    if (this.pool.length < 20) { // 限制池大小
      this.pool.push(buffer);
    }
  }
}

// 使用预读优化
async function preReadOptimization(fd, position, length, bufferPool) {
  const buffer = bufferPool.acquire();
  const { bytesRead } = await read(fd, buffer, 0, Math.min(length, buffer.length), position);
  
  const result = buffer.slice(0, bytesRead);
  bufferPool.release(buffer);
  
  return result;
}