记录一次 Linux/Android 读取大文件失败

背景

最近在做项目的过程中，有一个需要是连续读5个以上的 500MB+的文件，对齐进行解压缩。常见的做法是通过 std::ifstream 或者 fopen 来进行读取。但是在第二次读取的时候文件就一直为0。

代码

main.cpp

cpp 复制代码

#include <mutex>
#include <unordered_map>
#include <cstring>
#include <vector>
#include <iostream>
#include <fstream>
#include <sstream>
#include <chrono>
#include <unistd.h>
#include <dirent.h>
#include <sys/stat.h>
#include <algorithm>
#include "md5.h"

using namespace std;

static inline unsigned char hex2char(unsigned char hex)
{
	if (hex >= 0 && hex <= 9)
		hex += '0';
	else
		hex += 'a' - 10;
	return hex;
}

static std::string md5_digest(unsigned char *buf, size_t len)
{
	int i = 0;
	unsigned char digest[16];
	unsigned char md5_string[256];
	struct MD5Context ctx;
	MD5Init(&ctx);
	MD5Update(&ctx, buf, len);
	MD5Final(digest, &ctx);
	for (i = 0; i < 16; i++)
	{
		md5_string[i * 2] = hex2char((digest[i] >> 4) & 0xf);
		md5_string[i * 2 + 1] = hex2char(digest[i] & 0xf);
	}
	md5_string[i * 2] = 0;
	snprintf((char *)digest, 16, "%ld", len);
	strncat((char *)md5_string, (char *)digest, 16);
	return std::string((char *)md5_string);
}

int main(int argc, char* argv[]) {
    if (argc != 2) {
        cout << "Usage: " << argv[0] << " <filename>" << endl;
        return 1;
    }

    std::ifstream input_file(argv[1], std::ios::binary);
    if (!input_file) {
            std::cout<<"Failed to open input file.\n"<<std::endl;
            return -1;
    }
    // 获取文件大小
    input_file.seekg(0, std::ios::end);
    auto input_size = input_file.tellg();
    input_file.seekg(0, std::ios::beg);


    char* input_data = new (std::nothrow) char[input_size];
    if (input_data == nullptr) {
        // 内存分配失败
        std::cout<<"ZIP Failed to allocate memory input_data"<<std::endl;
        return -1;
    }
    memset(input_data,0,input_size);

/*
    long long read_size = fread(input_data, sizeof(char), input_size, input_file);
    if(read_size < input_size)
    {
        delete[] input_data;
        ALOGE("ZIP model_md5 input_size read error!,read_size : %lld",read_size);
        return -1;
    }
*/

    input_file.read(input_data, input_size);
    

    input_file.close();


/
    std::ifstream input_file2(argv[1], std::ios::binary);
    if (!input_file2) {
            std::cout<<"Failed to open input file.\n"<<std::endl;
            return -1;
    }
    // 获取文件大小
    input_file2.seekg(0, std::ios::end);
    auto input_size = input_file2.tellg(); //这里获取的 size 一直都是0，很奇怪
    input_file2.seekg(0, std::ios::beg);
//


    std::string md5_model = md5_digest((unsigned char*)input_data, input_size);

    std::cout<< "model name : "<< argv[1] <<"model md5 : "<<md5_model.c_str()<<endl;

    delete[] input_data;

    return 0;

}

md5.h

cpp 复制代码

// Copyright 2007 Google Inc. All Rights Reserved.
// Author: liuli@google.com (Liu Li)
#ifndef COMMON_MD5_H__
#define COMMON_MD5_H__
#include <stdint.h>
typedef uint32_t u32;
typedef uint8_t u8;
struct MD5Context {
  u32 buf[4];
  u32 bits[2];
  u8 in[64];
};
void MD5Init(struct MD5Context *ctx);
void MD5Update(struct MD5Context *ctx, unsigned char const *buf, size_t len);
void MD5Final(unsigned char digest[16], struct MD5Context *ctx);
#endif  // COMMON_MD5_H__

md5.cpp

cpp 复制代码

/*
 * written by Colin Plumb in 1993, no copyright is claimed.
 * This code is in the public domain; do with it what you wish.
 *
 * Equivalent code is available from RSA Data Security, Inc.
 * This code has been tested against that, and is equivalent,
 * except that you don't need to include two pages of legalese
 * with every copy.
 *
 * To compute the message digest of a chunk of bytes, declare an
 * MD5Context structure, pass it to MD5Init, call MD5Update as
 * needed on buffers full of bytes, and then call MD5Final, which
 * will fill a supplied 16-byte array with the digest.
 */
#include <string.h>
#include "md5.h"
#ifndef WORDS_BIGENDIAN
#define byteReverse(buf, len)   /* Nothing */
#else
/*
 * Note: this code is harmless on little-endian machines.
 */
static void byteReverse(unsigned char *buf, unsigned longs)
{
  u32 t;
  do {
    t = (u32) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
      ((unsigned) buf[1] << 8 | buf[0]);
    *(u32 *) buf = t;
    buf += 4;
  } while (--longs);
}
#endif
static void MD5Transform(u32 buf[4], u32 const in[16]);
/*
 * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
 * initialization constants.
 */
void MD5Init(struct MD5Context *ctx)
{
  ctx->buf[0] = 0x67452301;
  ctx->buf[1] = 0xefcdab89;
  ctx->buf[2] = 0x98badcfe;
  ctx->buf[3] = 0x10325476;
  ctx->bits[0] = 0;
  ctx->bits[1] = 0;
}
/*
 * Update context to reflect the concatenation of another buffer full
 * of bytes.
 */
void MD5Update(struct MD5Context *ctx, unsigned char const *buf, size_t len)
{
  u32 t;
  /* Update bitcount */
  t = ctx->bits[0];
  if ((ctx->bits[0] = t + ((u32) len << 3)) < t)
    ctx->bits[1]++;         /* Carry from low to high */
  ctx->bits[1] += len >> 29;
  t = (t >> 3) & 0x3f;        /* Bytes already in shsInfo->data */
  /* Handle any leading odd-sized chunks */
  if (t) {
    unsigned char *p = (unsigned char *) ctx->in + t;
    t = 64 - t;
    if (len < t) {
      memcpy(p, buf, len);
      return;
    }
    memcpy(p, buf, t);
    byteReverse(ctx->in, 16);
    MD5Transform(ctx->buf, (u32 *) ctx->in);
    buf += t;
    len -= t;
  }
  /* Process data in 64-byte chunks */
  while (len >= 64) {
    memcpy(ctx->in, buf, 64);
    byteReverse(ctx->in, 16);
    MD5Transform(ctx->buf, (u32 *) ctx->in);
    buf += 64;
    len -= 64;
  }
  /* Handle any remaining bytes of data. */
  memcpy(ctx->in, buf, len);
}
/*
 * Final wrapup - pad to 64-byte boundary with the bit pattern
 * 1 0* (64-bit count of bits processed, MSB-first)
 */
void MD5Final(unsigned char digest[16], struct MD5Context *ctx)
{
  unsigned count;
  unsigned char *p;
  /* Compute number of bytes mod 64 */
  count = (ctx->bits[0] >> 3) & 0x3F;
  /* Set the first char of padding to 0x80.  This is safe since there is
     always at least one byte free */
  p = ctx->in + count;
  *p++ = 0x80;
  /* Bytes of padding needed to make 64 bytes */
  count = 64 - 1 - count;
  /* Pad out to 56 mod 64 */
  if (count < 8) {
    /* Two lots of padding:  Pad the first block to 64 bytes */
    memset(p, 0, count);
    byteReverse(ctx->in, 16);
    MD5Transform(ctx->buf, (u32 *) ctx->in);
    /* Now fill the next block with 56 bytes */
    memset(ctx->in, 0, 56);
  } else {
    /* Pad block to 56 bytes */
    memset(p, 0, count - 8);
  }
  byteReverse(ctx->in, 14);
  /* Append length in bits and transform */
  memcpy(&ctx->in[14], &ctx->bits[0], sizeof(u32));
  memcpy(&ctx->in[15], &ctx->bits[1], sizeof(u32));
  MD5Transform(ctx->buf, (u32 *) ctx->in);
  byteReverse((unsigned char *) ctx->buf, 4);
  memcpy(digest, ctx->buf, 16);
  memset(ctx, 0, sizeof(*ctx));        /* In case it's sensitive */
}
/* The four core functions - F1 is optimized somewhat */
/* #define F1(x, y, z) (x & y | ~x & z) */
#define F1(x, y, z) (z ^ (x & (y ^ z)))
#define F2(x, y, z) F1(z, x, y)
#define F3(x, y, z) (x ^ y ^ z)
#define F4(x, y, z) (y ^ (x | ~z))
/* This is the central step in the MD5 algorithm. */
#define MD5STEP(f, w, x, y, z, data, s) \
  ( w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x )
/*
 * The core of the MD5 algorithm, this alters an existing MD5 hash to
 * reflect the addition of 16 longwords of new data.  MD5Update blocks
 * the data and converts bytes into longwords for this routine.
 */
__attribute__((no_sanitize("unsigned-integer-overflow"))) static void MD5Transform(u32 buf[4], u32 const in[16])
{
  u32 a, b, c, d;
  a = buf[0];
  b = buf[1];
  c = buf[2];
  d = buf[3];
  MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
  MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
  MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
  MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
  MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
  MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
  MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
  MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
  MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
  MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
  MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
  MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
  MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
  MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
  MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
  MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
  MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
  MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
  MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
  MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
  MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
  MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
  MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
  MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
  MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
  MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
  MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
  MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
  MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
  MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
  MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
  MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
  MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
  MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
  MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
  MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
  MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
  MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
  MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
  MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
  MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
  MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
  MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
  MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
  MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
  MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
  MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
  MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
  MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
  MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
  MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
  MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
  MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
  MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
  MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
  MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
  MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
  MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
  MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
  MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
  MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
  MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
  MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
  MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
  buf[0] += a;
  buf[1] += b;
  buf[2] += c;
  buf[3] += d;
}

解决办法

https://www.cnblogs.com/schips/p/linux_c_read_write_file_size_over_2g.html

cpp 复制代码

   fpos_t pos;
   fseek(fp,0,SEEK_END);
   fgetpos(fp,&pos);
   fseek( fp, 0, SEEK_SET);

把获取文件大小改成下面的方式就可以了：

cpp 复制代码

    for(int i=0;i<model_infos.models.size();i++)
    {
        FILE* input_file = fopen((odm_path + model_infos.models[i].name).c_str(), "rb");
        if (!input_file) {
            ALOGE("ZIP Failed to open input file.\n");
            return NULL;
        }

        fpos_t pos;
        fseek(input_file,0,SEEK_END);
        fgetpos(input_file,&pos);
        fseek( input_file, 0, SEEK_SET);
        auto input_size = pos;
        if(input_size == 0)
        {
            ALOGE("ZIP pos one Failed , pos is 0.\n");
            return NULL;
        }
        input_max_size = std::max<long long>(input_size,input_max_size);
        decompressed__max_size =std::max<long long>(decompressed__max_size,model_infos.models[i].size);

        fclose(input_file);
    }