Ubuntu 22.04 一键部署MinerU1.1.0

MinerU

MinerU是一款将PDF转化为机器可读格式的工具(如markdown、json),可以很方便地抽取为任意格式。 MinerU诞生于书生-浦语的预训练过程中,我们将会集中精力解决科技文献中的符号转化问题,希望在大模型时代为科技发展做出贡献。 相比国内外知名商用产品MinerU还很年轻,如果遇到问题或者结果不及预期请到issue提交问题,同时附上相关PDF

创建一键部署mineru 自动化脚本

  • 以下测试是在CPU模式下进行

  • 以下测试是在Ubuntu 22.04 ,以下自动化脚本支持其它系统 Centos/Redhaht/RockyLinux/AlmaLinux/OracleLinux 8,9,10,ubuntu(20,22,24),debian(11,12)

  • mineru 安装参考官网

  • conda 国内下载 官网下载

  • magic-pdf 命令使用参考

    vim /root/MinerU_install.bash

    #!/bin/bash

    -- coding: utf-8 --

    Author: CIASM

    update 2025/02/20

    系统支持 Centos/Redhaht/RockyLinux/AlmaLinux/OracleLinux 8,9,10,ubuntu(20,22,24),debian(11,12)

    install minerU 参考

    https://mineru.readthedocs.io/zh-cn/latest/user_guide/install/install.html#id3

    magic-pdf 使用参考

    #https://mineru.readthedocs.io/zh-cn/latest/user_guide/quick_start/command_line.html

    conda_prefix=/root/anaconda3

    install_basics() {

    Check if the script is being run as root

    if [ "$(id -u)" != "0" ]; then
    echo "This script must be run as root."
    exit 1
    fi

    if [[ -f /etc/os-release ]]; then
    OS=(cat /etc/*release* | grep "^ID=" | cut -d'=' -f2- | tr -d '"') VERSION=(awk -F= '/VERSION_ID/{print 2}' /etc/os-release | tr -d '"') #VERSION=( grep -oE 'VERSION_ID="[^"]+"' /etc/os-release | cut -d'"' -f2)

    else
    echo -e "\033[31mThis script only supports OS VERSION...\033[0m"
    exit 1
    fi

    Check the mineru version

    if ! [ -d "conda_prefix" ]; then echo -e "\033[32m Installing mineru for OS VERSION...\033[0m" case VERSION in

    CentOS/RedHat/oracle/RockLinux/AlmaLinux 8 install

    复制代码
     8|8.1|8.2|8.3|8.4|8.5|8.6|8.7|8.8|8.9|8.10)
      echo Installing basics...
      yum install -y http://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
      yum install -y net-tools wget curl
      yum install -y mesa-libGL
    
      # install mineru
      install_mineru
     ;;

    CentOS/RedHat/oracle/RockLinux/AlmaLinux 9 install

    复制代码
    9.0|9.1|9.2|9.3|9.4|9.5|9.6|9.7|9.8|9.9|9.10) 
      echo Installing basics...
      yum install -y http://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
      yum install -y net-tools wget curl
      yum install -y mesa-libGL
      
      # install mineru
      install_mineru
     ;;

    CentOS/RedHat/oracle/RockLinux/AlmaLinux 10 install

    复制代码
    10.0|10.1|10.2|10.2|10.3|10.4|10.5|10.6|10.7|10.8|10.9|10.10)  
      echo Installing basics...
      yum install -y http://dl.fedoraproject.org/pub/epel/epel-release-latest-10.noarch.rpm
      yum install -y net-tools wget curl
      yum install -y mesa-libGL
    
      # install mineru
      install_mineru
     ;;

    openEuler 20,22,23,24 Linux install

    复制代码
     20.03|22.03|23.03|24.03) 
      echo "add epel repo"
      openEuler_epel_repo
      
      echo Installing basics...
      yum install -y net-tools wget curl
      yum install -y mesa-libGL
    
      # install mineru
      install_mineru
     ;;

    ubuntu 20,21,22,23,24 Linux install

    复制代码
     20.04|21.04|22.04|23.04|24.04)
      echo Installing basics...
      apt update
      
      echo "Pop-up block"
      sed -i '$ a\export DEBIAN_FRONTEND=noninteractive' /etc/profile
      source /etc/profile
    
      echo "install libgl1-mesa-glx"
      apt-get update
      apt-get install -y libgl1-mesa-glx
    
      # install mineru
      install_mineru
     ;;

    Debian 10,11.12

    复制代码
    10|11|12)
      echo Installing basics...
      apt update
    
      echo "Pop-up block"
      sed -i '$ a\export DEBIAN_FRONTEND=noninteractive' /etc/profile
      source /etc/profile
    
      echo "install libgl1-mesa-glx"
      apt-get update
      apt-get install -y libgl1-mesa-glx
    
      # install mineru
      install_mineru
     ;;
      *)
      
      echo -e "\033[31m Unsupported $OS $VERSION...\033[0m" 
      exit 1
      ;;

    esac

    复制代码
      echo -e "\033[32m mineru for $OS $VERSION successfully installed...\033[0m"

    else
    echo -e "\033[33m mineru for OS VERSION already installed...\033[0m"
    fi

    }

    Obtain the system version number

    VERSION_ID=$(cat /etc/os-release | grep "^VERSION_ID=" | cut -d'=' -f2 | tr -d '"')

    openEuler 20.03 22.03 23.03 24.03 add epel 8,9

    openEuler_epel_repo (){

    if [[ VERSION_ID =~ ^2[2-4]\.03 ]]; then
    repo_version=9
    elif [[ $VERSION_ID == 20.03 ]]; then
    repo_version=8
    fi

    echo "openEuler add epel repo"
    cat <<EOF>>/etc/yum.repos.d/epel.repo
    [epel]
    name=epel
    baseurl=https://dl.fedoraproject.org/pub/epel/${repo_version}/Everything/x86_64/
    enabled=1
    gpgcheck=0
    priority=1
    EOF
    }

    install mineru

    install_mineru (){

    echo "install anaconda"
    wget -N -P /root/ -U NoSuchBrowser/1.0 https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/Anaconda3-2024.06-1-Linux-x86_64.sh
    bash /root/Anaconda3-2024.06-1-Linux-x86_64.sh -b -p /root/anaconda3
    echo 'export PATH=/root/anaconda3/bin:PATH' >> ~/.bashrc echo 'export PATH=/root/anaconda3/etc/profile.d:PATH' >> ~/.bashrc
    source ~/.bashrc

    echo "install python3.10 MinerU"
    conda create -n MinerU python=3.10 -y
    source /root/anaconda3/etc/profile.d/conda.sh
    conda activate MinerU
    pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple

    echo "Download the model weight file"
    pip install modelscope
    wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py -O download_models.py
    python download_models.py

    #echo "Download the model weight file"
    #pip install huggingface_hub
    #wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models_hf.py -O download_models_hf.py
    #python download_models_hf.py
    #!

    echo "test magic-pdf"
    source /root/anaconda3/etc/profile.d/conda.sh
    conda activate MinerU

    安装完成后进行转换使用

    #magic-pdf -p '/root/test.pdf' -o '/root/output' -m auto
    }

    main (){
    install_basics
    }

    main

执行一键部署mineru

  • 大概需要70分钟左右,实际需要看自己的网络了

    bash /root/MinerU_install.bash

magic-pdf 转换

  • magic-pdf 命令参考

  • Xmanager Power Suite 8.0.0005连接工具

    source /root/anaconda3/etc/profile.d/conda.sh
    conda activate MinerU
    magic-pdf -p '/root/test.pdf' -o '/root/output' -m auto

正在转换

转换完成,实际时间根据自己的硬件情况

相关推荐
码农老起4 小时前
与Aspose.pdf类似的jar库分享
java·pdf·jar
Elastic 中国社区官方博客11 小时前
Elasticsearch:使用 Azure AI 文档智能解析 PDF 文本和表格数据
大数据·人工智能·elasticsearch·搜索引擎·pdf·全文检索·azure
inxunoffice12 小时前
批量删除 PDF 中的所有图片、所有二维码图片以及指定的某张图片
pdf
aiweker14 小时前
Python PDF解析利器:pdfplumber | AI应用开发
python·pdf
inxunoffice17 小时前
批量提取 PDF 文档中指定页为新的 PDF 文档
pdf
inxunoffice17 小时前
批量在多个 PDF 的指定位置插入页,如插入封面、插入尾页
前端·pdf
hu556679819 小时前
Epub转PDF软件Calibre电子书管理软件
pdf
t梧桐树t20 小时前
xdocreport+freemarker导出docx&pdf
java·pdf
开开心心就好1 天前
自定义屏幕显示方向的实用软件
java·服务器·python·eclipse·pdf·word·excel
zhou周大哥1 天前
word,ppt,pdf 转图片
pdf·word·powerpoint