GPU机器-显卡占用

背景

不论在读研、实习还是正式的算法同学，经常会想跑训练&推理任务找不到卡，即使有卡还需要重新装环境，比较麻烦，所以许多经常使用公司内部云平台GPU开发机时候，会在跑完任务时候，跑一些无用的任务占用GPU，防止机器被kill

下面提供一个通用的gpus占用任务，方便大家占卡，但如果很久不用还是最好释放资源给别的同学实验哈。

代码

python 复制代码

import torch
import torch.nn as nn
import argparse
import time
import threading
import os
from datetime import datetime

class GPUOccupier:
    def __init__(self, gpu_id, memory_fraction=0.8, compute_fraction=0.7):
        """
        GPU占用器
        Args:
            gpu_id: GPU设备ID
            memory_fraction: 显存占用比例 (0-1)
            compute_fraction: 计算占用比例 (0-1)
        """
        self.gpu_id = gpu_id
        self.device = torch.device(f'cuda:{gpu_id}')
        self.memory_fraction = memory_fraction
        self.compute_fraction = compute_fraction
        self.running = False
        self.memory_holder = None
        
    def get_gpu_memory_info(self):
        """获取GPU显存信息"""
        torch.cuda.set_device(self.device)
        total_memory = torch.cuda.get_device_properties(self.device).total_memory
        allocated_memory = torch.cuda.memory_allocated(self.device)
        free_memory = total_memory - allocated_memory
        return total_memory, allocated_memory, free_memory
    
    def allocate_memory(self):
        """分配指定比例的显存"""
        torch.cuda.set_device(self.device)
        total_memory, _, free_memory = self.get_gpu_memory_info()
        
        # 计算需要分配的显存大小
        target_memory = int(total_memory * self.memory_fraction)
        current_allocated = torch.cuda.memory_allocated(self.device)
        need_allocate = target_memory - current_allocated
        
        if need_allocate > 0:
            # 每个float32占4字节
            num_elements = need_allocate // 4
            # 分配显存
            try:
                self.memory_holder = torch.zeros(num_elements, dtype=torch.float32, device=self.device)
                print(f"GPU {self.gpu_id}: 成功分配 {need_allocate / 1024**3:.2f} GB 显存")
            except RuntimeError as e:
                print(f"GPU {self.gpu_id}: 显存分配失败: {e}")
                # 尝试分配较小的显存
                available = free_memory * 0.95  # 留5%余量
                num_elements = int(available // 4)
                self.memory_holder = torch.zeros(num_elements, dtype=torch.float32, device=self.device)
                print(f"GPU {self.gpu_id}: 实际分配 {available / 1024**3:.2f} GB 显存")
    
    def compute_task(self):
        """执行计算任务以占用GPU计算资源"""
        torch.cuda.set_device(self.device)
        
        # 创建一些矩阵用于计算
        size = 4096
        a = torch.randn(size, size, device=self.device, dtype=torch.float32)
        b = torch.randn(size, size, device=self.device, dtype=torch.float32)
        
        while self.running:
            start_time = time.time()
            
            # 执行矩阵运算
            c = torch.matmul(a, b)
            torch.cuda.synchronize(self.device)
            
            # 计算执行时间
            compute_time = time.time() - start_time
            
            # 根据compute_fraction调整休眠时间
            # 如果compute_fraction=0.7，则工作70%的时间，休息30%的时间
            if self.compute_fraction < 1.0:
                sleep_time = compute_time * (1 - self.compute_fraction) / self.compute_fraction
                time.sleep(sleep_time)
    
    def start(self):
        """启动GPU占用"""
        self.running = True
        
        # 分配显存
        self.allocate_memory()
        
        # 启动计算线程
        self.compute_thread = threading.Thread(target=self.compute_task)
        self.compute_thread.start()
        
        print(f"GPU {self.gpu_id}: 占用已启动 (显存: {self.memory_fraction*100}%, 计算: {self.compute_fraction*100}%)")
    
    def stop(self):
        """停止GPU占用"""
        self.running = False
        if hasattr(self, 'compute_thread'):
            self.compute_thread.join()
        
        # 释放显存
        if self.memory_holder is not None:
            del self.memory_holder
            torch.cuda.empty_cache()
        
        print(f"GPU {self.gpu_id}: 占用已停止")
    
    def get_status(self):
        """获取GPU状态"""
        total_memory, allocated_memory, free_memory = self.get_gpu_memory_info()
        memory_usage = allocated_memory / total_memory * 100
        
        return {
            'gpu_id': self.gpu_id,
            'total_memory_gb': total_memory / 1024**3,
            'allocated_memory_gb': allocated_memory / 1024**3,
            'free_memory_gb': free_memory / 1024**3,
            'memory_usage_percent': memory_usage
        }

def main():
    parser = argparse.ArgumentParser(description='GPU占用程序')
    parser.add_argument('--gpus', type=str, default='all', 
                        help='要占用的GPU编号，如 "0,1,2" 或 "all" 占用所有GPU')
    parser.add_argument('--memory', type=float, default=0.8, 
                        help='显存占用比例 (0-1), 默认0.8')
    parser.add_argument('--compute', type=float, default=0.7, 
                        help='计算占用比例 (0-1), 默认0.7')
    parser.add_argument('--duration', type=int, default=0, 
                        help='运行时长(秒), 0表示持续运行直到手动停止')
    
    args = parser.parse_args()
    
    # 确定要使用的GPU
    if args.gpus == 'all':
        num_gpus = torch.cuda.device_count()
        gpu_ids = list(range(num_gpus))
    else:
        gpu_ids = [int(x.strip()) for x in args.gpus.split(',')]
    
    if not gpu_ids:
        print("错误: 没有可用的GPU")
        return
    
    print(f"检测到 {torch.cuda.device_count()} 个GPU")
    print(f"将占用GPU: {gpu_ids}")
    print(f"显存占用目标: {args.memory * 100}%")
    print(f"计算占用目标: {args.compute * 100}%")
    print("-" * 50)
    
    # 创建GPU占用器
    occupiers = []
    for gpu_id in gpu_ids:
        if gpu_id >= torch.cuda.device_count():
            print(f"警告: GPU {gpu_id} 不存在，跳过")
            continue
        occupier = GPUOccupier(gpu_id, args.memory, args.compute)
        occupiers.append(occupier)
    
    # 启动所有占用器
    for occupier in occupiers:
        occupier.start()
    
    print("\n占用已启动，按 Ctrl+C 停止程序")
    print("-" * 50)
    
    try:
        start_time = time.time()
        while True:
            time.sleep(10)  # 每10秒打印一次状态
            
            print(f"\n[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] GPU状态:")
            for occupier in occupiers:
                status = occupier.get_status()
                print(f"  GPU {status['gpu_id']}: "
                      f"显存使用 {status['allocated_memory_gb']:.2f}/{status['total_memory_gb']:.2f} GB "
                      f"({status['memory_usage_percent']:.1f}%)")
            
            # 检查是否到达指定运行时长
            if args.duration > 0 and time.time() - start_time > args.duration:
                print(f"\n已运行 {args.duration} 秒，正在停止...")
                break
                
    except KeyboardInterrupt:
        print("\n\n接收到停止信号，正在清理...")
    
    # 停止所有占用器
    for occupier in occupiers:
        occupier.stop()
    
    print("\n程序已退出")

if __name__ == "__main__":
    main()

执行命令

shell 复制代码

vi gpus_occupy.py
python gpus_occupy.py --gpus all --memory 0.7 --compute 0.7