GPU机器-显卡占用
背景
不论在读研、实习还是正式的算法同学,经常会想跑训练&推理任务找不到卡,即使有卡还需要重新装环境,比较麻烦,所以许多经常使用公司内部云平台GPU开发机时候,会在跑完任务时候,跑一些无用的任务占用GPU,防止机器被kill
下面提供一个通用的gpus占用任务,方便大家占卡,但如果很久不用还是最好释放资源给别的同学实验哈。
代码
python
import torch
import torch.nn as nn
import argparse
import time
import threading
import os
from datetime import datetime
class GPUOccupier:
def __init__(self, gpu_id, memory_fraction=0.8, compute_fraction=0.7):
"""
GPU占用器
Args:
gpu_id: GPU设备ID
memory_fraction: 显存占用比例 (0-1)
compute_fraction: 计算占用比例 (0-1)
"""
self.gpu_id = gpu_id
self.device = torch.device(f'cuda:{gpu_id}')
self.memory_fraction = memory_fraction
self.compute_fraction = compute_fraction
self.running = False
self.memory_holder = None
def get_gpu_memory_info(self):
"""获取GPU显存信息"""
torch.cuda.set_device(self.device)
total_memory = torch.cuda.get_device_properties(self.device).total_memory
allocated_memory = torch.cuda.memory_allocated(self.device)
free_memory = total_memory - allocated_memory
return total_memory, allocated_memory, free_memory
def allocate_memory(self):
"""分配指定比例的显存"""
torch.cuda.set_device(self.device)
total_memory, _, free_memory = self.get_gpu_memory_info()
# 计算需要分配的显存大小
target_memory = int(total_memory * self.memory_fraction)
current_allocated = torch.cuda.memory_allocated(self.device)
need_allocate = target_memory - current_allocated
if need_allocate > 0:
# 每个float32占4字节
num_elements = need_allocate // 4
# 分配显存
try:
self.memory_holder = torch.zeros(num_elements, dtype=torch.float32, device=self.device)
print(f"GPU {self.gpu_id}: 成功分配 {need_allocate / 1024**3:.2f} GB 显存")
except RuntimeError as e:
print(f"GPU {self.gpu_id}: 显存分配失败: {e}")
# 尝试分配较小的显存
available = free_memory * 0.95 # 留5%余量
num_elements = int(available // 4)
self.memory_holder = torch.zeros(num_elements, dtype=torch.float32, device=self.device)
print(f"GPU {self.gpu_id}: 实际分配 {available / 1024**3:.2f} GB 显存")
def compute_task(self):
"""执行计算任务以占用GPU计算资源"""
torch.cuda.set_device(self.device)
# 创建一些矩阵用于计算
size = 4096
a = torch.randn(size, size, device=self.device, dtype=torch.float32)
b = torch.randn(size, size, device=self.device, dtype=torch.float32)
while self.running:
start_time = time.time()
# 执行矩阵运算
c = torch.matmul(a, b)
torch.cuda.synchronize(self.device)
# 计算执行时间
compute_time = time.time() - start_time
# 根据compute_fraction调整休眠时间
# 如果compute_fraction=0.7,则工作70%的时间,休息30%的时间
if self.compute_fraction < 1.0:
sleep_time = compute_time * (1 - self.compute_fraction) / self.compute_fraction
time.sleep(sleep_time)
def start(self):
"""启动GPU占用"""
self.running = True
# 分配显存
self.allocate_memory()
# 启动计算线程
self.compute_thread = threading.Thread(target=self.compute_task)
self.compute_thread.start()
print(f"GPU {self.gpu_id}: 占用已启动 (显存: {self.memory_fraction*100}%, 计算: {self.compute_fraction*100}%)")
def stop(self):
"""停止GPU占用"""
self.running = False
if hasattr(self, 'compute_thread'):
self.compute_thread.join()
# 释放显存
if self.memory_holder is not None:
del self.memory_holder
torch.cuda.empty_cache()
print(f"GPU {self.gpu_id}: 占用已停止")
def get_status(self):
"""获取GPU状态"""
total_memory, allocated_memory, free_memory = self.get_gpu_memory_info()
memory_usage = allocated_memory / total_memory * 100
return {
'gpu_id': self.gpu_id,
'total_memory_gb': total_memory / 1024**3,
'allocated_memory_gb': allocated_memory / 1024**3,
'free_memory_gb': free_memory / 1024**3,
'memory_usage_percent': memory_usage
}
def main():
parser = argparse.ArgumentParser(description='GPU占用程序')
parser.add_argument('--gpus', type=str, default='all',
help='要占用的GPU编号,如 "0,1,2" 或 "all" 占用所有GPU')
parser.add_argument('--memory', type=float, default=0.8,
help='显存占用比例 (0-1), 默认0.8')
parser.add_argument('--compute', type=float, default=0.7,
help='计算占用比例 (0-1), 默认0.7')
parser.add_argument('--duration', type=int, default=0,
help='运行时长(秒), 0表示持续运行直到手动停止')
args = parser.parse_args()
# 确定要使用的GPU
if args.gpus == 'all':
num_gpus = torch.cuda.device_count()
gpu_ids = list(range(num_gpus))
else:
gpu_ids = [int(x.strip()) for x in args.gpus.split(',')]
if not gpu_ids:
print("错误: 没有可用的GPU")
return
print(f"检测到 {torch.cuda.device_count()} 个GPU")
print(f"将占用GPU: {gpu_ids}")
print(f"显存占用目标: {args.memory * 100}%")
print(f"计算占用目标: {args.compute * 100}%")
print("-" * 50)
# 创建GPU占用器
occupiers = []
for gpu_id in gpu_ids:
if gpu_id >= torch.cuda.device_count():
print(f"警告: GPU {gpu_id} 不存在,跳过")
continue
occupier = GPUOccupier(gpu_id, args.memory, args.compute)
occupiers.append(occupier)
# 启动所有占用器
for occupier in occupiers:
occupier.start()
print("\n占用已启动,按 Ctrl+C 停止程序")
print("-" * 50)
try:
start_time = time.time()
while True:
time.sleep(10) # 每10秒打印一次状态
print(f"\n[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] GPU状态:")
for occupier in occupiers:
status = occupier.get_status()
print(f" GPU {status['gpu_id']}: "
f"显存使用 {status['allocated_memory_gb']:.2f}/{status['total_memory_gb']:.2f} GB "
f"({status['memory_usage_percent']:.1f}%)")
# 检查是否到达指定运行时长
if args.duration > 0 and time.time() - start_time > args.duration:
print(f"\n已运行 {args.duration} 秒,正在停止...")
break
except KeyboardInterrupt:
print("\n\n接收到停止信号,正在清理...")
# 停止所有占用器
for occupier in occupiers:
occupier.stop()
print("\n程序已退出")
if __name__ == "__main__":
main()
执行命令
shell
vi gpus_occupy.py
python gpus_occupy.py --gpus all --memory 0.7 --compute 0.7