报错大致是这样的,但是直接run没有问题,debug就停住不动了
Traceback (most recent call last):
File "/home/mapengsen/.pycharm_helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 467, in start_client
s.connect((host, port))
TimeoutError: timed out
Traceback (most recent call last):
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
14:30:48.928250\] \[14:30:48.928492\] \[14:30:48.928599\] \[14:30:48.950877\] \[14:30:48.951222\] \[14:30:48.951351\] File "\
", line 688, in _load_unlocked File "\ ", line 883, in exec_module File "\ ", line 241, in _call_with_frames_removed Could not connect to 127.0.0.1: 56945 Traceback (most recent call last): File "/home/mapengsen/.pycharm_helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 467, in start_client s.connect((host, port)) TimeoutError: timed out File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 26, in \ Traceback (most recent call last): File "\ ", line 1027, in _find_and_load File "\ ", line 1006, in _find_and_load_unlocked File "\ ", line 688, in _load_unlocked File "\ ", line 883, in exec_module from torch._inductor.codecache import code_hash, CompiledFxGraph File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1424, in \ File "\ ", line 241, in _call_with_frames_removed File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 26, in \ from torch._inductor.codecache import code_hash, CompiledFxGraph File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1424, in \ AsyncCompile.warm_pool()AsyncCompile.warm_pool() File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1363, in warm_pool File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1363, in warm_pool pool._adjust_process_count() File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/concurrent/futures/process.py", line 697, in _adjust_process_count pool._adjust_process_count() File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/concurrent/futures/process.py", line 697, in _adjust_process_count Could not connect to 127.0.0.1: 56945
后来才发现是自己 import 自己定义的datasets的时候出现了错误,因为我是在自己定义的datasets中进行了测试,但是里面有错误,然后我还在主程序中import了这个datasets,所以一直停住不动。把dataset报错的地方删除就行,只留方法部分:
def collate_fn_paired_skip_invalid(batch):
if len(batch[0]) == 5: # 单任务情况 (添加了task_id)
valid_batch_items = [item for item in batch if item[0] is not None and item[2] is not None]
if not valid_batch_items:
return torch.empty(0), torch.empty(0, 0), torch.empty(0), torch.empty(0, 0), torch.empty(0, dtype=torch.long)
return torch.utils.data.dataloader.default_collate(valid_batch_items)
else: # 多任务情况 (7个元素,添加了task_id)
valid_batch_items = [item for item in batch if item[0] is not None and item[2] is not None and item[4] is not None]
if not valid_batch_items:
return torch.empty(0), torch.empty(0, 0), torch.empty(0), torch.empty(0, 0), torch.empty(0), torch.empty(0, 0), torch.empty(0, dtype=torch.long)
return torch.utils.data.dataloader.default_collate(valid_batch_items)
删除下面的,以免有错误
# --- 主训练循环 ---
trained_models_per_task = {}
# 假设您在这里定义了 all_task_names
all_task_names = [['A_bioavailability_ma'], ['A_hia_hou'], ['A_bioavailability_ma', 'A_hia_hou']]
for current_task_names in all_task_names:
task_key = '+'.join(current_task_names) # 创建任务组合的键名
print(f"\n--- 开始为任务组合: {task_key} 准备数据和模型 (Paired Data) ---")