python
复制代码
import os
import xml.etree.ElementTree as ET
def convert(size, box):
dw = 1. / size[0]
dh = 1. / size[1]
x = (box[0] + box[1]) / 2.0
y = (box[2] + box[3]) / 2.0
w = box[1] - box[0]
h = box[3] - box[2]
x = x * dw
w = w * dw
y = y * dh
h = h * dh
return (x, y, w, h)
def convert_annotation(xml_file, output_dir, labels):
# 加载XML文件
tree = ET.parse(xml_file)
root = tree.getroot()
# 获取图像尺寸
size = root.find('size')
w = int(size.find('width').text)
h = int(size.find('height').text)
# 初始化YOLO格式的标注字符串
result_str = ""
# 遍历所有对象
for obj in root.iter('object'):
difficult = obj.find('difficult')
if difficult is not None:
difficult = difficult.text
if int(difficult) == 1:
continue
cls = obj.find('name').text
if cls not in labels:
continue
cls_id = labels.index(cls)
xmlbox = obj.find('bndbox')
b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text),
float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
bb = convert((w, h), b)
result_str = result_str + " " + " ".join([str(a) for a in bb]) + " " + str(cls_id)
# 写入YOLO格式的标注文件
file_name = os.path.splitext(os.path.basename(xml_file))[0]
with open(os.path.join(output_dir, file_name + ".txt"), "w") as f:
f.write(result_str.strip())
def main(voc_dir, output_dir, labels):
# 遍历Annotations文件夹
annotations_dir = os.path.join(voc_dir, "Annotations")
for xml_file in os.listdir(annotations_dir):
if xml_file.endswith(".xml"):
xml_path = os.path.join(annotations_dir, xml_file)
convert_annotation(xml_path, output_dir, labels)
if __name__ == "__main__":
# VOC数据集根目录包含Annotations、JPEGImages等
voc_dir = "path_to_your_voc_dataset"
# 存放转换后的YOLO格式标注文件
output_dir = "path_to_your_yolo_annotations"
# 数据集包含类别
labels = ['nodule']
main(voc_dir, output_dir, labels)
python
复制代码
import os
import random
from shutil import copyfile
def split_dataset(image_folder, txt_folder, output_folder, split_ratio=(0.8, 0.1, 0.1)):
# Ensure output folders exist
for dataset in ['train', 'val', 'test']:
if not os.path.exists(os.path.join(output_folder, dataset, 'images')):
os.makedirs(os.path.join(output_folder, dataset, 'images'))
if not os.path.exists(os.path.join(output_folder, dataset, 'labels')):
os.makedirs(os.path.join(output_folder, dataset, 'labels'))
# Get list of image files
image_files = [f for f in os.listdir(image_folder) if f.endswith(('.jpg', '.jpeg', '.png'))]
random.shuffle(image_files)
num_images = len(image_files)
num_train = int(split_ratio[0] * num_images)
num_val = int(split_ratio[1] * num_images)
train_images = image_files[:num_train]
val_images = image_files[num_train:num_train + num_val]
test_images = image_files[num_train + num_val:]
# Copy images to respective folders
for dataset, images_list in zip(['train', 'val', 'test'], [train_images, val_images, test_images]):
for image_file in images_list:
image_path = os.path.join(image_folder, image_file)
copyfile(image_path, os.path.join(output_folder, dataset, 'images', image_file))
txt_file = os.path.splitext(image_file)[0] + '.txt'
txt_path = os.path.join(txt_folder, txt_file)
# Copy corresponding txt file if exists
if os.path.exists(txt_path):
copyfile(txt_path, os.path.join(output_folder, dataset, 'labels', txt_file))
if __name__ == "__main__":
# 图片路径
image_folder_path = "./JPEGImages"
# 标签路径
txt_folder_path = "./Labels"
# 划分后数据集路径
output_dataset_path = "./dataset"
split_dataset(image_folder_path, txt_folder_path, output_dataset_path)