目录
[formalgeo7k_v1 数据集解题数据可视化](#formalgeo7k_v1 数据集解题数据可视化)
Daniel21Ding/sft-geo170k-qa-qwen2-5-vl-3b-epoch20
mradermacher/Qwen3-VL-4B-Geo170k-GGUF
formalgeo7k_v1 数据集解题数据可视化
python
"""
FormalGeo7K 数据集 OpenCV 可视化脚本
图片路径: formalgeo7k_v1/diagrams/
JSON路径: formalgeo7k_v1/problems/
"""
import os
import cv2
import numpy as np
import json
import random
def load_all_problems(problems_dir):
"""
加载所有问题的 JSON 数据
Args:
problems_dir: problems 文件夹路径
Returns:
dict: {problem_id: problem_data}
"""
problems = {}
if not os.path.exists(problems_dir):
print(f"problems 目录不存在: {problems_dir}")
return problems
for filename in os.listdir(problems_dir):
if filename.endswith('.json'):
problem_id = int(filename.split('.')[0])
filepath = os.path.join(problems_dir, filename)
with open(filepath, 'r', encoding='utf-8') as f:
problems[problem_id] = json.load(f)
print(f"加载了 {len(problems)} 个问题")
return problems
def load_problem_by_id(problems_dir, problem_id):
"""
根据 ID 加载单个问题
Args:
problems_dir: problems 文件夹路径
problem_id: 问题 ID
Returns:
dict: 问题数据
"""
filepath = os.path.join(problems_dir, f"{problem_id}.json")
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
def get_image_for_problem(diagrams_dir, problem_img_name):
"""
获取问题的图片
Args:
diagrams_dir: diagrams 文件夹路径
problem_img_name: 图片文件名(如 "1.png")
Returns:
numpy array: 图片
"""
img_path = os.path.join(diagrams_dir, problem_img_name)
if os.path.exists(img_path):
img = cv2.imread(img_path)
return img, img_path
return None, None
def visualize_problem(problem_data, diagrams_dir, save_dir=None, wait_key=0):
"""
可视化单个问题
Args:
problem_data: 问题 JSON 数据
diagrams_dir: diagrams 文件夹路径
save_dir: 保存目录
wait_key: 显示等待时间
"""
problem_id = problem_data.get('problem_id', 'N/A')
problem_text_cn = problem_data.get('problem_text_cn', '')
problem_text_en = problem_data.get('problem_text_en', '')
problem_answer = problem_data.get('problem_answer', '')
problem_img_name = problem_data.get('problem_img', '')
source = problem_data.get('source', '')
problem_level = problem_data.get('problem_level', '')
print(f"\n{'='*70}")
print(f"问题 ID: {problem_id} | 来源: {source} | 难度: {problem_level}")
print(f"中文: {problem_text_cn}")
print(f"英文: {problem_text_en}")
print(f"答案: {problem_answer}")
print(f"{'='*70}\n")
# 加载图片
img, img_path = get_image_for_problem(diagrams_dir, problem_img_name)
if img is None:
print(f"未找到图片: {problem_img_name}")
return
print(f"图片: {img_path}")
h, w = img.shape[:2]
# 计算文字区域高度
text_height = 180
canvas = np.ones((h + text_height, w, 3), dtype=np.uint8) * 240
canvas = np.ascontiguousarray(canvas)
canvas[:h, :w] = img
# 添加标题
cv2.putText(canvas, f"Problem {problem_id} (Level {problem_level}) - {source}",
(10, h + 25), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 1)
# 显示答案
cv2.putText(canvas, f"Answer: {problem_answer}", (10, h + 50),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 100, 0), 1)
# 显示中文问题
cv2.putText(canvas, "Question (CN):", (10, h + 75),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
# 分行显示中文问题
max_chars = 65
lines_cn = [problem_text_cn[i:i+max_chars] for i in range(0, len(problem_text_cn), max_chars)]
for i, line in enumerate(lines_cn):
y_pos = h + 95 + i * 22
cv2.putText(canvas, line, (15, y_pos),
cv2.FONT_HERSHEY_SIMPLEX, 0.45, (80, 80, 80), 1)
cv2.imshow(f"FormalGeo7K - Problem {problem_id}", canvas)
cv2.waitKey(wait_key)
cv2.destroyAllWindows()
def visualize_multiple_problems(problems, diagrams_dir, num_samples=9, save_dir=None):
"""
批量可视化多个问题,拼接成网格
"""
problem_ids = list(problems.keys())[:num_samples]
images = []
valid_ids = []
for pid in problem_ids:
problem = problems[pid]
problem_img_name = problem.get('problem_img', '')
img, _ = get_image_for_problem(diagrams_dir, problem_img_name)
if img is not None:
img = cv2.resize(img, (350, 350))
# 添加问题 ID 标签
cv2.putText(img, f"ID:{pid}", (5, 25),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
images.append(img)
valid_ids.append(pid)
if not images:
print("没有可显示的图片")
return
# 创建网格
grid_size = int(np.ceil(np.sqrt(len(images))))
img_h, img_w = images[0].shape[:2]
grid = np.ones((grid_size * img_h, grid_size * img_w, 3), dtype=np.uint8) * 255
grid = np.ascontiguousarray(grid)
for i, img in enumerate(images):
r = i // grid_size
c = i % grid_size
grid[r*img_h:(r+1)*img_h, c*img_w:(c+1)*img_w] = img
cv2.putText(grid, f"FormalGeo7K Samples ({len(images)} problems)",
(10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
if save_dir:
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "formalgeo7k_grid.png")
cv2.imwrite(save_path, grid)
print(f"已保存: {save_path}")
cv2.imshow("FormalGeo7K Samples", grid)
cv2.waitKey(0)
cv2.destroyAllWindows()
def find_problems_without_images(problems, diagrams_dir):
"""
查找没有对应图片的问题
"""
missing = []
for pid, problem in problems.items():
problem_img_name = problem.get('problem_img', '')
img_path = os.path.join(diagrams_dir, problem_img_name)
if not os.path.exists(img_path):
missing.append((pid, problem_img_name))
return missing
def explore_random_problems(problems, diagrams_dir, num_samples=5, save_dir=None):
"""
随机探索问题
"""
problem_ids = list(problems.keys())
random_ids = random.sample(problem_ids, min(num_samples, len(problem_ids)))
for pid in random_ids:
visualize_problem(problems[pid], diagrams_dir, save_dir=save_dir, wait_key=0)
if __name__ == "__main__":
# ========== 配置 ==========
base_dir = r"E:\project\ChatTutor-main\my_datasets\formalgeo7k_v1"
problems_dir = os.path.join(base_dir, "problems")
diagrams_dir = os.path.join(base_dir, "diagrams")
save_dir = "./formalgeo_viz"
# 1. 加载所有问题
problems = load_all_problems(problems_dir)
if not problems:
print("没有找到 problems 数据")
exit()
print(f"数据集统计: 共 {len(problems)} 个问题")
# 2. 检查缺失图片
missing = find_problems_without_images(problems, diagrams_dir)
if missing:
print(f"有 {len(missing)} 个问题缺少图片")
for pid, img_name in missing[:5]:
print(f" Problem {pid}: 缺少图片 {img_name}")
for pid in list(problems.keys()):
# 4. 可视化单个问题
visualize_problem(problems[pid], diagrams_dir, save_dir=save_dir, wait_key=0)
# 5. 批量可视化
print("\n正在显示批量网格...")
# visualize_multiple_problems(problems, diagrams_dir, num_samples=9, save_dir=save_dir)
# 6. 随机探索
print("\n正在随机探索...")
# explore_random_problems(problems, diagrams_dir, num_samples=3, save_dir=save_dir)
print("\n完成!")
图像生成数据集GeoGPT4V
caishihao/GeoGPT4V-1.0
caishihao/GeoGPT4V-1.1
如果你需要的是已经包含图片的几何数据集 ,GeoGPT4V 更直接:
-
包含 4.9K 个由 GPT-4/GPT-4V 生成的几何问题
-
文本和图片完全对齐
-
结合了 19K 开源数据,总共约 24K 个样本
下载方式
bash
git clone https://github.com/Lanyu0303/GeoGPT4V_Project.git
Geo170K
Hugging Face 可以直接下载
4928.png
Daniel21Ding/sft-geo170k-qa-qwen2-5-vl-3b-epoch20
Kate-03/Qwen3-VL-4B-Geo170k
其训练目标是让它能同时理解一道几何题的文本描述和配套的几何图形 ,然后输出正确的解题步骤和最终答案
这个模型是一个专门为解决几何问题而微调的多模态大模型,你可以把它理解成一个"几何题的AI解题老师"。
它的核心能力和背景是这样的:
核心能力:看懂几何题,给出解题过程
它是一个多模态模型,能同时"看懂"几何题的文字描述 和几何图形 。你可以像向老师提问一样,把一道带图的几何题输入给它,它就能输出详细的解题步骤和答案。它的前身和基础是通义实验室的视觉语言模型Qwen2.5-VL-3B-Instruct。
mradermacher/Qwen3-VL-4B-Geo170k-GGUF
这个模型的核心特点是:它是一个在特定数学数据集上微调过的视觉语言模型,并且被转换成了专门针对 CPU 或显存不足的 GPU 优化的 GGUF 格式。
GeoX
github地址:
https://github.com/InternScience/GeoX?tab=readme-ov-file
数据集:
InternScience/GeoX-data
https://huggingface.co/datasets/InternScience/GeoX-data/viewer/default/train?row=2