在三维计算机视觉领域,点云分类的算法研究需要高质量、多样化的数据集作为基础支撑。ModelNet40作为该领域最经典的基准数据集之一,自2015年发布以来已成为PointNet、PointCNN等顶级论文的必选数据集。本文将深入解析ModelNet40的三个主要数据版本,为研究者提供选型建议,帮助读者避开数据预处理的常见陷阱。
一、 ModelNet40 数据集全景
1.1 数据集起源
ModelNet40 源自普林斯顿大学的 ModelNet 项目(官网),其原始设计目标是为三维视觉算法提供:
40 类常见物体(如飞机、汽车、椅子等)
10 类对齐版本(ModelNet10)
多格式存储的三维 CAD 模型
该数据集首次在 CVPR 2015 论文《 3D ShapeNets 》中登场,开创了深度学习处理三维点云的新范式。
1.2 核心价值
跨领域兼容性:支持计算机视觉、机器人学、图形学等多领域研究
标注规范:提供统一的物体类别标注和空间坐标系统
扩展性:包含原始 CAD 模型,便于生成不同点云密度的实验数据
二、ModelNet40 版本深度解析
数据集 | modelnet40_normal_resampled.zip | modelnet40_ply_hdf5_2048.zip | ModelNet40.zip |
---|---|---|---|
文件大小 | 1.71G | 435M | 2.04G |
内容 | point: x, y, z, normal_x, normal_y, normal_z; shape: 10k points | point: x, y, z; normal_x, normal_y, normal_z; shape: 2048 points | off格式, 具体参考这里 |
训练集 / 测试集 | 9843 / 2468 | 9840 / 2468 | 9844 / 2468 |
下载地址 | modelnet40_normal_resampled.zip | modelnet40_ply_hdf5_2048.zip | ModelNet40.zip |
* 参考文献:https://github.com/zhulf0804/3D-PointCloud/blob/master/Datasets.md
三、ModelNet40 转 modelnet40_normal_resampled 数据集
原创脚本,经过pointnet++模型验证后与原始modelnet40_normal_resampled数据集精度一致:
import os
import glob
import numpy as np
import trimesh
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
def normalize_point_cloud(pc):
centroid = np.mean(pc, axis=0)
pc -= centroid
furthest_distance = np.max(np.linalg.norm(pc, axis=1))
pc /= furthest_distance
return pc
def sample_point_cloud_with_normals(mesh, num_points=10000):
points, face_indices = trimesh.sample.sample_surface(mesh, num_points)
normals = mesh.face_normals[face_indices]
points = normalize_point_cloud(points)
return np.hstack((points, normals))
def process_off_file_single_task(args):
input_path, output_path, num_points = args
try:
mesh = trimesh.load_mesh(input_path)
if not isinstance(mesh, trimesh.Trimesh):
raise ValueError("不是合法的 Trimesh 对象")
sampled_data = sample_point_cloud_with_normals(mesh, num_points)
np.savetxt(output_path, sampled_data, fmt="%.6f", delimiter=',')
return True, input_path
except Exception as e:
return False, f"{input_path}: {e}"
def convert_modelnet10_dataset(
root_path="/workspace/pointcloud_classification/ModelNet40/ModelNet40/",
out_path="/workspace/pointcloud_classification/ModelNet40/modelnet40_normal_resampled_me/",
num_points=10000,
num_workers=96):
os.makedirs(out_path, exist_ok=True)
all_classes = sorted([d for d in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, d))])
shape_names = []
train_files = []
test_files = []
all_files = []
task_list = []
for cls in all_classes:
cls_input_path = os.path.join(root_path, cls)
cls_output_path = os.path.join(out_path, cls)
os.makedirs(cls_output_path, exist_ok=True)
shape_names.append(cls)
for split in ['train', 'test']:
split_path = os.path.join(cls_input_path, split)
off_files = glob.glob(os.path.join(split_path, '*.off'))
for off_file in off_files:
base_name = os.path.splitext(os.path.basename(off_file))[0]
output_txt = os.path.join(cls_output_path, base_name + ".txt")
rel_path = os.path.join(cls, base_name + ".txt")
task_list.append((off_file, output_txt, num_points))
if split == 'train':
train_files.append(rel_path)
else:
test_files.append(rel_path)
all_files.append(rel_path)
# 多进程处理所有 OFF 文件
print(f"🚀 启动多进程({num_workers} workers)处理 {len(task_list)} 个 OFF 文件")
with ProcessPoolExecutor(max_workers=num_workers) as executor:
futures = [executor.submit(process_off_file_single_task, task) for task in task_list]
for f in tqdm(as_completed(futures), total=len(futures), desc="Processing OFF files"):
success, msg = f.result()
if not success:
print(f"❌ 失败: {msg}")
# 写入 shape_names
with open(os.path.join(out_path, "modelnet10_shape_names.txt"), 'w') as f:
for name in shape_names:
f.write(name + "\n")
# 写入 train/test/filelist
def save_list(filelist, filename, strip_path=False, strip_ext=False):
with open(os.path.join(out_path, filename), 'w') as f:
for item in sorted(filelist):
name = item
if strip_path:
name = os.path.basename(name)
if strip_ext:
name = os.path.splitext(name)[0]
f.write(name + "\n")
save_list(all_files, "filelist.txt")
save_list(train_files, "modelnet40_train.txt", strip_path=True, strip_ext=True)
save_list(test_files, "modelnet40_test.txt", strip_path=True, strip_ext=True)
print(f"\n✅ 全部转换完成,结果保存在:{out_path}")
if __name__ == "__main__":
convert_modelnet10_dataset()
View Comments