3D点云分类数据集：ModelNet40转modelnet40_normal_resampled

在三维计算机视觉领域，点云分类的算法研究需要高质量、多样化的数据集作为基础支撑。ModelNet40作为该领域最经典的基准数据集之一，自2015年发布以来已成为PointNet、PointCNN等顶级论文的必选数据集。本文将深入解析ModelNet40的三个主要数据版本，为研究者提供选型建议，帮助读者避开数据预处理的常见陷阱。

一、 ModelNet40 数据集全景

1.1 数据集起源
ModelNet40 源自普林斯顿大学的 ModelNet 项目（官网），其原始设计目标是为三维视觉算法提供：

40 类常见物体（如飞机、汽车、椅子等）
10 类对齐版本（ModelNet10）
多格式存储的三维 CAD 模型
该数据集首次在 CVPR 2015 论文《 3D ShapeNets 》中登场，开创了深度学习处理三维点云的新范式。

1.2 核心价值
跨领域兼容性：支持计算机视觉、机器人学、图形学等多领域研究
标注规范：提供统一的物体类别标注和空间坐标系统
扩展性：包含原始 CAD 模型，便于生成不同点云密度的实验数据

二、ModelNet40 版本深度解析

数据集	modelnet40_normal_resampled.zip	modelnet40_ply_hdf5_2048.zip	ModelNet40.zip
文件大小	1.71G	435M	2.04G
内容	point: x, y, z, normal_x, normal_y, normal_z; shape: 10k points	point: x, y, z; normal_x, normal_y, normal_z; shape: 2048 points	off格式, 具体参考这里
训练集 / 测试集	9843 / 2468	9840 / 2468	9844 / 2468
下载地址	modelnet40_normal_resampled.zip	modelnet40_ply_hdf5_2048.zip	ModelNet40.zip

* 参考文献：https://github.com/zhulf0804/3D-PointCloud/blob/master/Datasets.md

三、ModelNet40 转 modelnet40_normal_resampled 数据集

原创脚本，经过pointnet++模型验证后与原始modelnet40_normal_resampled数据集精度一致：

import os
import glob
import numpy as np
import trimesh
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed


def normalize_point_cloud(pc):
    centroid = np.mean(pc, axis=0)
    pc -= centroid
    furthest_distance = np.max(np.linalg.norm(pc, axis=1))
    pc /= furthest_distance
    return pc


def sample_point_cloud_with_normals(mesh, num_points=10000):
    points, face_indices = trimesh.sample.sample_surface(mesh, num_points)
    normals = mesh.face_normals[face_indices]
    points = normalize_point_cloud(points)
    return np.hstack((points, normals))


def process_off_file_single_task(args):
    input_path, output_path, num_points = args
    try:
        mesh = trimesh.load_mesh(input_path)
        if not isinstance(mesh, trimesh.Trimesh):
            raise ValueError("不是合法的 Trimesh 对象")
        sampled_data = sample_point_cloud_with_normals(mesh, num_points)
        np.savetxt(output_path, sampled_data, fmt="%.6f", delimiter=',')
        return True, input_path
    except Exception as e:
        return False, f"{input_path}: {e}"


def convert_modelnet10_dataset(
        root_path="/workspace/pointcloud_classification/ModelNet40/ModelNet40/",
        out_path="/workspace/pointcloud_classification/ModelNet40/modelnet40_normal_resampled_me/",
        num_points=10000,
        num_workers=96):

    os.makedirs(out_path, exist_ok=True)
    all_classes = sorted([d for d in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, d))])

    shape_names = []
    train_files = []
    test_files = []
    all_files = []

    task_list = []

    for cls in all_classes:
        cls_input_path = os.path.join(root_path, cls)
        cls_output_path = os.path.join(out_path, cls)
        os.makedirs(cls_output_path, exist_ok=True)
        shape_names.append(cls)

        for split in ['train', 'test']:
            split_path = os.path.join(cls_input_path, split)
            off_files = glob.glob(os.path.join(split_path, '*.off'))

            for off_file in off_files:
                base_name = os.path.splitext(os.path.basename(off_file))[0]
                output_txt = os.path.join(cls_output_path, base_name + ".txt")
                rel_path = os.path.join(cls, base_name + ".txt")
                task_list.append((off_file, output_txt, num_points))
                if split == 'train':
                    train_files.append(rel_path)
                else:
                    test_files.append(rel_path)
                all_files.append(rel_path)

    # 多进程处理所有 OFF 文件
    print(f"🚀 启动多进程（{num_workers} workers）处理 {len(task_list)} 个 OFF 文件")
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(process_off_file_single_task, task) for task in task_list]
        for f in tqdm(as_completed(futures), total=len(futures), desc="Processing OFF files"):
            success, msg = f.result()
            if not success:
                print(f"❌ 失败: {msg}")

    # 写入 shape_names
    with open(os.path.join(out_path, "modelnet10_shape_names.txt"), 'w') as f:
        for name in shape_names:
            f.write(name + "\n")

    # 写入 train/test/filelist
    def save_list(filelist, filename, strip_path=False, strip_ext=False):
        with open(os.path.join(out_path, filename), 'w') as f:
            for item in sorted(filelist):
                name = item
                if strip_path:
                    name = os.path.basename(name)
                if strip_ext:
                    name = os.path.splitext(name)[0]
                f.write(name + "\n")

    save_list(all_files, "filelist.txt")
    save_list(train_files, "modelnet40_train.txt", strip_path=True, strip_ext=True)
    save_list(test_files, "modelnet40_test.txt", strip_path=True, strip_ext=True)

    print(f"\n✅ 全部转换完成，结果保存在：{out_path}")


if __name__ == "__main__":
    convert_modelnet10_dataset()