from google.colab import drive
drive.mount('/content/drive')
一、下载数据集
from datasets import load_dataset
max_length = 32 # Maximum length of the captions in tokens
coco_dataset_ratio = 50 # 50% of the COCO2014 dataset# Load the COCO2014 dataset for training, validation, and testing splits
train_ds = load_dataset("HuggingFaceM4/COCO", split=f"train[:{coco_dataset_ratio}%]")
valid_ds = load_dataset("HuggingFaceM4/COCO", split=f"validation[:{coco_dataset_ratio}%]")
test_ds = load_dataset("HuggingFaceM4/COCO", split="test")
二、保存数据集
dataset_path = '/content/drive/My Drive/COCO_Dataset_all'
train_ds.save_to_disk(dataset_path + '/train')
valid_ds.save_to_disk(dataset_path + '/validation')
test_ds.save_to_disk(dataset_path + '/test')
三、取出数据集
dataset_path = '/content/drive/My Drive/COCO_Dataset'
from datasets import load_from_disktrain_ds = load_from_disk(dataset_path + '/train')
valid_ds = load_from_disk(dataset_path + '/validation')
test_ds = load_from_disk(dataset_path + '/test')