步骤

下载MRPC验证集映射文件：dev_ids.tsv
运行download_glue_data.py

''' Script for downloading all GLUE data.
Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).
mkdir MRPC
cabextract MSRParaphraseCorpus.msi -d MRPC
cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
rm MRPC/_*
rm MSRParaphraseCorpus.msi
1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
'''
import io
import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfileTASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
TASK2PATH = {"CoLA": 'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip',"SST": 'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip',"QQP": 'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',"STS": 'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip',"MNLI": 'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip',"QNLI": 'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip',"RTE": 'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',"WNLI": 'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip',"diagnostic": 'https://dl.fbaipublicfiles.com/glue/data/AX.tsv'}MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'def download_and_extract(task, data_dir):print("Downloading and extracting %s..." % task)if task == "MNLI":print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.")data_file = "%s.zip" % taskurllib.request.urlretrieve(TASK2PATH[task], data_file)with zipfile.ZipFile(data_file) as zip_ref:zip_ref.extractall(data_dir)os.remove(data_file)print("\tCompleted!")def format_mrpc(data_dir, path_to_data):print("Processing MRPC...")mrpc_dir = os.path.join(data_dir, "MRPC")if not os.path.isdir(mrpc_dir):os.mkdir(mrpc_dir)if path_to_data:mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")else:try:mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)except urllib.error.HTTPError:print("Error downloading MRPC")returnassert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_fileassert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_filewith io.open(mrpc_test_file, encoding='utf-8') as data_fh, \io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:header = data_fh.readline()test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")for idx, row in enumerate(data_fh):label, id1, id2, s1, s2 = row.strip().split('\t')test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))# try:#     urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))# except KeyError or urllib.error.HTTPError:#     print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.")#     returndev_ids = []with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:for row in ids_fh:dev_ids.append(row.strip().split('\t'))with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:header = data_fh.readline()train_fh.write(header)dev_fh.write(header)for row in data_fh:label, id1, id2, s1, s2 = row.strip().split('\t')if [id1, id2] in dev_ids:dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))else:train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))print("\tCompleted!")def download_diagnostic(data_dir):print("Downloading and extracting diagnostic...")if not os.path.isdir(os.path.join(data_dir, "diagnostic")):os.mkdir(os.path.join(data_dir, "diagnostic"))data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)print("\tCompleted!")returndef get_tasks(task_names):task_names = task_names.split(',')if "all" in task_names:tasks = TASKSelse:tasks = []for task_name in task_names:assert task_name in TASKS, "Task %s not found!" % task_nametasks.append(task_name)return tasksdef main(arguments):parser = argparse.ArgumentParser()parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',type=str, default='all')parser.add_argument('--path_to_mrpc',help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',type=str, default='')args = parser.parse_args(arguments)if not os.path.isdir(args.data_dir):os.mkdir(args.data_dir)tasks = get_tasks(args.tasks)for task in tasks:if task == 'MRPC':format_mrpc(args.data_dir, args.path_to_mrpc)elif task == 'diagnostic':download_diagnostic(args.data_dir)else:download_and_extract(task, args.data_dir)if __name__ == '__main__':sys.exit(main(sys.argv[1:]))

报错：Error downloading standard development IDs for MRPC. You will need to manually split your data.

原因及解决办法参考：https://blog.csdn.net/qq_43800119/article/details/125352066

下载成功

参考资料

本文下载脚本参考：https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e

【深度学习】glue基准数据集下载相关推荐

动手学深度学习：Fashion-MNIST数据集下载失败解决方法
学习<动手学深度学习>的过程中,下载图片分类数据集时遇到了下载失败的问题: import torch import torchvision # 计算机视觉相关库 from torch.ut ...
深度学习目标检测数据集下载
coco与voc数据集下载与准备 coco与voc数据集下载在远程服务器上测试目标检测算法需要用到测试集,最常用的是coco2014/2017和voc07/12数据集. coco数据集的地址为htt ...
深度学习公开语音识别数据集下载 | 论文下载|音频数据集|corpus ——简记
转载【数据集】计算机视觉，深度学习，数据挖掘数据集整理
金融美国劳工部统计局官方发布数据上证A股日线数据,1999.12.09 至 2016.06.08,前复权,1095支股票深证A股日线数据,1999.12.09 至 2016.06.08,前复权, ...
【神经网络与深度学习】CIFAR10数据集介绍，并使用卷积神经网络训练图像分类模型——[附完整训练代码]
[神经网络与深度学习]CIFAR-10数据集介绍,并使用卷积神经网络训练模型--[附完整代码] 一.CIFAR-10数据集介绍 1.1 CIFAR-10数据集的内容 1.2 CIFAR-10数据集的结 ...
深度学习常用的数据集，包括各种数据跟图像数据。
1. 免费数据集下载(持续更新中...) 2.［导读］ "大数据时代",数据为王!无论是数据挖掘还是目前大热的深度学习领域都离不开"大数据".大公司们一般会 ...
数据集大全：25个深度学习的开放数据集
介绍深度学习的关键是训练.无论是从图像处理到语音识别,每个问题都有其独特的细微差别和方法. 但是,你可以从哪里获得这些数据?现在你看到的很多研究论文都使用专有数据集,而这些数据集通常不会向公众发布. ...
二十五个深度学习相关公开数据集
转 [干货]二十五个深度学习相关公开数据集 2018年04月18日 13:42:53 阅读数:758 (选自Analytics Vidhya:作者:Pranav Dar:磐石编译) 目录介绍图像处 ...
深度学习初学者，如何下载常用公开数据集并使用呢？
深度学习初学者,如何下载常用公开数据集并使用呢? 1.前言 2.官方文档怎样看 3.动手写代码 4.如何可视化遇到问题:ssl.SSLCertVerificationError: [SSL: CER ...

【深度学习】glue基准数据集下载

步骤

参考资料

【深度学习】glue基准数据集下载相关推荐

最新文章

热门文章