步骤

  1. 下载MRPC验证集映射文件:dev_ids.tsv
  2. 运行download_glue_data.py
''' Script for downloading all GLUE data.
Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).
mkdir MRPC
cabextract MSRParaphraseCorpus.msi -d MRPC
cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
rm MRPC/_*
rm MSRParaphraseCorpus.msi
1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
'''
import io
import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfileTASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
TASK2PATH = {"CoLA": 'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip',"SST": 'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip',"QQP": 'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',"STS": 'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip',"MNLI": 'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip',"QNLI": 'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip',"RTE": 'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',"WNLI": 'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip',"diagnostic": 'https://dl.fbaipublicfiles.com/glue/data/AX.tsv'}MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'def download_and_extract(task, data_dir):print("Downloading and extracting %s..." % task)if task == "MNLI":print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.")data_file = "%s.zip" % taskurllib.request.urlretrieve(TASK2PATH[task], data_file)with zipfile.ZipFile(data_file) as zip_ref:zip_ref.extractall(data_dir)os.remove(data_file)print("\tCompleted!")def format_mrpc(data_dir, path_to_data):print("Processing MRPC...")mrpc_dir = os.path.join(data_dir, "MRPC")if not os.path.isdir(mrpc_dir):os.mkdir(mrpc_dir)if path_to_data:mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")else:try:mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)except urllib.error.HTTPError:print("Error downloading MRPC")returnassert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_fileassert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_filewith io.open(mrpc_test_file, encoding='utf-8') as data_fh, \io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:header = data_fh.readline()test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")for idx, row in enumerate(data_fh):label, id1, id2, s1, s2 = row.strip().split('\t')test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))# try:#     urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))# except KeyError or urllib.error.HTTPError:#     print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.")#     returndev_ids = []with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:for row in ids_fh:dev_ids.append(row.strip().split('\t'))with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:header = data_fh.readline()train_fh.write(header)dev_fh.write(header)for row in data_fh:label, id1, id2, s1, s2 = row.strip().split('\t')if [id1, id2] in dev_ids:dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))else:train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))print("\tCompleted!")def download_diagnostic(data_dir):print("Downloading and extracting diagnostic...")if not os.path.isdir(os.path.join(data_dir, "diagnostic")):os.mkdir(os.path.join(data_dir, "diagnostic"))data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)print("\tCompleted!")returndef get_tasks(task_names):task_names = task_names.split(',')if "all" in task_names:tasks = TASKSelse:tasks = []for task_name in task_names:assert task_name in TASKS, "Task %s not found!" % task_nametasks.append(task_name)return tasksdef main(arguments):parser = argparse.ArgumentParser()parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',type=str, default='all')parser.add_argument('--path_to_mrpc',help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',type=str, default='')args = parser.parse_args(arguments)if not os.path.isdir(args.data_dir):os.mkdir(args.data_dir)tasks = get_tasks(args.tasks)for task in tasks:if task == 'MRPC':format_mrpc(args.data_dir, args.path_to_mrpc)elif task == 'diagnostic':download_diagnostic(args.data_dir)else:download_and_extract(task, args.data_dir)if __name__ == '__main__':sys.exit(main(sys.argv[1:]))
  • 报错:Error downloading standard development IDs for MRPC. You will need to manually split your data.
  • 原因及解决办法参考:https://blog.csdn.net/qq_43800119/article/details/125352066
  1. 下载成功

参考资料

本文下载脚本参考:https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e

【深度学习】glue基准数据集下载相关推荐

  1. 动手学深度学习:Fashion-MNIST数据集下载失败解决方法

    学习<动手学深度学习>的过程中,下载图片分类数据集时遇到了下载失败的问题: import torch import torchvision # 计算机视觉相关库 from torch.ut ...

  2. 深度学习目标检测数据集下载

    coco与voc数据集下载与准备 coco与voc数据集下载 在远程服务器上测试目标检测算法需要用到测试集,最常用的是coco2014/2017和voc07/12数据集. coco数据集的地址为htt ...

  3. 深度学习公开语音识别数据集下载 | 论文下载|音频数据集|corpus ——简记

  4. 转载【数据集】计算机视觉,深度学习,数据挖掘数据集整理

    金融 美国劳工部统计局官方发布数据 上证A股日线数据,1999.12.09 至 2016.06.08,前复权,1095支股票 深证A股日线数据,1999.12.09 至 2016.06.08,前复权, ...

  5. 【神经网络与深度学习】CIFAR10数据集介绍,并使用卷积神经网络训练图像分类模型——[附完整训练代码]

    [神经网络与深度学习]CIFAR-10数据集介绍,并使用卷积神经网络训练模型--[附完整代码] 一.CIFAR-10数据集介绍 1.1 CIFAR-10数据集的内容 1.2 CIFAR-10数据集的结 ...

  6. 深度学习常用的数据集,包括各种数据跟图像数据。

    1.   免费数据集下载(持续更新中...) 2.[导读] "大数据时代",数据为王!无论是数据挖掘还是目前大热的深度学习领域都离不开"大数据".大公司们一般会 ...

  7. 数据集大全:25个深度学习的开放数据集

    介绍 深度学习的关键是训练.无论是从图像处理到语音识别,每个问题都有其独特的细微差别和方法. 但是,你可以从哪里获得这些数据?现在你看到的很多研究论文都使用专有数据集,而这些数据集通常不会向公众发布. ...

  8. 二十五个深度学习相关公开数据集

    转 [干货]二十五个深度学习相关公开数据集 2018年04月18日 13:42:53 阅读数:758 (选自Analytics Vidhya:作者:Pranav Dar:磐石编译) 目录 介绍 图像处 ...

  9. 深度学习初学者,如何下载常用公开数据集并使用呢?

    深度学习初学者,如何下载常用公开数据集并使用呢? 1.前言 2.官方文档怎样看 3.动手写代码 4.如何可视化 遇到问题:ssl.SSLCertVerificationError: [SSL: CER ...

最新文章

  1. SAP HUM 因为存在Open TO 单据使得HU不能创建盘点凭证
  2. 公开致铁道部 高效运营从细节入手
  3. linux父子进程字写父读无效,linux父子进程
  4. sklearn中knn的各种用法总结
  5. 企业内网利用devpi搭建pypi私服
  6. vue-resource跨域问题
  7. 重装vcenter后恢复原来制作的模板!
  8. Java ListResourceBundle getKeys()方法与示例
  9. 信息学奥赛一本通 1010:计算分数的浮点数值 | OpenJudge NOI 1.3 05
  10. Ubuntu下一个好用的MySQL客户端tora
  11. 树莓派端口i2s_树莓派制造低成本交互式显微镜
  12. python:坦克大战源代码
  13. 程序员高效率办公软件(推荐)。
  14. lora calculator的使用
  15. matlab光子晶体求反射率,一维光子晶体禁带结构的MATLAB分析计算讲解.PDF
  16. Mixed Content: The page was loaded over HTTPS,blocked the content must be served over HTTPS.
  17. No Spring WebApplicationInitializer types detected on classpath
  18. 张振民北京计算机专修学院,工行甘肃省分行张振民一行来校考察交流
  19. 计算机夏令营英语自我介绍,夏令营英语自我介绍
  20. 浙大版《C语言程序设计》第四版(何钦铭颜晖) 第11章 指针进阶 课后习题答案

热门文章

  1. 【独家发布】网易将招50人,提供数据分析培训,费用全免!
  2. java-php-net-python-动漫产品销售计算机毕业设计程序
  3. shellcode事件
  4. Python pyecharts Bar图
  5. MATLAB练习之图像增强
  6. android otg读写监听,Android USBOTG写入权限错误
  7. web目录扫描工具dirbuster使用详解
  8. 大学生软件课程设计之新生报到管理系统
  9. html引入babel-polyfill,Babel教程-引入polyfill
  10. Python librosa模块介绍