nvidia t4 跑shoc benchmark测试的时候遇到问题如下,

注意,跑 neuralnet 之前,要解压nn_data到 pwd 下面,但是依然报错

用strace 跟踪 执行

strace -f -o gm.truss /home/shoc-master/bin/Serial/CUDA/NeuralNet -s 4 -d 1
25353 execve("/home/shoc-master/bin/Serial/CUDA/NeuralNet", ["/home/shoc-master/bin/Serial/CUD"..., "-s", "4", "-d", "1"], [/* 52 vars */]) = 0
25353 brk(NULL)                         = 0x20e3000
25353 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9cd6045000
25353 access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
25353 open("/usr/local/cuda-10.1/lib64/tls/x86_64/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/local/cuda-10.1/lib64/tls/x86_64", 0x7ffdae94cfc0) = -1 ENOENT (No such file or directory)
25353 open("/usr/local/cuda-10.1/lib64/tls/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/local/cuda-10.1/lib64/tls", 0x7ffdae94cfc0) = -1 ENOENT (No such file or directory)
25353 open("/usr/local/cuda-10.1/lib64/x86_64/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/local/cuda-10.1/lib64/x86_64", 0x7ffdae94cfc0) = -1 ENOENT (No such file or directory)
25353 open("/usr/local/cuda-10.1/lib64/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/local/cuda-10.1/lib64", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
25353 open("/usr/lib64/mpich-3.2/lib/tls/x86_64/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/lib64/mpich-3.2/lib/tls/x86_64", 0x7ffdae94cfc0) = -1 ENOENT (No such file or directory)
25353 open("/usr/lib64/mpich-3.2/lib/tls/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/lib64/mpich-3.2/lib/tls", 0x7ffdae94cfc0) = -1 ENOENT (No such file or directory)
25353 open("/usr/lib64/mpich-3.2/lib/x86_64/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/lib64/mpich-3.2/lib/x86_64", 0x7ffdae94cfc0) = -1 ENOENT (No such file or directory)
25353 open("/usr/lib64/mpich-3.2/lib/librt.so.1", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
25353 stat("/usr/lib64/mpich-3.2/lib", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0

发现一堆错误!

怀疑是不是cuda没有安装好,于是检查 cuda的安装log

cat /var/log/cuda_install.log

发现如下问题

[WARNING]: Missing recommended library: libGLU.so
[WARNING]: Missing recommended library: libXi.so
[WARNING]: Missing recommended library: libXmu.so

解决这个 warning,

yum install mesa-libGLU-devel mesa-libGL-devel
yum install libXmu*
yum install libXi*

再重新安装cuda

——————————————————————————————————————————

这个分割线告诉读者,上面都不是问题的根本原因,问题的根本原因是。。。shoc 源代码

相信写代码的人都看见了,activations[0]没有被初始化,free的时候程序就崩掉了,再回头看看之前的问题

25353 writev(19, [{"*** Error in `", 14}, {"/home/shoc-master/bin/Serial/CUD"..., 43}, {"': ", 3}, {"free(): invalid pointer", 23}, {": 0x", 4}, {"00007f9cd123c818", 16}, {" ***\n", 5}], 7) = 108
25353 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f9cd5e56000
25353 mmap(0x7f9cac000000, 67108864, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x7f9c9e000000
25353 munmap(0x7f9c9e000000, 67108864)  = 0
25353 mmap(NULL, 134217728, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x7f9c9a000000
25353 munmap(0x7f9c9a000000, 33554432)  = 0
25353 munmap(0x7f9ca0000000, 33554432)  = 0
25353 mprotect(0x7f9c9c000000, 135168, PROT_READ|PROT_WRITE) = 0
25353 futex(0x7f9cd1240cc0, FUTEX_WAKE_PRIVATE, 2147483647) = 0
25353 futex(0x7f9cd1458190, FUTEX_WAKE_PRIVATE, 2147483647) = 0
25353 write(19, "======= Backtrace: =========\n", 29) = 29
25353 writev(19, [{"/lib64/libc.so.6", 16}, {"(", 1}, {"+0x", 3}, {"81489", 5}, {")", 1}, {"[0x", 3}, {"7f9cd0ef7489", 12}, {"]\n", 2}], 8) = 43
25353 writev(19, [{"/home/shoc-master/bin/Serial/CUD"..., 43}, {"[0x", 3}, {"40b40c", 6}, {"]\n", 2}], 4) = 54
25353 writev(19, [{"/home/shoc-master/bin/Serial/CUD"..., 43}, {"[0x", 3}, {"405cfb", 6}, {"]\n", 2}], 4) = 54
25353 writev(19, [{"/lib64/libc.so.6", 16}, {"(", 1}, {"__libc_start_main", 17}, {"+0x", 3}, {"f5", 2}, {")", 1}, {"[0x", 3}, {"7f9cd0e983d5", 12}, {"]\n", 2}], 9) = 57
25353 writev(19, [{"/home/shoc-master/bin/Serial/CUD"..., 43}, {"[0x", 3}, {"40651f", 6}, {"]\n", 2}], 4) = 54
25353 write(19, "======= Memory map: ========\n", 29) = 29
25353 open("/proc/self/maps", O_RDONLY) = 20
25353 read(20, "00400000-004a7000 r-xp 00000000 "..., 1024) = 1024
25353 write(19, "00400000-004a7000 r-xp 00000000 "..., 1024) = 1024
25353 read(20, "000 00:05 38726                 "..., 1024) = 1024
25353 write(19, "000 00:05 38726                 "..., 1024) = 1024
25353 read(20, "4 352848                     /de"..., 1024) = 1024
25353 write(19, "4 352848                     /de"..., 1024) = 1024
25353 read(20, "7f9ccdc54000-7f9ccde54000 ---p 0"..., 1024) = 1024
25353 write(19, "7f9ccdc54000-7f9ccde54000 ---p 0"..., 1024) = 1024
25353 read(20, "0000 00:00 0 \n7f9cd0e76000-7f9cd"..., 1024) = 1024
25353 write(19, "0000 00:00 0 \n7f9cd0e76000-7f9cd"..., 1024) = 1024
25353 read(20, "0 ---p 00101000 fd:00 33633924  "..., 1024) = 1024
25353 write(19, "0 ---p 00101000 fd:00 33633924  "..., 1024) = 1024
25353 read(20, "r/lib64/libcublas.so.10.2.1.243\n"..., 1024) = 1024
25353 write(19, "r/lib64/libcublas.so.10.2.1.243\n"..., 1024) = 1024
25353 read(20, "2.17.so\n7f9cd5c24000-7f9cd5e2300"..., 1024) = 1024
25353 write(19, "2.17.so\n7f9cd5c24000-7f9cd5e2300"..., 1024) = 1024
25353 read(20, "f9cd5fed000-7f9cd5ffd000 -w-s 00"..., 1024) = 1024
25353 write(19, "f9cd5fed000-7f9cd5ffd000 -w-s 00"..., 1024) = 1024
25353 read(20, "f9cd602f000 rw-s 00000000 00:05 "..., 1024) = 1024
25353 write(19, "f9cd602f000 rw-s 00000000 00:05 "..., 1024) = 1024
25353 read(20, "960000-7ffdae962000 r-xp 0000000"..., 1024) = 158
25353 write(19, "960000-7ffdae962000 r-xp 0000000"..., 158) = 158
25353 read(20, "", 1024)                = 0
25353 close(20)                         = 0
25353 rt_sigprocmask(SIG_UNBLOCK, [ABRT], NULL, 8) = 0
25353 tgkill(25353, 25353, SIGABRT)     = 0
25353 --- SIGABRT {si_signo=SIGABRT, si_code=SI_TKILL, si_pid=25353, si_uid=0} ---
25401 +++ killed by SIGABRT (core dumped) +++
25400 +++ killed by SIGABRT (core dumped) +++
25353 +++ killed by SIGABRT (core dumped) +++

其实 strace 已经明确告诉我们问题的所在,只是好久没有搞c++了。

nvidia t4 shoc 测试问题相关推荐

  1. 阿里云服务器vgn5i配NVIDIA CUDA TF2测试成功

    阿里云服务器vgn5i配NVIDIA CUDA 先吐槽会 驱动安装 驱动安装并测试 安装Ubuntu 16.04 64-bit系统 安装Nvidia Cuda驱动 安装Docker nvidia 先吐 ...

  2. 腾讯云GPU服务器NVIDIA P40 GPU、P4、T4和GPU自由卡详解

    腾讯云GPU云服务器,GPU云服务器实例可选GN8机型.GN6S机型.GN7机型等规格,搭载 NVIDIA P40 GPU,最长可3年,云服务器吧来详细说下腾讯云GPU云服务器: 目录 腾讯云GPU云 ...

  3. 使用NVIDIA GRID vPC支持视频会议和算力工具

    使用NVIDIA GRID vPC支持视频会议和算力工具 随着2020年的发展,远程工作解决方案已成为许多人的新常态.企业正在寻找行之有效的解决方案,如虚拟桌面基础设施(VDI),以使他们的团队能够在 ...

  4. MLPerf发布首个AI芯片推理测试排行榜:阿里平头哥含光800获得多项第一

    晓查 发自 凹非寺  量子位 编译 | 公众号 QbitAI 今天MLPerf基准联盟公布了第一批AI芯片的推理测试结果,对来自19个公司机构的594种芯片在各种自然语言和计算机视觉任务中的表现进行了 ...

  5. ubuntu 重装 nvidia_时隔三个月 NVIDIA为何再次打破AI推理性能记录

    近日,NVIDIA宣布其AI计算平台在最新一轮MLPerf基准测试中再次打破性能记录,在这一业内唯一评估硬件.软件和服务的第三方AI性能基准测试中进一步扩大了其领先优势. 如果大家还有印象的是在7月, ...

  6. 我参加 NVIDIA Sky Hackathon---语音识别+前端设计

    一.语音识别 针对项目中要求识别的果皮,瓶子,纸箱这些物品我们选择采集以下文字对应的语音数据: 请检测出果皮 请检测出纸箱 请检测出瓶子 请检测出果皮和纸箱 请检测出纸箱和瓶子 请检测出果皮和瓶子 请 ...

  7. Arm云游戏及虚拟化技术沙龙,Arm中国,NVIDIA,腾讯及百度等为你分享超强技术干货...

    申耀的科技观察 读懂科技,赢取未来! 2021年3月24日,由Arm中国.竞核联合主办的"芯潮澎湃 云启未来"Arm云游戏及虚拟化技术沙龙在上海圆满落幕. Arm中国市场及生态副总 ...

  8. NVIDIA DeepStream 5.0构建智能视频分析应用程序

    NVIDIA DeepStream 5.0构建智能视频分析应用程序 无论是要平衡产品分配和优化流量的仓库,工厂流水线检查还是医院管理,要确保员工和护理人员在照顾病人的同时使用个人保护设备(PPE),就 ...

  9. 利用NVIDIA NGC的TensorRT容器优化和加速人工智能推理

    利用NVIDIA NGC的TensorRT容器优化和加速人工智能推理 Optimizing and Accelerating AI Inference with the TensorRT Contai ...

最新文章

  1. Sentinel v1.4.2 发布,更好用的集群限流功能
  2. Python Module — WSME
  3. 【Python】WindowsError的错误代码详解
  4. linux rmp命令安装包在哪里_rpm命令_Linux rpm 命令用法详解:RPM软件包的管理工具...
  5. P4301-[CQOI2013]新Nim游戏【线性基】
  6. 30.课时30.【Django模板】autoescape标签使用详解(Av61533158,P30)
  7. AlexNet原理和实现
  8. Python 中遇到note: see declaration of '_ts'
  9. pandas.Series
  10. python的基本语法规则
  11. 注册表知识和技巧大全
  12. 工业机器人pallet指令_工业机器人编程指令详解
  13. c++ 统计指定字母开头单词的数量
  14. 直板android智能手机,小巧又精悍 3大系统直板全键盘手机搜罗
  15. 谈谈Python的Flask框架学习与福利分享
  16. 无线路由器wan口和lan口
  17. 如何做好会员管理与维护?
  18. git cheatsheet
  19. 2022百度网盘目录管理系统
  20. 小飞升值记——(15)

热门文章

  1. scale与zoom的差异
  2. 「设计模式(五) - 代理模式」
  3. phpstudy2018修改网站根目录以及本地域名访问配置方法
  4. 关于C语言中的重点转义字符详解
  5. windows下Ardupilot编译环境搭建
  6. h5是html语言吗,H5和HTML5是一样的吗
  7. 什么是类、什么是对象
  8. 消息中间件合集:MQ(ActiveMQ/RabbitMQ/RocketMQ)+Kafka+笔记
  9. JAVA_HOME查看和设置
  10. 每天撸拼多多纸巾(非广告)