--需要爬虫的原因是网站一次性最多下载150行数据,如果手工下载的话,需要下载复制5190次--准备工作
library("rvest")
library("xml2")--单一网页尝试
url <- "http://www.stat-nba.com/query.php?page=0&QueryType=game&GameType=season&order=1&crtcol=pts&PageNum=200&Year0=1985&Month0=1&Day0=1&Year1=2018&Month1=11&Day1=30" --网页地址
web<-read_html(url,encoding = "UTF-8") --读取网页
player_name <- (html_nodes(web,".player_name_out")%>%html_text())[2:151] --节点可以多次尝试,或者了解HTML的基础知识
season <- (html_nodes(web,".season")%>%html_text())[2:151]
is_win <- (html_nodes(web,".wl")%>%html_text())[2:151]
result <- (html_nodes(web,".result_out")%>%html_text())[2:151]
is_first <- as.numeric((html_nodes(web,".gs")%>%html_text())[2:151])
playing_time <- as.numeric((html_nodes(web,".mp")%>%html_text())[2:151])
field_goal_percent <- as.numeric(sub("%","",((html_nodes(web,".fgper")%>%html_text())[2:151])))/100 --百分数识别的文本无法转化,需处理
field_goal <- as.numeric((html_nodes(web,".fg")%>%html_text())[2:151])
field_goal_a <- as.numeric((html_nodes(web,".fga")%>%html_text())[2:151])
three_percent <- as.numeric(sub("%","",((html_nodes(web,".threepper")%>%html_text())[2:151])))/100
three <- as.numeric((html_nodes(web,".threep")%>%html_text())[2:151])
three_a <- as.numeric((html_nodes(web,".threepa")%>%html_text())[2:151])
free_percent <- as.numeric(sub("%","",((html_nodes(web,".ftper")%>%html_text())[2:151])))/100
free <- as.numeric((html_nodes(web,".ft")%>%html_text())[2:151])
free_a <- as.numeric((html_nodes(web,".fta")%>%html_text())[2:151])
rebound <- as.numeric((html_nodes(web,".trb")%>%html_text())[2:151])
rebound_offen <- as.numeric((html_nodes(web,".orb")%>%html_text())[2:151])
rebound_defen <- as.numeric((html_nodes(web,".drb")%>%html_text())[2:151])
assist <- as.numeric((html_nodes(web,".ast")%>%html_text())[2:151])
steal <- as.numeric((html_nodes(web,".stl")%>%html_text())[2:151])
block <- as.numeric((html_nodes(web,".blk")%>%html_text())[2:151])
turnover <- as.numeric((html_nodes(web,".tov")%>%html_text())[2:151])
foul <- as.numeric((html_nodes(web,".pf")%>%html_text())[2:151])
points <- as.numeric((html_nodes(web,".pts")%>%html_text())[2:151])
data1 <- data.frame(player_name,season,is_win,result,is_first,playing_time,field_goal_percent,field_goal,field_goal_a,three_percent,three,three_a,free_percent,free,free_a,rebound,rebound_offen,rebound_defen,assist,steal,block,turnover,foul,points) --合并起来--循环爬取所有数据
player_data <- data.frame(player_name=0,season=0,is_win=0,result=0,is_first=0,playing_time=0,field_goal_percent=0,field_goal=0,field_goal_a=0,three_percent=0,three=0,three_a=0,free_percent=0,free=0,free_a=0,rebound=0,rebound_offen=0,rebound_defen=0,assist=0,steal=0,block=0,turnover=0,foul=0,points=0)
player_data = player_data[-1,]    --以上两句建立一个dim(0,23)的空数据框
for (i in 0:(ceiling(778447/150)-1)){
url <- paste0("http://www.stat-nba.com/query.php?page=",i,"&QueryType=game&GameType=season&order=1&crtcol=pts&PageNum=200&Year0=1985&Month0=1&Day0=1&Year1=2018&Month1=11&Day1=30") --循环数可以计算
web<-read_html(url,encoding = "UTF-8")
player_name <- (html_nodes(web,".player_name_out")%>%html_text())[2:151]
season <- (html_nodes(web,".season")%>%html_text())[2:151]
is_win <- (html_nodes(web,".wl")%>%html_text())[2:151]
result <- (html_nodes(web,".result_out")%>%html_text())[2:151]
is_first <- as.numeric((html_nodes(web,".gs")%>%html_text())[2:151])
playing_time <- as.numeric((html_nodes(web,".mp")%>%html_text())[2:151])
field_goal_percent <- as.numeric(sub("%","",((html_nodes(web,".fgper")%>%html_text())[2:151])))/100
field_goal <- as.numeric((html_nodes(web,".fg")%>%html_text())[2:151])
field_goal_a <- as.numeric((html_nodes(web,".fga")%>%html_text())[2:151])
three_percent <- as.numeric(sub("%","",((html_nodes(web,".threepper")%>%html_text())[2:151])))/100
three <- as.numeric((html_nodes(web,".threep")%>%html_text())[2:151])
three_a <- as.numeric((html_nodes(web,".threepa")%>%html_text())[2:151])
free_percent <- as.numeric(sub("%","",((html_nodes(web,".ftper")%>%html_text())[2:151])))/100
free <- as.numeric((html_nodes(web,".ft")%>%html_text())[2:151])
free_a <- as.numeric((html_nodes(web,".fta")%>%html_text())[2:151])
rebound <- as.numeric((html_nodes(web,".trb")%>%html_text())[2:151])
rebound_offen <- as.numeric((html_nodes(web,".orb")%>%html_text())[2:151])
rebound_defen <- as.numeric((html_nodes(web,".drb")%>%html_text())[2:151])
assist <- as.numeric((html_nodes(web,".ast")%>%html_text())[2:151])
steal <- as.numeric((html_nodes(web,".stl")%>%html_text())[2:151])
block <- as.numeric((html_nodes(web,".blk")%>%html_text())[2:151])
turnover <- as.numeric((html_nodes(web,".tov")%>%html_text())[2:151])
foul <- as.numeric((html_nodes(web,".pf")%>%html_text())[2:151])
points <- as.numeric((html_nodes(web,".pts")%>%html_text())[2:151])
data <- data.frame(player_name,season,is_win,result,is_first,playing_time,field_goal_percent,field_goal,field_goal_a,three_percent,three,three_a,free_percent,free,free_a,rebound,rebound_offen,rebound_defen,assist,steal,block,turnover,foul,points)
player_data <- rbind(player_data,data)
}--性能优化分段爬取0-5189
player_data1 <- data.frame(player_name=0,season=0,is_win=0,result=0,is_first=0,playing_time=0,field_goal_percent=0,field_goal=0,field_goal_a=0,three_percent=0,three=0,three_a=0,free_percent=0,free=0,free_a=0,rebound=0,rebound_offen=0,rebound_defen=0,assist=0,steal=0,block=0,turnover=0,foul=0,points=0)
player_data1 = player_data1[-1,]
for (i in 0:1000){
url <- paste0("http://www.stat-nba.com/query.php?page=",i,"&QueryType=game&GameType=season&order=1&crtcol=pts&PageNum=200&Year0=1985&Month0=1&Day0=1&Year1=2018&Month1=11&Day1=30")
web<-read_html(url,encoding = "UTF-8")
player_name <- html_nodes(web,".player_name_out")%>%html_text()
season <- html_nodes(web,".season")%>%html_text()
is_win <- html_nodes(web,".wl")%>%html_text()
result <- html_nodes(web,".result_out")%>%html_text()
is_first <- html_nodes(web,".gs")%>%html_text()
playing_time <- html_nodes(web,".mp")%>%html_text()
field_goal_percent <- html_nodes(web,".fgper")%>%html_text()
field_goal <- html_nodes(web,".fg")%>%html_text()
field_goal_a <- html_nodes(web,".fga")%>%html_text()
three_percent <- html_nodes(web,".threepper")%>%html_text()
three <- html_nodes(web,".threep")%>%html_text()
three_a <- html_nodes(web,".threepa")%>%html_text()
free_percent <- html_nodes(web,".ftper")%>%html_text()
free <- html_nodes(web,".ft")%>%html_text()
free_a <- html_nodes(web,".fta")%>%html_text()
rebound <- html_nodes(web,".trb")%>%html_text()
rebound_offen <- html_nodes(web,".orb")%>%html_text()
rebound_defen <- html_nodes(web,".drb")%>%html_text()
assist <- html_nodes(web,".ast")%>%html_text()
steal <- html_nodes(web,".stl")%>%html_text()
block <- html_nodes(web,".blk")%>%html_text()
turnover <- html_nodes(web,".tov")%>%html_text()
foul <- html_nodes(web,".pf")%>%html_text()
points <- html_nodes(web,".pts")%>%html_text()
data1 <- data.frame(player_name,season,is_win,result,is_first,playing_time,field_goal_percent,field_goal,field_goal_a,three_percent,three,three_a,free_percent,free,free_a,rebound,rebound_offen,rebound_defen,assist,steal,block,turnover,foul,points)
player_data1 <- rbind(player_data1,data1)
print(i)
print(Sys.time())
}

关于NBA所有数据的爬虫(rvest)相关推荐

  1. scrapy爬虫储存到mysql_详解Python之Scrapy爬虫教程NBA球员数据存放到Mysql数据库

    获取要爬取的URL 爬虫前期工作 用Pycharm打开项目开始写爬虫文件 字段文件items # Define here the models for your scraped items # # S ...

  2. NBA球员数据爬虫练习

    (每周爬虫)NBA球员数据爬虫 准备开个新坑,一周练习一次小爬虫,对于质量较高的数据集,可以顺便做一下分析.同时回归Python代码与统计分析方法. url:https://nba.hupu.com/ ...

  3. 用python做一个数据查询软件_使用Python实现NBA球员数据查询小程序功能

    本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理. 以下文章来源于早起Python ,作者投稿君 一.前言 有时将代码转成带有界面的程序,会极大地方便 ...

  4. python进行数据查询_使用Python实现NBA球员数据查询小程序功能

    本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理. 以下文章来源于早起Python ,作者投稿君 一.前言 有时将代码转成带有界面的程序,会极大地方便 ...

  5. Jsoup:用Java也可以爬虫,怎么使用Java进行爬虫,用Java爬取网页数据,使用Jsoup爬取数据,爬虫举例:京东搜索

    Jsoup:用Java也可以爬虫,怎么使用Java进行爬虫,用Java爬取网页数据,使用Jsoup爬取数据,爬虫举例:京东搜索 一.资源 为什么接下来的代码中要使用el.getElementsByTa ...

  6. Python 网络爬虫笔记8 -- 股票数据定向爬虫

    Python 网络爬虫笔记8 – 股票数据定向爬虫 Python 网络爬虫系列笔记是笔者在学习嵩天老师的<Python网络爬虫与信息提取>课程及笔者实践网络爬虫的笔记. 课程链接:Pyth ...

  7. python 正则表达式提取数据_Python爬虫教程-19-数据提取-正则表达式(re)

    本篇主页内容:match的基本使用,search的基本使用,findall,finditer的基本使用,匹配中文,贪婪与非贪婪模式 Python爬虫教程-19-数据提取-正则表达式(re) 正则表达式 ...

  8. python爬虫大作业爬多少数据_爬虫大作业

    1.选一个自己感兴趣的主题(所有人不能雷同). 2.用python 编写爬虫程序,从网络上爬取相关主题的数据. 3.对爬了的数据进行文本分析,生成词云. 4.对文本分析结果进行解释说明. 5.写一篇完 ...

  9. 基于k-means聚类算法对NBA球员数据的一次聚类分析

    数据挖掘大作业 前言 本章工具 k-means介绍 k-means原理 最佳k值的确定 拐点法 轮廓系数法 聚类运算 结果分析 小结 参考文献 前言 聚类分析的研究成果主要集中在基于距离(或者称为基于 ...

最新文章

  1. 好文分享:我是如何在求职中把自己“推销”出去的
  2. 【国内首家!】阿里云专有云通过商用密码应用安全性评估
  3. MySql命令——命令行客户机的分隔符
  4. 《流浪地球》内地票房超《红海行动》北美上映11天成绩不俗
  5. androidStudio导入库文件
  6. 8plus基带电源供电线路_iPhone7显示手机无服务还有感叹号,基带通病问题,你中招了吗?...
  7. 反向题在测试问卷信效度_[问卷的信度和效度分析]问卷信度和效度分析
  8. spring nature
  9. 计算机辅助教学属于多媒体技术应用吗,多媒体技术在计算机辅助教学中的应用...
  10. 短信接口抓包_[实战] 实现抢票小工具amp;短信通知提醒
  11. No387FirstUniqueCharacterInAString
  12. Wireshark lua插件
  13. pygame制作飞机大战1——规划
  14. 360加固之libjiagu.so dump修复
  15. graylog源码搭建
  16. 图像预处理 mean=[0.485, 0.456, 0.406] std=[0.229, 0.224, 0.225] 的由来以及使用
  17. 自己的故事自己讲-从一个挖煤的如何入门编程的故事。
  18. 收集的一些东西和链接
  19. 生物制药安全供电保生产
  20. android cmd 卸载,CMD 控制台操作卸载安装apk、卸载系统默认应用

热门文章

  1. 8.3.1 修改数据表名称
  2. Spring Boot 整合 Shiro(三)Kaptcha验证码 附源码
  3. java附魔_给你的Swagger文档换套附魔皮肤吧
  4. SecureCRT$SecureFX的安装方法
  5. 自回归AR模型、移动平均MA模型与自回归移动平均ARMA模型的比较分析
  6. 一文速学-时间序列分析算法之移动平均模型(MA)详解+Python实例代码
  7. Win95下的注册表文件(User.dat,System.dat)文件格式说明 (转)
  8. 真小白|一步步教你用Python抓取微信好友分析
  9. javaee期末复习选择题
  10. Python 模拟Hermite Polynomial厄米特多项式