初学爬取豆瓣哪吒之魔童降世短评

一、分析网址网页

首先用浏览器进入豆瓣网站，查看几页评论网址间的联系

https://movie.douban.com/subject/26794435/comments?status=P
https://movie.douban.com/subject/26794435/comments?start=20&limit=20&sort=new_score&status=P
https://movie.douban.com/subject/26794435/comments?start=40&limit=20&sort=new_score&status=P

很容易看到只有中间的start=X发生变化，这里第一个网址看着和后面差别较大，但也可以将X=0代入，即

https://movie.douban.com/subject/26794435/comments?start=0&limit=20&sort=new_score&status=P

这样循环找网页就方便了很多
然后查看网页源代码，发现是静态网页，数据都一次加载出来了。
或者打开f12开发者工具，信息不在xhr文件也是静态网页，这就好办啦

二、代码分析

1.get参数

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'}
cookies={'cookie': 'bid=iDf0tyAI54I; ps=y; ll="118183"; __utmc=30149280; _ga=\GA1.2.1325106029.1530404146; _gid=GA1.2.1270378106.1530405800; ue="965454764@qq.com";\dbcl2="180531938:E/xiLFShgbg"; ck=UDKl; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C153\0409505%2C%22https%3A%2F%2Faccounts.douban.com%2Flogin%3Falias%3D965454764%2540qq.com\%26redir%3Dhttps%253A%252F%252Fwww.douban.com%26source%3DNone%26error%3D1011%22%5D;\_pk_id.100001.8cb4=cdbf383efde098e6.1530404145.2.1530409505.1530405796.; _pk_ses.100001.\8cb4=*; push_noty_num=0; push_doumail_num=0; __utma=30149280.1325106029.1530404146.153040\4146.1530409505.2; __utmz=30149280.1530409505.2.2.utmcsr=accounts.douban.com|utmccn=(referral)\|utmcmd=referral|utmcct=/login; __utmt=1; __utmv=30149280.18053; __utmb=30149280.2.10.1530409505;\__yadk_uid=5eZwp3s8j7joGLL911UWJkWQpVQg6IX4'}
IPs = [{'HTTPS': 'https://115.237.16.200:8118'},{'HTTPS': 'https://42.49.119.10:8118'},{'HTTPS': 'http://60.174.74.40:8118'}]

headers有很多参数，这里只说明是浏览器即可
cookies是很多网站辨明身份的小段文本，打开浏览器，f12打开工具—>
f5刷新—>点击第一条信息，查看—>点击headers，查看头部响应信息->
复制cookie即可，这个cookies是在网上找的

ip池是我在网上随便找的，后面会随机使用一个ip

2.获取页面
通过random.choice（）函数随机选择一个ip
写入文件是为了查看访问的页面源代码是否正确，后面可以注释掉

def download_page(url):ip=random.choice(IPs)r=requests.get(url,headers=headers,cookies=cookies,proxies=ip)with open ('./test.txt','wb+') as f:    f.write(r.content)return r.text

记事本内容：

3.信息提取

def get_comments(html,page):soup=bs(html,'html.parser')coms=soup.find_all('div',class_='comment')
#    print(coms)for com in coms:#获得赞数agree=com.find('span',class_='comment-vote').find('span',class_="votes").get_text()agrees.append(agree)#名字name=com.find('span',class_='comment-info').find('a').stringnames.append(name)#得到打分和推荐程度info=com.find('span',class_='comment-info').find_all('span')#打分star=info[1]['class'][0][7:8]#推荐程度recommend=info[1]['title']stars.append(star)recommends.append(recommend)#评论comment=com.find('span',class_="short").stringcomments.append(comment)#每次获取完休息一下time.sleep(random.randint(1,2))print('第{}页获取完成'.format(page+1))storge(names,stars,agrees,comments,recommends)

其它信息还好，在获取打分星级时出了问题，可以看到信息在class为comment-info的第二个span里，于是我用了info[1][‘class’][7:8]获取星级，但总是为空
后来发现这个class里有个空格，所以info[1][‘class’]结果为[’‘allstar40’’,’‘rating’’]，加个下标0就好啦。这里太坑了

4.存储至excel

def storge(*a):#建立一个df对象df=pd.DataFrame()#赋值df['名字']=namesdf['打分']=starsdf['支持数']=agreesdf['推荐程度']=recommendsdf['评论']=comments#写入exceldf.to_excel('./哪吒评论.xlsx')

5.主函数

def main():depth=2#爬取深度for i in range(0,depth):url='https://movie.douban.com/subject/26794435/comments?start='+str(i*20)+'&limit=20&sort=new_score&status=P'html=download_page(url)if html is not None:print('正在访问第{}页'.format(i+1))get_comments(html,i)else:print('第{}页访问失败'.format(i+1))

三、完整代码

# -*- coding: utf-8 -*-
"""
Created on Tue Feb 11 18:21:08 2020@author: DZY
"""
import requests
from bs4 import BeautifulSoup as bs
import random
import time
import pandas as pdheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'}
cookies = {'cookie': 'bid=iDf0tyAI54I; ps=y; ll="118183"; __utmc=30149280; _ga=\GA1.2.1325106029.1530404146; _gid=GA1.2.1270378106.1530405800; ue="965454764@qq.com";\dbcl2="180531938:E/xiLFShgbg"; ck=UDKl; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C153\0409505%2C%22https%3A%2F%2Faccounts.douban.com%2Flogin%3Falias%3D965454764%2540qq.com\%26redir%3Dhttps%253A%252F%252Fwww.douban.com%26source%3DNone%26error%3D1011%22%5D;\_pk_id.100001.8cb4=cdbf383efde098e6.1530404145.2.1530409505.1530405796.; _pk_ses.100001.\8cb4=*; push_noty_num=0; push_doumail_num=0; __utma=30149280.1325106029.1530404146.153040\4146.1530409505.2; __utmz=30149280.1530409505.2.2.utmcsr=accounts.douban.com|utmccn=(referral)\|utmcmd=referral|utmcct=/login; __utmt=1; __utmv=30149280.18053; __utmb=30149280.2.10.1530409505;\__yadk_uid=5eZwp3s8j7joGLL911UWJkWQpVQg6IX4
IPs = [{'HTTPS': 'https://115.237.16.200:8118'},{'HTTPS': 'https://42.49.119.10:8118'},{'HTTPS': 'http://60.174.74.40:8118'}]
def download_page(url):ip=random.choice(IPs)r=requests.get(url,headers=headers,cookies=cookies,proxies=ip)with open ('./test.txt','wb+') as f:    f.write(r.content)return r.text
#目标：名字，打分，赞数，评语，推荐程度
names=[]
stars=[]
agrees=[]
comments=[]
recommends=[]
def get_comments(html,page):soup=bs(html,'html.parser')coms=soup.find_all('div',class_='comment')
#    print(coms)for com in coms:#获得赞数agree=com.find('span',class_='comment-vote').find('span',class_="votes").get_text()agrees.append(agree)#名字name=com.find('span',class_='comment-info').find('a').stringnames.append(name)#得到打分和推荐程度info=com.find('span',class_='comment-info').find_all('span')#打分star=info[1]['class'][0][7:8]#推荐程度recommend=info[1]['title']stars.append(star)recommends.append(recommend)#评论comment=com.find('span',class_="short").stringcomments.append(comment)#每次获取完休息一下time.sleep(random.randint(1,2))print('第{}页获取完成'.format(page+1))storge(names,stars,agrees,comments,recommends)def storge(*a):#建立一个df对象df=pd.DataFrame()#赋值df['名字']=namesdf['打分']=starsdf['支持数']=agreesdf['推荐程度']=recommendsdf['评论']=comments#写入exceldf.to_excel('./哪吒评论.xlsx')
def main():depth=2#爬取深度for i in range(0,depth):url='https://movie.douban.com/subject/26794435/comments?start='+str(i*20)+'&limit=20&sort=new_score&status=P'html=download_page(url)if html is not None:print('正在访问第{}页'.format(i+1))get_comments(html,i)else:print('第{}页访问失败'.format(i+1))
main()

四、执行结果