0开始学py爬虫（学习笔记）

思路

$获解提存
第0步：获取数据。爬虫程序会根据我们提供的网址，向服务器发起请求，然后返回数据。
第1步：解析数据。爬虫程序会把服务器返回的数据解析成我们能读懂的格式。
第2步：提取数据。爬虫程序再从中提取出我们需要的数据。
第3步：储存数据。爬虫程序把这些有用的数据保存起来，便于你日后的使用和分析。

import requests

import requests
#引入requests库
res = requests.get('URL')
#requests.get是在调用requests库中的get()方法，它向服务器发送了一个请求，括号里的参数是你需要的数据所在的网址，然后服务器对请求作出了响应。
#我们把这个响应返回的结果赋值在变量res上

Response 对象

Response.status_code

response.content—它能把Response对象的内容以二进制数据的形式返回，适用于图片、音频、视频的下载

import requests
res = requests.get('https://res.pandateacher.com/2018-12-18-10-43-07.png')
#发出请求，并把返回的结果放在变量res中
pic=res.content
#把Reponse对象的内容以二进制数据的形式返回
photo = open('ppt.jpg','wb')
#新建了一个文件ppt.jpg，这里的文件没加路径，它会被保存在程序运行的当前目录下。
#图片内容需要以二进制wb读写。你在学习open()函数时接触过它。
photo.write(pic)
#获取pic的二进制内容
photo.close()
#关闭文件

response.text—这个属性可以把Response对象的内容以字符串的形式返回，适用于文字、网页源代码的下载。

import requests
#引用requests库
res = requests.get('https://localprod.pandateacher.com/python-manuscript/crawler-html/sanguo.md')
#下载《三国演义》第一回，我们得到一个对象，它被命名为res
novel=res.text
#把Response对象的内容以字符串的形式返回
print(novel[:800])
#现在，可以打印小说了，但考虑到整章太长，只输出800字看看就好。在关于列表的知识那里，你学过[:800]的用法。

response.encoding—它能帮我们定义Response对象的编码

import requests
#引用requests库
res = requests.get('https://localprod.pandateacher.com/python-manuscript/crawler-html/sanguo.md')
#下载《三国演义》第一回，我们得到一个对象，它被命名为res
res.encoding='utf-8'
#定义Reponse对象的编码为utf-8。
novel=res.text
#把Response对象的内容以字符串的形式返回
print(novel[:800])
#打印小说的前800个字。

如何查看网站的robots协议呢，很简单，在网站的域名后加上/robots.txt就可以了。

file1=open(“2.txt”,“a+”,encoding=“utf-8”):设置导入时的字符串

BeautifulSoup

bs对象=BeautifulSoup(要解析的文本,‘解析器’)

import requests
from bs4 import BeautifulSoup
res = requests.get('https://localprod.pandateacher.com/python-manuscript/crawler-html/spider-men5.0.html')
soup = BeautifulSoup( res.text,'html.parser')
print(type(soup)) #查看soup的类型
print(soup) # 打印soup

虽然response.text和soup打印出的内容表面上看长得一模一样，却有着不同的内心，它们属于不同的类：<class 'str'> 与<class 'bs4.BeautifulSoup'>。前者是字符串，后者是已经被解析过的BeautifulSoup对象。之所以打印出来的是一样的文本，是因为BeautifulSoup对象在直接打印它的时候会调用该对象内的__str__方法，所以直接打印 bs 对象显示字符串是__str__的返回结果

提取数据

find()
find_all()
Tag对象

以上对象可用find,find_all

find()与find_all()是BeautifulSoup对象的两个方法，它们可以匹配html的标签和属性，把BeautifulSoup对象里符合要求的数据都提取出来

Tag

import requests # 调用requests库
from bs4 import BeautifulSoup # 调用BeautifulSoup库
res = requests.get('https://localprod.pandateacher.com/python-manuscript/crawler-html/spider-men5.0.html')# 返回一个Response对象，赋值给res
html = res.text# 把Response对象的内容以字符串的形式返回
soup = BeautifulSoup( html,'html.parser') # 把网页解析为BeautifulSoup对象
items = soup.find_all(class_='books') # 通过匹配标签和属性提取我们想要的数据
print(type(soup))
print(type(soup))
for i in items:# print("想要的数据都在这里了：\n",i)kind=i.find(class_="h2")title=i.find(class_="title")brief=i.find(class_="info")print(kind,'\n',title,'\n',brief)print(type(kind),type(title),type(brief))

除了我们拿到的数据之外；运行结果的数据类型，又是三个<class ‘bs4.element.Tag’>，用find()提取出来的数据类型和刚才一样，还是Tag对象。接下来要做的，就是把Tag对象中的文本内容提出来
这时，可以用到Tag对象的另外两种属性——Tag.text，和Tag[‘属性名’]

import requests
# 引用requests库
from bs4 import BeautifulSoup
# 引用BeautifulSoup库res_foods = requests.get('http://www.xiachufang.com/explore/')
# 获取数据
bs_foods = BeautifulSoup(res_foods.text,'html.parser')
# 解析数据
list_foods = bs_foods.find_all('div',class_='info pure-u')
# 查找最小父级标签list_all = []
# 创建一个空列表，用于存储信息for food in list_foods:tag_a = food.find('a')# 提取第0个父级标签中的<a>标签name = tag_a.text[17:-13]# 菜名，使用[17:-13]切掉了多余的信息URL = 'http://www.xiachufang.com'+tag_a['href']# 获取URLtag_p = food.find('p',class_='ing ellipsis')# 提取第0个父级标签中的<p>标签ingredients = tag_p.text[1:-1]# 食材，使用[1:-1]切掉了多余的信息list_all.append([name,URL,ingredients])# 将菜名、URL、食材，封装为列表，添加进list_allprint(list_all)

请求豆瓣top250(不知道为啥不行，但是代码写出来了，有可能是反爬，还没看过反爬，得缓一缓)

# 豆瓣TOP250里面的 序号/电影名/评分/推荐语/链接 都爬取下来，结果全部展示打印出来
import requests,random ,bs4for x in range(10):url='https://movie.douban.com/top250?start=' + str(x*25) + '&filter='res=requests.get(url)bs=bs4.BeautifulSoup(res.text,'html.parser')bs=bs.find('ol',class_="grid_view")for titles in bs.find_all('li'):num=titles.find('em',class_="").text#查找序号title=titles.find('span',class_="title").text#查找电影名tes=titles.find('span',class_="inq").text#查找推荐语comment = titles.find('span',class_="rating_num").text#查找评分url_movie=titles.find('a')['href']print(num+'.'+title+'--'+comment+'\n'+'推荐语：' + tes +'\n' + url_movie)

爬取qq音乐的周杰伦歌曲

import requests
# 引用requests库
res_music = requests.get('https://c.y.qq.com/soso/fcgi-bin/client_search_cp?ct=24&qqmusic_ver=1298&new_json=1&remoteplace=txt.yqq.song&searchid=60997426243444153&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&p=1&n=20&w=%E5%91%A8%E6%9D%B0%E4%BC%A6&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0')
# 调用get方法，下载这个字典
json_music = res_music.json()
# 使用json()方法，将response对象，转为列表/字典
list_music = json_music['data']['song']['list']
# 一层一层地取字典，获取歌单列表
for music in list_music:
# list_music是一个列表，music是它里面的元素print(music['name'])# 以name为键，查找歌曲名print('所属专辑：'+music['album']['name'])# 查找专辑名print('播放时长：'+str(music['interval'])+'秒')# 查找播放时长print('播放链接：https://y.qq.com/n/yqq/song/'+music['mid']+'.html\n\n')

找想要的数据在xhr对象中

伪装成浏览器可以请求可以循环多次

import requests
url = 'https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg'
# 这是那个，请求歌曲评论的url
headers = {'origin':'https://y.qq.com',# 请求来源，本案例中其实是不需要加这个参数的，只是为了演示'referer':'https://y.qq.com/n/yqq/song/004Z8Ihr0JIu5s.html',# 请求来源，携带的信息比“origin”更丰富，本案例中其实是不需要加这个参数的，只是为了演示'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',# 标记了请求从什么设备，什么浏览器上发出}
params = {'g_tk':'5381',
'loginUin':'0',
'hostUin':'0',
'format':'json',
'inCharset':'utf8',
'outCharset':'GB2312',
'notice':'0',
'platform':'yqq.json',
'needNewCode':'0',
'cid':'205360772',
'reqtype':'2',
'biztype':'1',
'topid':'102065756',
'cmd':'8',
'needcommentcrit':'0',
'pagenum':0,
'pagesize':'25',
'lasthotcommentid':'',
'domain':'qq.com',
'ct':'24',
'cv':'101010  '}res_music = requests.get(url,headers=headers,params=params)
# 发起请求

逗号分隔可以在crv中分开

file=open('test.csv','a+')
#创建test.csv文件，以追加的读写模式
file.write('美国队长,钢铁侠,蜘蛛侠')
#写入test.csv文件
file.close()
#关闭文件

Python自带了csv模块。

import csv
csv_file=open("demo.csv",'w',newline='',encoding='utf-8')
#创建csv文件，我们要先调用open()函数，传入参数：文件名“demo.csv”、写入模式“w”、newline=''、encoding='utf-8'。

加newline=’ '参数的原因是，可以避免csv文件出现两倍的行距（就是能避免表格的行与行之间出现空白行）。加encoding=‘utf-8’，可以避免编码问题导致的报错或乱码。

openpyxl工作本模块

import openpyxlwb=openpyxl.Workbook()
sheet=wb.active#打开工作本
sheet.title='new title'#更新工作本名字
sheet['A1']='漫威宇宙'
rows=[['美国队长','钢铁侠','蜘蛛侠'],['是','漫威','宇宙', '经典','人物']]
for i in rows:sheet.append(i)#插入一行
print(rows)
wb.save('Marvel.xlsx')# 读取的代码
wb=openpyxl.load_workbook('Marvel.xlsx')#调用openpyxl.load_workbook()函数，打开“Marvel.xlsx”文件。
sheet =wb['new title']#获取“Marvel.xlsx”工作薄中名为“new title”的工作表
sheetname=wb.sheetnames#sheetnames是用来获取工作薄所有工作表的名字的。如果你不知道工作薄到底有几个工作表
print(sheetname)
A1_cell=sheet['A1']#把“new title”工作表中A1单元格赋值给A1_cell，再利用单元格value属性，就能打印出A1单元格的值
A1_value=A1_cell.value
print(A1_value)

爬下周杰伦的歌，然后放到xslx表中

import requests,openpyxl
wb=openpyxl.Workbook()
sheet=wb.active
sheet.title='song'sheet['A1']="歌曲名"
sheet['B1']='所属专辑'
sheet['C1']='播放时长'
sheet['D1']='播放链接'url = 'https://c.y.qq.com/soso/fcgi-bin/client_search_cp'for x in range(5):params={'ct': '24','qqmusic_ver': '1298','new_json': '1','remoteplace': 'sizer.yqq.song_next','searchid': '64405487069162918','t': '0','aggr': '1','cr': '1','catZhida': '1','lossless': '0','flag_qc': '0','p': str(x + 1),'n': '20','w': '周杰伦','g_tk': '5381','loginUin': '0','hostUin': '0','format': 'json','inCharset': 'utf8','outCharset': 'utf-8','notice': '0','platform': 'yqq.json','needNewCode': '0'}res_music=requests.get(url,params=params)json_music=res_music.json()list_music=json_music['data']['song']['list']for music in list_music:name=music['name']album=music['album']['name']time=music['interval']link='https://y.qq.com/n/yqq/song/' + str(music['file']['media_mid']) + '.html\n\n'sheet.append([name,album,time,url])print('歌曲名：' + name + '\n' + '所属专辑:' + album +'\n' + '播放时长:' + str(time) + '\n' + '播放链接:'+ url)
wb.save("Jay.xlsx")

继续学习中。。。