Python-urllib、BeautifulSoup爬取豆瓣数据

b站学习地址：urllib获取网页数据

https://www.bilibili.com/video/BV12E411A7ZQ?p=18

1、get请求

import urllib.request  # 指定url，获取网页数据
import urllib.parse  # 解析器
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode('utf-8'))  # 对获取到的网页源码进行utf-8的解码

2、post请求

# post请求  url="http://httpbin.org/post" 专门用来测试的网址 post可模拟用户真实登录
try:data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding='utf-8')  # 将信息转换成二进制数据包response = urllib.request.urlopen("http://httpbin.org/post", data=data, timeout=1)print(response.read().decode('utf-8'))
except Exception as e:if hasattr(e, "code"):  # 如果含有code属性就打印code信息print(e.code)if hasattr(e, "reason"):print(e.reason)

3、获取某个信息

response = urllib.request.urlopen("http://httpbin.org/get")
print(response.getheaders())

4、带headers封装post

# 带header发送  封装
url = "https://www.douban.com"
data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding='utf-8')
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
# 本质是告诉浏览器我们可以接收什么水平的信息，可理解为伪装成浏览器给服务器发送信息
req = urllib.request.Request(url=url, data=data, headers=headers, method="POST")
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

b站学习地址：BeautifulSoup解析获取到的网页数据

https://www.bilibili.com/video/BV12E411A7ZQ?p=20

1、定义网页对象，选择对应解析器

import re  # 正则表达式
from bs4 import BeautifulSoup  # 网页解析获取数据# BeautifulSoup将复杂HTML文档转换成一个复杂树形结构，每个节点都是pyhon对象，所有对象可以归纳为4种
# -Tag
# -NavigableString
# -BeautifulSoup
# -Commentfile = open("./baidu.html", "rb")  # 当前文件夹./ rb二进制读取
html = file.read()
bs = BeautifulSoup(html, "html.parser")  # 解析器是html.parser

2、获取Tag标签、标签内容、标签属性

# 拿标签Tag 只能拿到找到的第一个标签
print(bs.title)
print(bs.head)
# 标签里的内容
print(bs.title.string)
# 标签里面的属性值
print(bs.a.attrs)

3、文档遍历

# 文档的遍历
print(bs.head.contents)
print(bs.head.contents[1])

4、文档搜索

# 文档的搜索
t_list = bs.find_all("a", limit=3)  # 字符串过滤,查找与字符串完全匹配的内容
t_list = bs.find_all(re.compile("a"))  # 正则表达式 包含a的都找出来# 根据函数的要求搜索
def name_is_exists(tag):return tag.has_attr("name")t_list = bs.find_all(name_is_exists)
for item in t_list:print(item)# 指定参数搜索
t_list = bs.find_all(id="head")
t_list = bs.find_all(class_=True)  # 整个类别里面有个class# 文本查找
t_list = bs.find_all(text=["hao123", "地图", "贴吧"])
t_list = bs.find_all(text=re.compile("\d"))  # 正则表达式匹配数字# css选择器
t_list = bs.select('title')  # 按照标签来查找
t_list = bs.select(".mnav")  # 按照类名来查找 前面加个. 表示类名
t_list = bs.select("#u1")  # 按照id来查找
t_list = bs.select("a[class]='bri']")  # 按照属性来查找
t_list = bs.select("head > title")  # 通过子标签来查找 一层一层找下去t_list = bs.select(".mnav ~ .bri")  # 兄弟节点查找
print(t_list[0].get_text())

基本代码

https://www.bilibili.com/video/BV12E411A7ZQ?p=24

import re
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import xlwt
import sqlite3url = "http://movie.douban.com/top250?start="
findLink = re.compile(r'<a href="(.*?)">')  # 创建正则表达式规则
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S)
findTitle = re.compile(r'<span class="title">(.*?)</span>')
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
findJudge = re.compile(r'<span>(\d*)人评价</span>')
findIng = re.compile(r'<span class="inq">(.*)</span>')
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)def main():datalist = getData(url)savepath = "douban.xls"dbpath = "movie.db"# saveData(datalist, savepath)  # excel存储saveData2DB(datalist, dbpath)  # 数据库存储def getData(url):datalist = []for i in range(0, 10):nexturl = url + str(i * 25)html = askURL(nexturl)soup = BeautifulSoup(html, "html.parser")# 查找符合要求的字符串，成一个列表for item in soup.find_all('div', class_="item"):  # 找div 并且class是itemdata = []item = str(item)  # 把查找到的变成str，便于处理link = re.findall(findLink, item)[0]  # 因为有两条只需要第一条即可data.append(link)imgSrc = re.findall(findImgSrc, item)[0]data.append(imgSrc)titles = re.findall(findTitle, item)if len(titles) == 2:  # 有中文名和外文名ctitle = titles[0]data.append(ctitle)otitle = titles[1].replace("/", "")data.append(otitle)else:data.append(titles[0])data.append(' ')  # 如无外文名留空rating = re.findall(findRating, item)[0]data.append(rating)judge = re.findall(findJudge, item)[0]data.append(judge)ing = re.findall(findIng, item)if len(ing) != 0:ing = ing[0].replace("。", "")data.append(ing)else:data.append(" ")bd = re.findall(findBd, item)[0]bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd)  # 去掉brbd = re.sub('/', " ", bd)  # 替换/data.append(bd.strip())datalist.append(data)return datalistdef askURL(url):html = ""headers = {"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 91.0.4472.114 Safari / 537.36 Edg / 91.0.864.59"}req = urllib.request.Request(url=url, headers=headers, method="POST")try:response = urllib.request.urlopen(req)html = response.read().decode('utf-8')except Exception as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)return htmldef saveData(datalist, savepath):print("开始！")book = xlwt.Workbook(encoding='utf-8', style_compression=0)  # style_compression=0不压缩sheet = book.add_sheet('douban250', cell_overwrite_ok=True)  # cell_overwrite_ok=True可以覆盖单元格，默认为Falsecol = ("电影链接", "图片链接", "中文名", "外文名", "评分", "评价数", "概述", "相关信息")for i in range(0, 8):sheet.write(0, i, col[i])for i in range(0, 250):data = datalist[i]for j in range(0, 8):sheet.write(i + 1, j, data[j])print("结束！")book.save(savepath)def saveData2DB(datalist, dbpath):init_db(dbpath)conn = sqlite3.connect(dbpath)cur = conn.cursor()for data in datalist:for index in range(len(data)):if index == 4 or index == 5:continuedata[index] = '"' + str(data[index]) + '"'sql = '''insert into movie250 (info_link,pic_link,cname,oname,score,rated,instroduction,info) values(%s)''' % ",".join(data)# print(sql)cur.execute(sql)conn.commit()cur.close()conn.close()def init_db(dbpath):  # 创建数据库sql = '''create table movie250(id integer primary key autoincrement,info_link text,pic_link text,cname varchar,oname varchar,score numeric,rated numeric,instroduction text,info text)'''conn = sqlite3.connect(dbpath)  # 有则打开无则创建数据库文cursor = conn.cursor()  # 获取游标cursor.execute(sql)  # 执行操作conn.commit()  # 提交数据库操作 查询时不需要提交conn.close()  # 关闭数据库连接if __name__ == "__main__":main()

报错相关

object of type 'NoneType' has no len()

原因：调用的函数漏写返回值

UnboundLocalError: local variable 'a' referenced before assignment

原因：局部变量与全局变量名字重复

注意：会被豆瓣封IP

Python-urllib、BeautifulSoup爬取豆瓣数据相关推荐

爬虫beautifulsoup爬取豆瓣读书数据
爬虫beautifulsoup爬取豆瓣读书数据:主要是爬取收集书的名字.类别.简介,用于接下来的聚类学习. 豆瓣链接:https://book.douban.com/tag/?view=type&am ...
[python爬虫] BeautifulSoup爬取+CSV存储贵州农产品数据
在学习使用正则表达式.BeautifulSoup技术或Selenium技术爬取网络数据过程中,通常会将爬取的数据存储至TXT文件中,前面也讲述过海量数据存储至本地MySQL数据库中,这里主要补充Bea ...
Python爬取豆瓣+数据可视化
博客原文和源码下载:Python爬取豆瓣+数据可视化前言前段时间应我姐邀请,看了一下Python爬虫.不得不说Python的语法确实简洁优美,可读性强,比较接近自然语言,非常适合编程的初学者上手. ...
python BeautifulSoup爬取豆瓣电影top250信息并写入Excel表格
豆瓣是一个社区网站,创立于2005年3月6日.该网站以书影音起家,提供关于书籍,电影,音乐等作品信息,其描述和评论都是由用户提供的,是Web2.0网站中具有特色的一个网站. 豆瓣电影top250网址: ...
BeautifulSoup爬取豆瓣电影数据
BeautifulSoup爬取豆瓣TOP250 豆瓣爬取地址 https://movie.douban.com/top250?format=text BeautifulSoup官网地址 https:/ ...
python爬取豆瓣电影top250_用Python爬虫实现爬取豆瓣电影Top250
用Python爬虫实现爬取豆瓣电影Top250 #爬取豆瓣电影Top250 #250个电影 ,分为10个页显示,1页有25个电影 import urllib.request from bs4 imp ...
python用bs4爬取豆瓣电影排行榜 Top 250的电影信息和电影图片，分别保存到csv文件和文件夹中
python用bs4爬取豆瓣电影排行榜 Top 250的电影信息和图片,分别保存到csv文件和文件夹中. 爬取的数据包括每个电影的电影名 , 导演 ,演员 ,评分,推荐语,年份,国家,类型. py如果 ...
Python实战，爬取金融期货数据
大家好,我是毕加锁. 今天给大家带来的是 Python实战,爬取金融期货数据文末送书! 文末送书! 文末送书! 任务简介首先,客户原需求是获取https://hq.smm.cn/copper网 ...
【Python实战】爬取豆瓣排行榜电影数据(含GUI界面版)
项目简介这个项目源于大三某课程设计.平常经常需要搜索一些电影,但是不知道哪些评分高且评价人数多的电影.为了方便使用,就将原来的项目重新改写了.当做是对爬虫技术.可视化技术的实践了.主要是通过从排行榜 ...

Python-urllib、BeautifulSoup爬取豆瓣数据

Python-urllib、BeautifulSoup爬取豆瓣数据相关推荐

最新文章

热门文章