04-正则解析+bs4基础爬虫

正则
匹配字符串
正则表达式必备技能：
元字符：（单个字符）
1，普通字符
2, 特殊字符
. 匹配所有文字符号，除了换行
\w 匹配数字，字母，下划线
\d 匹配数字
\W 匹配除了数字，字母，下划线
\D 匹配除了数字
^ 匹配一句话得开头
$ 匹配一句话得结尾
[abc] 字符组等于 \w
[^abc] 字符组非字符组等于 \W
3,量词
用来修饰前面得一个单位的出现次数
* 表示出现0次或者更多次
+ 表示出现一次或者更多次
？表示出现0次或者1次
{n}表示出现n次 # \d{3} 表示出现3个数字
4 ,贪婪匹配
默认的量词（，+，？）都是尽可能多的匹配内容
5 , 惰性匹配
a.?x （匹配到a和第一个x就结束）惰性匹配，尽可能少的匹配，回溯算法

import re # 关于处理正则表达式的一个模块lst = re.findall(r"\d+","哈哈99呵呵88嘿嘿77")
print(lst)  # \d 匹配数字
#['99', '88', '77']it = re.finditer(r"\d+","哈哈99呵呵88嘿嘿77")
for item in it:print(item.group())# 99# 88# 77it = re.finditer(r"\d+","1哈哈哈sdasd12123ha4562哈哈哈78922")
# 默认匹配第一个数字
obj = it.__next__()
print(obj.group())
#1
# 正则遍历所有数字
for item in it:print (item.group())
# 1
# 12123
# 4562
# 78922##   match 默认从开头匹配  只能匹配一次
result = re.match(r"\d+","123456哈哈99呵呵88嘿嘿77")
print(result.group())
# 123456## search 查找得到第一个结果就返回
result1 = re.search(r"\d+","哈哈123456哈哈99呵呵88嘿嘿77")
print(result1.group())
# 123456obj = re.compile(r"www\.baidu\.com")
it = obj.finditer("www.baidu.com,www.jd.com")
for item in it:print(item.group())  #  www.baidu.com##括号括起来的内容是你想要的结果
##(?P <name>)
obj = re.compile(r"www\.(?P<gpc>baidu|jd)\.com")
it = obj.finditer("www.baidu.com,www.jd.com")
for item in it:print(item.group("gpc"))   # baidu#jdobj = re.compile(r"www\.(baidu|jd)\.com")
it = obj.findall("www.baidu.com,www.jd.com")
print(it)
# ['baidu', 'jd']

from urllib.request import urlopen  #爬虫url模块
import re# 打开网页url，拿到页面源代码
content = urlopen("https://news.163.com/").read().decode("gbk")
# print (content)
obj = re.compile(r'<div class="mod_top_news2" id="js_top_news">.*?">(?P<title>.*?)</a>',re.S)
lst = obj.findall(content)
print(lst)

补：分析浏览器开发者工具中Elements和network这两个选项卡对应的页面源码数据有何不同之处？

Elements中包含的显示的页面源码数据为当前页面所有的数据加载完毕后对应的完整页面数据源码（包含了动态加载的数据）
network中显示的页面源码数据仅仅为某一个单独的请求对应的相应数据（不包含动态加载数据）
结论：如果在进行数据解析的时候，一定需要对页面布局进行数据分析，如果当前网站没有动态加载的数据可以直接使用Elements对页面布局进行分析，否则只可以使用network对页面数据进行分析。
如下图静态网站：

动态加载：

使用正则提取校花网图片链接

# -*- coding: utf-8 -*-
import requests
import urllib
import re
import osdirName = "校花图片" #定义文件名字
if not os.path.exists(dirName): #3 判断是否存在该文件os.mkdir(dirName)  # 如果不存在该文件则创建headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
} #模仿浏览器UA头
url = "http://www.521609.com/daxuemeinv/" #目标url
page_text = requests.get(url=url,headers=headers).text #捕获到当前首页的页面源码数据
ex = '<li>.*?<img src="(.*?)" width=.*?</li>' # 从当前获取的页面源码数据中解析出图片地址
img_src_list = re.findall(ex,page_text,re.S) # 调用获取到的页面源码以及加上re解析的参数信息  #re.S作用于正则换行
for src in img_src_list:print(src)src = "http://www.521609.com"+src #解析出来的url并不完整需要进行拼接imgpath = dirName + '/'+src.split('/')[-1] #为图片进行命名urllib.request.urlretrieve(src,imgpath) #下载图片print(imgpath,"下载完成")

bs4

bs4 解析原理
- 实例化一个BeautifulSoup的对象，且将待解析的页面源码数据加载到该对象中
- 调用BeautifulSoup对象中相关方法或者属性进行标签定位和文本数据提取
环境安装
- pip install lxml #解析器
- pip install bs4
BeautifulSoup对象的实例化：
- BeautifulSoup（fp，‘lxml’）：用来将本地存储的html文档中的数据进行解析
- BeautifulSoup（page_text，‘lxml’）：用来将互联网上请求到的页面源码数据今夕解析
标签定位：

- soup.tagName：只可以定位到第一次出现的TagName标签

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoupheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
url = "http://www.shicimingju.com/book/sanguoyanyi.html"
page_text = requests.get(url=url,headers=headers).text
# with open("三国.html","w",encoding="utf-8") as f:
#     f.write(page_text)
soup = BeautifulSoup(page_text,'lxml') #实例化一个BeautifulSoup对象
print(soup.li) #soup.tagName：只可以定位到第一次出现的TagName标签

- soup.find(‘tagName’,attrName=‘value’)：属性定位

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoupheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
url = "http://www.shicimingju.com/book/sanguoyanyi.html"
page_text = requests.get(url=url,headers=headers).text
# with open("三国.html","w",encoding="utf-8") as f:
#     f.write(page_text)
soup = BeautifulSoup(page_text,'lxml') #实例化一个BeautifulSoup对象
print(soup.find('div',class_="book-mulu")) #soup.find('tagName',attrName='value')：属性定位

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoupheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
url = "http://www.shicimingju.com/book/sanguoyanyi.html"
page_text = requests.get(url=url,headers=headers).text
# with open("三国.html","w",encoding="utf-8") as f:
#     f.write(page_text)
soup = BeautifulSoup(page_text,'lxml') #实例化一个BeautifulSoup对象
print(soup.find('div',id="top_right_nav")) #soup.find('tagName',attrName='value')：属性定位
print(soup.find('p',class_="des"))

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoupheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
url = "http://www.shicimingju.com/book/sanguoyanyi.html"
page_text = requests.get(url=url,headers=headers).text
# with open("三国.html","w",encoding="utf-8") as f:
#     f.write(page_text)
soup = BeautifulSoup(page_text,'lxml') #实例化一个BeautifulSoup对象
print(soup.find('p',class_="des")) #soup.find('tagName',attrName='value')：属性定位
print(soup.findAll('p',class_="des"))# 跟find一样用于属性定位，只不过findAll返回的是列表

soup.select(“选择器”)

类选择器
id选择器
层级选择器

> : 表示一个层级

：空格表示多个层级

如下：

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoupheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
url = "http://www.shicimingju.com/book/sanguoyanyi.html"
page_text = requests.get(url=url,headers=headers).text
# with open("三国.html","w",encoding="utf-8") as f:
#     f.write(page_text)
soup = BeautifulSoup(page_text,'lxml') #实例化一个BeautifulSoup对象
print(soup.select('.book-mulu > ul > li >a')) #soup.select("选择器")  > : 大于号表示一个层级
print(soup.select('.book-mulu a')) #空格表示多个层级

文本数据提取：

.text ：返回的是改标签下的所有文本内容
.string: 返回的是改标签直系下的文本内容

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoupheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
url = "http://www.shicimingju.com/book/sanguoyanyi.html"
page_text = requests.get(url=url,headers=headers).text
soup = BeautifulSoup(page_text,'lxml') #实例化一个BeautifulSoup对象
a_href = soup.select('.book-mulu > ul > li >a')
for a in a_href:print(a.string)#a.text

取url属性

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoupheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
url = "http://www.shicimingju.com/book/sanguoyanyi.html"
page_text = requests.get(url=url,headers=headers).text
soup = BeautifulSoup(page_text,'lxml') #实例化一个BeautifulSoup对象
a_href = soup.select('.book-mulu > ul > li >a')
for a in a_href:print(a['href'])#属性提取

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoupfp = open("三国演义.txt","w",encoding="utf-8")
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
}
url = "http://www.shicimingju.com/book/sanguoyanyi.html"
page_text = requests.get(url=url,headers=headers).text
soup = BeautifulSoup(page_text,'lxml') #实例化一个BeautifulSoup对象
a_list = soup.select('.book-mulu > ul > li >a')
for a in a_list:title = a.string  #中文名字detail_url = "http://www.shicimingju.com" + a ['href'] #解析出来的url地址# 对详情页发起请求解析出章节内容page_text_detail = requests.get(url=detail_url,headers=headers).textsoup = BeautifulSoup(page_text_detail,'lxml')div_tag = soup.find('div',class_="chapter_content")content = div_tag.textfp.write(title+':'+content+"\n")print(title,"保存成功！！！")
fp.close()