python--re模块及爬取红牛分公司实战演练

正则取消转义

正则器中取消转义推荐使用\(每个\只能取消一个字符的转义)

而python中取消转义推荐使用r‘\n\a\t’（也可以使用\）

python内置模块之re

引用方式：import re

import re
re.findall('正则表达式','带匹配的文本')#根据正则匹配所有符合要求的文本
res=re.findall('a','abd jason  jack')#找到所有带a的字符
print(res)#生成列表['a', 'a', 'a']re.search('正则表达式','带匹配的文本')#根据正则匹配一个符合目标的字符
# ret=re.search('s','hawhe scere  asec')
ret=re.search('j','hawhe scere  asec')#没有符合条件的数据，group直接报错
print(ret)#<re.Match object; span=(6, 7), match='s'>
print(ret.group())#可以取出拿到的第一个s

findall中没有查到相关的字符，则返回值是一个【】

search中没有查到相关的字符，group中直接报错

re.match:根据正则从头到开始匹配，换言之，想要匹配的字符必须在开头

ret=re.match('a','hawhe scere  asec')#None
print(ret)
ret=re.match('h','hawhe scere  asec')#<re.Match object; span=(0, 1), match='h'>
print(ret)
print(ret.group())#h

re.split:按照前面的正则表达式切割字符res=re.split('[ab]','abcd') print(res)#返回值【】【】cd

re.sub:re.sub('正则表达式的目标类型'，‘转换字符’，‘字符串’，可跟个数（不写默认替换全部）)#输出结果将目标类型替换成转换字符

re.subn:在sub的基础上将字符转成元组

re.compile:

regexp_obj = re.compile('\d+')
res = regexp_obj.search('absd213j1hjj456jk')
res1 = regexp_obj.match('123hhkj2h1j3123')
res2 = regexp_obj.findall('1213k1j2jhj21j3123hh')
print(res,res1,res2)

'''常用'''
findall针对分组优先展示无名分组
res = re.findall("^[1-9]\d{14}(\d{2}[0-9x])?$",'110105199812067023')
print(res) # ['023']
?:取消分组优先展示
# res1 = re.findall("^[1-9](?:\d{14})(?:\d{2}[0-9x])?$",'110105199812067023')
# print(res1)

# 有名分组 ?p<xxx>（<名字>）给分组取名，根据名字来取值
res = re.search('^[1-9](?P<xxx>\d{14})(?P<ooo>\d{2}[0-9x])?$','110105199812067023')
print(res)
print(res.group()) # 110105199812067023
print(res.group(1)) # 10105199812067 无名分组的取值方式(索引取)
print(res.group('xxx')) # 10105199812067
print(res.group('ooo')) # 023

re实战之爬取红牛分公司数据

with open(r'redbull.html','r',encoding='utf-8')as f:data=f.read()
title=re.findall('<h2>(.*?)</h2>',data)email_list = re.findall('<p class="mailIco">(.*?)</p>', data)phone_list = re.findall('<p class="telIco">(.*?)</p>', data)
res=zip(title,email_list,phone_list)for data_tuple in res:print("""公司名称：%s公司地址：%s公司邮箱：%s"""%(data_tuple[0],data_tuple[1],data_tuple[2]))

collections模块(高阶模块)

1.nametuple(具名元组)
from collections import namedtuple

2.队列 import queue # 内置队列模块:FIFO

 # 初始化队列q = queue.Queue()# 往队列中添加元素q.put('first')q.put('second')q.put('third')# 从队列中获取元素print(q.get())print(q.get())print(q.get())print(q.get())  # 值去没了就会原地等待

3.双端队列deque

 from collections import dequeq = deque([11,22,33])q.append(44)  # 从右边添加q.appendleft(55)  # 从左边添加print(q.pop())  # 从右边取值print(q.popleft())  # 从做边取值

4.有序字典OrderDict

normal_dict=dict([('id',2379837394),('name','jason'),('pwd','123')])
print(normal_dict)
from  collections import OrderedDict
order_dict=OrderedDict([('id',2379837394),('name','jason'),('pwd','123')])
print(order_dict)

5.默认字典defaultdict

可以做字典，不同的k匹配不同的v值，ascii码和值域范围

from  collections import defaultdict
res='bnmfasdjfadfhajdkfhjd'
my_dict=defaultdict(list)
for value in res:if value>'f':my_dict['k1'].append(value)else:my_dict['k2'].append(value)
print(my_dict)

6.计数器:统计每个字符出现的次数

res='flghafgasdfhasfaksdjfhasldk'
from collections import counter
ret=counter(res)
print(ret)

time与datetime模块

时间的三种表现形式：1.时间戳，2.结构化时间（机器识别），3.格式化时间（人识别）

# 格式化时间
print(time.strftime('%Y-%m-%d'))  # 2021-11-25
print(time.strftime('%Y-%m-%d %H:%M:%S'))  # 2021-11-25 11:48:34
print(time.strftime('%Y-%m-%d %X'))  # 2021-11-25 11:48:34

import datetime
# print(datetime.date.today())  # 2021-11-25
# print(datetime.datetime.today())  # 2021-11-25 12:15:11.969769
"""date年月日  datetime年月日时分秒  time时分秒(MySQL django后期可以)"""
# res = datetime.datetime.today()
# print(res.year)  # 2021
# print(res.month)  # 11
# print(res.day)  # 25
# print(res.weekday())  # 获取星期(weekday星期是0-6) 0表示周一
# print(res.isoweekday())  # 获取星期(weekday星期是1-7) 1表示周一
"""时间差(timedelta)"""
# ctime = datetime.datetime.today()
# time_tel = datetime.timedelta(days=3)
# print(ctime)  # 2021-11-25 12:20:48.570489
# print(ctime - time_tel)  # 2021-11-22 12:21:06.712396
# print(ctime + time_tel)  # 2021-11-28 12:21:06.712396
"""
日期对象 = 日期对象 +/- timedelta对象
timedelta对象 = 日期对象 +/- 日期对象