urllib库的用法

urlopen

urllib.request.urlopen(url,data=None,[timeout,]*,cafile=None,cadefault=False,context=None)

import urllib.request# GET类型的请求，无需data参数
response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))

import urllib.request
import urllib.parse# POST类型的请求，需要传入data
data = bytes(urllib.parse.urlencode({'world': 'hello'}), encoding='utf8')
response = urllib.request.urlopen('http://httpbin.org/post', data=data)
print(response.read())

import urllib.request# timeout 表示必须在此时间内获得请求，否则报错
response = urllib.request.urlopen('http://httpbin.org/get', timeout=1)
print(response.read())

import urllib.request
import socket
import urllib.error# 请求时长超出0.1 则抛出异常
try:response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1)
except urllib.error.URLError as e:if isinstance(e.reason, socket.timeout):print('TIME OUT')
# TIME OUT

response

响应的类型

import urllib.requestresponse = urllib.request.urlopen('https://www.python.org')
print(type(response))
# <class 'http.client.HTTPResponse'>

状态码Status Code，响应头Response Headers

import urllib.requestresponse = urllib.request.urlopen('https://www.python.org')
print(response.status)
print(response.getheaders())  # 获取所有响应头
print(response.getheader('Server'))  # 获取特定的响应头

import urllib.requestresponse = urllib.request.urlopen('https://www.python.org')
print(response.status)
print(response.read().decode('utf-8'))

request

构造request，便于自定义参数，设置请求方式

from urllib import request, parse# 构造request，便于自定义参数，设置请求方式
url = 'http://httpbin.org/post'
headers = {'User-Agent': 'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)','Host': 'httpbin.org'
}
dict = {'name': 'Germey'
}
data = bytes(parse.urlencode(dict), encoding='utf8')  # 　formdata的数据用bytes类型编码下
req = request.Request(url=url, data=data, headers=headers,method='POST')  # POST请求方式
response = request.urlopen(req)
print(response.status)
# print(response.read())
print(response.read().decode('utf-8'))

或者调用request的add_header方法传入头

from urllib import request, parseurl = 'http://httpbin.org/post'
dict = {'name': 'Germey'
}
# formdata的数据用 bytes类型编码下
data = bytes(parse.urlencode(dict), encoding='utf8')
req = request.Request(url=url, data=data, method='POST')  # POST请求方式
req.add_header('User-Agent', 'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

handler

代理
通过切换代理 ip 让服务器识别出来自不同的地方，防止封掉爬虫

import urllib.requestproxy_hander = urllib.request.ProxyHandler({'http': 'http://49.76.14.74:61202','https': 'https://49.76.14.74:61202'
})
# 通过切换代理 ip 让服务器识别出来自不同的地方，防止封掉爬虫
opener = urllib.request.build_opener(proxy_hander)
response = opener.open('http://www.baidu.com')
print(response.read())

Cookie，保存用户登陆信息

import http.cookiejar
import urllib.requestcookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')  # 把百度的cookie复制到变量中
for item in cookie:  # 打印出cookieprint(item.name+"="+item.value)

把cookie保存为文件，便于每次读取登陆信息。

import http.cookiejar
import urllib.requestfilename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename)  # 声明为MoziilaCookiwJar（火狐）
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save( ignore_expires = True,ignore_discard=True)

import http.cookiejar
import urllib.requestfilename = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar(filename)  # cookie2.0的格式存储
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save( ignore_expires = True,ignore_discard=True)

用什么方法存cookie，用什么方法读取就行了

import http.cookiejar
import urllib.request# 用什么格式的cookie存，用什么方法读取就行了
cookie = http.cookiejar.LWPCookieJar()
# ignore_discard的意思是即使cookies将被丢弃也将它保存下来
#ignore_expires的意思是如果cookies已经过期也将它保存并且文件已存在时将覆盖
cookie.load('cookie.txt', ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

异常处理

from urllib import request,error
# 捕捉异常
try:response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.URLError as e:print(e.reason)

from urllib import request,error
# 捕捉异常
try:response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:print(e.reason,e.code,e.headers,sep='\n')
except error.URLError as e:print(e.reason)
else:print('Request Successfully')

import socket
import urllib.request
import urllib.errortry:response = urllib.request.urlopen('http://www.baidu.com',timeout=0.01)
except urllib.error.URLError as e:print(type(e.reason))if isinstance(e.reason,socket.timeout):print('TIME OUT')

URL解析

urlparse 将网址分解成六个参数

urllib.parse.urlparse(urlstring,scheme=”,allow_fragments=True)

from urllib.parse import urlparseresult = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(type(result),result)

如果网址没有指定协议类型，则可以自定义协议类型。如果前面网址制定了协议类型，则后面不会生效。

from urllib.parse import urlparse
# 指定协议类型
result = urlparse('www.baidu.com/index.html;user?id=5#comment',scheme='https')
print(result)

如果allow_fragment参数为false，则不识别freagment标识符。相反，它们被解析为path、params或query的一部分，并将fragment设置为返回值中的空字符串。

from urllib.parse import urlparseresult = urlparse('www.baidu.com/index.html;user?id=5#comment',allow_fragments=False)
print(result)

urlunparse 将参数拼接成网址

from urllib.parse import urlunparsedata = ['http','www.baidu.com','index.html','user','a=6','comment']
print(urlunparse(data))

urljoin

from urllib.parse import urljoinprint(urljoin('http://www.baidu.com','https://cuiqingcai.com/FAQ.html'))

urlencode 把一个字典对象转化为get参数

from urllib.parse import urlencodeparams = {'name':'germey','age':22
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(params)
print(url)

Python3爬虫入门之Urllib库的用法相关推荐

Python3爬虫入门之selenium库的用法
Selenium 基本使用 from selenium import webdriver from selenium.webdriver.common.by import By from seleni ...
爬虫入门之urllib库详解(二)
爬虫入门之urllib库详解(二) 1 urllib模块 urllib模块是一个运用于URL的包 urllib.request用于访问和读取URLS urllib.error包括了所有urllib.r ...
Python爬虫入门之Urllib库的基本使用
那么接下来,小伙伴们就一起和我真正迈向我们的爬虫之路吧. 1.分分钟扒一个网页下来怎样扒网页呢?其实就是根据URL来获取它的网页信息,虽然我们在浏览器中看到的是一幅幅优美的画面,但是其实是由浏览器解 ...
Python爬虫入门三urllib库基本使用
urllib是一个收集了多个涉及了URL的模块的包: URL获取网页 urllibtest.pyimport urllib2 response = urllib2.urlopen('http://ww ...
Python爬虫入门四urllib库的高级用法
1.设置headers 有些网站不会同意程序直接用上面的方式进行访问,如果识别有问题,那么站点根本不会响应,所以为了完全模拟浏览器的工作,我们需要设置一些 Headers 的属性. 首先,打开我们的浏 ...
python3爬虫入门（urllib和requests简单使用）
爬虫介绍知道python有强大的的爬虫库,但是对于我们普通小白来说,写一个完整的爬虫需要知道什么甚至了解什么都是很重要的.掌握了这些基本点,才能够熟悉爬虫的构成和获取有用的信息. 编写一个小爬虫个人 ...
Python3爬虫入门之Request库的使用
requests 什么是Requests Requests 是⽤Python语⾔编写,基于 urllib,采⽤Apache2 Licensed开源协议的HTTP库. 它⽐urllib更加⽅便,可以节约 ...
Python3爬虫入门之beautifulsoup库的使用
强调内容 BeautifulSoup 灵活又方便的网页解析库,处理高效,支持多种解析器.利用它不用编写正则表达式即可方便地实现网页信息的提取. 解析库解析器使用方法优势劣势 Python标准库 ...
Python3爬虫入门之pyquery库的使用
pyquery 初始化字符串初始化 html = ''' <div><ul><li class="item-0">first item< ...

Python3爬虫入门之Urllib库的用法