python爬虫之urllib

#coding=utf-8
#urllib操作类  import time
import urllib.request
import urllib.parse
from urllib.error import HTTPError, URLError
import sys
class myUrllib:@staticmethoddef get_headers(headers):default_headers = {'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',#'Referer': r'http://www.baidu.com/','Connection': 'keep-alive','Cookie':'uuid_tt_dd=2845574184150781887; _ga=GA1.2.1608505838; dc_tos=p308'}headers = headers and dict(default_headers,**headers) or default_headers  return headers@staticmethoddef get(url,headers={}):headers = myUrllib.get_headers(headers)#data=urllib.parse.urlencode(query_data).encode('utf-8')#r/R:非转义的原始字符串 #u/U:表示unicode字符串 #b:bytes url=r'%s'%urlrequest = urllib.request.Request(url,headers=headers,method='GET')try:html = urllib.request.urlopen(request).read()page = html.decode('utf-8')except HTTPError as e:print (e.code,e.reason)except URLError as e:print (e.reason)return page@staticmethoddef post(url,data={},headers={}):headers = myUrllib.get_headers(headers)data=urllib.parse.urlencode(data)binary_data=data.encode('utf-8')url=r'%s'%urlrequest=urllib.request.Request(url,data=binary_data,headers=headers,method='POST')#发送请求，传送表单数据    # response=urllib.request.urlopen(request)#接受反馈的信息# data=response.read()#读取反馈信息# data=data.decode('utf-8')#print (data.encode('gb18030'))#print (response.geturl())#返回获取的真实的URL#info()：返回一个对象，表示远程服务器返回的头信息。#getcode()：返回Http状态码，如果是http请求，200表示请求成功完成;404表示网址未找到。#geturl()：返回请求的url地址。try:html = urllib.request.urlopen(request).read()page = html.decode('utf-8')except HTTPError as e:print (e.code,e.reason)except URLError as e:print (e.reason)return pagegetInfo = myUrllib.get('http://localhost:88/test/c.php?act=category',{'Referer': r'https://www.baidu.com/'})
print(getInfo)sys.exit() postInfo = myUrllib.post('http://localhost:88/test/c.php',{'id':1010},{'Referer': r'https://www.baidu.com/'})
print(postInfo)

d:\python\crawler>python urllib01.py
HTTP_HOST:
localhost:88

HTTP_USER_AGENT:
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/63.0.3239.108 Safari/537.36

HTTP_COOKIE:
uuid_tt_dd=2845574184150781887; _ga=GA1.2.1608505838; dc_tos=p308

HTTP_REFERER:
https://www.baidu.com/

REQUEST_METHOD:
GET

GET DATA:
array(1) {
["act"]=>
string(8) "category"
}

#设置代理

#coding=utf-8
import urllib.request
import random
from urllib.error import HTTPError, URLErrordef proxy_handler(url,iplist,wfile):#ip = random.choice(iplist)for ip in iplist:try:print('*'*20,'\n ip:',ip)proxy_support = urllib.request.ProxyHandler({'http':ip})opener = urllib.request.build_opener(proxy_support)opener.addheaders = [('User-Agent',r'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36')]urllib.request.install_opener(opener)response = urllib.request.urlopen(url)code = response.getcode()url = response.geturl()print('*'*20,'\n url:',url)print('*'*20,'\n code:',code)info = response.info()print('*'*20,'\n info:',info)if code == 200:page = response.read()#写入文件page = str(page, encoding='utf-8')fw = open(wfile,'w',encoding='UTF-8')fw.write(page)fw.close()print('*'*20,'\n write file:',wfile)breakexcept HTTPError as e:print (e.code,e.reason)continueexcept URLError as e:print (e.reason)continueurl = r'http://ip.chinaz.com/'
iplist = ['182.42.244.169:808','122.72.18.34:80','52.44.16.168:3129']
wfile = 'page.txt'
proxy_handler(url,iplist,wfile)

d:\python\crawler>python proxy01.py
********************
ip: 182.42.244.169:808
[WinError 10061] 由于目标计算机积极拒绝，无法连接。
********************
ip: 122.72.18.34:80
********************
url: http://ip.chinaz.com/
********************
code: 200
********************
info: Cache-Control: private
Content-Length: 33900
Content-Type: text/html; charset=utf-8
Server: Microsoft-IIS/7.5
X-AspNet-Version: 4.0.30319
Set-Cookie: qHistory=aHR0cDovL2lwLmNoaW5hei5jb20rSVAv5pyN5Yqh5Zmo5Zyw5Z2A5p+l6K
i; domain=.chinaz.com; expires=Tue, 05-Feb-2019 15:03:42 GMT; path=/
X-Powered-By: ASP.NET
Date: Mon, 05 Feb 2018 15:03:42 GMT
X-Cache: MISS from GD-SZ-WEB-01
X-Cache-Lookup: MISS from GD-SZ-WEB-01:80
Connection: close

********************
write file: page.txt

转载于:https://www.cnblogs.com/fonyer/p/8871447.html

python爬虫之urllib相关推荐

Python爬虫之urllib模块2
Python爬虫之urllib模块2 本文来自网友投稿作者:PG-55,一个待毕业待就业的二流大学生. 看了一下上一节的反馈,有些同学认为这个没什么意义,也有的同学觉得太简单,关于Beautiful ...
python爬虫用urllib还是reques,python爬虫中urllib.request和requests有什么区别？
在学习python爬虫,想要检索request相关内容时,往往会出现urllib.request和requests这两个词,urllib.request和requests都是python爬虫的模块,其 ...
python爬虫之urllib库详解
python爬虫之urllib库详解前言一.urllib库是什么? 二.urllib库的使用 urllib.request模块 urllib.parse模块利用try-except,进行超时处理 ...
Python爬虫进阶——urllib模块使用案例【淘宝】
Python爬虫基础--HTML.CSS.JavaScript.JQuery网页前端技术 Python爬虫基础--正则表达式 Python爬虫基础--re模块的提取.匹配和替换 Python爬虫基础- ...
python补充urllib教程,Python爬虫之urllib基础用法教程
综述本系列文档用于对Python爬虫技术进行简单的教程讲解,巩固自己技术知识的同时,万一一不小心又正好对你有用那就更好了. Python 版本是3.7.4 urllib库介绍它是 Python 内 ...
Python爬虫【urllib模块】
通用爬虫爬虫的一般流程 1 初始化一批URL,将这些URL放入队列 2 从队列中取出这些URL,通过DNS解析IP,对IP对应的网站下载HTML页面,保存到本地服务器中,爬取完的URL放到已爬取队列 ...
python urllib.request 爬虫数据处理-python 爬虫之 urllib库
文章更新于:2020-03-02 注:代码来自老师授课用样例. 一.初识 urllib 库在 python2.x 版本,urllib 与urllib2 是两个库,在 python3.x 版本,二者合 ...
python爬虫之urllib,伪装,超时设置,异常处理
Urllib Urllib.request.urlopen().read().decode() 返回一个二进制的对象,对这个对象进行read()操作,可以得到一个包含网页的二进制字符串,然后用deco ...
利用python爬虫(part1)--urllib.request模块
学习笔记文章目录网络爬虫概述定义爬虫分类爬取数据步骤爬虫请求模块常用方法 urllib.request.urlopen()方法响应对象(response)方法关于请求头 urllib ...
python爬虫用urllib还是reques_Python爬虫之urllib.request库
爬虫--urllib.request库的基本使用所谓网页抓取,就是把URL地址中指定的网络资源从网络流中读取出来,保存到本地.在Python中有很多库可以用来抓取网页,我们先学习urllib.req ...

python爬虫之urllib

python爬虫之urllib相关推荐

最新文章

热门文章