python_爬虫

import pymysql
from http import cookiejar
from urllib import request,parse
from urllib.error import HTTPError,URLErrorclass Session_cookie(object):def __init__(self): # 初始化cookie = cookiejar.CookieJar()handler = request.HTTPCookieProcessor(cookie)self.opener = request.build_opener(handler)# request.install_opener(opener) # 这个是将opner变成全局参数，慎用，尤其在大型项目开发中def main(self,url,headers=None,data=None):return main(url,headers,data,self.opener)# 爬虫连接
# ------------------------------------------------------------
def main(url,headers=None,data=None,opener=None): # 调用函数if not data:print('无data数据')return get_response(url,headers=headers,opener=opener)else:print('有data数据')return get_response(url,headers=headers,data=data,opener=opener)def get_response(url,data=None,headers=None,opener=None):if not headers:headers = {'User-Agent':get_random('p_useragent')[0][1]}html = '' # 定义一个空值，避免出错无法返回try:if data:data = parse.urlencode(data)data = bytes(data,encoding='utf-8')req = request.Request(url, data=data, headers=headers)else:req = request.Request(url,headers=headers)if not opener:response = request.urlopen(req)else:print('使用opener')response = opener.open(req)html = response.read().decode()except HTTPError as e: # 总的错误信息，不适合用于调试print(e)except URLError as e:print(e)return html  # 返回数据 如果写在try里面，当httperror时无返回值，外面就接不到值# 数据库处理
# ------------------------------------------------------------
def mysql_connect(table,sql,data=None): # 数据库连接conn = pymysql.connect('127.0.0.1', 'root', '123456', 'PaChong', charset='utf8')cursor = conn.cursor()if not data:row = cursor.execute(sql) # 查询类else:row = cursor.execute(sql,data) # 需要提交的类conn.commit() # 提交return cursordef get_random(table=None): # 由数据库获取随机信息 proxy agentsql = 'SELECT * FROM {} WHERE id >= ((SELECT MAX(Id) FROM {})-(SELECT MIN(Id) FROM {})) * RAND() + (SELECT MIN(Id) FROM {})  LIMIT 1'.format(table, table, table, table)cursor = mysql_connect(table,sql=sql)return cursor.fetchall()def proxies_save_mysql(data): # 保存代理table = 'p_proxies'keys = ','.join(data.keys())values = ','.join(['%s']*len(data))sql = 'insert into {}({}) values({})'.format(table,keys,values)data = tuple(data.values())mysql_connect(table,sql=sql,data=data)
# ------------------------------------------------------------if __name__ == '__main__':# url = 'http://fanyi.baidu.com/sug'# data = {'kw':'中国'}# import json# res = json.loads(main(url,data=data))# print(res)# url = 'http://www.baidu.com'# res = main(url)# print(res)pass

正常情况下，每写一个爬虫，都需要执行分析->请求->响应->下载(存储)的流程，但诸多功能，其实都是在重复造轮子，比如请求、调用请求头、post请求data值，可以将这些功能写到一个py文件里，这样再写其他爬虫文件时，直接调用，就可以略过输入请求头、post传参转码等诸多操作。

转载于:https://www.cnblogs.com/hejianlong/p/9470438.html

python_爬虫_模块相关推荐

Python_爬虫_网页图片下载_その日の紋
Python_爬虫_网页图片下载_その日の紋项目效果项目需求项目分析 URL分析页面分析项目实施项目源码项目效果项目需求目标页面:https://www.hanakomon.jp/c ...
Python_爬虫_案例汇总：
1.豆瓣采集 1 #coding:utf-8 2 #采集豆瓣书信息和图片,写进数据库 3 4 from urllib import request 5 # from bs4 import Beauti ...
Python_爬虫_猫眼电影网电影预告片批量下载
非常简单的一个基础爬虫代码,可以根据不同的url自动下载同一页中的所有预告片 import requests from lxml import etree import re# 1.确定url地址 u ...
python_爬虫_七麦网
本文用于学习交流使用,如有侵权,联系删除 1 爬取需求 1.1 七麦网简介七麦网(https://www.qimai.cn/),该平台支持提供iOS.Android应用市场.微信.小程序等数据查询, ...
python爬虫模块_python之爬虫_模块
asdf The Dormouse's story总共 f Once upon a time there were three little sisters; and their names were ...
python_爬虫_豆瓣TOP250_url
本文仅供学习使用,如有侵权,联系删除. 获得豆瓣top 250书单的url import lxml import requests import re import csv from requests ...
Python_爬虫_中文乱码
今天在用Python2.7爬取百度百科的一个网页时发现输出时中文为乱码. 尝试一: 查看网页页面信息,发现其中文字编码为"GBK",遂准备对其进行解码. content = url ...
python_爬虫_豆瓣TOP250_页面内容
本文仅供学习使用,如有侵权,联系删除豆瓣TOP250书籍页面内容如下,此次将爬取图片中的内容 from bs4 import BeautifulSoup import lxml import req ...
Python_机器学习_常用科学计算库_第6章_ Seaborn+综合案例
Python_机器学习_常用科学计算库_第6章_ Seaborn+综合案例文章目录 Python_机器学习_常用科学计算库_第6章_ Seaborn+综合案例 Seaborn 学习目标 6.1 Se ...
Python_机器学习_算法_第4章_4.决策树算法
Python_机器学习_算法_第4章_4.决策树算法文章目录 Python_机器学习_算法_第4章_4.决策树算法决策树算法学习目标 4.1 决策树算法简介学习目标小结 4.2 决策树分类原 ...

python_爬虫_模块

python_爬虫_模块相关推荐

最新文章

热门文章