麻雀虽小五脏俱全

这篇博客仅仅200行代码,却涵盖了Python很多知识面

图形化界面+日志打印文件+代理IP+定时器+数据库连接+异常捕获…

import requests
from bs4 import BeautifulSoup  # 用来解析网页
import uuid
import pymysql
import datetime
from fake_useragent import UserAgent
import time  # 导入时间隔
from pymysql import  OperationalError
import easygui
from tkinter import messagebox
from tkinter import *from requests.exceptions import ProxyErrortop = Tk()
top.withdraw()cookk = {'Cookie': 'deviceIdRenew=1; Hm_lvt_91cf34f62b9bedb16460ca36cf192f4c=1606462198,1606544088,1606547040,1606720814; deviceId=a3f148f-05f0-42d7-b8e5-e29d1f0f1; sessionId=S_0KI4880CAX77BO85; lmvid=d10147ae8706720112b5ed4fae4735a2; lmvid.sig=aaLOZZvjSRmxcxpc4je4nRzkSN_olI3w-q2rW9zTHs4; hnUserTicket=fa6e3cd1-65c0-485a-9557-26dc9b7682ac; hnUserId=267459990; Hm_lpvt_91cf34f62b9bedb16460ca36cf192f4c=1606722342'}headers = {'Host': 'www.cnhnb.com','Referer': 'https://www.cnhnb.com/hangqing/cdlist-0-0-0-0-0-1/','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36',
}def getTime():# 获取现在时间now_time = datetime.datetime.now()# 获取明天时间next_time = now_time + datetime.timedelta(days=+1)next_year = next_time.date().yearnext_month = next_time.date().monthnext_day = next_time.date().day# 获取明天3点时间next_time = datetime.datetime.strptime(str(next_year) + "-" + str(next_month) + "-" + str(next_day) + " 03:00:00","%Y-%m-%d %H:%M:%S")timer_start_time = (next_time - now_time).total_seconds()return timer_start_time;class Logger(object):def __init__(self, filename='default.log', stream=sys.stdout):self.terminal = streamself.log = open(filename, 'a')def write(self, message):self.terminal.write(message)self.log.write(message)def flush(self):pass# 将控制台的信息打印到日志文件中
sys.stdout = Logger(stream=sys.stdout)
(host, db, user, passwd) = easygui.multpasswordbox('惠农网价格爬虫脚本', 'Python爬取中心',fields=['请输入数据库地址(默认本地地址)', '请输入数据库名称(默认zhang)','请输入数据库账号(默认root)','请输入数据库密码(默认root)', ],values=['127.0.0.1', 'zhang', 'root', 'root'])
(ipList, start, end) = easygui.multenterbox('惠农网价格爬虫脚本', 'Python爬取中心',fields=['请输入ip:端口号,并用逗号隔开(ip个数=页码数)', '请输入要爬取的起始页(开始页)', '请输入要爬取的起始页(结束页)'],values=['', '', ''])
fenlei = easygui.multchoicebox(msg="请选择你要爬取的分类(支持多选)", title="分类",choices=("水果", "蔬菜", "禽畜肉蛋", "水产", "农副加工", "粮油米面", "种子种苗", "苗木花草"))
print(f'这个集合是{ipList}')
# 字符串转数组
bb = ipList.split(',')
# 创建IP数组
IPlILI = [{"https": "http://" + i} for i in bb]def getPrice():try:conn = pymysql.connect(host=host, user=user, passwd=passwd, db=db, charset='utf8')cur = conn.cursor()messagebox.showinfo("提示", "数据库已连接")print("------爬虫程序开始------")except OperationalError:messagebox.showwarning("提示", "输入的数据库账号或密码错误")return# 定义一个ip长度qqqLen = int(len(bb))print(f'这个IP数组的长度是{qqqLen}')www = 0types = []headers["User-Agent"] = UserAgent().randomif '水果' in fenlei:types.append(2003191)if '蔬菜' in fenlei:types.append(2003192)if '禽畜肉蛋' in fenlei:types.append(2003193)if '水产' in fenlei:types.append(2003194)if '农副加工' in fenlei:types.append(2003195)if '粮油米面' in fenlei:types.append(2003196)if '种子种苗' in fenlei:types.append(2003197)if '苗木花草' in fenlei:types.append(2003198)if '中药材' in fenlei:types.append(2003200)for inx, type in enumerate(types):for i in range(int(start), int(end) + 1):  # 爬取第一页到第3页的数据# 代理IPif www == qqqLen:print('进来了')print(www)print(qqqLen)www = 0uull = IPlILI[www]# 如果重复了ip那就返回重新来print(f'使用的ip是{uull},正在抓取的分类是:{fenlei[inx]},所在页数是:{i}')try:resp = requests.get(f"https://www.cnhnb.com/hangqing/cdlist-{type}-0-0-0-0-{i}", proxies=uull,headers=headers, cookies=cookk, timeout=30)page_one = BeautifulSoup(resp.text, "html.parser")  # 通过html来,把请求的网页打印出来# 找到表格数据(table)dd = page_one.find('div', class_='quotation-content-list').find_all('li')except AttributeError:continueexcept ProxyError:print(f'第{i}个IP无法连接')# 移出没用的ipIPlILI.remove(uull)print('第一次移出IP成功')# 添加需要替换的IPpp = easygui.multenterbox('替换IP', 'Python爬取中心',fields=['请输入要替换的IP(IP+端口号)'],values=[''])addUrl = {"https": "http://" + pp[0]}IPlILI.append(addUrl)print('第一次添加IP成功')try:resp = requests.get(f"https://www.cnhnb.com/hangqing/cdlist-{type}-0-0-0-0-{i}", proxies=addUrl,headers=headers, cookies=cookk, timeout=30)page_one = BeautifulSoup(resp.text, "html.parser")  # 通过html来,把请求的网页打印出来# 找到表格数据(table)dd = page_one.find('div', class_='quotation-content-list').find_all('li')except ProxyError:print(f'新添加的IP依旧无法连接')# 移出没用的ipIPlILI.remove(addUrl)print('第二次移出IP成功')# 添加需要替换的IPpp = easygui.multenterbox('第二次替换IP', 'Python爬取中心',fields=['请输入要替换的IP(IP+端口号)'],values=[''])addTwoUrl = {"https": "http://" + pp[0]}IPlILI.append(addTwoUrl)print('第二次添加IP成功')resp = requests.get(f"https://www.cnhnb.com/hangqing/cdlist-{type}-0-0-0-0-{i}", proxies=addTwoUrl,headers=headers, cookies=cookk)page_one = BeautifulSoup(resp.text, "html.parser")  # 通过html来,把请求的网页打印出来# 找到表格数据(table)dd = page_one.find('div', class_='quotation-content-list').find_all('li')except Exception:continueif dd is None:print("要回去了")continuefor ss in dd:  # tr是每一行内容,在所有的行中,遍历每一列的内容shopDate = ss.findAll('span')[0].text.strip()if str(datetime.date.today() - datetime.timedelta(days=1))[8:10] == str(shopDate.split('-')[2]):productId = str(uuid.uuid1())name = ss.findAll('span')[1].text.strip()pru = ss.findAll('span')[2].text.strip()province = ss.findAll('span')[2].text.strip()[0:2]# 截取省if '内蒙' in str(pru) or '黑龙' in str(pru) or '台湾' in str(pru):province = ss.findAll('span')[2].text.strip()[0:3]# 市区# 截取区print(pru)if '自治州' in str(pru):area = str(ss.findAll('span')[2].text.split("自治州")[1])elif '自治县' in str(pru):area = str(ss.findAll('span')[2].text.split("自治县")[1])elif '地区' in str(pru):area = str(ss.findAll('span')[2].text.split("地区")[1])else:try:area = str(ss.findAll('span')[2].text.split("市")[1])except IndexError:print('这么奇葩的地市666666666')print(area)if type == 2003191:gry = "水果"elif type == 2003192:gry = "蔬菜"elif type == 2003193:gry = "禽畜肉蛋"elif type == 2003194:gry = "水产"elif type == 2003195:gry = "农副加工"elif type == 2003196:gry = "粮油米面"elif type == 2003197:gry = "种子种苗"elif type == 2003198:gry = "苗木花草"else:gry = "中药材"price = ss.findAll('span')[3].text[0:-3]updown = ss.findAll('span')[4].text.strip()unit = ss.findAll('span')[3].text[-3:]sql = "insert into p_price_data(id,name,area,price,unit,creatime,province,up_down,type) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"cur.execute(sql, (productId, name, area, price, unit, shopDate, province, updown, gry))# print("sql已执行")print(f"分类为:{fenlei[inx]}的第{i}个页面数据抓取完毕")www += 1conn.commit()time.sleep(2)  # 防止服务器蹦了,间隔一秒钟time.sleep(2)cur.close()conn.close()# 爬取供应跟舆情和特色
if __name__ == '__main__':print("----------------开始爬取价格----------------")getPrice();  # 更新较慢,每天更新前一天的print("----------------价格爬取结束----------------")

代码写的很简单,注解也很详细,每一步都有说明

只看代码有点枯燥,运行一下,看下效果图o(￣︶￣)o

就是这么人性化,解决IP问题

嗯,简单暴力的爬取反爬网站的数据

Python可配置爬虫_自定义IP+数据库+日志+分类+分页(代理IP破解反爬虫)相关推荐

python爬去新浪微博_Python爬虫爬取新浪微博内容示例【基于代理IP】
Python爬虫爬取新浪微博内容示例[基于代理IP] 发布时间:2020-09-07 10:08:14 来源:脚本之家阅读:120 本文实例讲述了Python爬虫爬取新浪微博内容.分享给大家供大家参 ...
python爬取微博文本_Python爬虫爬取新浪微博内容示例【基于代理IP】
本文实例讲述了Python爬虫爬取新浪微博内容.分享给大家供大家参考,具体如下: 用Python编写爬虫,爬取微博大V的微博内容,本文以女神的微博为例(爬新浪m站:https://m.weibo.cn ...
Python3网络爬虫(十一)：爬虫黑科技之让你的爬虫程序更像人类用户的行为(代理IP池等)
转载请注明作者和出处:http://blog.csdn.net/c406495762 运行平台: Windows Python版本: Python3.x IDE: Sublime text3 前言黑 ...
【Python3.6爬虫学习记录】（十一）使用代理IP及用多线程测试IP可用性--刷访问量
前言:本来准备写一个刷空间留言的脚本,然而kb TX,无限循环空间验证码.上午还傻x的学验证码识别,后来才发现根本发不了留言,即使填的是对的,仍然继续弹出.无奈,睡了一觉,开始搞新玩意–代理IP!其实 ...
Python有道翻译爬虫，破解反爬虫机制，解决{errorCode:50}错误
一.引言参考网址:https://tendcode.com/article/youdao-spider/ 当前成功时间:2019-6-28 转自个人开源博客:https://my.oschina.n ...
第11篇- 抓取免费代理IP并搭建自己的代理IP池
提前声明:该专栏涉及的所有案例均为学习使用,如有侵权,请联系本人删帖! 文章目录一.前言二.了解代理IP 三.抓取代理ip网站四.完整代码一.前言使用代理IP原因:对于我们数据抓取来说,由于 ...
代理ip最新识别方法及代理ip有效性检测方法
代理ip最新识别方法以及代理ip有效性检测方法一.代理ip常见的一些功能 1.突破自身ip访问限制,现在有许多的网站都对ip地址访问进行了限制,这时则可以通过代理ip来突破限制,让自己进入网站. 2 ...
国内整C多IP服务器怎么搭建代理IP，又怎么区分代理IP呢
ip是上网需要唯一的身份地址代表,而代理ip就是我们上网过程中的一个中间待运行的平台,是由你的电脑先访问这个代理ip,之后再由这个代理ip访问你点开的页面,所要的效果媒介,所以在这个页面的访问记录里留 ...
python文件路径过滤器_自定义过滤器及标签
代码布局(自定义的代码,放在哪里) 1,某个app特有的 --app目标下,templateags 文件夹 --再到ttemplateags 文件夹下创建python模块(py文件) 2,定义复用 - ...
python爬虫——Scrapy入门（爬取西刺代理ip和port）
一.创建项目创建好的项目二.创建爬虫 1.创建一定要先进入刚才创建的爬虫项目文件中再创建爬虫对比未创建爬虫,发现多了一个xici.py文件 2.查看网站君子协议(robots): 3.解释爬虫 ...

Python可配置爬虫_自定义IP+数据库+日志+分类+分页(代理IP破解反爬虫)

麻雀虽小五脏俱全

这篇博客仅仅200行代码,却涵盖了Python很多知识面

图形化界面+日志打印文件+代理IP+定时器+数据库连接+异常捕获…

代码写的很简单,注解也很详细,每一步都有说明

只看代码有点枯燥,运行一下,看下效果图o(￣︶￣)o

就是这么人性化,解决IP问题

嗯,简单暴力的爬取反爬网站的数据

Python可配置爬虫_自定义IP+数据库+日志+分类+分页(代理IP破解反爬虫)相关推荐

最新文章

热门文章