python去各大免费代理ip网站抓取代理ip并做校验入库,然后对库里的ip不停做校验,然后用flask做成api接口

传送门:https://github.com/zjl1110/ProxyIPGet

目录结构:

ProxyIPGet

|----app

|----flaskrun.py(flask程序)

|----static(没用上)

|----templates(没用上)

|----checkout_script.py(用来不停校验库里的ip是否有效)

|----common(公用模块)

|----__init__.py

|----email_manager.py(发邮件模块,没用上)

|----html_manager.py(html模块,没用上)

|----ip_db_manager.py(存,取ip)

|----log_manager.py(日志模块,没用上)

|----redis_manager.py(redis模块)

|----request_common.py(请求模块)

|----request_manager(请求模块)

|----setting.py(设置模块)

|----url_manager.py(url模块,没用上)

|----run.py(抓取校验ip并入库)

|----runapp.py(三个程序的主入口程序)

email_manager.py:

# -*- coding: utf-8 -*-
#__author__="ZJL"import smtplib
import email.mime.multipart
import email.mime.text
from common.setting import toEmail, emailName, emailPassword, smtp_connectclass EmailTloost:def __init__(self, toemail, totilte, totext):self.toemail = toemailself.emailname = emailNameself.emailpassword = emailPasswordself.smtp_connect = smtp_connectself.msg = email.mime.multipart.MIMEMultipart()self.msg['from'] = self.emailnameself.msg['to'] = self.toemailself.msg['subject'] = totilteself.content = totextself.txt = email.mime.text.MIMEText(self.content)self.msg.attach(self.txt)# smtp = smtplibdef sendEmail(self):smtp = smtplib.SMTP()smtp.connect(self.smtp_connect, '25')smtp.login(self.emailname, self.emailpassword)smtp.sendmail(self.emailname, self.toemail, str(self.msg))smtp.quit()def batchSendEmail(totilte, totext):for toemail in toEmail:e = EmailTloost(toemail, totilte, totext)e.sendEmail()# batchSendEmail("xxx","hahahah")

html_manager.py:

# -*- coding: utf-8 -*-
#__author__="ZJL"from common.redis_manager import RedisManager as rm# Html页面管理器
class HtmlManager(object):def __init__(self, new_htmls="new_htmls", **key):self.rm = rm()self.new_htmls = new_htmls# 向管理器中添加一个新的htmldef add_new_html(self, html):if html is None:return# 如果不在队列中就添加新htmlif not self.rm.isExist(self.new_htmls, html):self.rm.setSets(self.new_htmls, html)# 向管理器中添加新的更多的htmldef add_new_htmls(self, htmls):if htmls is None or len(htmls) == 0:returnfor html in htmls:self.add_new_html(html)# 判断管理器是否有新的htmldef has_new_html(self):return self.rm.setsLen(self.new_htmls) != 0# 从管理器中获取一个htmldef get_new_html(self):new_html = self.rm.getSetsOneDel(self.new_htmls)return new_html

ip_db_manager.py

# -*- coding: utf-8 -*-
#__author__="ZJL"import random# 将有效IP存到reids
def Ip_DBSave(rm, db_name, ip_port, ip_time):try:ip_times = rm.getKeyAllAttribute(db_name)if len(ip_times)<110:rm.setKeyValue(db_name,ip_time,ip_port)else:ip_times.sort(reverse=True)rm.delAttribute(db_name, ip_times[-1])rm.setKeyValue(db_name, ip_time, ip_port)except Exception as e:return e,"Ip_DB"# 随机获取IP
def Ip_DBGet(rm, db_name):try:ip_times = rm.getKeyAllAttribute(db_name)ip_len = len(ip_times)ip_prot = rm.getKeyValue(db_name,ip_times[random.randint(0,ip_len-1)])return ip_protexcept Exception as e:return e,"Ip_DBGet"# 获取所有IP
def Ip_DBGetAll(rm, db_name):ip_prots={}try:ip_times = rm.getKeyAllAttribute(db_name)for ip_time in ip_times:ip = rm.getKeyValue(db_name, ip_time)ip_prots[ip] = ip_time# ips = [rm.getKeyValue(db_name, ip_time) for ip_time in ip_times]return ip_protsexcept Exception as e:return e,"Ip_DBGetAll"

log_manager.py

# -*- coding: utf-8 -*-
#__author__="ZJL"# 日志管理器
# 错误分两个等级,只有尾号1会触发邮件提醒,日志文件过大也会触发邮件
# 使用尾号区分是为了用前面的数字可以记录抓取阶段表示
import logging, traceback, os
from common.email_manager import batchSendEmail
from common.setting import logfilename, errortitledef objLogging(errorcode, errortext):logging.basicConfig(level=logging.DEBUG,format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',datefmt='%a, %d %b %Y %H:%M:%S',filename=logfilename,filemode='a+')console = logging.StreamHandler()console.setLevel(logging.INFO)formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')console.setFormatter(formatter)logging.getLogger('').addHandler(console)if errorcode[-1] == "0":text = errortext + "\n" + traceback.format_exc()logging.debug(text)elif errorcode[-1] == "1":text = errortext + "\n" + traceback.format_exc()logging.warning(text)try:batchSendEmail(errortitle, text)except Exception as e:# print(traceback.format_exc())logging.warning(traceback.format_exc())else:text = errortext + "\n" + traceback.format_exc()logging.warning(text)filesize = os.path.getsize(logfilename)if filesize >= 3000000:try:batchSendEmail("日志文件过大", "日志文件大小大于3M,请及时处理")except Exception as e:# print(traceback.format_exc())logging.warning(traceback.format_exc())

redis_manager.py

# -*- coding: utf-8 -*-
#__author__="ZJL"import redis
from common.setting import redis_db,redis_host,redis_port# redis队列管理器
class RedisManager(object):def __init__(self, host=redis_host, port=redis_port, db=redis_db):self.pool = redis.ConnectionPool(host=host, port=port, db=db)self.r = redis.StrictRedis(connection_pool=self.pool)# 可以存储任意格式def setData(self, keyname, data):data = self.r.set(keyname, data)return data# 取数据def getData(self, keyname, coding="utf-8"):data = self.r.get(keyname)data = data.decode(coding)return data# 取数据并删除def getDataDel(self, keyname, coding="utf-8"):data = self.r.get(keyname)data = data.decode(coding)# 删除self.r.delete(keyname)return data# 只保存属性值,key对应多个属性def setValue(self, keyname, data):data = self.r.lpush(keyname, data)return data# 取出属性值,并删除def getValue(self, keyname, coding="utf-8"):data = self.r.brpop(keyname, 0)[1]data = data.decode(coding)return data# 以键值对形式保存属性名和属性值,key对应多个属性def setKeyValue(self, keyname, datakey, data):state = self.r.hset(keyname, datakey, data)if state == 0:return Trueelse:return False# 取出属性值def getKeyValue(self, keyname, datakey, coding="utf-8"):data = self.r.hget(keyname, datakey)data = data.decode(coding)return data# 取出属性值并删除def getKeyValueDel(self, keyname, datakey, coding="utf-8"):data = self.r.hget(keyname, datakey)data = data.decode(coding)# 删除self.r.hdel(keyname, datakey)return data# 根据属性名删属性值def delAttribute(self, keyname, datakey):hdel = self.r.hdel(keyname, datakey)if hdel == 1:return Trueelse:return False# 获得key下面所有属性名def getKeyAllAttribute(self, keyname):hkeys = self.r.hkeys(keyname)return hkeys# 获得所有key的名称def getKey(self):keys = self.r.keys()return keys# 获得同一个key还有多少def getLen(self, keyname):llen = self.r.llen(keyname)return llen# 判断key是否存在def getExists(self, keyname):exists = self.r.exists(keyname)return exists# 获得key的数量def getDbsize(self):dbsize = self.r.dbsize()return dbsize# 删除keydef deleteKy(self, keyname):delete = self.r.delete(keyname)if delete == 1:return Trueelse:return False# 删除当前数据库的所有数据def flushDB(self):flushdb = self.r.flushdb()return flushdb# ======集合==========# 添加数据,因为是集合所以有去重功能,返回添加了多少def setSets(self, keyname, *data):return self.r.sadd(keyname, *data)# 取出集合,返回列表def getSetsList(self, keyname, coding="utf-8"):datas = self.r.smembers(keyname)datas = [d.decode(coding) for d in datas]return datas# 取出集合,返回列表,最后删除def getSetsListDel(self, keyname, coding="utf-8"):datas = self.r.smembers(keyname)datas = [d.decode(coding) for d in datas][self.r.srem(keyname, d) for d in datas]return datas# 取出集合最后一个元素def getSetsOne(self, keyname, coding="utf-8"):data = self.r.smembers(keyname)data = [d.decode(coding) for d in data]if len(data) > 0:return data.pop()else:return# 取出集合最后一个元素并删除def getSetsOneDel(self, keyname, coding="utf-8"):datas = self.r.smembers(keyname)datas = [d.decode(coding) for d in datas]if len(datas) > 0:data = datas.pop()self.r.srem(keyname, data)return dataelse:return# 删除集合的元素,返回删除了多少def setsDel(self, keyname, *data):return self.r.srem(keyname, data)# 判断元素是否存在def isExist(self, keyname, data):return self.r.sismember(keyname, data)# 集合长度def setsLen(self, keyname):return self.r.scard(keyname)# 多个集合的交集,返回列表def setsIntersection(self, *keyname):data = self.r.sinter(keyname)data = [d.decode("utf-8") for d in data]return data# 多个集合的并集,返回列表def setsAndSet(self, *keyname):data = self.r.sunion(keyname)data = [d.decode("utf-8") for d in data]return data

request_common.py

# -*- coding: utf-8 -*-
#__author__="ZJL"#定义一个重试修饰器,默认重试一次,sets参数是便于将错误url和信息存入redis(或其他)
def asyncRetry(num_retries=1,sets=None):#用来接收函数def wrapper(func):#用来接收函数的参数,这里采用协程方式async def wrapper(*args,**kwargs):#为了方便看抛出什么错误定义一个错误变量last_exception =None#循环执行包装的函数for _ in range(num_retries):try:#如果没有错误就返回包装的函数,这样跳出循环,这里需要挂起return await func(*args, **kwargs)except Exception as e:# print(e)#捕捉到错误不要return,不然就不会循环了,这里不能挂起# print(args[0],kwargs)#这里用于将出错的url存入redis# sets(args[0])last_exception = e#如果要看抛出错误就可以抛出# raise last_exceptionreturn wrapperreturn wrapper

request_manager.py

# -*- coding: utf-8 -*-
#__author__="ZJL"import aiohttp# 请求管理器
class RequestManager(object):def __init__(self):self.session = aiohttp.ClientSession()def get(self, url, *, allow_redirects=True, **kwargs):return self.session.get(url, allow_redirects=True, **kwargs)def post(self, url, *, data=None, **kwargs):return self.session.post(url, data=None, **kwargs)

setting.py:

# -*- coding: utf-8 -*-
#__author__="ZJL"# 接收邮箱
toEmail = ["xxxxxxxx@qq.com"]smtp_connect = "smtp.163.com"# 发送邮箱
emailName = "xxxxxx@163.com"# 邮箱密码
emailPassword = "xxxxx"# redisIP
redis_host = "127.0.0.1"# redis端口
redis_port = 6379# redisDB
redis_db = 1# 日志文件名
logfilename = "Logging.log"# 邮件标题
errortitle = "程序错误报告"User_Agent=["Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"]

url_manager.py:

# -*- coding: utf-8 -*-
#__author__="ZJL"from common.redis_manager import RedisManager as rm# Url管理器class UrlManager(object):# new_urls是待爬URL队列名,old_urls是已爬URL队列名,error_urls是失败URL队列名def __init__(self, new_urls="new_urls", old_urls="old_urls", error_urls="error_urls", **key):# redis队列self.rm = rm()# 待爬urlself.new_urls = new_urls# 已爬urlself.old_urls = old_urls# 失败urlself.error_urls = error_urls# 向管理器中添加一个新的urldef add_new_url(self, url):if url is None:return# 如果不在待爬中也不再已爬中就添加新urlif not self.rm.isExist(self.new_urls, url) and not self.rm.isExist(self.old_urls, url):self.rm.setSets(self.new_urls, url)# 向管理器中添加一个失败的urldef add_error_url(self, url):if url is None:returnself.rm.setSets(self.error_urls, url)# 向管理器中添加新的更多的urldef add_new_urls(self, urls):if urls is None or len(urls) == 0:returnfor url in urls:self.add_new_url(url)# 判断管理器是否有新的待爬取的urldef has_new_url(self):return self.rm.setsLen(self.new_urls) != 0# 从管理器中获取一个新的待爬取的urldef get_new_url(self):new_url = self.rm.getSetsOneDel(self.new_urls)self.rm.setSets(self.old_urls, new_url)return new_url# 从管理器中获取所有的待爬取的urldef get_new_urls(self):new_urls = self.rm.getSetsListDel(self.new_urls)for new_url in new_urls:self.rm.setSets(self.old_urls, new_url)return new_urls

run.py:

# -*- coding: utf-8 -*-
#__author__="ZJL"from common.redis_manager import RedisManager
from common.request_manager import RequestManager
from common.request_common import asyncRetry
from common.url_manager import UrlManager
from common.ip_db_manager import Ip_DBSave
from bs4 import BeautifulSoup as bs
import asyncio
import time
import random
import requests# 公用头信息
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"}# 存IP
rdb = RedisManager(db="4")# 这个没有用摆设
rm = UrlManager()# 重试机制
@asyncRetry(4, rm.add_error_url)
async def getPage(url):# asyncio.Semaphore(),限制同时运行协程数量sem = asyncio.Semaphore(5)with (await sem):async with RequestManager().session as session:async with session.get(url, headers=headers, timeout=360) as resp:# 暂停一会儿,太不暗落落容易被封time.sleep(random.random()*5)# 断言,判断网站状态assert resp.status == 200# 判断不同url做不同的处理if "xicidaili" in url:body = await resp.text()xici_grabPage(url,body)elif "kuaidaili" in url:body = await resp.text()kuaidaili_grabPage(url,body)elif "nianshao" in url:body = await resp.text()nianshao_grabPage(url,body)elif "66ip" in url:body = await resp.text()ip66_grabPage(url,body)elif "httpsdaili" in url:body = await resp.text()httpsdaili_grabPage(url,body)elif "swei360" in url:body = await resp.text()swei360_grabPage(url,body)elif "kxdaili" in url:body = await resp.text()kxdaili_grabPage(url,body)else:return await resp.text()# 关闭请求session.close()# 各个网站的不同解析函数
def xici_grabPage(url,body):try:soup = bs(body, "lxml")trs = soup.find(id="ip_list").find_all("tr")for index, tr in enumerate(trs):if index > 0:for index, td in enumerate(tr.find_all("td")):if index == 1:ip = td.textelif index == 2:port = td.textcheckout_ip(ip + ":" + port,url)except Exception as e:return e, "xici_grabPage"def kuaidaili_grabPage(url,body):try:soup = bs(body, "lxml")trs = soup.find(id="list").find_all("tr")for index, tr in enumerate(trs):if index > 0:for index, td in enumerate(tr.find_all("td")):if index == 0:ip = td.textelif index == 1:port = td.textprint(ip + ":" + port)checkout_ip(ip + ":" + port,url)except Exception as e:return e, "kuaidaili_grabPage"def nianshao_grabPage(url,body):try:soup = bs(body, "lxml")trs = soup.find(class_="table").find_all("tr")for index, tr in enumerate(trs):if index > 0:for index, td in enumerate(tr.find_all("td")):if index == 0:ip = td.textelif index == 1:port = td.textprint(ip + ":" + port)checkout_ip(ip + ":" + port,url)except Exception as e:return e, "nianshao_grabPage"def ip66_grabPage(url,body):try:soup = bs(body, "lxml")trs = soup.find("table", width='100%').find_all("tr")for index, tr in enumerate(trs):if index > 0:for index, td in enumerate(tr.find_all("td")):if index == 0:ip = td.textelif index == 1:port = td.textprint(ip + ":" + port)checkout_ip(ip + ":" + port,url)except Exception as e:return e, "ip66_grabPage"def httpsdaili_grabPage(url,body):try:soup = bs(body, "lxml")trs = soup.find("table", class_="table table-bordered table-striped").find_all("tr")for index, tr in enumerate(trs):if index > 0:for index, td in enumerate(tr.find_all("td")):if index == 0:ip = td.textelif index == 1:port = td.textprint(ip + ":" + port)checkout_ip(ip + ":" + port,url)except Exception as e:return e, "httpsdaili_grabPage"def swei360_grabPage(url,body):try:soup = bs(body, "lxml")trs = soup.find("div", id="list").find_all("tr")for index, tr in enumerate(trs):if index > 0:for index, td in enumerate(tr.find_all("td")):if index == 0:ip = td.textelif index == 1:port = td.textprint(ip + ":" + port)checkout_ip(ip + ":" + port,url)except Exception as e:return e, "swei360_grabPage"def kxdaili_grabPage(url,body):try:soup = bs(body, "lxml")trs = soup.find("table", class_="ui table segment").find_all("tr")for index, tr in enumerate(trs):if index > 0:for index, td in enumerate(tr.find_all("td")):if index == 0:ip = td.textelif index == 1:port = td.textprint(ip + ":" + port)checkout_ip(ip + ":" + port,url)except Exception as e:return e, "kxdaili_grabPage"# IP有效性检查,去访问百度
def  checkout_ip(ip_port,xurl=""):s = requests.session()try:proxies={"http":"http://"+ip_port,}url = "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=0&rsv_idx=1&tn=baidu&wd=python&rsv_pq=b3fb9f5200036a4f&rsv_t=04cdhQxxUlftjer%2FovL4Xb6B2ySx%2F%2BMhjXIPfJV24Ezf7GRFVpuhiYmxzmw&rqlang=cn&rsv_enter=1&rsv_sug3=7&rsv_sug1=1&rsv_sug7=100&rsv_sug2=0&inputT=2391&rsv_sug4=3002&rsv_sug=2"# url = "http://www.ip181.com/"r = s.get(url, headers=headers, proxies=proxies,timeout=360)time.sleep(random.random() * 2)assert r.status_code == 200except Exception as e:return e,"checkout_ip"else:print(xurl+" "+ip_port+" OK")ip_time = time.time()db_name = "proxyIP"Ip_DBSave(rdb, db_name, ip_port, ip_time)finally:s.close()def main():# 总页数page_num = 5# 起始页面page_url_base = ['http://www.xicidaili.com/nn/','http://www.kuaidaili.com/free/inha/','http://www.nianshao.me/?page=','http://www.66ip.cn/',# 'http://www.goubanjia.com/free/anoy/%E9%AB%98%E5%8C%BF/','http://www.httpsdaili.com/?page=','http://www.swei360.com/free/?stype=1&page=','http://www.kxdaili.com/dailiip/1/']# 所有URL的列表page_urls = []for url in page_url_base:if "66ip" in url or "kxdaili" in url :for num in range(1, page_num + 1):new_url = url + str(num) +".html"page_urls.append(new_url)elif "goubanjia" in url :for num in range(1, page_num + 1):new_url = url + "index" + str(num) + ".shtml"page_urls.append(new_url)else:for num in range(1,page_num+1):new_url = url + str(num)page_urls.append(new_url)# asyncio.get_event_loop(),创建事件循环loop = asyncio.get_event_loop()# 协程任务tasks = [getPage(host) for host in page_urls]# 在事件循环中执行协程程序loop.run_until_complete(asyncio.gather(*tasks))# 关闭loop.close()if __name__ == '__main__':# start = time.time()while True:main()time.sleep(6000*2)# print("Elapsed Time: %s" % (time.time() - start))'''
http://www.xicidaili.com/nn/4
http://www.kuaidaili.com/free/inha/8/
http://www.data5u.com/
http://www.66ip.cn/3.html
http://www.nianshao.me/?page=2
http://www.goubanjia.com/free/anoy/%E9%AB%98%E5%8C%BF/index2.shtml
http://www.httpsdaili.com/?page=3
http://www.swei360.com/free/?stype=1&page=2
'''

checkout_script.py:

# -*- coding: utf-8 -*-
#__author__="ZJL"from common.ip_db_manager import Ip_DBGetAll
from common.redis_manager import RedisManager
import requests
import time
import randomheaders = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
}# 不停校验库内IP是否有效
def checkout_IP():rdb = RedisManager(db="4")db_name = "proxyIP"ip_list = Ip_DBGetAll(rdb,db_name)for ip,ip_time in ip_list.items():web_checkout_ip(rdb, db_name, ip, ip_time)# 去百度校验IP是否有效
def  web_checkout_ip(rm, db_name, ip_port, ip_time):s = requests.session()try:proxies={"http":"http://"+ip_port,}url = "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=0&rsv_idx=1&tn=baidu&wd=python&rsv_pq=b3fb9f5200036a4f&rsv_t=04cdhQxxUlftjer%2FovL4Xb6B2ySx%2F%2BMhjXIPfJV24Ezf7GRFVpuhiYmxzmw&rqlang=cn&rsv_enter=1&rsv_sug3=7&rsv_sug1=1&rsv_sug7=100&rsv_sug2=0&inputT=2391&rsv_sug4=3002&rsv_sug=2"# url = "http://www.ip181.com/"r = s.get(url, headers=headers, proxies=proxies,timeout=360)time.sleep(random.random() * 2)assert r.status_code == 200except Exception as e:print(r.status_code)print("DEL",rm.delAttribute(db_name,ip_time))return e,"checkout_ip"else:print(r.status_code)finally:s.close()def main():checkout_IP()if __name__ == '__main__':while True:main()

flaskrun.py:

# -*- coding: utf-8 -*-
#__author__="ZJL"from flask import Flask
from common.ip_db_manager import Ip_DBGet
from common.redis_manager import RedisManagerrdb = RedisManager(db="4")
db_name = "proxyIP"app = Flask(__name__)@app.route('/getip')
def get_ipport():ip_port = Ip_DBGet(rdb, db_name)return ip_port@app.errorhandler(403)
def page_not_found(error):return "403"@app.errorhandler(404)
def page_not_found(error):return "404"@app.errorhandler(410)
def page_not_found(error):return "403"@app.errorhandler(500)
def page_not_found(error):return "500"if __name__ == '__main__':app.run(debug=True)

runapp.py:

# -*- coding: utf-8 -*-
#__author__="ZJL"import os
import multiprocessing# print(os.system("python app/flaskrun.py"))
# print(os.system("python run.py"))def worker(str):os.system(str)if __name__ == "__main__":p1 = multiprocessing.Process(target=worker, args=("python3 app/flaskrun.py",))p2 = multiprocessing.Process(target=worker, args=("python3 run.py",))p3 = multiprocessing.Process(target=worker, args=("python3 checkout_script.py",))p1.start()p2.start()p3.start()

结果:

浏览器输入:

http://127.0.0.1:5000/getip

结果:

python自制免费代理IP服务相关推荐

  1. Python之免费代理ip的抓取与使用

    Python之免费代理ip的抓取与使用 使用爬虫不可避免的就会遇到网站的各种封ip操作,因此就需要我们找寻代理,通过代理进行操作,屏蔽自己真实ip. 本文直接从网站中抓取代理ip地址,进行测试,并将测 ...

  2. 用Python获取免费代理IP

    前言 为什么要IP代理:当采集数据, 批量采集数据, 请求速度过快, 网站可能会把你IP封掉 <你的网络进不去这个网站> IP代理换一个IP, 再去采集请求数据 一. 抓包分析数据来源 1 ...

  3. Python获取免费代理IP,并全部测试一遍,结果大失所望

    前言 为什么要IP代理:当采集数据, 批量采集数据, 请求速度过快, 网站可能会把你IP封掉 <你的网络进不去这个网站> IP代理换一个IP, 再去采集请求数据 一. 抓包分析数据来源 1 ...

  4. 怎样使用python爬虫获得免费代理IP

    怎样使用python爬虫获得免费代理IP 进行爬取和测试有效性 总结 爬虫一直是python使用的一个重要部分,而许多网站也为此做了许多反爬措施,其中爬虫访问过于频繁直接封ip地址也作为一种" ...

  5. 【Python 爬虫教程】付费代理IP与免费代理IP的区别是什么

    网络上有很多厂商提供代理IP服务.其中,有免费版和付费版,这两者的区别是什么?下面就来为大家进行详细的介绍. 区别 成本:免费代理IP用户不需要成本,但是可用率低.付费代理IP则相反. 安全性:免费I ...

  6. Python爬虫:爬取免费代理ip

    之前写的几个爬虫都只能爬取到少量的信息,这是由于一个ip频繁地访问网站,会被认定为非正常的爬虫从而被屏蔽,这时候就需要使用代理ip来访问网站了,具体方法就是在发送request时添加一个proxy参数 ...

  7. Python 免费代理ip的批量获取

    Python 免费代理ip的批量获取 简介 网络爬虫的世界,向来都是一场精彩的攻防战.现在许多网站的反爬虫机制在不断的完善,其中最令人头疼的,莫过于直接封锁你的ip.但是道高一尺魔高一丈,在爬取网页的 ...

  8. 使用python为爬虫获取免费代理ip

    免费代理ip的爬取 爬虫一直是python使用的一个重要部分,而许多网站也为此做了许多反爬措施,其中爬虫访问过于频繁直接封ip地址也作为一种"伤敌一千,自损八百"的方法被许多网站采 ...

  9. 使用python多线程获取代理IP(代理IP池,附源码)

    @[TOC]使用python多线程获取代理IP(代理IP池,附源码) 代理IP池 都说现在是大数据的时代,大多数的厂商会特别的注重自己家的数据安全问题,但同时,也有着许多的行业是需要大数据来做支撑的, ...

最新文章

  1. C语言矩阵M*N节省空间的算法(附完整源码)
  2. freebsd运行php,FreeBSD学习笔记16-FreeBSD下安装PHP
  3. .NET 设计规范--.NET约定、惯用法与模式--附录:C#编程风格约定
  4. 如何在Unity项目中添加语音识别?
  5. LED灯随机亮起几个灯
  6. Hard problem CodeForces - 706C
  7. pands 画图 调整大小_两个精品案例解释机械设计的步骤,有思路,再着手画图...
  8. Linux命令之iconv命令
  9. 25th Sept 2014:《数学分析八讲读书笔记》
  10. java ssm网上超市购物管理系统
  11. 大数据概述及电信大数据应用
  12. canvas绘制竖排的数字_小程序利用Canvas绘制图片和竖排文字
  13. elementui table tooltip产出展示三个点,并且提示
  14. [转帖]国产麒麟系统为何饱受争议?
  15. Karabiner Elements for Mac(键盘改键神器)
  16. win10开始菜单打不开,使用startmenu.diagcab进行修复
  17. 常州SEO姜东:tiktok(国际抖音)运营教程:打造高权重账号
  18. 精品课 - Python 基础
  19. 缺少lib库文件解决方法
  20. Ubuntu14.04 安装 Python3.6

热门文章

  1. 荣耀4c电信Android6.0,华为畅玩4C电信版 CyanogenMod 13.0_Android_6.0.1 【HRT_chiwahfj】
  2. 如何用requests获取百度网站的图片资源
  3. 笔试题之——栈问题(列车车厢编组问题)
  4. 腾讯地图实时精准定位
  5. 开启snapshot的操作失败 如何解决——两种办法
  6. InnoDB之redo log
  7. 3S处理技术 -MapGIS转换shp攻略
  8. 初始化k8s时,报错[kubelet-check] It seems like the kubelet isn‘t running or healthy.
  9. mybatis数组越界异常 Error preparing statement
  10. 未来5年可能必备的IT技术