python 23 selenium高级和使用代理

1. selenium使用代理

第一步：创建配置对象

from selenium.webdriver import Chrome,ChromeOption
options = ChromeOptions()

第二步：添加配置

options.add_argument('--proxy-server=http://代理服务器:端口')

第三步：通过指定配置创建浏览器对象

b = Chrome(options = options)  #options：选择、选项、选择权，相当于做了一个配置，借助这个配置创建浏览器对象
b.get('https://www.baidu.com/')
print(b.page_source)

2. selenium基本配置

取消自动测试

from selenium.webdriver import Chrome,ChromeOptions
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
b = Chrome(options=options)
b.get('https://www.baidu.com/')
b.implicitly_wait(5)

取消图片加载

options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
print(b.page_source)
b.close()

3. selenuim等待

3.1 隐式等待

如果没有设置隐式等待：在通过浏览器获取标签的时候，如果标签不存在会直接报错
如果设置了隐式等待：在通过浏览器获取标签的时候，如果标签不存在不会直接报错，而是在指定时间范围内，不断尝试重新获取标签，直到获取到标签或者超时为值（如果超时会报错）

一个浏览器只需设置一次隐式等待时间，它会作用于这个浏览器每次获取标签的时候

#sleep设置等待时间，如果给出的时间不够，有可能没有加载完，会报错，隐式等待比较灵活。
from selenium.webdriver import Chrome,ChromeOptions
from selenium.webdriver.common.by import Byoptions = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])# 取消图片加载
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})b=Chrome()
b.get('https://www.jd.com')
# 1.设置隐式等待
# 1）. 设置隐式等待时间,这个隐式等待是在获取标签的时候有效。
b.implicitly_wait(5) #
print('============')
# 2）. 获取标签的时候，隐式等待时间才会生效input_tag = b.find_element_by_id('key')
input_tag.send_keys('钱包')

3.2 显示等待 - 等待某个条件成立或者不成立为止

1) 创建等待对象：webDriverwait(浏览器对象，超时时间)

2) 添加等待条件：

等待对象. until(条件) - 等到条件成立为止
等待对象. until_not(条件) - 等到条件不成立为止

条件的写法：

presence_of_element_located(标签) - 指定标签出现
text_to_be_present_in_element_value(标签，值) - 指定标签的value属性值
text_to_be_present_in_element(标签，值) - 指定标签的标签内容包括指定值
注意：
注意：条件中提供标签的方式
（by.xxx,具体指）
【主要用于自动化测试】用于抢东西，抢茅台，可以用这个来控制

# presence_of_element_located(标签)  #当包含这个页面的标签出现的时候，才比较wait = WebDriverWait(b, 10)
# wait.until(EC.text_to_be_present_in_element_value((By.ID, 'key'), '电脑'))
# print('继续执行')# 等到页面中出现id值为J_goodsList里面class值为gl-i-wrap的div标签值
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList div.gl-i-wrap')))print(b.page_source)

4.练习

#前程无忧
import requests
from time import sleep
from re import search
from json import loads
def get_ip():url = 'http://d.jghttp.alicloudecs.com/getip?num=1&type=1&pro=&city=0&yys=0&port=11&time=2&ts=0&ys=0&cs=0&lb=4&sb=0&pb=4&mr=1&regions='while True:response = requests.get(url)if response.text[0] == '{':print('获取ip失败')sleep(1)continuereturn response.text
def get_net_data():# 设置请求地址和hedearsurl = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}# 发送请求，如果请求失败重新获取ip然后重新请求while True:ip = get_ip()proxies = {'http':ip,'https':ip}response = requests.get(url,headers = headers,proxies = proxies )result= search(r'window.__SEARCH_RESULT__ = (.+?)</script>',response)if result:analysis_data(result.group(1))break
def analysis_data(json_data:str):data = loads(json_data)  #是一个字典，字典每一个元素还是一个字典for x in data:print(x['job_name'],x['providesalary_text'])if __name__ == '__main__':get_net_data()## 京东评价
import requests
from selenium.webdriver import Chrome,ChromeOptions
from bs4 import BeautifulSoup
from time import sleep
from re import findallheaders = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
#一、获取整个网页的鼠标的数据# 定义函数拿到一页网页的所有鼠标的地址
def get_one_data(page:int,key_word='鼠标'):#请求网页的数据url = f'https://search.jd.com/Search?keyword={key_word}&pvid=9a743dbbb1ca49d28bc230c6cb731a72&page={page}&s=56&click=0'response = requests.get(url,headers = headers)  #整个网页的数据#解析每一个商品的详情页地址soup = BeautifulSoup(response.text,'lxml')good_list = soup.select('#J_goodsList div.gl-i-wrap .p-img>a')for x in good_list:one_good_url = 'https:'+x.attrs['href']  #获取get_good_details(one_good_url)def get_good_details(url:str):#获取带有评价信息的详情页数据b = Chrome(options=options)b.get(url)b.implicitly_wait(5)  #让这个页面# for x in range(4)#     b.execute_script('window.scrollby(0,900)')  #持续滚动，然给整个页面滑动到最后button = b.find_element_by_css_selector('#detail>div.tab-main.large>ul>li:nth-child(5)') #找到评论的标签button.click()   #点击评论按钮sleep(1)result = b.page_source   #得到评论的点击链接##解析数据，获取单个数据soup = BeautifulSoup(result,'lxml')name = soup.select_one('.sku-name').text.strip()print(name)price = soup.select_one('.p-price>.price ').text.strip()print(price)              #.J-comments-list ul.filter-list>li'comment_count =soup.select_one('#detail > div.tab-main.large > ul > li:nth-child(5)').text.strip()count_num = findall(r'\((.+?)\)',comment_count)[0]print(count_num)comment_info = soup.select('.J-comments-list ul.filter-list>li')if comment_info:positive_comment = comment_info[4].text  #取出内容在正则positive_comment = findall(r'好评\((.+?)\)',positive_comment)[0]nutual_comment = comment_info[5].text   #取出内容在使用正则nutual_comment = findall(r'中评\((.+?)\)', nutual_comment)[0]nagetive_comment = comment_info[6].textnegative_comment = findall(r'差评\((.+?)\)', nagetive_comment)[0]print(positive_comment,nutual_comment,negative_comment)print('----------------------------冷静的分割线-------------------------------')b.close()#解析数据if __name__ == '__main__':get_one_data(1,'鼠标')

python 23 selenium高级和使用代理相关推荐

Python：Selenium + Chrome添加认证代理
添加无认证代理,以参数形式添加 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_argument('--proxy-server ...
[python爬虫] Selenium高级篇之窗口移动、弹出对话框自登录
在我们使用Selenium Python制作自动爬虫和网页自动测试的时候,通常会遇到弹出新的窗体或对话框的时候,此时你捕获的窗体已经被打断,从而干扰你的爬虫. 那怎么解决这个问题呢? 本篇文章主要记录 ...
python win10 捕获弹出窗口_[python爬虫] Selenium高级篇之窗口移动、弹出对话框自登录...
在我们使用Selenium Python制作自动爬虫和网页自动测试的时候,通常会遇到弹出新的窗体或对话框的时候,此时你捕获的窗体已经被打断,从而干扰你的爬虫. 那怎么解决这个问题呢? 本篇文章主要记录 ...
小猿圈python学习-Selenium爬虫之使用代理ip的方法
今天小猿圈给大家分享的是如何在爬取数据的时候防止IP被封,今天给大家分享两种方法,希望大家可以认真学习,再也不用担心被封IP啦. 第一种: 降低访问速度,我们可以使用time模块中的sleep,使程 ...
Python爬虫4.4 — selenium高级用法教程
Python爬虫4.4 - selenium高级用法教程综述 Headless Chrome 设置请求头设置代理IP 常用启动项参数options设置 Cookie操作 selenium设置coo ...
Python爬虫之selenium高级功能
Python爬虫之selenium高级功能原文地址表单操作元素拖拽页面切换弹窗处理表单操作表单里面会有文本框.密码框.下拉框.登陆框等. 这些涉及与页面的交互,比如输入.删除.点击等. ...
python爬虫selenium账号和密码_python3 - selenium 添加有账号密码的代理
from selenium importwebdriverimportstringimportzipfile#打包Google代理插件 def create_proxyauth_extension(p ...
Python中的高级数据结构详解
这篇文章主要介绍了Python中的高级数据结构详解,本文讲解了Collection.Array.Heapq.Bisect.Weakref.Copy以及Pprint这些数据结构的用法,需要的朋友可以参考 ...
小白学vb还是python_小白学 Python 爬虫（30）：代理基础
人生苦短,我用 Python 前文传送门: 小白学 Python 爬虫(1):开篇小白学 Python 爬虫(2):前置准备(一)基本类库的安装小白学 Python 爬虫(3):前置准备(二)Li ...

python 23 selenium高级和使用代理

python 23 selenium高级和使用代理

1. selenium使用代理

2. selenium基本配置

3. selenuim等待

3.1 隐式等待

3.2 显示等待 - 等待某个条件成立或者不成立为止

1) 创建等待对象：webDriverwait(浏览器对象，超时时间)

2) 添加等待条件：

4.练习

python 23 selenium高级和使用代理相关推荐

最新文章

热门文章