python selenium 爬虫模拟浏览网站内容

使用python selenium编写的爬虫代码，模拟用户浏览某个网站内容，废话少说进入正文。

1、爬虫界面如下：

界面使用说明：

第一步：填写要访问的网站地址

第二步：填写每天访问该网址的次数

第三步：点击“开始刷量”按钮开始访问网站内容

2、爬虫源代码介绍：

1）点击“开始刷量”按钮调用runjob方法，runjob具体代码如下：

# 访问网站操作代码
def  runjob():# m, s = divmod(second, 60)# h, m = divmod(m, 60)# hms = "{:02}:{:02}:{:02}".format(h, m, s)# logger.info(hms)#a、 如果人工停止刷量，则直接结束当前线程，并刷新界面if (myframe.stop_refresh_page_thread):# 增加刷量addrefreshnum()# 刷新页面myframe.refresh_run_stop_button()return# b、正常执行任务refreshnum = myframe.refreshnumsiteurl = myframe.siteurlinput.GetValue().strip()my_logger_info(logger,"==开始网站%s第%d次刷量<直接访问>=="%(siteurl,refreshnum+1))try:# 创建浏览器driver = createWebDriver()# 浏览器最大化driver.maximize_window()# 浏览网站viewSite(driver,siteurl)except Exception as e:info = traceback.format_exc()my_logger_info(logger,info)driver.save_screenshot(".\\refreshpage_directvisit_error.png")myframe.stop_refresh_page_thread=Truefinally:# 关闭浏览器driver.quit()# 增加刷量addrefreshnum()# 正常执行后刷新界面myframe.refresh_run_stop_button()# 打印日志my_logger_info(logger,"==完成网站%s第%d次刷量<直接访问>=="%(siteurl,refreshnum+1))

2）runjob=>createWebDriver()代码如下

#创建浏览器驱动
def createWebDriver():# 配置参数options = webdriver.ChromeOptions()# 设置网页编码options.add_argument('lang=zh_CN.UTF-8')# 禁止加载图片options.add_argument('blink-settings=imagesEnabled=false')# 禁用sandboxoptions.add_argument('--no-sandbox')# 无界面模式options.add_argument('headless')driver = webdriver.Chrome(options=options, keep_alive=True)# 防止selenium访问被识别出来，不算流量driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})return driver

3）runjob=>viewSite(driver,siteurl)代码如下

# 模拟浏览网站
def  viewSite(driver,url):my_logger_info(logger,"<<开始访问网站:%s"% url)driver.get(url)pagesource = driver.page_sourcerunIdleSomeTime(random.randint(3, 5))linklist = []linklist.append("桥架国标")linklist.append("桥架价格")linklist.append("桥架安装")prodlist = []prodlist.append("桥架配件")prodlist.append("桥架规格")for i in range(len(prodlist)):# 支持人工停止刷量if (myframe.stop_refresh_page_thread):breaklanmu = prodlist[i]viewProductOfLanmu(driver, lanmu)for i in range(len(linklist)):# 支持人工停止刷量if (myframe.stop_refresh_page_thread):breaklanmu = linklist[i]viewArticleOfLanmu(driver, lanmu)# 支持人工停止刷量if (myframe.stop_refresh_page_thread):my_logger_info(logger, "已经停止刷量")my_logger_info(logger,">>完成访问网站:%s" % url)

4）runjob=>viewSite(driver,siteurl)=>viewProductOfLanmu(driver,lanmu)代码如下

# 查看栏目产品
def viewProductOfLanmu(driver,lanmu):# 浏览相关栏目link_d = driver.find_element_by_link_text(lanmu)# 不直接使用link.click()，避免被其他咨询窗口遮挡driver.execute_script("arguments[0].click();", link_d)# 等待栏目第一页加载完成runIdleSomeTime(random.randint(3, 5))# 支持人工停止刷量if (myframe.stop_refresh_page_thread):returnpagesource = driver.page_sourcesoup = BeautifulSoup(pagesource, "html.parser")# logger.debug(soup.prettify())while True:# 查看当前页所有文章newsdiv_s = soup.find("div", class_="list").find_all("div", class_="mask")for i in range(len(newsdiv_s)):link=newsdiv_s[i].find("a")my_logger_info(logger,"访问页面：%s" % link['href'])# 在新的窗口打开文章js = "window.open('" + link['href'] + "','_blank');"try:driver.execute_script(js)except Exception as e:info = traceback.format_exc()my_logger_info(logger, info)continue# driver.implicitly_wait(3)# 查看打开的文章内容runIdleSomeTime(random.randint(5, 7))# 支持人工停止刷量if (myframe.stop_refresh_page_thread):break# driver.close()# 获取当前打开的所有窗口windows = driver.window_handles# 转换到最新打开的窗口driver.switch_to.window(windows[-1])driver.close()# 转换到父窗口driver.switch_to.window(windows[0])# 在当前页浏览文章runIdleSomeTime(random.randint(1, 3))# 支持人工停止刷量if (myframe.stop_refresh_page_thread):break# 支持人工停止刷量if (myframe.stop_refresh_page_thread):break# 判断是否有下一页pagediv_s = soup.find("div", class_="pageBox")nextpagelink_s=pagediv_s.find("a",text="下一页")if (not nextpagelink_s):break# 尝试翻转到下一页,翻页失败则不再进行nextpagelink_d = driver.find_element_by_link_text("下一页")driver.execute_script("arguments[0].click();", nextpagelink_d)# 等待栏目当前页加载完成runIdleSomeTime(random.randint(3, 5))pagesource = driver.page_sourcesoup = BeautifulSoup(pagesource, "html.parser")

5）runjob=>viewSite(driver,siteurl)=>viewArticleOfLanmu(driver,lanmu)代码如下

# 查看栏目文章
def viewArticleOfLanmu(driver,lanmu):# 浏览相关栏目link_d = driver.find_element_by_link_text(lanmu)# 不直接使用link.click()，避免被其他咨询窗口遮挡driver.execute_script("arguments[0].click();", link_d)# 等待栏目第一页加载完成runIdleSomeTime(random.randint(3, 5))# 支持人工停止刷量if (myframe.stop_refresh_page_thread):returnpagesource = driver.page_sourcesoup = BeautifulSoup(pagesource, "html.parser")# logger.debug(soup.prettify())while True:# 查看当前页所有文章newsdiv_s = soup.find("div", class_="newsList")for link in newsdiv_s.find_all("a", class_="look"):my_logger_info(logger,"访问页面：%s" % link['href'])# 在新的窗口打开文章js = "window.open('" + link['href'] + "','_blank');"driver.execute_script(js)# driver.implicitly_wait(3)# 查看打开的文章内容runIdleSomeTime(random.randint(5, 7))# 支持人工停止刷量if (myframe.stop_refresh_page_thread):break# driver.close()# 获取当前打开的所有窗口windows = driver.window_handles# 转换到最新打开的窗口driver.switch_to.window(windows[-1])driver.close()# 转换到父窗口driver.switch_to.window(windows[0])# 在当前页浏览文章runIdleSomeTime(random.randint(5, 7))# 支持人工停止刷量if (myframe.stop_refresh_page_thread):break# 支持人工停止刷量if (myframe.stop_refresh_page_thread):break# 判断是否有下一页pagediv_s = soup.find("div", class_="pageBox")nextpagelink_s=pagediv_s.find("a",text="下一页")if (not nextpagelink_s):break# 尝试翻转到下一页,翻页失败则不再进行nextpagelink_d = driver.find_element_by_link_text("下一页")driver.execute_script("arguments[0].click();", nextpagelink_d)# 等待栏目当前页加载完成runIdleSomeTime(random.randint(3, 5))pagesource = driver.page_sourcesoup = BeautifulSoup(pagesource, "html.parser")

3、爬虫访问示例网站：

首页链接：

电缆桥架生产企业

栏目链接：

镀锌桥架

防火桥架

桥架标准

文章链接：

梯式镀锌桥架最新价格新鲜出炉

2022年槽式热镀锌桥架市场价格介绍，一起来看看

欢迎有兴趣的同学留言交流。

python selenium 爬虫模拟浏览网站内容相关推荐

python selenium爬虫实例_python使用selenium爬虫知乎的方法示例
说起爬虫一般想到的情况是,使用 python 中都通过 requests 库获取网页内容,然后通过 beautifulSoup 进行筛选文档中的标签和内容.但是这样有个问题就是,容易被反扒机制所拦住. ...
python selenium爬虫
python selenium爬虫 1 前言博主是一名经管研究生,以自身经历为例.如今大学生写论文大部分都需要数据支撑来论证研究结果,数据除了从数据库直接下载外,有些是需要通过网络爬虫来获得.网络爬 ...
Python Selenium爬虫实战应用
本节讲解 Python Selenium 爬虫实战案例,通过对实战案例的讲解让您进一步认识 Selenium 框架. 实战案例目标:抓取京东商城(https://www.jd.com/)商品名称.商品 ...
Python - Selenium Chrome 模拟手机
Python - Selenium Chrome 模拟手机 Max.Bai 2017-04 Chrome浏览器支持移动端调试,当然ChromeDriver也支持移动端测试了. 使用python调用We ...
【Python】爬虫获取购物网站的商品信息、商品图片
[Python]爬虫.爬取购物网站数据.Selenium安装教程及环境搭建前言一.Selenium安装二.Chrome安装驱动: 三. 四.成果展示: 五.结束语: [Python]爬虫.爬取购 ...
python selenium爬虫_详解基于python +Selenium的爬虫
详解基于python +Selenium的爬虫一.背景 1. Selenium Selenium 是一个用于web应用程序自动化测试的工具,直接运行在浏览器当中,支持chrome.firefox等主 ...
python selenium爬虫代码示例_python3通过selenium爬虫获取到dj商品的实例代码
先给大家介绍下python3 selenium使用其实这个就相当于模拟人的点击事件来连续的访问浏览器.如果你玩过王者荣耀的话在2016年一月份的版本里面就有一个bug. 安卓手机下载一个按键精灵就可 ...
python+selenium爬虫自动化批量下载文件
一.项目需求在一个业务网站有可以一个个打开有相关内容的文本,需要逐个保存为TXT,数据量是以千为单位,人工操作会麻木到崩溃. 二.解决方案目前的基础办法就是使用python+selenium自动化 ...
Python+selenium+firefox模拟登录微博并爬取数据（1
1:环境python3.5,最新 firefox,selenium-3.14.0. 本来准备用无界面的,但是感觉效果不好看出来所以先用有界面的浏览器来做.分几次来慢慢写.这节先配置好环境. 2:安装: ...

python selenium 爬虫模拟浏览网站内容

python selenium 爬虫模拟浏览网站内容相关推荐

最新文章

热门文章

python selenium 爬虫 模拟浏览网站内容

python selenium 爬虫 模拟浏览网站内容相关推荐

最新文章

热门文章

python selenium 爬虫模拟浏览网站内容

python selenium 爬虫模拟浏览网站内容相关推荐