selenium爬取阿里巴巴国际站

为了减轻采购妹纸同事的工作任务。

所有的代码我都放到这里了 https://github.com/jevy146/selenium_Alibaba
第一步获取信息。

# -*- coding: utf-8 -*-
# @Time    : 2020/6/17 14:09
# @Author  : 结尾！！
# @FileName: D01_spider_alibaba_com.py
# @Software: PyCharmfrom selenium.webdriver import ChromeOptions
import time
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait#第一步实现对淘宝的登陆
class Chrome_drive():def __init__(self):ua = UserAgent()option = ChromeOptions()option.add_experimental_option('excludeSwitches', ['enable-automation'])option.add_experimental_option('useAutomationExtension', False)NoImage = {"profile.managed_default_content_settings.images": 2}  # 控制 没有图片option.add_experimental_option("prefs", NoImage)# option.add_argument(f'user-agent={ua.chrome}')  # 增加浏览器头部# chrome_options.add_argument(f"--proxy-server=http://{self.ip}")  # 增加IP地址。。option.add_argument('--headless')  #无头模式 不弹出浏览器self.browser = webdriver.Chrome(options=option)self.browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {'source': 'Object.defineProperty(navigator,"webdriver",{get:()=>undefined})'})  #去掉selenium的驱动设置self.browser.set_window_size(1200,768)self.wait = WebDriverWait(self.browser, 12)def get_login(self):url='https://www.alibaba.com/'self.browser.get(url)#self.browser.maximize_window()  # 在这里登陆的中国大陆的邮编#这里进行人工登陆。time.sleep(2)self.browser.refresh()  # 刷新方法 refresreturn#获取判断网页文本的内容：def index_page(self,page,wd):"""抓取索引页:param page: 页码"""print('正在爬取第', page, '页')words=wd.replace(' ','_')url = f'https://www.alibaba.com/products/{words}.html?IndexArea=product_en&page={page}'js1 = f" window.open('{url}')"  # 执行打开新的标签页print(url)self.browser.execute_script(js1)  # 打开新的网页标签# 执行打开新一个标签页。self.browser.switch_to.window(self.browser.window_handles[-1])  # 此行代码用来定位当前页面窗口self.buffer()  # 网页滑动  成功切换#等待元素加载出来time.sleep(3)self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#root > div > div.seb-pagination > div > div.seb-pagination__pages > a:nth-child(10)')))#获取网页的源代码html =  self.browser.page_sourceget_products(wd,html)self.close_window()def buffer(self): #滑动网页的for i in range(20):time.sleep(0.8)self.browser.execute_script('window.scrollBy(0,380)', '')  # 向下滑行300像素。def close_window(self):length=self.browser.window_handlesprint('length',length) #判断当前网页窗口的数量if  len(length) > 3:self.browser.switch_to.window(self.browser.window_handles[1])self.browser.close()time.sleep(1)self.browser.switch_to.window(self.browser.window_handles[-1])import csv
def save_csv(lise_line):file = csv.writer(open("./alibaba_com_img.csv",'a',newline="",encoding="utf-8"))file.writerow(lise_line)#解析网页，
from scrapy.selector import Selector
def get_products(wd,html_text):"""提取商品数据"""select=Selector(text=html_text)# 大概有47个items = select.xpath('//*[@id="root"]/div/div[3]/div[2]/div/div/div/*').extract()print('产品数 ',len(items))for i in range(1, 49):title = select.xpath(f'//*[@id="root"]/div/div[3]/div[2]/div/div/div/div[{i}]/div/div[2]/h4/a/@title').extract()  # 产品的标题title_href = select.xpath(f'//*[@id="root"]/div/div[3]/div[2]/div/div/div/div[{i}]/div/div[2]/h4/a/@href').extract()  # 产品的详情页start_num = select.xpath(f'//*[@id="root"]/div/div[3]/div[2]/div/div/div/div[{i}]/div/div[2]/div[1]/div/p[2]/span/text()').extract()  # 起订量price = select.xpath(f'//*[@id="root"]/div/div[3]/div[2]/div/div/div/div[{i}]/div/div[2]/div[1]/div/p[1]/@title').extract()  # 产品的价格adress_href = select.xpath(f'//*[@id="root"]/div/div[3]/div[2]/div/div/div/div[{i}]/div/div[3]/a/@href').extract()  # 商家链接adress = select.xpath(f'//*[@id="root"]/div/div[3]/div[2]/div/div/div/div[{i}]/div/div[3]/a/@title').extract()  # 商家地址Response_Rate = select.xpath(f'//*[@id="root"]/div/div[3]/div[2]/div/div/div/div[{i}]/div/div[3]/div[2]/div[1]/span/span/text()').extract()  # 复购率Transaction = select.xpath(f'//*[@id="root"]/div/div[3]/div[2]/div/div/div/div[{i}]/div/div[3]/div[2]/div[2]/span//text()').extract()  # 交易量img = select.xpath(f'//*[@id="root"]/div/div[3]/div[2]/div/div/div/div[{i}]/div/div[1]/a/div[2]/div[1]/img/@src').extract()  # 图片all_down =[wd]+ title +img+ title_href + start_num + price + adress + adress_href + Response_Rate + Transactionsave_csv(all_down)print(title, img,title_href, start_num, price, adress, adress_href, Response_Rate, Transaction)def main():"""遍历每一页"""run=Chrome_drive()run.get_login() #先扫码登录wd=['turkey fryer','towel warmer']for w in wd:for i in range(1, 6):run.index_page(i,w)if __name__ == '__main__':csv_title = 'word,title,img,title_href,start_num,price,adress,adress_href,Response_Rate,Transaction,Transactioning'.split(',')save_csv(csv_title)main()

第二步：下载图片，图片都在这个文件夹中
，运行D02_get_img.py

# -*- coding: utf-8 -*-
# @Time    : 2020/6/17 14:41
# @Author  : 结尾！！
# @FileName: D02_get_img.py
# @Software: PyCharmimport requestsdef open_requests(img, img_name):img_url ='https:'+ imgres=requests.get(img_url)with open(f"./downloads_picture/{img_name}",'wb') as fn:fn.write(res.content)import pandas as pd
df1=pd.read_csv('./alibaba_com_img.csv',)
for img in df1["img"]:if pd.isnull(img):passelse:if '@sc01' in img:img_name=img[24:]print(img,img_name)open_requests(img, img_name)

第三步：将图片插入到对应的excel中
D3将图片插入到excel中.py

# -*- coding: utf-8 -*-
# @Time    : 2020/1/19 10:17
# @Author  : 结尾！！
# @FileName: 4将图片插入到excel中.py
# @Software: PyCharmfrom PIL import Image
import os
import xlwings as xw
path='./alibaba_com.xlsx'
app = xw.App(visible=True, add_book=False)
wb = app.books.open(path)sht = wb.sheets['alibaba_com_img']
img_list=sht.range("D2").expand('down').value
print(len(img_list))def write_pic(cell,img_name):path=f'./downloads_picture/{img_name}'print(path)fileName = os.path.join(os.getcwd(), path)img = Image.open(path).convert("RGB")print(img.size)w, h = img.sizex_s = 70  # 设置宽 excel中，我设置了200x200的格式y_s = h * x_s / w  #  等比例设置高sht.pictures.add(fileName, left=sht.range(cell).left, top=sht.range(cell).top, width=x_s, height=y_s)if __name__ == '__main__':for index,img in enumerate(img_list):cell="B"+str(index+2)if '@sc01' in img:img_name = img[24:]try:write_pic(cell,img_name)print(cell,img_name)except:print("没有找到这个img_name的图片",img_name)wb.save()wb.close()app.quit()

selenium爬取阿里巴巴国际站相关推荐

爬虫之selenium爬取斗鱼网站
爬虫之selenium爬取斗鱼网站示例代码: from selenium import webdriver import timeclass Douyu(object):def __init__(s ...
Php使用selenium爬虫,selenium,python爬虫_使用selenium爬取网站时输出结果不正确，selenium,python爬虫 - phpStudy...
使用selenium爬取网站时输出结果不正确网站链接:http://www.ncbi.nlm.nih.gov/pubmed?term=(%222013%22%5BDate%20-%20Publica ...
[Python3网络爬虫开发实战] 7-动态渲染页面爬取-4-使用Selenium爬取淘宝商品
在前一章中,我们已经成功尝试分析Ajax来抓取相关数据,但是并不是所有页面都可以通过分析Ajax来完成抓取.比如,淘宝,它的整个页面数据确实也是通过Ajax获取的,但是这些Ajax接口参数比较复杂,可 ...
利用Selenium爬取淘宝商品信息
文章来源:公众号-智能化IT系统. 一. Selenium和PhantomJS介绍 Selenium是一个用于Web应用程序测试的工具,Selenium直接运行在浏览器中,就像真正的用户在操作一样. ...
[python爬虫] selenium爬取局部动态刷新网站（URL始终固定）
在爬取网站过程中,通常会遇到局部动态刷新情况,当你点击"下一页"或某一页时,它的数据就进行刷新,但其顶部的URL始终不变.这种局部动态刷新的网站,怎么爬取数据呢?某网站数据显示如下 ...
[python爬虫] Selenium爬取内容并存储至MySQL数据库
前面我通过一篇文章讲述了如何爬取CSDN的博客摘要等信息.通常,在使用Selenium爬虫爬取数据后,需要存储在TXT文本中,但是这是很难进行数据处理和数据分析的.这篇文章主要讲述通过Selenium ...
python爬取网易云歌单_详解python selenium 爬取网易云音乐歌单名
目标网站: 首先获取第一页的数据,这里关键要切换到iframe里打印一下获取剩下的页数,这里在点击下一页之前需要设置一个延迟,不然会报错. 结果: 一共37页,爬取完毕后关闭浏览器完整代码: u ...
layui获取input信息_python爬虫—用selenium爬取京东商品信息
python爬虫--用selenium爬取京东商品信息 1.先附上效果图(我偷懒只爬了4页) 2.京东的网址https://www.jd.com/ 3.我这里是不加载图片,加快爬取速度,也可以用Hea ...
Python之网络爬虫（selenium爬取动态网页、爬虫案例分析、哈希算法与RSA加密）
文章目录一.selenium爬取动态网页二.爬虫案例分析三.哈希hash算法与RSA加密一.selenium爬取动态网页 1.动态网页认知爬虫其实就是在模仿浏览器的行为应对要多次数据的交互 ...
爬取新笔趣阁排行并保存到mysql_python+selenium爬取微博热搜存入Mysql的实现方法...
最终的效果废话不多少,直接上图这里可以清楚的看到,数据库里包含了日期,内容,和网站link 下面我们来分析怎么实现使用的库 import requests from selenium.webdr ...

selenium爬取阿里巴巴国际站

selenium爬取阿里巴巴国际站相关推荐

最新文章

热门文章