暑假在家帮家里写了点小代码，分享一下同时也备份一下
拼多多好像是反爬比较聪明老是爬不到，还是我太菜了

淘宝

# -*- coding: utf-8 -*-
import requests
import re
import pandas as pd
import time
import xlwt
import os# 此处写入登录之后自己的cookies
cookie = input('请输入想查询的商品的cookie：'.strip())
# 获取页面信息
def getHTMLText(url):headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'}user_cookies = cookiecookies = {}for a in user_cookies.split(';'):  # 因为cookies是字典形式，所以用spilt函数将之改为字典形式name, value = a.strip().split('=', 1)cookies[name] = valuetry:r = requests.get(url, cookies=cookies, headers=headers, timeout=60)print(r.status_code)print(r.cookies)return r.textexcept:print('获取页面信息失败')return ''#  格式化页面，查找数据
def parsePage(html):list = []try:views_title = re.findall('"raw_title":"(.*?)","pic_url"', html)print(len(views_title))  # 打印检索到数据信息的个数，如果此个数与后面的不一致，则数据信息不能加入列表print(views_title)views_price = re.findall('"view_price":"(.*?)","view_fee"', html)print(len(views_price))print(views_price)item_loc = re.findall('"item_loc":"(.*?)","view_sales"', html)print(len(item_loc))print(item_loc)views_sales = re.findall('"view_sales":"(.*?)","comment_count"', html)print(len(views_sales))print(views_sales)comment_count = re.findall('"comment_count":"(.*?)","user_id"', html)print(len(comment_count))print(comment_count)shop_name = re.findall('"nick":"(.*?)","shopcard"', html)print(len(shop_name))for i in range(len(views_price)):list.append([views_title[i], views_price[i], item_loc[i], comment_count[i], views_sales[i], shop_name[i]])# print(list)print('爬取数据成功')return listexcept:print('有数据信息不全，如某一页面中某一商品缺少地区信息')# 存储到csv文件中，为接下来的数据分析做准备
def save_to_file(list):data = pd.DataFrame(list)data.to_csv('C:\\Users\\Administrator\\Desktop\\商品数据.csv', header=False, mode='a+')  # 用追加写入的方式
# csv转存为excel
def txt_xls(filename, xlsname):try:f = open(filename, 'r',encoding='utf-8')xls = xlwt.Workbook()sheet =xls.add_sheet('sheet1',cell_overwrite_ok=True)x=0while True:line =f.readline()if not line:break
#split(';')表示csv文件内容以“;”分割for i in range(len(line.split(';'))):item = line.split(';')[i]sheet.write(x,i,item)x+=1 f.close()xls.save(xlsname)except:raise
def main():name = [['views_title', 'views_price', 'item_loc', 'comment_count', 'views_sales', 'shop_name']]data_name = pd.DataFrame(name)data_name.to_csv('C:\\Users\\Administrator\\Desktop\\商品数据.csv', header=False, mode='a+')  # 提前保存一行列名称goods = input('请输入想查询的商品名称：'.strip())  # 输入想搜索的商品名称print('注意需要输入和上面cookie所对应的商品名称')depth = 5  # 爬取的页数start_url = 'http://s.taobao.com/search?q=' + goods  # 初始搜索地址for i in range(depth):time.sleep(3 + i)try:page = i + 1print('桐:正在爬取第%s页数据' % page)url = start_url + 'imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20200408&ie=utf8&sort=sale-desc&bcoffset=0&p4ppushleft=%2C44&s=' + str(44 * i)html = getHTMLText(url)# print(html)list = parsePage(html)save_to_file(list)except:print('数据没保存成功')if __name__ == '__main__':main()filename = "C:\\Users\\Administrator\\Desktop\\商品数据.csv"xlsname ="C:\\Users\\Administrator\\Desktop\\商品数据.xls"txt_xls(filename,xlsname)os.remove(filename)

京东

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from lxml import etree
from openpyxl import Workbook
wb = Workbook()
sheet = wb.active
sheet['A1'] = 'name'
sheet['B1'] = 'price'
sheet['C1'] = 'commit'
sheet['D1'] = 'shop'
sheet['E1'] = 'sku'
sheet['F1'] = 'icons'
sheet['G1'] = 'detail_url'
driver_path = r"C:\Users\Administrator\Desktop\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
driver = webdriver.Chrome(chrome_options=options)# 不加载图片
options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
driver = webdriver.Chrome(executable_path=driver_path, options=options)
wait = WebDriverWait(driver, 60)  # 设置等待时间
def search(keyword):try:input = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key")))  # 等到搜索框加载出来submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button")))  # 等到搜索按钮可以被点击input[0].send_keys(keyword)  # 向搜索框内输入关键词submit.click()  # 点击wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')))total_page = driver.find_element(By.XPATH, '//*[@id="J_bottomPage"]/span[2]/em[1]/b').textreturn int(total_page)except TimeoutError:search(keyword)def get_data(html):selec_data = etree.HTML(html)lis = selec_data.xpath('//ul[@class="gl-warp clearfix"]/li')for li in lis:try:title = li.xpath('.//div[@class="p-name p-name-type-2"]//em/text()')[0].strip()   # 名字price = li.xpath('.//div[@class="p-price"]//i/text()')[0].strip()   # 价格comment = li.xpath('.//div[@class="p-commit"]//a/text()')  # 评论数shop_name = li.xpath('.//div[@class="p-shop"]//a/text()')  # 商铺名字data_sku = li.xpath('.//div[@class="p-focus  "]/a/@data-sku')[0] if li.xpath('.//div[@class="p-focus  "]/a/@data-sku') else None  # 商品唯一idicons = li.xpath('.//div[@class="p-icons"]/i/text()')  # 备注comment = comment[0] if comment != [] else ''shop_name = shop_name[0] if shop_name != [] else ''icons_n = ''for x in icons:icons_n = icons_n + ',' + xdetail_url = li.xpath('.//div[@class="p-name p-name-type-2"]/a/@href')[0]  # 详情页网址detail_url = 'https:' + detail_urlitem = [title, price, comment, shop_name, data_sku, icons_n[1:], detail_url]print(item)sheet.append(item)except TimeoutError:get_data(html)
def main():url_main = 'https://www.jd.com/'keyword = input('请输入商品名称:')  # 搜索关键词driver.get(url=url_main)page = search(keyword)j = 1for i in range(3, page*2, 2):if j == 1:url = 'https://search.jd.com/Search?keyword={}&page={}&s={}&click=0'.format(keyword, i, j)else:url = 'https://search.jd.com/Search?keyword={}&page={}&s={}&click=0'.format(keyword, i, (j-1)*50)driver.get(url)time.sleep(1)driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")  # 下滑到底部time.sleep(3)driver.implicitly_wait(20)wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="J_goodsList"]/ul/li[last()]')))html = driver.page_sourceget_data(html)time.sleep(1)print(f'正在爬取第{j}页')j += 1if j == 20:break wb.save('京东{}信息.xlsx'.format(keyword))print('桐：>爬取成功啦<')
if __name__ == '__main__':main()

Python输入关键词批量得到电商商品信息相关推荐

玩转 python selenium---抓取某知名电商商品页的图片
练下手,爬一下某电商网站上的商品图片,还真爬到了.代码如下: 4.28 from selenium import webdriver import timeoption = webdriver.Chr ...
8000字讲清楚从0到1搭建电商商品中心（建议收藏）
<电商产品经理从0到1>系列文章面向0~3岁的产品经理或者有多年产品工作经验却刚接触电商领域的产品经理. 本系列文章将详细介绍电商核心系统的产品设计方案,帮助你体系化地认识电商产品. 看完 ...
陈宏申：浅谈京东电商商品文案挖掘难点与优化实践
导读: 在电商推荐中,除了推送商品的图片和价格信息外,文案也是商品非常重要的维度.基于编码器解码器范式的序列文本生成模型是文案挖掘的核心,但该种方法面临着两大技术挑战:一是文案生成结果不可靠和生成质量 ...
python 爬虫抓取某电商页面的商品价格
1. 业务需求最近想通过爬虫抓取某电商商品页的商品详情, 浏览器页面打开如下: 本来以为是一个很简单的爬虫,却没想到一波三折,并没有那么简单. 2. 付诸实践接到任务后,就兴冲冲的写了段代码来爬取 ...
python爬取电商订单_Python探索之爬取电商售卖信息代码示例
网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动的抓取万维网信息的程序或者脚本. 下面有一个示例代码,分享给大家: #! /usr/bi ...
Solr应用之电商商品搜索备忘
把以前做电商商品搜索的经验归档一下.电商的搜索功能大体上比较相同,从京东.苏宁.易讯等大型电商都可以观察出来.电商搜索功能大致分为几块: 1. 商品搜索.列表的展示,带排序功能:可能有些产品会要求一个 ...
python爬虫实战-如何批量爬取唯品会商品信息＞＞＞
第一步.打开唯品会网站 https://www.vip.com.然后随意搜索一种商品,比如"键盘",搜索之后下拉发现页面URL没有发生改变,但是商品信息在不断加载,那么这就是动态 ...
巨人java生鲜app下载_Java生鲜电商平台-生鲜电商商品中心系统设计与数据库架构（生鲜小程序/APP）...
Java生鲜电商平台-生鲜电商商品中心系统设计与数据库架构(生鲜小程序/APP) 说明:Java生鲜电商平台-生鲜电商商品中心系统设计与数据库架构(生鲜小程序/APP) 日日鲜-商品中心系统设计项目 ...
python爬取并分析淘宝商品信息
python爬取并分析淘宝商品信息背景介绍一.模拟登陆二.爬取商品信息 1. 定义相关参数 2. 分析并定义正则 3. 数据爬取三.简单数据分析 1.导入库 2.中文显示 3.读取数据 4.分 ...
python爬虫爬取当当网的商品信息
python爬虫爬取当当网的商品信息一.环境搭建二.简介三.当当网网页分析 1.分析网页的url规律 2.解析网页html页面书籍商品html页面解析其他商品html页面解析四.代码实现 ...

Python输入关键词批量得到电商商品信息

淘宝

京东

Python输入关键词批量得到电商商品信息相关推荐

最新文章

热门文章