day03 爬取京东信息，bs4

今日内容:
一 Selenium剩余部分
二 BeautifulSoup4

一 Selenium剩余部分

1.元素交互操作:
    - 点击、清除
        click
        clear

- ActionChains
是一个动作链对象，需要把driver驱动传给它。
动作链对象可以操作一系列设定好的动作行为。

- iframe的切换
driver.switch_to.frame('iframeResult')

- 执行js代码
execute_script()

爬取京东商品信息

# 初级版:#     from tank!# '''# import time# from selenium import webdriver# from selenium.webdriver.common.keys import Keys## driver = webdriver.Chrome()## num = 1## try:#     driver.implicitly_wait(10)#     # 往京东发送请求#     driver.get('https://www.jd.com/')##     # 往京东主页输入框输入墨菲定律，按回车键#     input_tag = driver.find_element_by_id('key')#     input_tag.send_keys('墨菲定律')#     input_tag.send_keys(Keys.ENTER)##     time.sleep(5)###     good_list = driver.find_elements_by_class_name('gl-item')#     for good in good_list:#         # print(good)#         # 商品名称#         good_name = good.find_element_by_css_selector('.p-name em').text#         # print(good_name)##         # 商品链接#         good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href')#         # print(good_url)##         # 商品价格#         good_price = good.find_element_by_class_name('p-price').text#         # print(good_price)##         # 商品评价#         good_commit = good.find_element_by_class_name('p-commit').text##         good_content = f'''#         num: {num}#         商品名称: {good_name}#         商品链接: {good_url}#         商品价格: {good_price}#         商品评价: {good_commit}#         \n#         '''##         print(good_content)##         with open('jd.txt', 'a', encoding='utf-8') as f:#             f.write(good_content)#         num += 1##     print('商品信息写入成功!')### finally:#     driver.close()

'''中级版'''# import time# from selenium import webdriver# from selenium.webdriver.common.keys import Keys## driver = webdriver.Chrome()## num = 1## try:#     driver.implicitly_wait(10)#     # 往京东发送请求#     driver.get('https://www.jd.com/')##     # 往京东主页输入框输入墨菲定律，按回车键#     input_tag = driver.find_element_by_id('key')#     input_tag.send_keys('墨菲定律')#     input_tag.send_keys(Keys.ENTER)##     time.sleep(5)##     # 下拉滑动5000px#     js_code = '''#         window.scrollTo(0, 5000)#     '''##     driver.execute_script(js_code)##     # 等待5秒，待商品数据加载#     time.sleep(5)##     good_list = driver.find_elements_by_class_name('gl-item')#     for good in good_list:#         # print(good)#         # 商品名称#         good_name = good.find_element_by_css_selector('.p-name em').text#         # print(good_name)##         # 商品链接#         good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href')#         # print(good_url)##         # 商品价格#         good_price = good.find_element_by_class_name('p-price').text#         # print(good_price)##         # 商品评价#         good_commit = good.find_element_by_class_name('p-commit').text##         good_content = f'''#         num: {num}#         商品名称: {good_name}#         商品链接: {good_url}#         商品价格: {good_price}#         商品评价: {good_commit}#         \n#         '''##         print(good_content)##         with open('jd.txt', 'a', encoding='utf-8') as f:#             f.write(good_content)#         num += 1##     print('商品信息写入成功!')##     # 找到下一页并点击#     next_tag = driver.find_element_by_class_name('pn-next')#     next_tag.click()##     time.sleep(10)## finally:#     driver.close()

'''狂暴版'''import timefrom selenium import webdriverfrom selenium.webdriver.common.keys import Keys

def get_good(driver):    num = 1    try:        time.sleep(5)

        # 下拉滑动5000px        js_code = '''            window.scrollTo(0, 5000)        '''        driver.execute_script(js_code)

        # 等待5秒，待商品数据加载        time.sleep(5)        good_list = driver.find_elements_by_class_name('gl-item')        for good in good_list:            # 商品名称            good_name = good.find_element_by_css_selector('.p-name em').text

            # 商品链接            good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href')

            # 商品价格            good_price = good.find_element_by_class_name('p-price').text

            # 商品评价            good_commit = good.find_element_by_class_name('p-commit').text

            good_content = f'''            num: {num}            商品名称: {good_name}            商品链接: {good_url}            商品价格: {good_price}            商品评价: {good_commit}            \n            '''            print(good_content)            with open('jd.txt', 'a', encoding='utf-8') as f:                f.write(good_content)            num += 1

        print('商品信息写入成功!')

        # 找到下一页并点击        next_tag = driver.find_element_by_class_name('pn-next')        next_tag.click()

        time.sleep(5)        # 递归调用函数本身        get_good(driver)

    finally:        driver.close()

if __name__ == '__main__':    driver = webdriver.Chrome()    try:        driver.implicitly_wait(10)        # 往京东发送请求        driver.get('https://www.jd.com/')        # 往京东主页输入框输入墨菲定律，按回车键        input_tag = driver.find_element_by_id('key')        input_tag.send_keys('墨菲定律')        input_tag.send_keys(Keys.ENTER)

        # 调用获取商品信息函数        get_good(driver)

    finally:        driver.close()

bs4搜索文档树

find: 找第一个find_all: 找所有

标签查找与属性查找:name 属性匹配

    name 标签名    attrs 属性查找匹配    text 文本匹配

    标签:        - 字符串过滤器               字符串全局匹配

        - 正则过滤器            re模块匹配

        - 列表过滤器            列表内的数据匹配

        - bool过滤器            True匹配

        - 方法过滤器            用于一些要的属性以及不需要的属性查找。    属性:        - class_        - id'''html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" >Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>and they lived at the bottom of a well.</p><p class="story">...</p>"""from bs4 import BeautifulSoup

soup = BeautifulSoup(html_doc, 'lxml')

# name 标签名# attrs 属性查找匹配# text 文本匹配# find与find_all搜索文档

'''

字符串过滤器'''p = soup.find(name='p')p_s = soup.find_all(name='p')

print(p)print(p_s)

# name + attrsp = soup.find(name='p', attrs={"id": "p"})print(p)

# name + texttag = soup.find(name='title', text="The Dormouse's story")print(tag)

# name + attrs + texttag = soup.find(name='a', attrs={"class": "sister"}, text="Elsie")print(tag)

'''- 正则过滤器re模块匹配'''import re# name# 根据re模块匹配带有a的节点a = soup.find(name=re.compile('a'))print(a)

a_s = soup.find_all(name=re.compile('a'))print(a_s)

# attrsa = soup.find(attrs={"id": re.compile('link')})print(a)

# - 列表过滤器# 列表内的数据匹配print(soup.find(name=['a', 'p', 'html', re.compile('a')]))print(soup.find_all(name=['a', 'p', 'html', re.compile('a')]))

# - bool过滤器# True匹配print(soup.find(name=True, attrs={"id": True}))

# - 方法过滤器# 用于一些要的属性以及不需要的属性查找。

def have_id_not_class(tag):    # print(tag.name)    if tag.name == 'p' and tag.has_attr("id") and not tag.has_attr("class"):        return tag

# print(soup.find_all(name=函数对象))print(soup.find_all(name=have_id_not_class))

# 补充知识点:# ida = soup.find(id='link2')print(a)

# classp = soup.find(class_='sister')print(p)

转载于:https://www.cnblogs.com/x2436876927/p/11129196.html

day03 爬取京东信息，bs4相关推荐

【python爬虫系列】13.实战二爬取京东信息
实战2 爬取京东 1. 2.设计架构: 每个商品只有100页, 用线程池处理,不需要分区对于耗费时间的评论获取使用Celery分布式获取 Celery使用redis中间件和存储结果写入cs 3.写 ...
Scrapy练习——爬取京东商城商品信息
刚刚接触爬虫,花了一段时间研究了一下如何使用scrapy,写了一个比较简单的小程序,主要用于爬取京东商城有关进口牛奶页面的商品信息,包括商品的名称,价格,店铺名称,链接,以及评价的一些信息等.简单记录 ...
python爬取京东评论_Python如何爬取京东的评价信息
Python如何爬取京东的评价信息模块:requests,BeautifulSoup import re import time import csv import requests from bs ...
python爬取京东商品属性_python爬虫小项目：爬取京东商品信息
#爬取京东手机信息 import requests from bs4 import BeautifulSoup from selenium import webdriver import re imp ...
python爬取京东商品信息代码_爬取京东商品信息
利用 BeautifulSoup + Requests 爬取京东商品信息并保存在Excel中一.查看网页信息打开京东商城,随便输入个商品,就选固态硬盘吧先看看 URL 的规律,可以看到我们输入的 ...
爬取京东商品详情页信息
之前写过爬取京东商品导航信息,现在献上爬取京东商品详情页信息. #爬取京东商品详情页信息 #2017/7/30import requests from bs4 import BeautifulSoup ...
利用python爬取京东华为旗舰店手机信息（价格、型号、颜色、链接等）
目录第一章.前言 1.1.效果展示 1.2.需要用到的库 1.3.原理分析第二章.代码分开讲解 2.1.对象的定义及初始化 2.1.1.第一至二行 2.1.2.第三至四行 2.1.3.第五至六行 ...
python爬取京东书籍_Python爬取当当、京东、亚马逊图书信息代码实例
注:1.本程序采用MSSQLserver数据库存储,请运行程序前手动修改程序开头处的数据库链接信息 2.需要bs4.requests.pymssql库支持 3.支持多线程 from bs4 impor ...
用selenium爬取京东平台商品列表,爬取商品名称、价格、店铺信息
#用selenium爬取京东平台商品列表,爬取商品名称.价格.店铺信息from selenium import webdriver from selenium.webdriver.common.by ...
go爬虫和python爬虫哪个好_python 爬虫实战项目--爬取京东商品信息（价格、优惠、排名、好评率等）-Go语言中文社区...
利用splash爬取京东商品信息一.环境 window7 python3.5 pycharm scrapy scrapy-splash MySQL 二.简介为了体验scrapy-splash 的动 ...

day03 爬取京东信息，bs4

day03 爬取京东信息，bs4相关推荐

最新文章

热门文章