文章目录

  • 爬虫篇
  • 绘制图书图片墙
  • 数据分析篇

爬虫篇

'''
Function:当当网图书爬虫
'''
import time
import pickle
import random
import requests
from bs4 import BeautifulSoupheaders = {'Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8','Cache-Control': 'no-cache','Connection': 'keep-alive','Host': 'search.dangdang.com'
}'''解析, 提取需要的数据'''
def parseHtml(html):data = {}soup = BeautifulSoup(html, 'lxml')conshoplist = soup.find_all('div', {'class': 'con shoplist'})[0]for each in conshoplist.find_all('li'):# 书名bookname = each.find_all('a')[0].get('title').strip(' ')# 书图img_src = each.find_all('a')[0].img.get('data-original')if img_src is None:img_src = each.find_all('a')[0].img.get('src')img_src = img_src.strip(' ')# 价格price = float(each.find_all('p', {'class': 'price'})[0].span.text[1:])# 简介detail = each.find_all('p', {'class': 'detail'})[0].text# 评分stars = float(each.find_all('p', {'class': 'search_star_line'})[0].span.span.get('style').split(': ')[-1].strip('%;')) / 20# 评论数量num_comments = float(each.find_all('p', {'class': 'search_star_line'})[0].a.text[:-3])data[bookname] = [img_src, price, detail, stars, num_comments]return data'''主函数'''
def main(keyword):url = 'http://search.dangdang.com/?key={}&act=input&page_index={}'results = {}num_page = 0while True:num_page += 1print('[INFO]: Start to get the data of page%d...' % num_page)page_url  = url.format(keyword, num_page)res = requests.get(page_url, headers=headers)if '抱歉,没有找到与“%s”相关的商品,建议适当减少筛选条件' % keyword in res.text:breakpage_data = parseHtml(res.text)results.update(page_data)time.sleep(random.random() + 0.5)with open('%s_%d.pkl' % (keyword, num_page-1), 'wb') as f:pickle.dump(results, f)return resultsif __name__ == '__main__':main('python')

绘制图书图片墙

思路:
1)先利用爬取当当网图书的图片ur
2)批量爬取图片
3)绘制图片墙

import os
import time
import math
import pickle
import requests
from PIL import ImagePICDIR = 'pictures'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
}'''图片下载'''
def downloadPics(urls, savedir):if not os.path.exists(savedir):os.mkdir(savedir)for idx, url in enumerate(urls):res = requests.get(url, headers=headers)with open(os.path.join(savedir, '%d.jpg' % idx), 'wb') as f:f.write(res.content)time.sleep(0.5)'''制作照片墙'''
def makePicturesWall(picdir):picslist = os.listdir(picdir)num_pics = len(picslist)print('照片数量',num_pics)size = 64line_numpics = int(math.sqrt(num_pics))#正方形picwall = Image.new('RGBA', (line_numpics*size, line_numpics*size))x = 0y = 0for pic in picslist:img = Image.open(os.path.join(picdir, pic))img = img.resize((size, size), Image.ANTIALIAS)     #改变图片尺寸picwall.paste(img, (x*size, y*size))    #合并图片x += 1if x == line_numpics:x = 0y += 1print('[INFO]: Generate pictures wall successfully...')picwall.save("picwall.png")     #保存图片if __name__ == '__main__':with open('python_61.pkl', 'rb') as f:data = pickle.load(f)urls = [j[0] for i, j in data.items()]  #加载图片下载 url# downloadPics(urls, PICDIR)makePicturesWall(PICDIR)

图片墙:

数据分析篇

'''
import os
import jieba
import pickle
from pyecharts import Bar
from pyecharts import Pie
from pyecharts import Funnel
from wordcloud import WordCloud'''柱状图(2维)'''
def drawBar(title, data, savepath='./results'):if not os.path.exists(savepath):os.mkdir(savepath)bar = Bar(title, title_pos='center')#bar.use_theme('vintage')attrs = [i for i, j in data.items()]values = [j for i, j in data.items()]bar.add('', attrs, values, xaxis_rotate=15, yaxis_rotate=30)bar.render(os.path.join(savepath, '%s.html' % title))'''饼图'''
def drawPie(title, data, savepath='./results'):if not os.path.exists(savepath):os.mkdir(savepath)pie = Pie(title, title_pos='center')#pie.use_theme('westeros')attrs = [i for i, j in data.items()]values = [j for i, j in data.items()]pie.add('', attrs, values, is_label_show=True,legend_orient="vertical", #标签成列legend_pos="left",# #标签在左radius=[30, 75],rosetype="area" #宽度属性随值大小变化)pie.render(os.path.join(savepath, '%s.html' % title))'''漏斗图'''
def drawFunnel(title, data, savepath='./results'):if not os.path.exists(savepath):os.mkdir(savepath)funnel = Funnel(title, title_pos='center')#funnel.use_theme('chalk')attrs = [i for i, j in data.items()]values = [j for i, j in data.items()]funnel.add("", attrs, values, is_label_show=True,label_pos="inside",#显示标签在图像中label_text_color="#fff",funnel_gap=5,legend_pos="left",legend_orient="vertical" #标签成列)funnel.render(os.path.join(savepath, '%s.html' % title))'''统计词频'''
def statistics(texts, stopwords):words_dict = {}for text in texts:temp = jieba.cut(text)for t in temp:if t in stopwords or t == 'unknow':continueif t in words_dict.keys():words_dict[t] += 1else:words_dict[t] = 1return words_dict'''词云'''
def drawWordCloud(words, title, savepath='./results'):if not os.path.exists(savepath):os.mkdir(savepath)wc = WordCloud( background_color='white', max_words=2000, width=1920, height=1080, margin=5)wc.generate_from_frequencies(words)wc.to_file(os.path.join(savepath, title+'.png'))if __name__ == '__main__':with open('python_61.pkl', 'rb') as f:data = pickle.load(f)# 价格分布results = {}prices = []price_max = ['', 0]for key, value in data.items():price = value[1]if price_max[1] < price:price_max = [key, price]prices.append(price)results['小于50元'] = sum(i < 50 for i in prices)results['50-100元'] = sum((i < 100 and i >= 50) for i in prices)results['100-200元'] = sum((i < 200 and i >= 100) for i in prices)results['200-300元'] = sum((i < 300 and i >= 200) for i in prices)results['300-400元'] = sum((i < 400 and i >= 300) for i in prices)results['400元以上'] = sum(i >= 400 for i in prices)drawPie('python相关图书的价格分布', results)print('价格最高的图书为: %s, 目前单价为: %f' % (price_max[0], price_max[1]))# 评分分布results = {}stars = []for key, value in data.items():star = value[3] if value[3] > 0 else '暂无评分'stars.append(str(star))for each in sorted(set(stars)):results[each] = stars.count(each)drawBar('python相关图书评分分布', results)# 评论数量results = {}comments_num = []top6 = {}for key, value in data.items():num = int(value[-1])comments_num.append(num)top6[key.split('【')[0].split('(')[0].split('(')[0].split(' ')[0].split(':')[0]] = numresults['0评论'] = sum(i == 0 for i in comments_num)results['0-100评论'] = sum((i > 0 and i <= 100) for i in comments_num)results['100-1000评论'] = sum((i > 100 and i <= 1000) for i in comments_num)results['1000-5000评论'] = sum((i > 1000 and i <= 5000) for i in comments_num)results['5000评论以上'] = sum(i > 5000 for i in comments_num)drawFunnel('python相关图书评论数量分布', results)top6 = dict(sorted(top6.items(), key=lambda item: item[1])[-6:])drawBar('python相关图书评论数量TOP6', top6)# 词云stopwords = open('./stopwords.txt', 'r', encoding='utf-8').read().split('\n')[:-1]texts = [j[2] for i, j in data.items()]words_dict = statistics(texts, stopwords)drawWordCloud(words_dict, 'python相关图书简介词云', savepath='./results')

图片展示:




评论词云:

全部代码与数据放在Github上:
https://github.com/why19970628/Python_Crawler/tree/master/DangDang_Books

当当网图书爬虫与数据分析相关推荐

  1. Redis-Scrapy分布式爬虫:当当网图书为例

    Scrapy-Redis分布式策略: Scrapy_redis在scrapy的基础上实现了更多,更强大的功能,具体体现在: reqeust去重,爬虫持久化,和轻松实现分布式 假设有四台电脑:Windo ...

  2. python爬虫之静态网页(以当当网图书畅销榜榜为例)

    在编写爬虫代码时,将网页分为静态网页和动态网页两种,不同类型的网页数据有着不同的处理办法,这篇文章简述爬取静态网页的方法,以当当网的图书销量为例,使用爬虫技术可以获得网页的页面数据并且自动生成exce ...

  3. 请用Python写一个爬虫程序,爬取当当网图书畅销榜数据

    好的,下面是一个简单的 Python 爬虫代码,可以爬取当当网图书畅销榜数据: import requests from bs4 import BeautifulSoupurl = "htt ...

  4. Scrapy爬取当当网图书销售前100

    scrapy爬取当当网图书畅销榜 一.采集任务 爬取当当网图书畅销榜信息,获取热销图书前500相关数据. 二.网页解析 1. 打开当当网,按照图书榜>图书畅销榜 进入当当网图书畅销榜[http: ...

  5. scrapy框架的简单使用——爬取当当网图书信息

    ** Scrapy爬取当当网图书信息实例 --以警察局办案为类比 ** 使用Scrapy进行信息爬取的过程看起来十分的复杂,但是他的操作方式与警局办案十分的相似,那么接下来我们就以故事的形式开始Scr ...

  6. 数据挖掘 (三)——基于python的当当网小爬虫

    导语 本文将实现利用Python爬取并简单地可视化分析当当网的图书数据. 详细的exe.源代码.程序解析等文件请关注公众号 行歌 知否, 回复关键词 当当网小爬虫 获取. 更多精彩内容请关注公众号 转 ...

  7. 使用selenium爬取当当网图书

    1.目标 利用Selenium抓取当当网图书并用正则表达式获取到得到商品的图片.名称.价格.评论人数信息. 2.开发环境 python2.7 +windows10 +pycharm 3.页面分析 如图 ...

  8. 在当当买了python怎么下载源代码-初学Python 之抓取当当网图书页面目录并保存到txt文件...

    这学期新开了门"高大上"的课<机器学习>,也开始入门Python.然后跟我们一样初学Python 的老师布置了个"作业"--用Python 弄个抓取 ...

  9. Python爬虫实战+Scrapy框架 爬取当当网图书信息

    1.环境准备 1.在python虚拟环境终端使用 pip install scrapy下载scrapy依赖库 2.使用scrapy startproject book创建scrapy心目工程 3.使用 ...

  10. python 爬虫 爬取当当网图书信息

    初次系统的学习python,在学习完基本语法后,对爬虫进行学习,现在对当当网进行爬取,爬取了基本图书信息,包括图书名.作者等 import requests from time import slee ...

最新文章

  1. 一、nginx 安装
  2. python tkinter 滚动条_如何在tkinter窗口中添加滚动条?
  3. Django进阶之session
  4. RNA-Seq入门  如何跑通一个Rna-Seq流程
  5. 比Spring简单的IoC容器
  6. 【Paper】2017_The distributed optimal consensus algorithms for general linear multi-agent systems
  7. 栈溢出笔记1.3 准备Shellcode
  8. 地产遇冷之际,行业竞争加剧,房企如何走出营销低谷,先声夺人?
  9. html5健康有机蔬菜果汁店网站模板
  10. 美国智能机市场Android份额逼近苹果iOS
  11. 公众号抢号_公众号调性是什么意思?公众号排版可以塑造公众号调性吗?
  12. 网站SEO优化之Robots.txt文件写法。
  13. linux操作系统环境搭建实验报告,操作系统实验报告 Linux基本环境
  14. Kindle Paperwhite安装PDF阅读器Koreader
  15. 成功在神舟K650c-i7 d2(i7-4700MQ、HM87)上装好了Windows XP
  16. 拼多多引流大法,日引百人太简单
  17. Beyond Homophily in Graph Neural Networks: Current Limitations and Effective Designs 论文理解
  18. Oracle EBS中分类账和法人实体 的关系(有sql语句实例)
  19. 统计学之基础知识(数据分析准备)
  20. 支付宝被攻击引发备付金监管问题

热门文章

  1. spss因子分析结果解读_因子分析巴特利特球形度检验结果解读
  2. 【杨中科解惑】我该怎么选择?选择就是放弃
  3. 用户体验之新浪滚动新闻
  4. sift算法matlab详解,sift算法原理详解及应用
  5. 用 Python 可视化分析全球火山分布,发现了这些有趣的现象
  6. TheFatRat免杀
  7. FFmpeg下载地址
  8. Python | 实现pdf文件分页
  9. (开源)微信小程序实时控制stc89c51,通过esp8266
  10. 缺少比较器,运放来救场!(运放当做比较器电路记录)