python爬虫爬出新高度

一、运行环境

(1)win7

(2)python 2.7

(3)pycharm

二、获取csdn发表的文章

# coding:utf-8
import urllib2, re, time, random, os, datetime
import HTMLParser
from bs4 import BeautifulSoup
import sysreload(sys)
sys.setdefaultencoding('utf-8')# 自定义打印函数
def self_log(msg):print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)# 获取页面内容
def get_html(url):headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}req = urllib2.Request(url=url, headers=headers)try:html = urllib2.urlopen(req).read()html = HTMLParser.HTMLParser().unescape(html)return htmlexcept urllib2.HTTPError, e:print e.code# 得到博客页面总数
def get_last_page(html, fd):if not html:self_log(u'页面错误，停止运行')returnpage = BeautifulSoup(html, 'lxml')try:last_page = page.find('div', class_='pagelist').find_all('a')last_page = last_page[len(last_page) - 1].get('href')[-1:]self_log('总共有%s 页博客' % last_page)fd.write('总共有%s 页博客\n' % last_page)return last_pageexcept Exception, e:return 1# 获取页面列表def get_items(url):content_html = get_html(url)page = BeautifulSoup(content_html, 'lxml')items = page.find_all('div', class_='list_item list_view')return items# 根据每一个items list 提取需要的元素
def handle_items(items, content_list, read_num_for_sort):for item in items:temp = {}  # 临时变量title = item.find('a')  # 标题content_url = 'http://blog.csdn.net' + title.get('href')  # 标题对应文章的地址read_times = item.find('span', class_='link_view').text.strip()  # 阅读次数comments_time = item.find('span', class_='link_comments')  # 评论次数read_number = int(filter(str.isdigit, str(read_times)))  # 提取出来具体阅读次数的数字，为之后的排序做准备read_num_for_sort.append(read_number)# 将数据打包temp['indexs'] = read_numbertemp['title'] = title.text.strip()temp['read_times'] = read_timestemp['comments_time'] = comments_time.text.strip()temp['content_url'] = content_urlcontent_list.append(temp)# 创建文件夹def mkdir_folder(path):if not os.path.exists(path):os.makedirs(path)# 获取页面信息def getContent(html, dir_path):page = BeautifulSoup(html, 'lxml')try:title = page.find('div', class_='article_title').find('a').texttitle = title.strip()except Exception, e:print etry:content = page.find('div', class_='article_content')dir_path = dir_pathartitle_name_path = dir_path + '/' + title + '.txt'with open(artitle_name_path + '.txt', 'w') as f:f.write(content.text)self_log(u'存贮文章：%s 完毕' % title)except Exception, e:print e# 存贮每一篇文章到本地def run_to_get_article(content_total_list, dir_path):self_log('start save every article  ')for article_content in content_total_list:article_url = article_content.split('|')[4]self_log('将要存贮的地址是： %s ...' % article_url)artitle_html = get_html(article_url)getContent(artitle_html, dir_path)# 根据传进来的地址，获取博主名字，同时以博主名字命名存贮目录def get_blocker_name(url):if 'viewmode' in url:print url.split('.net')[1]print url.split('.net')[1].split('?')[0].split('/')[1]return url.split('.net')[1].split('?')[0].split('/')[1]else:print url.split('.net')[1]print url.split('.net')[1].split('/')[1]return url.split('.net')[1].split('/')[1]# 程序运行主函数def run(url, dir_path):read_num_for_sort = []content_list = []content_total_list = []# 定义文件夹名字并创建文件夹dir_path = dir_pathmkdir_folder(dir_path)# 定义文件名字count_file_name = dir_path + '/' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '.txt'fd = open(count_file_name, 'w')# 1.从主页进入获取页面总数main_html = get_html(url)last_page = get_last_page(main_html, fd)if last_page > 1:# 3.组装url，分别加载每页的页面,同时在每一个页面提取我们需要的内容for i in range(1, int(last_page) + 1):if 'category' not in url:main_url = url.split('?')[0] + '/article/list/%d?viewmode=contents' % ielse:main_url = url + '/%s' % iself_log('即将获取第%d页的内容，地址是：%s' % (i, main_url))items = get_items(main_url)  # 获取每一页的页面内容，根据页面内容得到文章item listhandle_items(items, content_list, read_num_for_sort)  # 处理item listelse:items = get_items(url)  # 获取每一页的页面内容，根据页面内容得到文章item listhandle_items(items, content_list, read_num_for_sort)  # 处理item listself_log('总共有%d 篇文章' % len(content_list))  # 根据得到的数据，统计文章总数# 根据 indexs（阅读次数）这个索引值进行排序# 非常好的一个根据列表中字典数据进行排序的方法content_list = sorted(content_list, cmp=lambda x, y: cmp(x['indexs'], y['indexs']), reverse=0)article_index = 1for a in content_list:# 组装打印语句totalcontent = '第' + str(article_index) + '篇|' + a['title'] + '|' + a['read_times'] + '|' + a['comments_time'] + '|' + a['content_url']# self_log(totalcontent)print totalcontent# 将其存贮到本地fd.write(totalcontent)fd.write('\n')article_index += 1content_total_list.append(totalcontent)fd.close()return content_total_listif __name__ == '__main__':print '''*****************************************  **    Welcome to Spider of Count CSDN  *******************************************'''#url = 'http://blog.csdn.net/qiqiyingse?viewmode=contents'url ='http://blog.csdn.net/zuochao_2013?viewmode=list'# url='http://blog.csdn.net/qiqiyingse/article/category/6292432?viewmode=contents'# url='http://blog.csdn.net/zuoxiaolong8810/article/category/1434962?viewmode=contents'dir_path = get_blocker_name(url)content_total_list = run(url, dir_path)run_to_get_article(content_total_list, dir_path)

二、爬去微信公众号内容

# coding: utf-8'''
总的来说就是通过搜狗搜索中的微信搜索入口来爬取
'''# 这三行代码是防止在python2上面编码错误的，在python3上面不要要这样设置
import sysreload(sys)
sys.setdefaultencoding('utf-8')from urllib import quote
from pyquery import PyQuery as pq
from selenium import webdriver
from pyExcelerator import *  # 导入excel相关包import requests
import time
import re
import json
import osclass weixin_spider:def __init__(self, keywords):' 构造函数 'self.keywords = keywords# 搜狐微信搜索链接入口# self.sogou_search_url = 'http://weixin.sogou.com/weixin?type=1&query=%s&ie=utf8&_sug_=n&_sug_type_=' % quote(self.keywords)self.sogou_search_url = 'http://weixin.sogou.com/weixin?type=1&query=%s&ie=utf8&s_from=input&_sug_=n&_sug_type_=' % quote(self.keywords)# 爬虫伪装头部设置self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}# 设置操作超时时长self.timeout = 5# 爬虫模拟在一个request.session中完成self.s = requests.Session()# excel 第一行数据self.excel_data = [u'编号', u'时间', u'文章标题', u'文章地址', u'文章简介']# 定义excel操作句柄self.excle_w = Workbook()# 搜索入口地址，以公众为关键字搜索该公众号def get_search_result_by_keywords(self):self.log('搜索地址为：%s' % self.sogou_search_url)return self.s.get(self.sogou_search_url, headers=self.headers, timeout=self.timeout).content# 获得公众号主页地址def get_wx_url_by_sougou_search_html(self, sougou_search_html):doc = pq(sougou_search_html)# print doc('p[class="tit"]')('a').attr('href')# print doc('div[class=img-box]')('a').attr('href')# 通过pyquery的方式处理网页内容，类似用beautifulsoup，但是pyquery和jQuery的方法类似，找到公众号主页地址return doc('div[class=txt-box]')('p[class=tit]')('a').attr('href')# 使用webdriver 加载公众号主页内容，主要是js渲染的部分def get_selenium_js_html(self, url):browser = webdriver.PhantomJS()browser.get(url)time.sleep(3)# 执行js得到整个页面内容html = browser.execute_script("return document.documentElement.outerHTML")return html# 获取公众号文章内容def parse_wx_articles_by_html(self, selenium_html):doc = pq(selenium_html)print '开始查找内容msg'return doc('div[class="weui_media_box appmsg"]')# 有的公众号仅仅有10篇文章，有的可能多一点# return doc('div[class="weui_msg_card"]')#公众号只有10篇文章文章的# 将获取到的文章转换为字典def switch_arctiles_to_list(self, articles):# 定义存贮变量articles_list = []i = 1# 以当前时间为名字建表excel_sheet_name = time.strftime('%Y-%m-%d')excel_content = self.excle_w.add_sheet(excel_sheet_name)# 遍历找到的文章，解析里面的内容if articles:for article in articles.items():self.log(u'开始整合(%d/%d)' % (i, len(articles)))# 处理单个文章articles_list.append(self.parse_one_article(article, i, excel_content))i += 1return articles_list# 解析单篇文章def parse_one_article(self, article, i, excel_content):article_dict = {}# 获取标题title = article('h4[class="weui_media_title"]').text()self.log('标题是： %s' % title)# 获取标题对应的地址url = 'http://mp.weixin.qq.com' + article('h4[class="weui_media_title"]').attr('hrefs')self.log('地址为： %s' % url)# 获取概要内容summary = article('.weui_media_desc').text()self.log('文章简述： %s' % summary)# 获取文章发表时间date = article('.weui_media_extra_info').text()self.log('发表时间为： %s' % date)# 获取封面图片pic = self.parse_cover_pic(article)# 获取文章内容content = self.parse_content_by_url(url).html()# 存储文章到本地contentfiletitle = self.keywords + '/' + title + '_' + date + '.html'self.save_content_file(contentfiletitle, content)# 将这些简单的信息保存成excel数据cols = 0tempContent = [i, date, title, url, summary]for data in self.excel_data:excel_content.write(0, cols, data)excel_content.write(i, cols, tempContent[cols])cols += 1self.excle_w.save(self.keywords + '/' + self.keywords + '.xls')# 返回字典数据return {'title': title,'url': url,'summary': summary,'date': date,'pic': pic,'content': content}# 查找封面图片，获取封面图片地址def parse_cover_pic(self, article):pic = article('.weui_media_hd').attr('style')p = re.compile(r'background-image:url\((.*?)\)')rs = p.findall(pic)self.log('封面图片是：%s ' % rs[0] if len(rs) > 0 else '')return rs[0] if len(rs) > 0 else ''# 获取文章页面详情def parse_content_by_url(self, url):page_html = self.get_selenium_js_html(url)return pq(page_html)('#js_content')# 存储文章到本地def save_content_file(self, title, content):with open(title, 'w') as f:f.write(content)# 存贮json数据到本地def save_file(self, content):' 数据写入文件 'with open(self.keywords + '/' + self.keywords + '.txt', 'w') as f:f.write(content)# 自定义log函数，主要是加上时间def log(self, msg):print u'%s: %s' % (time.strftime('%Y-%m-%d %H:%M:%S'), msg)# 验证函数def need_verify(self, selenium_html):' 有时候对方会封锁ip，这里做一下判断，检测html中是否包含id=verify_change的标签，有的话，代表被重定向了，提醒过一阵子重试 'return pq(selenium_html)('#verify_change').text() != ''# 创建公众号命名的文件夹def create_dir(self):if not os.path.exists(self.keywords):os.makedirs(self.keywords)# 爬虫主函数def run(self):' 爬虫入口函数 '# Step 0 ：  创建公众号命名的文件夹self.create_dir()# Step 1：GET请求到搜狗微信引擎，以微信公众号英文名称作为查询关键字self.log(u'开始获取，微信公众号英文名为：%s' % self.keywords)self.log(u'开始调用sougou搜索引擎')sougou_search_html = self.get_search_result_by_keywords()# Step 2：从搜索结果页中解析出公众号主页链接self.log(u'获取sougou_search_html成功，开始抓取公众号对应的主页wx_url')wx_url = self.get_wx_url_by_sougou_search_html(sougou_search_html)self.log(u'获取wx_url成功，%s' % wx_url)# Step 3：Selenium+PhantomJs获取js异步加载渲染后的htmlself.log(u'开始调用selenium渲染html')selenium_html = self.get_selenium_js_html(wx_url)# Step 4: 检测目标网站是否进行了封锁if self.need_verify(selenium_html):self.log(u'爬虫被目标网站封锁，请稍后再试')else:# Step 5: 使用PyQuery，从Step 3获取的html中解析出公众号文章列表的数据self.log(u'调用selenium渲染html完成，开始解析公众号文章')articles = self.parse_wx_articles_by_html(selenium_html)self.log(u'抓取到微信文章%d篇' % len(articles))# Step 6: 把微信文章数据封装成字典的listself.log(u'开始整合微信文章数据为字典')articles_list = self.switch_arctiles_to_list(articles)# Step 7: 把Step 5的字典list转换为Jsonself.log(u'整合完成，开始转换为json')data_json = json.dumps(articles_list)# Step 8: 写文件self.log(u'转换为json完成，开始保存json数据到文件')self.save_file(data_json)self.log(u'保存完成，程序结束')# main
# 几个可供参考的公众号
# DataBureau
# python6359
# ArchNotes
if __name__ == '__main__':print '''*******************************************    Welcome to Spider of 公众号       *******************************************'''gongzhonghao = raw_input(u'输入要爬取的公众号')if not gongzhonghao:gongzhonghao = 'python6359'weixin_spider(gongzhonghao).run()

三、爬去微信公众号内容制作词云

(1)weixin_spider.py

# coding: utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')from urllib import quote
from pyquery import PyQuery as pq
from selenium import webdriverimport requests
import time
import re
import json
import os# 搜索入口地址，以公众为关键字搜索该公众号
def get_search_result_by_keywords(sogou_search_url):# 爬虫伪装头部设置headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}# 设置操作超时时长timeout = 5# 爬虫模拟在一个request.session中完成s = requests.Session()log(u'搜索地址为：%s' % sogou_search_url)return s.get(sogou_search_url, headers=headers, timeout=timeout).content# 获得公众号主页地址
def get_wx_url_by_sougou_search_html(sougou_search_html):doc = pq(sougou_search_html)return doc('div[class=txt-box]')('p[class=tit]')('a').attr('href')# 使用webdriver 加载公众号主页内容，主要是js渲染的部分
def get_selenium_js_html(url):browser = webdriver.PhantomJS(executable_path=r'H:\python\phantomjs.exe')browser.get(url)time.sleep(3)# 执行js得到整个页面内容html = browser.execute_script("return document.documentElement.outerHTML")browser.close()return html# 获取公众号文章内容
def parse_wx_articles_by_html(selenium_html):doc = pq(selenium_html)return doc('div[class="weui_media_box appmsg"]')# 将获取到的文章转换为字典
def switch_arctiles_to_list(articles):# 定义存贮变量articles_list = []i = 1# 遍历找到的文章，解析里面的内容if articles:for article in articles.items():log(u'开始整合(%d/%d)' % (i, len(articles)))# 处理单个文章articles_list.append(parse_one_article(article))i += 1return articles_list# 解析单篇文章
def parse_one_article(article):article_dict = {}# 获取标题title = article('h4[class="weui_media_title"]').text().strip()###log(u'标题是： %s' % title)# 获取标题对应的地址url = 'http://mp.weixin.qq.com' + article('h4[class="weui_media_title"]').attr('hrefs')log(u'地址为： %s' % url)# 获取概要内容summary = article('.weui_media_desc').text()log(u'文章简述： %s' % summary)# 获取文章发表时间date = article('.weui_media_extra_info').text().strip()log(u'发表时间为： %s' % date)# 获取封面图片pic = parse_cover_pic(article)# 返回字典数据return {'title': title,'url': url,'summary': summary,'date': date,'pic': pic}# 查找封面图片，获取封面图片地址
def parse_cover_pic(article):pic = article('.weui_media_hd').attr('style')p = re.compile(r'background-image:url\((.*?)\)')rs = p.findall(pic)log(u'封面图片是：%s ' % rs[0] if len(rs) > 0 else '')return rs[0] if len(rs) > 0 else ''# 自定义log函数，主要是加上时间
def log(msg):print u'%s: %s' % (time.strftime('%Y-%m-%d_%H-%M-%S'), msg)# 验证函数
def need_verify(selenium_html):' 有时候对方会封锁ip，这里做一下判断，检测html中是否包含id=verify_change的标签，有的话，代表被重定向了，提醒过一阵子重试 'return pq(selenium_html)('#verify_change').text() != ''# 创建公众号命名的文件夹
def create_dir(keywords):if not os.path.exists(keywords):os.makedirs(keywords)# 爬虫主函数def run(keywords):' 爬虫入口函数 '# Step 0 ：  创建公众号命名的文件夹create_dir(keywords)# 搜狐微信搜索链接入口sogou_search_url = 'http://weixin.sogou.com/weixin?type=1&query=%s&ie=utf8&s_from=input&_sug_=n&_sug_type_=' % quote(keywords)# Step 1：GET请求到搜狗微信引擎，以微信公众号英文名称作为查询关键字log(u'开始获取，微信公众号英文名为：%s' % keywords)log(u'开始调用sougou搜索引擎')sougou_search_html = get_search_result_by_keywords(sogou_search_url)# Step 2：从搜索结果页中解析出公众号主页链接log(u'获取sougou_search_html成功，开始抓取公众号对应的主页wx_url')wx_url = get_wx_url_by_sougou_search_html(sougou_search_html)log(u'获取wx_url成功，%s' % wx_url)# Step 3：Selenium+PhantomJs获取js异步加载渲染后的htmllog(u'开始调用selenium渲染html')selenium_html = get_selenium_js_html(wx_url)# Step 4: 检测目标网站是否进行了封锁if need_verify(selenium_html):log(u'爬虫被目标网站封锁，请稍后再试')else:# Step 5: 使用PyQuery，从Step 3获取的html中解析出公众号文章列表的数据log(u'调用selenium渲染html完成，开始解析公众号文章')articles = parse_wx_articles_by_html(selenium_html)log(u'抓取到微信文章%d篇' % len(articles))# Step 6: 把微信文章数据封装成字典的listlog(u'开始整合微信文章数据为字典')articles_list = switch_arctiles_to_list(articles)return [content['title'] for content in articles_list]

(2)weixin_wordcloud.py

# coding: utf8
from wordcloud import WordCloud
import weixin_spider
import matplotlib.pyplot as plt
import timeif __name__ == '__main__':print u'''******************************************************************   Welcome to Spider of weixin gongzhonghao for wordcloud   ******************************************************************'''gongzhonghao = raw_input(u'input weixin gongzhonghao:')if not gongzhonghao:gongzhonghao = 'python'text = " ".join(weixin_spider.run(gongzhonghao))print textwordcloud = WordCloud(random_state=1, font_path=r'C:/Users/Windows/fonts/simkai.ttf').generate(text)  # 处理获取到的title，准备绘制词云print 'start drawing.....'plt.figure()plt.imshow(wordcloud, interpolation='bilinear')plt.axis('off')plt.show()

转载出处: http://blog.csdn.net/column/details/15501.html

python爬虫爬出新高度相关推荐

python爬虫爬猎聘网获取多条职责描述中有Linux需求的招聘信息
python爬虫爬猎聘网获取多条职责描述中有Linux需求的招聘信息下列是我爬虫的作业摘要随着现代化社会的飞速发展,网络上巨大信息量的获取给用户带来了许多的麻烦.由于工作和生活节奏的需求,人们 ...
在当当买了python怎么下载源代码-python爬虫爬取当当网
[实例简介]python爬虫爬取当当网 [实例截图] [核心代码] ''' Function: 当当网图书爬虫 Author: Charles 微信公众号: Charles的皮卡丘 ''' impor ...
python爬虫代码实例-Python爬虫爬取百度搜索内容代码实例
这篇文章主要介绍了Python爬虫爬取百度搜索内容代码实例,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下搜索引擎用的很频繁,现在利用Python爬 ...
python爬虫数据分析可以做什么-python爬虫爬取的数据可以做什么
在Python中连接到多播服务器问题,怎么解决你把redirect关闭就可以了.在send时,加上参数allow_redirects=False 通常每个浏览器都会设置redirect的次数.如果re ...
python爬虫爬取csdn博客专家所有博客内容
python爬虫爬取csdn博客专家所有博客内容: 全部过程采取自动识别与抓取,抓取结果是将一个博主的所有文章存放在以其名字命名的文件内,代码如下 #coding:utf-8import urlli ...
python网易云_用python爬虫爬取网易云音乐
标签: 使用python爬虫爬取网易云音乐需要使用的模块只需要requests模块和os模块即可开始工作先去网易云音乐网页版找一下你想要听的歌曲点击进去.按键盘F12打开网页调试工具,点击Ne ...
使用Python爬虫爬取网络美女图片
代码地址如下: http://www.demodashi.com/demo/13500.html 准备工作安装python3.6 略安装requests库(用于请求静态页面) pip instal ...
python如何爬取网站所有目录_用python爬虫爬取网站的章节目录及其网址
认识爬虫网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本.另外一些不常使用的名字还有蚂蚁.自动索引.模拟 ...
运用Python爬虫爬取一个美女网址，爬取美女图
运用Python爬虫爬取一个美女网址,爬取美女图要运用到的python技术: 导入库 1.request 发送请求,从服务器获取数据 2.BeautifulSoup 用来解析整个网页的源代码 imp ...

python爬虫爬出新高度

python爬虫爬出新高度相关推荐

最新文章

热门文章