python爬取js加载的数据_Python爬虫:爬取JS加载数据的网页

比如简书:

Paste_Image.png

我们来写个程序，爬取简书网站随便一个作者的所有文章，再对其所有文章进行分词统计

程序运行统计的结果见文章:

我统计了彭小六简书360篇文章中使用的词语

需要的Python包

包名

作用

selenium

用于和phantomjs合作模拟浏览器访问网页

lxml

用于对html页面的解析，提取数据

jieba

用于对文章正文分词

tld

解析url，比如提取domain

还需要下载 phantomjs，selenium配合phantomjs的使用代码中有体现

下载地址: http://phantomjs.org/

下面代码中，由于使用文件保存数据，而没有使用数据库保存数据，所以代码量比较多，其中主要代码并不多

直接上代码####

# -*-coding:utf-8-*-

import json

import os, sys

from random import randint

from collections import Counter

import jieba

from lxml import etree

from selenium import webdriver

import time

from tld import get_tld

path = os.path.abspath(os.path.dirname(__file__))

class Spider():

'''

获取简书作者的全部文章页面，并解析

'''

def __init__(self, start_url):

'''

我这里使用文件保存数据，没有使用数据库保存数据

所有需要初始化文件保存路径

使用本程序的你可以把文件保存改成数据库保存,建议使用nosql方便保存

start_url:作者文章列表页面，比如http://www.jianshu.com/u/65fd4e5d930d

:return:

'''

self.start_url = start_url

res = get_tld(self.start_url, as_object=True, fix_protocol=True)

self.domain = "{}.{}".format(res.subdomain, res.tld)

self.user_id = self.start_url.split("/")[-1]

# 保存作者文章列表html页面

post_list_dir = '{}/post-list'.format(path)

self.post_lists_html = '{}/post_list_{}.html'.format(post_list_dir, self.user_id)

# 保存作者所有文章的url

self.post_lists_urls = '{}/urls_{}.dat'.format(post_list_dir, self.user_id)

# 保存文章原始网页：

self.posts_html_dir = '{}/post-html/{}'.format(path, self.user_id)

# 保存文章解析后的内容：

self.posts_data_dir = '{}/post-data/{}'.format(path,self.user_id)

# 保存文章统计后的结果：

self.result_dir = '{}/result'.format(path)

self.executable_path='{}/phantomjs-2.1.1-linux-x86_64/bin/phantomjs'.format(path)

# mkdir

if not os.path.exists(self.posts_html_dir):

os.makedirs(self.posts_html_dir)

if not os.path.exists(self.posts_data_dir):

os.makedirs(self.posts_data_dir)

if not os.path.exists(post_list_dir):

os.makedirs(post_list_dir)

if not os.path.exists(self.result_dir):

os.makedirs(self.result_dir)

# 网上随笔找的免费代理ip

self.ips = ['61.167.222.17:808','58.212.121.72:8998', '111.1.3.36:8000', '125.117.133.74:9000']

def post_list_page(self):

'''

获取文章列表页面，以及文章链接

:return:

'''

obj = webdriver.PhantomJS(executable_path=self.executable_path)

obj.set_page_load_timeout(30)

obj.maximize_window()

# 随机一个代理ip

ip_num = len(self.ips)

ip = self.ips[randint(0,ip_num-1)]

obj.http_proxy = ip

obj.get(self.start_url)

# 文章总数量

sel = etree.HTML(obj.page_source)

r = sel.xpath("//div[@class='main-top']//div[@class='info']//li[3]//p//text()")

if r:

crawl_post_n = int(r[0])

else:

print("[Error] 提取文章总书的xpath不正确")

sys.exit()

n = crawl_post_n/9

i = 1

while n:

t = randint(2,5)

time.sleep(t)

js = "var q=document.body.scrollTop=100000"

# 页面一直下滚

obj.execute_script(js)

n -= 1

i += 1

# 然后把作者文章列表页面的html(保存到数据库，或文本保存)

of = open(self.post_lists_html, "w")

of.write(obj.page_source)

of.close()

# 我们也顺便把作者所有的文章链接提取出来(保存到数据库，或文本保存)

of = open(self.post_lists_urls, "w")

sel = etree.HTML(obj.page_source)

results = sel.xpath("//div[@id='list-container']//li//a[@class='title']/@href")

for result in results:

of.write("http://{}{}".format(self.domain, result.strip()))

of.write("\n")

of.close()

def posts_html(self):

'''

获取文章页面html

:return:

'''

of = open(self.post_lists_urls)

urls = of.readlines()

ip_num = len(self.ips)

obj = webdriver.PhantomJS(executable_path=self.executable_path)

obj.set_page_load_timeout(10)

obj.maximize_window()

for url in urls:

# 随机一个代理ip

ip = self.ips[randint(0,ip_num-1)]

obj.http_proxy = ip

url = url.strip()

print("代理ip:{}".format(ip))

print("网页:{}".format(url))

try:

obj.get(url)

except:

print("Error:{}".format(url))

post_id = url.split("/")[-1]

of = open("{}/{}_{}.html".format(self.posts_html_dir, obj.title, post_id), "w")

of.write(obj.page_source)

of.close()

t = randint(1,5)

time.sleep(t)

def page_parsing(self):

'''

html解析

:return:

'''

# 只获取匹配的第一个

xpath_rule_0 ={

"author":"//div[@class='author']//span[@class='name']//text()", # 作者名字

"author_tag":"//div[@class='author']//span[@class='tag']//text()",# 作者标签

"postdate":"//div[@class='author']//span[@class='publish-time']//text()", # 发布时间

"word_num":"//div[@class='author']//span[@class='wordage']//text()",#字数

"notebook":"//div[@class='show-foot']//a[@class='notebook']/span/text()",#文章属于的目录

"title":"//div[@class='article']/h1[@class='title']//text()",#文章标题

}

# 获取匹配的所有,并拼接成一个字符串的

xpath_rule_all_tostr ={

"content":"//div[@class='show-content']//text()",#正文

}

# 获取匹配的所有,保存数组形式

xpath_rule_all ={

"collection":"//div[@class='include-collection']//a[@class='item']//text()",#收入文章的专题

}

# 遍历所有文章的html文件，如果保存在数据库的则直接查询出来

list_dir = os.listdir(self.posts_html_dir)

for file in list_dir:

file = "{}/{}".format(self.posts_html_dir, file)

if os.path.isfile(file):

of = open(file)

html = of.read()

sel = etree.HTML(html)

of.close()

# 解析

post_id = file.split("_")[-1].strip(".html")

doc = {'url':'http://{}/p/{}'.format(self.domain,post_id)}

for k,rule in xpath_rule_0.items():

results = sel.xpath(rule)

if results:

doc[k] = results[0]

else:

doc[k] = None

for k,rule in xpath_rule_all_tostr.items():

results = sel.xpath(rule)

if results:

doc[k] = ""

for result in results:

if result.strip():

doc[k] = "{}{}".format(doc[k], result)

else:

doc[k] = None

for k,rule in xpath_rule_all.items():

results = sel.xpath(rule)

if results:

doc[k] = results

else:

doc[k] = None

if doc["word_num"]:

doc["word_num"] = int(doc["word_num"].strip('字数').strip())

else:

doc["word_num"] = 0

# 保存到数据库或者文件中

of = open("{}/{}.json".format(self.posts_data_dir, post_id), "w")

of.write(json.dumps(doc))

of.close()

def statistics(self):

'''

分开对每篇文章的进行分词统计，也统计全部文章分词

:return:

'''

# 遍历所有文章的html文件，如果保存在数据库的则直接查询出来

word_sum = {} #正文全部词语统计

title_word_sum = {} #标题全部词语统计

post_word_cnt_list = [] #每篇文章使用的词汇数量

# 正文统计数据保存

list_dir = os.listdir(self.posts_data_dir)

for file in list_dir:

file = "{}/{}".format(self.posts_data_dir, file)

if os.path.isfile(file):

of = open(file)

str = of.read()

doc = json.loads(str)

# 正文统计：精确模式,默认hi精确模式，所以可以不指定cut_all=False

words = jieba.cut(doc["content"], cut_all=False)

data = dict(Counter(words))

data = sorted(data.iteritems(), key=lambda d: d[1], reverse=True)

word_cnt = 0

for w in data:

# 只统计超过1个字的词语

if len(w[0]) < 2:

continue

# 统计到全部文章词语中

if w[0] in word_sum:

word_sum[w[0]]["cnt"] += w[1]

word_sum[w[0]]["post_cnt"] += 1

else:

word_sum[w[0]] = {}

word_sum[w[0]]["cnt"] = w[1]

word_sum[w[0]]["post_cnt"] = 1

word_cnt += 1

post_word_cnt_list.append((word_cnt,

doc["postdate"],

doc["title"],

doc["url"]))

# 标题统计：精确模式,默认hi精确模式，所以可以不指定cut_all=False

words = jieba.cut(doc["title"], cut_all=False)

data = dict(Counter(words))

data = sorted(data.iteritems(), key=lambda d: d[1], reverse=True)

for w in data:

# 只统计超过1个字的词语

if len(w[0]) < 2:

continue

# 统计到全部文章词语中

if w[0] in title_word_sum:

title_word_sum[w[0]]["cnt"] += w[1]

title_word_sum[w[0]]["post_cnt"] += 1

else:

title_word_sum[w[0]] = {}

title_word_sum[w[0]]["cnt"] = w[1]

title_word_sum[w[0]]["post_cnt"] = 1

post_word_cnt_list = sorted(post_word_cnt_list, key=lambda d: d[0], reverse=True)

wf = open("{}/content_statis_{}.dat".format(self.result_dir, self.user_id), "w")

wf.write("| 词语 | 发布日期 | 标题 | 链接 |\n")

for pw in post_word_cnt_list:

wf.write("|　{} | {} | {}| {}|\n".format(pw[0],pw[1],pw[2],pw[3]))

wf.close()

# 全部文章正文各词语按使用次数统计结果

wf = open("{}/content_statis_sum_use-num_{}.dat".format(self.result_dir, self.user_id), "w")

word_sum_t = sorted(word_sum.iteritems(), key=lambda d: d[1]['cnt'], reverse=True)

wf.write("| 分词 | 使用次数 | 使用的文章数量|\n")

for w in word_sum_t:

wf.write("| {} | {} | {}|\n".format(w[0], w[1]["cnt"], w[1]["post_cnt"]))

wf.close()

# 全部文章正文各词语按使用文章篇数统计结果

wf = open("{}/content_statis_sum_post-num_{}.dat".format(self.result_dir, self.user_id), "w")

word_sum_t = sorted(word_sum.iteritems(), key=lambda d: d[1]['post_cnt'], reverse=True)

wf.write("| 分词 | 使用的文章数量 | 使用次数 |\n")

for w in word_sum_t:

wf.write("| {} | {} | {}|\n".format(w[0], w[1]["post_cnt"], w[1]["cnt"]))

wf.close()

# 全部文章title各词语按使用次数统计结果

wf = open("{}/title_statis_sum_use-num_{}.dat".format(self.result_dir,self.user_id), "w")

title_word_sum_t = sorted(title_word_sum.iteritems(), key=lambda d: d[1]['cnt'], reverse=True)

wf.write("| 分词 | 使用次数 | 使用的文章数量|\n")

for w in title_word_sum_t:

wf.write("| {} | {} | {}|\n".format(w[0], w[1]["cnt"], w[1]["post_cnt"]))

wf.close()

# 全部文章title各词语按使用次数统计结果

wf = open("{}/title_statis_sum_post-num_{}.dat".format(self.result_dir, self.user_id), "w")

title_word_sum_t = sorted(title_word_sum.iteritems(), key=lambda d: d[1]['post_cnt'], reverse=True)

wf.write("| 分词 | 使用的文章数量 | 使用次数 |\n")

for w in title_word_sum_t:

wf.write("| {} | {} | {}|\n".format(w[0], w[1]["post_cnt"], w[1]["cnt"]))

wf.close()

print("一共统计文章：{}　篇".format(len(list_dir)))

print("所有正文－使用了２字及以上词语：{}　个".format(len(word_sum_t)))

print("所有标题－使用了２字及以上词语：{}　个".format(len(title_word_sum_t)))

if __name__ == '__main__':

sp = Spider(start_url="http://www.jianshu.com/u/65fd4e5d930d")

print("获取作者文章列表页面...")

sp.post_list_page()

print("获取作者所有文章页面...")

#sp.posts_html()

print("解析作者所有文章页面...")

#sp.page_parsing()

print("简单统计分析文章词汇...")

#sp.statistics()

python爬取js加载的数据_Python爬虫:爬取JS加载数据的网页相关推荐

python爬表格数据_python爬虫,爬取表格数据
python爬虫,爬取表格数据 python爬虫,爬取表格数据 python爬虫,爬取全国空气质量指数编程环境:Jupyter Notebook 所要爬取的网页数据内容如下图 python爬虫代码及 ...
python爬虫爬取股票软件数据_Python爬虫抓取东方财富网股票数据并实现MySQL数据库存储（转载）...
完整代码实际上,整个事情完成了两个相对独立的过程:1.爬虫获取网页股票数据并保存到本地文件:2.将本地文件数据储存到MySQL数据库.并没有直接的考虑把从网页上抓取到的数据实时(或者通过一个临时文件 ...
python抓取数据库数据_Python爬虫抓取东方财富网股票数据并实现MySQL数据库存储...
Python爬虫可以说是好玩又好用了.现想利用Python爬取网页股票数据保存到本地csv数据文件中,同时想把股票数据保存到MySQL数据库中.需求有了,剩下的就是实现了. 在开始之前,保证已经安装好 ...
python爬取豆瓣电影top250的代码_Python爬虫——爬取豆瓣电影Top250代码实例
利用python爬取豆瓣电影Top250的相关信息,包括电影详情链接,图片链接,影片中文名,影片外国名,评分,评价数,概况,导演,主演,年份,地区,类别这12项内容,然后将爬取的信息写入Excel表中 ...
python爬取电影网站存储于数据库_python爬虫猫眼电影和电影天堂数据csv和mysql存储过程解析...
字符串常用方法 # 去掉左右空格 'hello world'.strip() # 'hello world' # 按指定字符切割 'hello world'.split(' ') # ['hello' ...
python解析网页数据_python爬虫——爬取网页数据和解析数据
1.网络爬虫的基本概念网络爬虫(又称网络蜘蛛,机器人),就是模拟客户端发送网络请求,接收请求响应,一种按照一定的规则,自动地抓取互联网信息的程序. 只要浏览器能够做的事情,原则上,爬虫都能够做到. ...
python爬取flash数据_python爬虫: 爬取flash播放页面的信息
我们通过查看知道flash类型的网页采取文件格式是amf类型的 AMF(Action Message Format) 是Flash与服务端通信的一种常见的二进制编码模式,其传输效率高,可以在HTTP层 ...
python爬取贴吧数据_Python爬虫——抓取贴吧帖子
原博文 2016-11-13 23:13 − 抓取百度贴吧帖子按照这个学习教程,一步一步写出来,中间遇到很多的问题,一一列举首先, 获得标题和贴子总数 # -*- coding:utf-8 ...
python 搜索网页数据_python爬虫爬取网页所有数据
技术文档主体内容:可以认为是页面最想表达的内容总和.对于内容详情页来说,主体内容指从标题开始至正文内容结束,翻页区域也被视为主体内容,文章后的评论.分享.推荐等不视为主体内容. 首屏:用户点击搜索结 ...

python爬取js加载的数据_Python爬虫:爬取JS加载数据的网页

python爬取js加载的数据_Python爬虫:爬取JS加载数据的网页相关推荐

最新文章

热门文章