Sodino

文章目录

  1. 1. app.js
  2. 2. img-spider.js
  3. 3. img.gallery.js
  4. 4. ifengImgs.js
  5. 5. ifengPictures.js
  6. 6. qqImgs.js
  7. 7. imgs.html

先上效果图:

图片上部分为待解析的网页新闻链接,支持一次输入多个.
图片下部分为解析的进度日志打印。

点击’Commit’之后,对比效果图如下。左边为腾讯新闻原网页,右边为抓取后的整合效果。

工程结构:

文件名 描述
app.js 程序启动
img-spider.js 爬虫爬取管理
ifengImgs.js 爬取iFeng下game/fashion的实现
ifengPictures.js 爬取iFeng下game高清图的实现
qqImgs.js 爬取腾讯新闻图的实现
img.gallery.js 爬取图片的汇总
imgs.html 提交爬取链接的html界面

应用到的知识点:

  • express:搭建Web服务
  • cheerio:类似jQuery的快速解析网页工具
  • iconv-lite:解决中文乱码问题
  • 正则表达式:网址匹配、内容匹配/过滤
  • Charles:抓包工具

更多细节看源码吧….

GitHub源码链接:Sodino#ImgSpider


app.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
var fs = require('fs');
var express = require('express');
var img_spider = require('./img-spider.js');
var app = express();
app.get('/imgs.html', (req, resp) => {
resp.writeHead(200, {'Content-Type' : 'text/html'});
resp.write('<head><meta charset="utf-8"/></head>');
var file = fs.createReadStream('./imgs.html');
file.pipe(resp);
});
app.post('/imgs.html', (req, resp) => {
req.on('data', (data) => {
var content = data.toString();
content = unescape(content);
content = content.replace('txtUrls=', '')
//.replace('/\r/g','')
//.replace('/\\r/g','')
;
var arrUrl = content.split(/\s+/);
var imgSpider = new img_spider();
imgSpider.spider(arrUrl, (err, arrImgGallery) => {
resp.writeHead(200, {'Content-Type' : 'text/html'});
resp.write('<head><meta charset="utf-8"/></head>');
resp.write('<body>');
if (err) {
var errStr = err.toString();
resp.write(errStr);
resp.write('</body>');
resp.end();
return;
}
arrImgGallery.forEach((element, index, arrGallery)=>{
var gallery = element;
resp.write('<p>============================================</p>');
resp.write('<p>' + gallery.title + '</p>');
var arrImgs = gallery.arrImgs;
arrImgs.forEach((ele, idx, arrImg)=>{
var desc = ele.desc;
var imgUrl = ele.imgBig;
resp.write('<p>idx=' + idx + "</p>");
resp.write('<p>' + desc + '</p>');
//<img id="bigPic" src="http://img1.gtimg.com/16/1615/161596/16159645_980x1200_0.jpg" style="opacity: 1;">
resp.write('<p><img id="bigPic" src="' + imgUrl+'" style="opacity: 1;"></img></p>');
resp.write('<p>------------------------</p>');
});
});
resp.write('</body>');
resp.end();
});
});
});
app.listen(1024);
console.log('server running on http://localhost:1024/imgs.html');

img-spider.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
var ifengImgs = require('./ifengImgs.js');
var ifengPictures = require('./ifengPictures.js');
var qqImgs = require('./qqImgs.js');
var ImgSpider = function(){
this.arrUrls = [];
this.arrImgGallery = [];
this.callback = null;
};
ImgSpider.prototype.spider = function(arrUrl, callback){
if (!Array.isArray(arrUrl)) {
throw new Error("arrUrl isn't a array!");
}
if (arrUrl.length == 0) {
throw new Error("arrUrl is empty.");
}
this.callback = callback;
arrUrl.forEach((element, index, arr) => {
if (ifengImgs.prototype.RegExp.test(element)) {
runSpider(element, ifengImgs, this);
} else if (ifengPictures.prototype.RegExp.test(element)) {
runSpider(element, ifengPictures, this);
} else if (qqImgs.prototype.RegExp.test(element)) {
runSpider(element, qqImgs, this);
} else {
element = element.trim();
if (element.length > 0) {
var err = new Error("Can't support this url:[" + element + ']');
callback(err, null);
} else {
// do nothing..
}
}
});
};
ImgSpider.prototype.clean = function () {
this.arrUrls = [];
this.arrImgGallery = [];
this.callback = null;
};
function runSpider(url, constructor, imgSpider) {
imgSpider.arrUrls.push(url);
spider = new constructor();
spider.spider(url, (err, imgGallery) => {
if (err) {
console.log('error');
console.log(err);
return;
}
console.log('Done:', imgGallery.url, imgGallery.title);
imgSpider.arrImgGallery.push(imgGallery);
if (imgSpider.arrImgGallery.length == imgSpider.arrUrls.length) {
if (Object.prototype.toString.call(imgSpider.callback)=== '[object Function]') {
imgSpider.callback(null, imgSpider.arrImgGallery);
}
}
});
}
module.exports = ImgSpider;

img.gallery.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
var ImgGallery = function(url) {
this.url = url;
this.title = '';
this.arrImgs = [];
};
ImgGallery.prototype.push = function(idx, imgBig, imgSmall, desc) {
var img = new Img(idx, imgBig, imgSmall, desc);
var length = this.arrImgs.push(img);
return length;
}
var Img = function(idx, imgBig, imgSmall, desc) {
this.imgBig = imgBig;
this.imgSmall = imgSmall;
this.desc = desc;
this.index = idx;
};
module.exports = ImgGallery;

ifengImgs.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
var url = 'http://games.ifeng.com/a/20160504/41603363_0.shtml';
var cheerio = require("cheerio");
var http = require("http");
var iconv = require('iconv-lite');
var img_gallery = require('./img.gallery.js');
var SpiderIfengImgs = function() {
};
// http://games.ifeng.com/a/20160504/41603363_0.shtml
// http://fashion.ifeng.com/a/20160519/40162307_0.shtml#p=1
SpiderIfengImgs.prototype.RegExp = /http:\/\/(games)|(fashion).ifeng.com\/a\/\d{8}\/\d+_\d+.shtml/;
SpiderIfengImgs.prototype.spider = function(url, callback){
http.get(url, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var imgGallery = new img_gallery(url);
var chunkAll = Buffer.concat(arrBuf, bufLength);
var html = iconv.decode(chunkAll,'utf-8');
var $ = cheerio.load(html);
imgGallery.title = $("title").text();
//console.log('page title', imgGallery.title);
var strStart = 'var G_listdata= ';
var strEnd = '</script>';
var idxStart = html.indexOf(strStart);
var idxEnd = html.indexOf(strEnd, strStart.length + idxStart);
var jsListData = html.slice(idxStart + strStart.length, idxEnd);
jsListData = jsListData.replace(/'/g, "\"")
.replace(/title/g, '\"title\"')
.replace(/big_img/g, '\"big_img\"')
.replace(/originalimg/g, '\"originalimg\"')
.replace(/picwidth/g, '\"picwidth\"')
.replace(/picheight/g, '\"picheight\"')
.replace(/morelink/g, '\"morelink\"')
.replace(/img:/g, '\"img\":')
.replace('];', ']')
;
var objJson = JSON.parse(jsListData);
objJson.forEach((element, index, arr) => {
var title = element.title;
var big = element.big_img;
var img = element.img;
var originalimg = element.originalimg;
imgGallery.push(index, big, img, title);
/*console.log(index);
console.log('title', title);
console.log('big', big);
console.log('img', img);
console.log('originalimg', originalimg);*/
});
if (Object.prototype.toString.call(callback)=== '[object Function]') {
callback(null, imgGallery);
}
});
});
};
module.exports = SpiderIfengImgs;

ifengPictures.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
var url = 'http://games.ifeng.com/picture/gaoqing/detail_2015_09/11/41081883_0.shtml';
var cheerio = require("cheerio");
var http = require("http");
var iconv = require('iconv-lite');
var img_gallery = require('./img.gallery.js');
var SpiderIfengPictures = function(){
};
SpiderIfengPictures.prototype.RegExp = /http:\/\/games.ifeng.com\/picture\/gaoqing\/detail_\d{4}_\d{2}\/\d{2}\/\d+_\d+.shtml/;
SpiderIfengPictures.prototype.spider = function (strUrl, callback) {
http.get(strUrl, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var imgGallery = new img_gallery(strUrl);
var chunkAll = Buffer.concat(arrBuf, bufLength);
var html = iconv.decode(chunkAll,'utf-8');
console.log('-----------------------------------');
console.log('html', html);
var $ = cheerio.load(html);
imgGallery.title = $("title").text();
//console.log('page title', imgGallery.title);
var strStart = '_listdata[0] = ';
var strEnd = 'new ifeng.Gallery';
var idxStart = html.indexOf(strStart);
var idxEnd = html.indexOf(strEnd, strStart.length + idxStart);
var jsListData = html.slice(idxStart + strStart.length, idxEnd);
jsListData = jsListData.replace(/'/g, "\"")
.replace(/title/g, '\"title\"')
.replace(/morelink/g, '\"morelink\"')
.replace(/picwidth/g, '\"picwidth\"')
.replace(/picheight/g, '\"picheight\"')
.replace(/listimg/g, '\"listimg\"')
.replace(/timg:/g, '\"timg\":')
.replace(/img:/g, '\"img\":')
.replace(/\};_listdata\[\d*\] = /g, '},')
.replace('\};', '}')
;
jsListData = '[' + jsListData + ']';
var objJson = JSON.parse(jsListData);
//console.log('jsListData', jsListData);
objJson.forEach((element, index, arr) => {
var title = element.title;
var timg = element.timg;
var img = element.img;
var listimg = element.listimg;
imgGallery.push(index, timg, img, title);
/*console.log(index);
console.log('title', title);
console.log('timg', timg);
console.log('img', img);
console.log('listimg', listimg);*/
});
if (Object.prototype.toString.call(callback)=== '[object Function]') {
callback(null, imgGallery);
}
});
});
};
module.exports = SpiderIfengPictures;

qqImgs.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
var img_gallery = require('./img.gallery.js');
var http = require("http");
var iconv = require('iconv-lite');
var cheerio = require("cheerio");
var url = 'http://news.qq.com/a/20160512/009639.htm';
var url = 'http://news.qq.com/a/20160512/009639.hdBigPic.js';
var SpiderQQImgs = function() {
this.title = null;
this.imgGallery = null;
this.callback = null;
};
SpiderQQImgs.prototype.RegExp = /http:\/\/news.qq.com\/a\/\d{8}\/\d+.htm/;
SpiderQQImgs.prototype.send2callback = function() {
if ((typeof this.title =='string')&&this.title.constructor==String && this.title.length > 0 && this.imgGallery != null && Object.prototype.toString.call(this.callback)=== '[object Function]') {
this.imgGallery.title = this.title;
this.callback(null, this.imgGallery);
}
};
SpiderQQImgs.prototype.spider = function (url, callback) {
this.callback = callback;
this.spiderTitle(url);
url = url.replace('.htm', '.hdBigPic.js');
this.spiderImgGallery(url);
};
SpiderQQImgs.prototype.spiderTitle = function (url) {
var spider = this;
http.get(url, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var chunkAll = Buffer.concat(arrBuf, bufLength);
var html = iconv.decode(chunkAll,'gb2312');
var $ = cheerio.load(html);
spider.title = $("title").text();
//console.log('page title', spider.title);
spider.send2callback();
});
});
};
SpiderQQImgs.prototype.spiderImgGallery = function (url) {
var spider = this;
http.get(url, function(res){
var arrBuf = [];
var bufLength = 0;
res.on("data", function(chunk){
arrBuf.push(chunk);
bufLength += chunk.length;
})
.on("end", function(){
var imgGallery = new img_gallery(url);
var chunkAll = Buffer.concat(arrBuf, bufLength);
var strJson = iconv.decode(chunkAll,'gb2312') // 汉字不乱码
.replace(/\/\*[\s\S]+?\*\//,'')/*.replace(subfix, '')*/ // 删除掉注释
.replace(/\'/g, '"') // 单引号变双引号才能解析成Object
/*.replace(/ /g, '')
.replace(/"Content":"",/g, '').replace(/"Attributes":\[\],/g, '')
.replace(/ /g, '')
.replace(/,"Children":\[\]/g,"")*/;
// console.log(strJson);
var objJson = JSON.parse(strJson);
deleteEmptyProperty(objJson);
var arr = objJson.Children[0].Children;
var shift1 = arr.shift();
var imgCount = shift1.Children[0].Content;
var arrImgs = arr.shift().Children;
// console.log('imgCount', imgCount);
arrImgs.forEach((element, index, array) => {
var arr = element.Children;
var small = arr[1];
var smallUrl = small.Children[0].Content;
var big = arr[2];
var bigUrl = big.Children[0].Content;
var text = arr[3];
var strText = text.Children[0].Content;
/*console.log('index', index);
console.log('smallUrl', smallUrl);
console.log('bigUrl', bigUrl);
console.log('text', strText);*/
imgGallery.push(index, bigUrl, smallUrl, strText);
});
spider.imgGallery = imgGallery;
spider.send2callback();
});
});
};
function deleteEmptyProperty(object){
for (var i in object) {
var value = object[i];
// console.log('typeof object[' + i + ']', (typeof value));
if (typeof value === 'object') {
if (Array.isArray(value)) {
if (value.length == 0) {
delete object[i];
//console.log('delete Array', i);
continue;
}
}
deleteEmptyProperty(value);
if (isEmpty(value)) {
//console.log('isOwnEmpty true', i, value);
delete object[i];
//console.log('delete a empty object');
}
} else {
if (value === '' || value === null || value === undefined) {
delete object[i];
//console.log('delete ', i);
} else {
//console.log('check ', i, value);
}
}
}
}
function isEmpty(object) {
for (var name in object) {
return false;
}
return true;
}
module.exports = SpiderQQImgs;

imgs.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
<!DOCTYPE html><html lang="zh-CN">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>Images spider</title>
</head>
<body>
<form id="form1" actoin="imgs.html" method="POST">
Please input urls:<br/>
<textarea name="txtUrls" style="width:500px;height:120px;">http://news.qq.com/a/20160531/018019.htm#p=1
http://games.ifeng.com/a/20160530/41615842_0.shtml#p=1
</textarea><br/>
<br/>
<input type="submit" value="commit"/><br/>
<br/>
</form>
</body>
</html>

About Sodino

【Node.js】爬虫--抓取新闻标题、图片、文字描述,支持QQ、iFeng相关推荐

  1. java爬虫拉勾网_[Java教程]node.js爬虫爬取拉勾网职位信息

    [Java教程]node.js爬虫爬取拉勾网职位信息 0 2017-03-14 00:00:21 简介 用node.js写了一个简单的小爬虫,用来爬取拉勾网上的招聘信息,共爬取了北京.上海.广州.深圳 ...

  2. python爬取新闻网站内容findall函数_Python正则抓取新闻标题和链接的方法示例

    本文实例讲述了Python正则抓取新闻标题和链接的方法.分享给大家供大家参考,具体如下: #-*-coding:utf-8-*- import re from urllib import urlret ...

  3. Node.js 爬虫爬取电影信息

    Node.js 爬虫爬取电影信息 本文地址:https://blog.csdn.net/weixin_45580251/article/details/107669713 爬取的是1905电影网的信息 ...

  4. java 爬虫 抓取网上的图片报错521解决方案

    最近做爬虫时碰到了521错误,500开头的都是服务器错误:521错误码需要请求多次才能返回正确的结果:查看请求次数需要借助抓包工具,我自己使用Fiddler 4抓取到发送了三次请求才拿到结果,所以这就 ...

  5. node抓取58同城信息_如何使用标准库和Node.js轻松抓取网站以获取信息

    node抓取58同城信息 网络抓取工具是一种工具,可让我们选择网站的非结构化数据并将其转换为结构化数据库. 那么,网络刮板将在哪里派上用场呢? 我列出了我最喜欢的用例,以使您对启动自己的应用感到兴奋! ...

  6. Python 爬虫: 抓取花瓣网图片

    接触Python也好长时间了,一直没什么机会使用,没有机会那就自己创造机会!呐,就先从爬虫开始吧,抓点美女图片下来. 废话不多说了,讲讲我是怎么做的. 1. 分析网站 想要下载图片,只要知道图片的地址 ...

  7. Python爬虫抓取指定网页图片代码实例

    更多编程教程请到:菜鸟教程 https://www.piaodoo.com/ 友情链接:好看站 http://www.nrso.net/ 高州阳光论坛https://www.hnthzk.com/ 想 ...

  8. 利用python从网络上爬取图片_一篇文章教会你利用Python网络爬虫抓取王者荣耀图片...

    点击上方"IT共享之家",进行关注 回复"资料"可获赠Python学习福利 [一.项目背景] 王者荣耀作为当下最火的游戏之一,里面的人物信息更是惟妙惟肖,但受到 ...

  9. node.js爬虫爬取电影天堂,实现电视剧批量下载。

    2019独角兽企业重金招聘Python工程师标准>>> ###一.项目描述   引言:在电影天堂下电视剧的下伙伴有木有发现,它没有提供批量下载功能,美剧英剧还好,10集左右,我就多点 ...

最新文章

  1. 机器学习(MACHINE LEARNING)MATLAB动态规划解决背包问题
  2. 三十六、深入Vue.js组件Component(上篇)
  3. c++关联容器的容器操作(和顺序容器都支持的操作)详细解释,基础于c++primer 5th 表 9.2 (持续更新)
  4. 非关型数据库之Hbase
  5. 一个模型使召回阶段又准确又多样
  6. Java学生实训平台_基于jsp的学生实训平台-JavaEE实现学生实训平台 - java项目源码...
  7. 谈谈Linux下动态库查找路径的问题
  8. 常见的java开源组件_java开源框架有哪些?Java常用开源框架
  9. 周志华《机器学习》中的西瓜数据集
  10. Python123 英文字符的鲁棒输入
  11. 如何评价柏拉图_哲学家如何看待死亡?|读柏拉图《斐多篇》(1)
  12. COUNTIFS函数
  13. zabbix代理服务器配置
  14. ANDROID_APP C++框架
  15. Cadence CIS 器件管理平台解决方案
  16. 宋宝华:让Linux的段错误(segmentation fault)不再是一个错误
  17. 【camera】Mtk相机冷启动的拆解笔记
  18. DaisyDisk for mac(磁盘清理软件)中文版
  19. 全国计算机等级报名12,全国计算机等级考试12月20日开始报名!注意事项请查收!...
  20. C语言第十二课:编写扫雷游戏(综合练习2)

热门文章

  1. iOS仿QQ侧滑菜单、登录按钮动画、仿斗鱼直播APP、城市选择器、自动布局等源码
  2. 腾讯云轻量应用服务器安装宝塔面板流程(图文教程)
  3. 探索计算机的秘密硬件教案,《电脑的秘密》教案教学设计
  4. Python网络爬虫爬取携程网中的游记标题及内容
  5. 俞敏洪的同济大学演讲
  6. 库位分布看板(库位管理)
  7. 服务器通俗讲是什么?服务器有什么作用?
  8. [从头读历史] 第251节 图解易经(下)
  9. 大数据公司Talend纳斯达克上市 股价大涨41.67%
  10. 德国科隆大学计算机排名,德国科隆大学世界排名如何?