Node 实现爬虫？

什么是爬虫 ?

通过模拟浏览器的请求，服务器就会对根据我们的请求返回我们想要的数据。将数据解析出来，并且进行保存。

爬虫一般步骤 ?

目标：确定你想要获取的数据

确定你想要获取的数据在那个页面（一般详细的数据会在详情页）。

确定在哪些页面可以链接到这些页面（一般分类列表页面会有详情页的链接数据）。

寻找页面之间数据之间的规律。

分析页面：

获取数据的方式（正则，或者 cheerio 库）。

数据是通过 ajax 请求的数据，还是 html 自带的数据。

如果是通过 ajax 请求的数据，需要分析这个 ajax 请求的链接是什么链接，一般请求的数据都为 json 格式的数据，会比较容易解析。

如果数据在 html 里面，就用 cheerio 通过选择器将内容选中。

编写单个数据获取的案例：（多个循环）

解析出分类页的链接地址。

解析出列表页的链接地址。

解析出详情页的链接地址。

解析详情页里面想要获取的数据。

将数据保存起来（本地文件 / 数据库）。

如果遇到阻碍，进行对反爬虫对抗：

User-Agent 是否是正常浏览器的信息，将请求头设置成跟浏览器一样的内容。

因为爬虫爬取速度过快，会导致封号（IP），可以降低速度进行解决。或者可以使用代理进行解决。

设置验证码，只有通过人为的验证码后，才将凭证给到你。就需要浏览器的真实操作，可以使用无界浏览器帮助。

请求数据的库？

axios 。

request 。

puppetter 无界浏览器（完全模拟浏览器）。

Node 通过正则爬取电影数据？

// 0,请求数据方法封装
let request = require('request')
function req(path) {return new Promise(function(resolve, reject) {request.get(path, function(err, response, body) {if (err) {reject(err)} else {resolve({ response, body })}})})
}// 1,获取起始页的所有分类
let httpUrl = 'https://www.1905.com/vod/list/n_1_t_1/o3p1.html'
async function getClassUrl() {// 请求拿到爬取数据let { response, body } = await req(httpUrl)//console.log(body)// 解析正文内容let reg = /<span class="search-index-L">类型(.*?)<div class="grid-12x">/igslet result = reg.exec(body)[1]let reg1 = /<a href="javascript\:void\(0\);" onclick="location\.href='(.*?)';return false;" >(.*?)<\/a>/igsvar res;while (res = reg1.exec(result)) {getMovies(res[1], res[2])}
}
getClassUrl()// 2,获取分类里的电影链接
async function getMovies(url, movieType) {let { response, body } = await req(url)let reg = /<a class="pic-pack-outer" target="_blank" href="(.*?)" .*?><img/igslet arrList = []var res;while (res = reg.exec(body)) {arrList.push(res[1])getInfo(res[1])}//console.log(movieType, arrList)
}// 3,根据电影链接获取电影的详细信息
async function getInfo(url) {let { response, body } = await req(url)console.log('--', body)let reg = /<span id="playerBoxIntroCon" class="active">(.*?)<a/igs//console.log('--', reg.exec(body))var res;let arrList = []while (res = reg.exec(body)) {arrList.push(res[1])}//console.log(arrList)
}

Node 通过 cheerio 库爬取表情包？

// Node 爬取表情包
// cheerio 是 jquery 核心功能的一个快速灵活而又简单的实现，主要是为了在服务器端需要对DOM元素进行操作的地方
// cheerio 是 node.js 的提取页面模块，为服务器特别定制
// 1，安装 npm i cheerio
// 2, 导入 require('cheerio)
const axios = require('axios')
const cheerio = require('cheerio')
const fs = require('fs')
const path = require('path')let httpUrl = 'https://www.doutula.com/article/list/?page=1'
axios.get(httpUrl).then((res) => {// cheerio 解析 htmllet $ = cheerio.load(res.data)$('#home .col-sm-9>a').each((index,item) => {let pageUrl = $(item).attr('href')parsePage(pageUrl,index)})
})async function parsePage(url,index){let res = await axios.get(url)let $ = cheerio.load(res.data)$('.pic-content img').each((i,item) => {let imgUrl = $(item).attr('src')let extName = path.extname(imgUrl)let imgPath = './img/pic-'+index+'-'+'M'+'-'+i+extName// 创建写入图片流let ws =  fs.createWriteStream(imgPath)axios.get(imgUrl,{responseType:'stream'}).then((res) => {res.data.pipe(ws)res.data.on('close',()=>{ws.close()})})})
}

Node 通过 cheerio 库爬取音乐？

// cheerio 爬取音乐
// 目标：下载音乐
// 1，获取音乐相关的信息，通过音乐相关的信息获取 mp3 列表let axios = require('axios')
let cheerio = require('cheerio')
let fs = require('fs')
// 选定的网页路径
let httpUrl = 'https://www.xiami.com/list?page=1&query=%7B%22dataType%22%3A%22recommend%22%7D&scene=main&type=collect'
// 拿到总的页面数据
axios.get(httpUrl).then((res) => {// 页面数据解析let $ = cheerio.load(res.data)let obj = {}$('.adaptive-list>.collect-item').each((index,item) => {let mp3Url = 'https://ww.xiami.com'+$(item).find('.wrapper>a').attr('href')let name = $(item).find('.info>.name>a').text()let author = $(item).find('.info>.author>a').text()obj.url = mp3Urlobj.name = nameobj.author = authorconsole.log('数据:',obj)// 具体的文件下载保存downLoad(obj)})
})
// 文件下载，保存到指定目录下操作
function downLoad (obj){axios.get(obj.url,{responseType:'stream'}).then(function(res){// 写入流方式进行let ws = fs.createWriteStream('./mp3/'+obj.name+'.mp3')res.data.pipe(ws)})
}

Node 对于 puppeteer 库的基础使用？

// puppeteer 库的使用(无头浏览器)
let puppeteer = require('puppeteer')
// 打开浏览器
async function test(){// puppeteer.launch(options) 实例开启浏览器// 可以传入一个options对象，配置为无界面浏览器（性能更高，更快），也可以配置为有界面浏览器（一般用于调试开发）let options = {// 设置为有界面，为true，为无界面headless:false,// 设置视窗的宽高defaultViewport:{width:1200,height:800}}let browser = await puppeteer.launch(options)// 打开页面,返回新的页面对象let page = await browser.newPage()// 访问页面await page.goto('https://www.dytt8.net/index.htm')// 截屏await page.screenshot({path:'screenshot.png'})// 获取页面内容page.$$eval('#menu li a',(elements) => {elements.forEach((item,index) => {console.log(item.innerHTML)})})page.on('console',(e) => {console.log(e)})
}
test()

Node 通过 puppeteer 库爬取电子书？

let puppeteer = require('puppeteer')
let axios = require('axios')
let url = require('url')
let httpUrl = 'https://sobooks.cc/';(async function(){// 开发测试阶段浏览器配置
let debugOptions = {// 设置视窗的宽高defaultViewport:{width:1200,height:800},// 设置为有界面，如果为true,即为无界面headless:false,// 设置放慢每个步骤的毫秒数slowMo:250
}
// 无头浏览器配置，效率更高（开发完后使用）
let options = {headless:true
}
// 开启一个浏览器
let browser = await puppeteer.launch(debugOptions)// 目标: 获取https://sobooks.cc/ 所有书名和电子书的链接
// 进入网站，获取整个网站列表页的页数
async function getAllNum(){let page = await browser.newPage()// 截取谷歌请求await page.setRequestInterception(true)// 监听请求事件，并对请求进行拦截page.on('request',interceptedRequest => {// 通过url模块对请求的地址进行解析let urlObj = url.parse(interceptedRequest.url())if(urlObj.hostname == "googleads.g.doubleclick.net"){// 如果是谷歌的广告请求，放弃此次请求interceptedRequest.abort()}else{interceptedRequest.continue()}})await page.goto(httpUrl)// 设置选择器，获取总页数let pageNum = await page.$eval('.pagination li:last-child span',(element) => {let text = element.innerHTMLtext = text.substring(1,text.length-2).trim()return text})page.close()return pageNum
}let pageNum = await getAllNum()// 获取列表页的所有链接
async function pageList(num){let pageListUrl = 'https://sobooks.cc/page/'+numlet page = await browser.newPage()// 截取谷歌请求await page.setRequestInterception(true)// 监听请求事件，并对请求进行拦截page.on('request',interceptedRequest => {// 通过url模块对请求的地址进行解析let urlObj = url.parse(interceptedRequest.url())if(urlObj.hostname == "googleads.g.doubleclick.net"){// 如果是谷歌的广告请求，放弃此次请求interceptedRequest.abort()}else{interceptedRequest.continue()}})await page.goto(pageListUrl)// $eval 找一个符合条件的元素，$$eval 找所有符合条件的元素let arrList = await page.$$eval('.card .card-item .thumb-img>a',(elements) => {let arr = []elements.forEach((element,index) => {var obj = {href:element.getAttribute('href'),title:element.getAttribute('title')}arr.push(obj)})return arr})page.close()// 通过获取的数组的地址和标题去请求书籍的详情页arrList.forEach((pageObj,i) => {//getPageInfo(pageObj)})//return arrList
}
let pageArr = await pageList(1)
//console.log('pageArr',pageArr)
// 进入每个电子书的详情页获取下载电子书的网盘地址
async function getPageInfo(pageObj){let page = await browser.newPage()// 截取谷歌请求await page.setRequestInterception(true)// 监听请求事件，并对请求进行拦截page.on('request',interceptedRequest => {// 通过url模块对请求的地址进行解析let urlObj = url.parse(interceptedRequest.url())if(urlObj.hostname == "googleads.g.doubleclick.net"){// 如果是谷歌的广告请求，放弃此次请求interceptedRequest.abort()}else{interceptedRequest.continue()}})await page.goto(pageObj.href)//$() 只能获取属性，不能获取文本let eleA = await page.$('.dltable tr:nth-child(3) a:last-child')let aHref = await eleA.getProperty('href')aHref = aHref._remoteObject.valueconsole.log('aHref',aHref)
}
getPageInfo({href:"https://sobooks.cc/books/14620.html"})
//
})()

总结：使用正则，分析好页面结构，基本的数据都能爬取的到，但是使用它爬取数据过程比较比较繁琐。个人比较推荐使用 cheerio 库 去做数据的爬取，它用法简单，和 jQuery 用法一致，对于熟悉前端开发的同学而言，基本没有新的学习成本，也更好理解。但是也存在局限性，比如一些网站有较强的反爬虫机制，可以考虑使用 puppeteer 库 去操作。它的本质原理，就是模拟浏览器去请求页面，然后在进行数据的爬取，具体使用细节可以去查看对应的官方文档。

Node 实现爬虫？相关推荐

Node.js aitaotu图片批量下载Node.js爬虫1.00版
即使是https网页,解析的方式也不是一致的,需要多试试. 代码: //====================================================== // aitaot ...
Node.js umei图片批量下载Node.js爬虫1.00
这个爬虫在abaike爬虫的基础上改改图片路径和下一页路径就出来了,代码如下: //====================================================== // ...
node.js 爬虫入门总结
node.js爬虫前端同学可能向来对爬虫不是很感冒,觉得爬虫需要用偏后端的语言,诸如 php , python 等.当然这是在 nodejs 前了,nodejs 的出现,使得 Javascript ...
node：爬虫爬取网页图片 1
代码地址如下: http://www.demodashi.com/demo/13845.html 前言周末自己在家闲着没事,刷着微信,玩着手机,发现自己的微信头像该换了,就去网上找了一下头像,看着图 ...
如何用 Node.js 爬虫？
本文来自作者小北在 GitChat 上分享「Node.js 爬虫从 0 到 1」,「阅读原文」查看交流实录「文末高能」编辑 | 家辉写在前面我们经常会听说爬虫这个词语,但是却从来没有见过这 ...
java爬虫拉勾网_[Java教程]node.js爬虫爬取拉勾网职位信息
[Java教程]node.js爬虫爬取拉勾网职位信息 0 2017-03-14 00:00:21 简介用node.js写了一个简单的小爬虫,用来爬取拉勾网上的招聘信息,共爬取了北京.上海.广州.深圳 ...
node.js 爬虫实现爬取网页图片并保存到本地
node.js 爬虫实现爬取网页图片并保存到本地没有废话直接看代码 /*** 请求网站数据* 将数据保存本地文件*/ //不同协议引用不同模块,http https const http = re ...
Node.js 爬虫爬取电影信息
Node.js 爬虫爬取电影信息本文地址:https://blog.csdn.net/weixin_45580251/article/details/107669713 爬取的是1905电影网的信息 ...
Node.js爬虫一站到底系列先导篇
前言: 在web编程课上,老师布置了爬虫任务,而没有任何经验和相关方面知识的小白简直一头雾水,不知道该如何下手.一开始抱着一本厚厚的犀牛书啃了好几天,本以为对Javascipt语法有一定了解后便可以自 ...
node 写爬虫，原来这么简单
作者:CarsonXu 原文地址:https://juejin.im/post/5eca37f951882543345e81df 前言今天给大家带来的是node简单爬虫,对于前端小白也是非常好理解且 ...

Node 实现爬虫？

什么是爬虫 ?

爬虫一般步骤 ?

请求数据的库？

Node 通过正则爬取电影数据？

Node 通过 cheerio 库爬取表情包？

Node 通过 cheerio 库爬取音乐？

Node 对于 puppeteer 库的基础使用？

Node 通过 puppeteer 库爬取电子书？

Node 实现爬虫？相关推荐

最新文章

热门文章

Node 实现爬虫 ？

什么是爬虫 ?

爬虫一般步骤 ?

请求数据的库 ？

Node 通过正则爬取电影数据 ？

Node 通过 cheerio 库爬取表情包 ？

Node 通过 cheerio 库爬取音乐 ？

Node 对于 puppeteer 库的基础使用 ？

Node 通过 puppeteer 库爬取电子书 ？

Node 实现爬虫 ？相关推荐

最新文章

热门文章

Node 实现爬虫？

请求数据的库？

Node 通过正则爬取电影数据？

Node 通过 cheerio 库爬取表情包？

Node 通过 cheerio 库爬取音乐？

Node 对于 puppeteer 库的基础使用？

Node 通过 puppeteer 库爬取电子书？

Node 实现爬虫？相关推荐