本文将示例如何使用 SpiderHttpUtils 来爬取某知名*猫平台的评论信息。

以 https://detail.tmall.com/item.htm?id=18539499729 宝贝为例,使用Fiddler抓包工具获取到它的评论请求地址如下,其中的 currentPage 参数即为被爬取的评论的页码。

pom.xml 文件中引入依赖包:

 <parent><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-parent</artifactId><version>1.5.7.RELEASE</version><relativePath /></parent><dependencies><dependency><groupId>org.apache.commons</groupId><artifactId>commons-text</artifactId><version>1.6</version></dependency><dependency><groupId>org.apdplat</groupId><artifactId>word</artifactId><version>1.3</version></dependency><dependency><groupId>org.json</groupId><artifactId>json</artifactId></dependency><dependency><groupId>junit</groupId><artifactId>junit</artifactId><scope>test</scope></dependency></dependencies>

爬取评论的完整代码如下:

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;import org.apache.commons.text.StringEscapeUtils;
import org.apdplat.word.WordSegmenter;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.apdplat.word.segmentation.Word;
import org.json.JSONArray;
import org.json.JSONObject;
import org.junit.Test;import spider.SpiderHttpUtils;public class SpiderTest {public Map<String, String> getHeaders() {Map<String, String> headers = new HashMap<String, String>();headers.put("Host", "rate.tmall.com");headers.put("Referer"," https://detail.tmall.com/item.htm?spm=a220m.1000858.1000725.1.50be3bd8ewlaTd&id=41504319950&user_id=1975415428&cat_id=2");// headers.put("Accept-Language",// "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7");headers.put("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");headers.put("Cookie","cna=LuKHFKl4TlECAXQZ4Ux2g/Nd; cookie2=1d8425e75fcbd3cdaa40611db6680374; t=17fe97a643f4e1510f9e2977f9cbdd7d; _tb_token_=5734e153a5d34; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; dnk=pengjun%5Cu674E; lid=pengjun%E6%9D%8E; hng=CN%7Czh-CN%7CCNY%7C156; sn=%E5%85%A8%E6%A3%89%E6%97%B6%E4%BB%A3%E5%AE%98%E6%96%B9%E6%97%97%E8%88%B0%E5%BA%97%3Azfx; tk_trace=1; tracknick=pengjun%5Cu674E;lgc=pengjun%5Cu674E; enc=0F%2FkiNyKc%2F1vIUcjp6C7VI6tjD6K9gSaTtAQPlmY8CraZFMzXZMEcgDnr0LKd0SvSeKPrUQAqqEU%2Bq0O3aXG4Q%3D%3D; SHTSID=F8504BDA308C40A1867B84AA984C7914; uc1=cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&cookie21=U%2BGCWk%2F7pY%2FF&cookie15=UtASsssmOIJ0bQ%3D%3D&existShop=false&pas=0&cookie14=UoTZ5OSpoR6Xcg%3D%3D&tag=8&lng=zh_CN; uc3=vt3=F8dByEze4ekEsQsgc1A%3D&id2=VWeT3jqq6jDz&nk2=E6EQ1CLKS%2FnL&lg2=VT5L2FSpMGV7TQ%3D%3D; _l_g_=Ug%3D%3D; unb=682167773; cookie1=V3oTBcYJDILlbjtF3qOSEAd2Amf77M7oTu0rSZnkuIc%3D; login=true; cookie17=VWeT3jqq6jDz; _nk_=pengjun%5Cu674E;csg=1d4e91d8; skt=af3747a6827ebc42; _m_h5_tk=0ca05f482e46af75317d66b214d43689_1550465686263; _m_h5_tk_enc=2c51b947b84a5ef62f7c6523f04bbce9; x5sec=7b22726174656d616e616765723b32223a223762303932326363393666646437303062663361636430393164343932353530434b4731714f4d46454a335338706254376175634a686f4c4e6a67794d5459334e7a637a4f7a453d227d; whl=-1%260%260%260; l=bBN1mgHrvxpFLmphBOCwNQKXnqQTlIRRguSJGpWpi_5LUsvecl7OllzxWUv6Vj5P9zLB42mIJ0JTgFyQ5Ppf.; isg=BMPDJZxTm0CbSVClfvjWFCVzUofBKAVE096Au_WgcyLLtOHWfQtayJoiKgRfFK9y");return headers;}/*** 爬取评论内容*/@Testpublic void testSpider() throws IOException, InterruptedException {// 构建正则表达式对响应内容进行匹配过滤String regEx = "jsonp\\d+\\?\\(([\\s\\S]*)\\)";Pattern pat = Pattern.compile(regEx);// 请求地址String url = null;// 响应内容String retStr = null;Random random = new Random();// 用来保存评论到文件FileOutputStream fos = new FileOutputStream(new File("D:/简柔洁面巾_评论.txt"));OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");BufferedWriter bw = new BufferedWriter(osw);int lineCount = 0;// 逐页对评论进行爬取并写入文件for (int i = 1; i < 100; i++) {System.out.println("开始爬取第 " + i + " 页评论");url = MessageFormat.format(" https://rate.tmall.com/list_detail_rate.htm?itemId=41504319950&spuId=303661613&sellerId=1975415428&order=3&currentPage={0}&append=0&content=1&tagId=&posi=&picture=&groupId=&ua=098%23E1hvnpvRvphvUvCkvvvvvjiPRLspljEhn2qpsjthPmPyljDvRsLhtjrWPLsyAjn8RphvCvvvphmCvpvZ7Dl0eP5w7Di43kS5PbE4Bxi%2Fz1htvpvhvvCvpUwCvvpv9hCv2QhvCvvvMMGEvpCWvXfYMBlre8g7%2B3%2Bilj7Jyb8rwZDl%2BboJ%2BulABzcGeE9fV5EUAWAXeBOqb64B9Cka%2BfvsxI2heB6t%2BFBCAfyp%2Bu0OjomUy4oGULIKogyCvvOCvhE2zWoivpvUvvCC8Nrej68tvpvIvvCvpvvvvvvvvhOVvvvCw9vvB9OvvUHmvvCVC9vv9ogvvhOVvvmCb9hCvvOv9hCvvvvtvpvhvvCvp8wCvvpvvhHh9phv2HiwJSaQzHi475CnzT6Cvvyv9XRbIQvvD7w%3D&needFold=0&_ksTS=1550459693930_943&callback=jsonp944",i);retStr = SpiderHttpUtils.sendGet(true, url, null, getHeaders(), "utf-8");Thread.sleep(random.nextInt(4000) + 2000);Matcher mat = pat.matcher(retStr);if (mat.find()) {String jsonstr = mat.group(1);String finalJson = StringEscapeUtils.unescapeJava(jsonstr);try {JSONObject retJson = new JSONObject(finalJson);JSONObject rateDetail = retJson.getJSONObject("rateDetail");JSONArray rateList = rateDetail.getJSONArray("rateList");for (int index = 0; index < rateList.length(); index++) {JSONObject jsonObject = rateList.getJSONObject(index);bw.write(jsonObject.getString("rateContent"));lineCount += 1;bw.newLine();}} catch (Exception e) {e.printStackTrace();System.out.println(jsonstr);// break;}}}// 关闭文件流bw.close();System.out.println("共爬取 " + lineCount + " 行评论");}/*** 对爬取到的评论内容进行分词*/@Testpublic void testWord() throws IOException {FileReader reader = new FileReader("D:/简柔洁面巾_评论.txt");BufferedReader br = new BufferedReader(reader);String str = null;Map<String, IKWord> map = new HashMap<String, IKWord>();while ((str = br.readLine()) != null) {List<Word> words = WordSegmenter.seg(str, SegmentationAlgorithm.BidirectionalMaximumMatching);for (Word word : words) {String text = word.getText();IKWord ikWord = map.get(text);if (map.containsKey(text)) {ikWord.addCount(1);} else {ikWord = new IKWord();ikWord.setWord(text);ikWord.setCount(1);map.put(text, ikWord);}}}br.close();reader.close();FileWriter writer = new FileWriter("D:/简柔洁面巾_分词.txt");BufferedWriter bw = new BufferedWriter(writer);List<IKWord> list = new ArrayList<IKWord>();list.addAll(map.values());Collections.sort(list);Iterator<IKWord> iterator = list.iterator();while (iterator.hasNext()) {IKWord next = iterator.next();bw.write(next.getWord() + " " + next.getCount());bw.newLine();}bw.close();writer.close();}
}
public class IKWord implements Comparable<IKWord> {private Integer count;private String word;@Overridepublic int compareTo(IKWord that) {return that.count - this.count;}public boolean equals(IKWord that) {return (this.word.equals(that.word));}public Integer getCount() {return count;}public void setCount(Integer count) {this.count = count;}public String getWord() {return word;}public void setWord(String word) {this.word = word;}public void addCount(Integer count) {this.count += count;}}

使用URLConnection爬取评论相关推荐

  1. Selenium 爬取评论数据,就是这么简单!

    本文来自作者 秦子敬 在 GitChat 上分享「如何利用 Selenium 爬取评论数据?」,「阅读原文」查看交流实录 「文末高能」 编辑 | 飞鸿 一.前言 我们知道,如今的 web 网页数据很多 ...

  2. 京东爬取评论简单分析

    京东爬取评论简单分析 1.定义一个获取所有评论的函数 def get_comment(url): """ 获取评论函数 """ i = 0 ...

  3. python豆瓣爬虫爬取评论做成词云

    前言 前一段时间学校有个project,做一个电影购票系统,当时就用springboot做了系统,用python抓了一些电影的基本信息.后来发现如果把评论做成词云那展示起来不是很酷炫么.于是乎把这个过 ...

  4. python爬取评论_python爬取网易云音乐评论

    本文实例为大家分享了python爬取网易云音乐评论的具体代码,供大家参考,具体内容如下 import requests import bs4 import json def get_hot_comme ...

  5. python爬虫 爬取评论区

    在做课题的时候老师给了一个建议:去找找非结构化数据. 说实话我们连结构化数据都整不明白,还妄想尝试处理非结构化数据,于是尝试爬取了一下微博评论区.    讲太多会被墙审核不通过 先理解一个叫做开发者界 ...

  6. python爬取评论_Python爬取豆瓣《复仇者联盟3》评论并生成乖萌的格鲁特

    ### 1. 需求说明 本项目基于Python爬虫,爬取豆瓣电影上关于复仇者联盟3的所有影评,并保存至本地文件.然后对影评进行分词分析,使用词云生成树人格鲁特的形象照片. ### 2. 代码实现 此部 ...

  7. java 爬取评论,Java基于WebMagic爬取某豆瓣电影评论的实现

    目的 搭建爬虫平台,爬取某豆瓣电影的评论信息. 准备 webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发.webmagic的核心非常简单,但是覆盖 ...

  8. selenium爬取评论

    from selenium import webdriverdriver=webdriver.Chrome() # 自动访问的网站 driver.get("http://www.santos ...

  9. 京东iphone8的异步加载爬取评论

    2019独角兽企业重金招聘Python工程师标准>>> 前言: 最近关注了ID王大伟的博客, 看见他的博文对Python爬虫的爬取觉得很有意思, 于是跟着操作, 以下是操作步骤: 1 ...

最新文章

  1. Gradient Descent梯度下降(透彻分析)
  2. 【ACM】杭电OJ 2063
  3. IBM首席执行官提出人工智能部署三大基本原则
  4. Android客户端内置内存工具进行崩溃定位的实践经验
  5. 一步步在Docker里运行Web应用
  6. [转载] Python中的xrange和range的区别
  7. 统计list里面相同元素个数_Array篇easy难度之求相同元素个数
  8. 你真的会使用Eclipse的debug吗?
  9. zlib安装_.NET Core 架构设计实战04 - Nginx安装配置
  10. 生日python十种日期格式_Python可视化-二十四节气与生日间隔天数统计
  11. 概率矩阵分解模型 PMF
  12. 雷林鹏分享:jQuery Mobile 方向改变事件
  13. Vue中常用的组件传值方式
  14. C语言俄罗斯方块代码
  15. 银行流水、财报、年报、电费分割单等各类文档一键提取,达观表格提取工具再升级!
  16. javascript动态插入html元素
  17. MicroSemi LiberoSoc启动太慢的问题
  18. 以前收入高,但毫无上进心,40岁一身房贷,被裁后找个小公司,天天担心试用期过不了,焦虑地睡不着!...
  19. flash 水墨表现(转)
  20. ax.contour绘制等值线图时报错:The following kwargs were not used by contour: ‘color‘

热门文章

  1. 【Android】玩转命令行工具-apkanalyzer
  2. Lua--棋牌游戏开发(概念性设计一)
  3. 用R语言玩玩股票(二)
  4. 程序员有哪些靠谱的副业赚钱途径
  5. (经典Flash游戏)Zoom Keeper
  6. 【3dCG】——期中创新实训记录(3)
  7. 深入了解基于RTMP数据传输协议的实时流媒体技术(图解过程)一看就会【建议新手收藏】
  8. 计算机原材料管理发展国外,仓库管理系统的国内外发展现状
  9. Sony Xperia L36h 4.0通用一键root(无需解锁)详细图文教程
  10. ScriptManager.RegisterStartupScript方法