java jsoup 网络爬虫 学习例子(八)京东和淘宝商品比价 PhantomJS

/** filename getHtml.js* phantomjs.exe 2.0.0* author InJavaWeTrust*/var system = require('system');
var address = '';if (system.args.length != 2) {console.log('Try to pass two args when invoking this script!');phantom.exit();
} else {address = system.args[1];
}var page = require('webpage').create();
var url  = address;
phantom.outputEncoding = 'GBK';
page.open(url, function (status) {if (status !== 'success') {console.log('Failed to get the page!');} else {console.log(page.content);}phantom.exit();
});package com.iteye.injavawetrust.phantomjs;import java.util.List;/*** * @author InJavaWeTrust**/
public interface ProductList {/*** 爬取商品列表* @return*/public List<ProductInfo> getProductList();}package com.iteye.injavawetrust.phantomjs;import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;/*** * @author InJavaWeTrust**/
public class TBProductList implements ProductList{private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();private String tbUrl;private String productName;public TBProductList(String tbUrl, String productName) {this.tbUrl = tbUrl;this.productName = productName;}@Overridepublic List<ProductInfo> getProductList() {List<ProductInfo> tbProductList = new ArrayList<ProductInfo>();ProductInfo productInfo = null;String url = "";int page = 0;for(int i = 0; i < 10; i++){try {System.out.println("TB Product 第[" + (i + 1) + "]页");if(i == 0){url = tbUrl;}else{page += 44;url = Constants.TBURL + pcu.getUrlCode(productName) + Constants.TBPAGE + page;}System.out.println(url);Document doc = Jsoup.parse(pcu.getHtmlByPhantomjs(url));Elements itemlist = doc.select("div[class=m-itemlist]");Iterator<Element> it = itemlist.iterator();while(it.hasNext()){Element item = it.next();Elements items = item.select("div[data-category=auctions]");Iterator<Element> one = items.iterator();while(one.hasNext()){Element e = one.next();Elements price = e.select("div[class=price g_price g_price-highlight]>strong");String productPrice = price.text();Elements title = e.select("div[class=row row-2 title]>a");String productName = title.text();productInfo = new ProductInfo();productInfo.setProductName(productName);productInfo.setProductPrice(productPrice);tbProductList.add(productInfo);}}} catch(Exception e) {System.out.println("Get TB product has error");System.out.println(e.getMessage());}}return tbProductList;}public static void main(String[] args) {      try{String productName = "铅笔";String tbUrl = Constants.TBURL + pcu.getUrlCode(productName);List<ProductInfo> list = new TBProductList(tbUrl, productName).getProductList();for(ProductInfo pi : list){System.out.println("[" + pi.getProductName() + "]  [" + pi.getProductPrice() + "]");}}catch(Exception e){e.printStackTrace();}}}package com.iteye.injavawetrust.phantomjs;import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;/*** * @author InJavaWeTrust**/
public class JDProductList implements ProductList{private String jdUrl;private String productName;private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();public JDProductList(String jdUrl, String productName){this.jdUrl = jdUrl;this.productName = productName;}@Overridepublic List<ProductInfo> getProductList() {List<ProductInfo> jdProductList = new ArrayList<ProductInfo>();ProductInfo productInfo = null;String url = "";for(int i = 0; i < 10; i++){try {System.out.println("JD Product 第[" + (i + 1) + "]页");if(i == 0) {url = jdUrl;}else{url = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC + Constants.JDPAGE + (i + 1);}System.out.println(url);Document document = Jsoup.connect(url).timeout(5000).get();Elements uls = document.select("ul[class=gl-warp clearfix]");Iterator<Element> ulIter = uls.iterator();while(ulIter.hasNext()) {Element ul = ulIter.next();Elements lis = ul.select("li[data-sku]");Iterator<Element> liIter = lis.iterator();while(liIter.hasNext()) {Element li = liIter.next();Element div = li.select("div[class=gl-i-wrap]").first();Elements title = div.select("div[class=p-name p-name-type-2]>a");String productName = title.attr("title"); //得到商品名称Elements price = div.select(".p-price>strong");String productPrice =price.attr("data-price"); //得到商品价格productInfo = new ProductInfo();productInfo.setProductName(productName);productInfo.setProductPrice(productPrice);jdProductList.add(productInfo);}}} catch(Exception e) {System.out.println("Get JD product has error [" + url + "]");System.out.println(e.getMessage());}}return jdProductList;}public static void main(String[] args) {try {String productName = "书包";String jdUrl = Constants.JDURL + pcu.getGbk(productName)  + Constants.JDENC;List<ProductInfo> list = new JDProductList(jdUrl, productName).getProductList();System.out.println(list.size());for(ProductInfo pi : list){System.out.println(pi.getProductName() + "  " + pi.getProductPrice());}} catch (Exception e) {e.printStackTrace();}}}package com.iteye.injavawetrust.phantomjs;/*** * @author InJavaWeTrust**/
public class Constants {/*** JDURL*/public static String JDURL = "http://search.jd.com/Search?keyword=";/*** JD汉字编码格式*/public static String JDENC = "&enc=utf-8";/*** JD分页*/public static String JDPAGE ="&page=";/*** TBURL*/public static String TBURL = "https://s.taobao.com/search?q=";/*** 淘宝分页*/public static String TBPAGE = "&s=";/*** 超时时间*/public static int TIMEOUT = 50000;/*** 获取页面script*/public static String SCRIPT = "E:\\InJavaWeTrust\\js\\getHtml.js ";/*** phantomjs.exe path*/public static String PHANTOMJSPATH = "D:\\Program Files\\phantomjs\\bin\\phantomjs.exe ";}package com.iteye.injavawetrust.phantomjs;import java.io.Serializable;
import java.util.Date;/*** * @author InJavaWeTrust**/
public class ProductInfo implements Serializable{private static final long serialVersionUID = 8179244535272774089L;/*** 商品ID*/private String productid;/*** 商品名称*/private String productName;/*** 商品价格*/private String productPrice;/*** 月销售笔数*/private String tradeNum;/*** 商品URL*/private String productUrl;/*** 商品网店名称*/private String shopName;/*** 电商名称*/private String ecName;/*** 爬取入库日期*/private Date date;public String getProductid() {return productid;}public void setProductid(String productid) {this.productid = productid;}public String getProductName() {return productName;}public void setProductName(String productName) {this.productName = productName;}public String getProductPrice() {return productPrice;}public void setProductPrice(String productPrice) {this.productPrice = productPrice;}public String getTradeNum() {return tradeNum;}public void setTradeNum(String tradeNum) {this.tradeNum = tradeNum;}public String getProductUrl() {return productUrl;}public void setProductUrl(String productUrl) {this.productUrl = productUrl;}public String getShopName() {return shopName;}public void setShopName(String shopName) {this.shopName = shopName;}public String getEcName() {return ecName;}public void setEcName(String ecName) {this.ecName = ecName;}public Date getDate() {return date;}public void setDate(Date date) {this.date = date;}}package com.iteye.injavawetrust.phantomjs;import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.TimeZone;import org.apache.commons.logging.LogFactory;import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.HttpMethod;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.HtmlPage;/*** * @author InJavaWeTrust**/
public class PriceCheckUtil {private PriceCheckUtil() {}private static final PriceCheckUtil instance = new PriceCheckUtil();public static PriceCheckUtil getInstance() {return instance;}/*** 商品汉字转码* @param productName 商品名称* @return*/public String getGbk(String productName){String retGbk = "";try {retGbk = new String(productName.getBytes("UTF-8"), "GBK");} catch (UnsupportedEncodingException e) {e.printStackTrace();}return retGbk;}/*** 对淘宝浏览器汉字进行转换* @param productName 商品名称* @return*/public String getUrlCode(String productName){String retUrlCode = "";try {retUrlCode = URLEncoder.encode(productName, "utf8");} catch (UnsupportedEncodingException e) {e.printStackTrace();}return retUrlCode;}/*** 从列表list中找到与productName相似度最高的ProductInfo** @param productName* @param list* @return 相似度最高的productName*/public ProductInfo getSimilarity(String productName, List<ProductInfo> list) {ProductInfo productInfo = null;/*** 找到list中所有的productName与字符串productName的相似度,保存在lens数组中*/double lens[] = new double[list.size()];for (int i = 0; i < list.size() - 1; i++) {lens[i] = sim(productName, list.get(i).getProductName());}/*** 遍历出最大的相似度maxLen*/double maxLen = 0.0;for (int i = 0; i < lens.length; i++) {if (maxLen < lens[i]) {maxLen = lens[i];}}/*** 遍历出最大的相似度的索引maxLenIndex*/int maxLenIndex = 0;for (int i = 0; i < lens.length; i++) {if (maxLen == lens[i]) {maxLenIndex = i;}}productInfo = list.get(maxLenIndex);return productInfo;}/*** 求三个数中最小的一个* @param one* @param two* @param three* @return*/public int min(int one, int two, int three) {int min = one;if(two < min) {min = two;}if(three < min) {min = three;}return min;}/*** 计算矢量距离* Levenshtein Distance(LD)* @param str1* @param str2* @return*/public int ld(String str1, String str2) {int d[][];    //矩阵int n = str1.length();int m = str2.length();int i;    //遍历str1的int j;    //遍历str2的char ch1;    //str1的char ch2;    //str2的int temp;    //记录相同字符,在某个矩阵位置值的增量,不是0就是1if(n == 0) {return m;}if(m == 0) {return n;}d = new int[n+1][m+1];for(i=0; i<=n; i++) {    //初始化第一列d[i][0] = i;}for(j=0; j<=m; j++) {    //初始化第一行d[0][j] = j;}for(i=1; i<=n; i++) {    //遍历str1ch1 = str1.charAt(i-1);//去匹配str2for(j=1; j<=m; j++) {ch2 = str2.charAt(j-1);if(ch1 == ch2) {temp = 0;} else {temp = 1;}//左边+1,上边+1, 左上角+temp取最小d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp);}}return d[n][m];}/*** 计算相似度* @param str1* @param str2* @return*/public double sim(String str1, String str2) {int ld = ld(str1, str2);return 1 - (double) ld / Math.max(str1.length(), str2.length());}/** * 毫秒转换成hhmmss * @param ms 毫秒 * @return hh:mm:ss */  public String msToss(long ms) {SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss");  formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00"));  String ss = formatter.format(ms);  return ss;  }/*** 禁止htmlunit日志输出*/public void offLog(){LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");}/*** 获取淘宝数据* @param url* @return* @throws Exception*/public String getXmlByHtmlunit(String url) throws Exception {offLog();String ret = "";WebClient webClient = new WebClient(BrowserVersion.CHROME);// 1 启动JSwebClient.getOptions().setJavaScriptEnabled(true);// 2 禁用Css,可避免自动二次请求CSS进行渲染webClient.getOptions().setCssEnabled(false);// 3 启动客户端重定向webClient.getOptions().setRedirectEnabled(true);// 4 JS运行错误时,是否抛出异常webClient.getOptions().setThrowExceptionOnScriptError(false);// 5AJAX supportwebClient.setAjaxController(new NicelyResynchronizingAjaxController());// 6 设置超时webClient.getOptions().setTimeout(Constants.TIMEOUT);WebRequest webRequest = new WebRequest(new URL(url));webRequest.setHttpMethod(HttpMethod.GET);HtmlPage page = webClient.getPage(webRequest);webClient.waitForBackgroundJavaScript(10000);ret = page.asXml();webClient.close();return ret;}/*** 通过Phantomjs得到html页面* @param url* @return*/public String getHtmlByPhantomjs(String url) {StringBuilder html = new StringBuilder();try {Runtime rt = Runtime.getRuntime();Process p = rt.exec(Constants.PHANTOMJSPATH + Constants.SCRIPT + url);InputStream is = p.getInputStream();BufferedReader br = new BufferedReader(new InputStreamReader(is));String tmp = "";while ((tmp = br.readLine()) != null) {html.append(tmp);}} catch (IOException e) {e.printStackTrace();}return html.toString();}}package com.iteye.injavawetrust.phantomjs;import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;/*** * @author InJavaWeTrust**/
public class PriceCheckMain {private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();public List<Map<String, ProductInfo>> getProductList(String productName) {String jdUrl = Constants.JDURL + productName + Constants.JDENC;String tbUrl = Constants.TBURL + productName;return getProductFromUrls(jdUrl, tbUrl, productName);}public List<Map<String, ProductInfo>> getProductFromUrls(String jdUrl, String tbUrl, String productName) {List<Map<String, ProductInfo>> retListMap = new ArrayList<Map<String,ProductInfo>>();List<ProductInfo> jdProductList = new JDProductList(jdUrl, productName).getProductList();List<ProductInfo> tbProductList = new TBProductList(tbUrl, productName).getProductList();for(int i = 0; i < jdProductList.size(); i++){String jdProductName = jdProductList.get(i).getProductName();Map<String, ProductInfo> map = new HashMap<String, ProductInfo>();map.put("JD", jdProductList.get(i));ProductInfo tbProduct = pcu.getSimilarity(jdProductName, tbProductList);map.put("TB", tbProduct);retListMap.add(map);}return retListMap;}public static void main(String[] args) {System.out.println("输入商品名称:");Scanner scanner = new Scanner(System.in);String productName = scanner.next();scanner.close();System.out.println("京东和淘宝[" + productName + "]商品比价开始。。。。。。");try{long starTime = System.currentTimeMillis();List<Map<String, ProductInfo>> list = new PriceCheckMain().getProductList(productName);for(Map<String, ProductInfo> map : list){String jdName = map.get("JD").getProductName();String jdPrice = map.get("JD").getProductPrice();String ddName = map.get("TB").getProductName();String ddPrice = map.get("TB").getProductPrice();System.out.println("[" + jdName + "]  [" + ddName + "]");System.out.println("[" + jdPrice + "]  [" + ddPrice + "]");System.out.println("-----------------------------------------------------------");}long endTime = System.currentTimeMillis();System.out.println("用时 [" + pcu.msToss(endTime - starTime) + "]");}catch(Exception e){System.out.println("error");System.out.println(e.getMessage());}}}

运行结果:

输入商品名称:
铅笔
京东和淘宝[铅笔]商品比价开始。。。。。。
JD Product 第[1]页
http://search.jd.com/Search?keyword=铅笔&enc=utf-8
JD Product 第[2]页
http://search.jd.com/Search?keyword=閾呯瑪&enc=utf-8&page=2

。。。。。。。。。。。。

TB Product 第[1]页
https://s.taobao.com/search?q=铅笔
TB Product 第[2]页
https://s.taobao.com/search?q=%E9%93%85%E7%AC%94&s=44

。。。。。。。。。。。。。。。。。

[马可9002铅笔 马克三角铅 笔易握正姿木杆 安全无毒2H HB 2B HB HB]  [马可9001铅笔 三角形杆橡皮头 学生写字铅笔 HB 2B 满28元包邮]
[12.00]  [8.96]
-----------------------------------------------------------
用时 [00:01:35]

java jsoup 网络爬虫 学习例子(八)京东和淘宝商品比价 PhantomJS相关推荐

  1. java jsoup 网络爬虫 学习例子(六)京东和当当商品比价

    java jsoup 网络爬虫 学习例子(六)京东和当当商品比价 package com.iteye.injavawetrust.jdvsdd;import java.util.List;/*** * ...

  2. java jsoup 网络爬虫 学习例子(七)京东和淘宝商品比价 htmlunit

    java jsoup 网络爬虫 学习例子(七)京东和淘宝商品比价 htmlunit package com.iteye.injavawetrust.pricecheck;import java.uti ...

  3. python爬虫笔记(六)网络爬虫之实战(1)——淘宝商品比价定向爬虫(解决淘宝爬虫限制:使用cookies)...

    1.  淘宝商品信息定向爬虫 链接: https://www.taobao.com/ 2. 实例编写 2.1 整体框架 # -*- coding: utf-8 -*-import requests i ...

  4. java jsoup 网络爬虫 jsoup解析html Java爬虫 Jsoup爬虫 jsoup例子

    java jsoup 网络爬虫 java jsoup 网络爬虫 学习例子(一)抓取豆瓣电影名称+推荐星级 java jsoup 网络爬虫 学习例子(二)只抓取豆瓣电影5星(力荐)电影名称 java j ...

  5. python爬虫学习 之 定向爬取 淘宝商品价格

    python爬虫学习 之 定向爬取 淘宝商品价格 import requests import redef getHTMLText(url):try:r = requests.get(url, tim ...

  6. Java实现网络爬虫:爬取京东商品案例

    Java实现网络爬虫 爬取京东商品案例 需求分析 代码实现 爬取京东商品案例 需求分析 一.需求 抓取京东商城的数据,把商品数据保存到数据库. 二.功能分析 使用HttpClient发送一个get请求 ...

  7. 爬虫学习笔记:天猫(淘宝)评论数据爬虫

    目录 1.杂语 2.目的和方法 2.1 目的 2.2 爬虫方法 step1:获取cookie信息与评论url地址 step2:获取请求头信息user-agent step3:查看评论数据 step4: ...

  8. 爬虫学习笔记——Selenium爬取淘宝商品信息并保存

    在使用selenium来模拟浏览器操作,抓取淘宝商品信息前,先完成一些准备工作. 准备工作:需要安装selenium,pyquery,以及Chrome浏览器并配置ChromeDriver. 安装sel ...

  9. Python网络爬虫(6)--爬取淘宝模特图片

    经过前面的一些基础学习,我们大致知道了如何爬取并解析一个网页中的信息,这里我们来做一个更有意思的事情,爬取MM图片并保存.网址为https://mm.taobao.com/json/request_t ...

最新文章

  1. 64位php oracle,64位系统无法加载PHP的oracle扩展问题
  2. springmvc中获取request对象,加载biz(service)的方法
  3. MySQL Group Replication 介绍
  4. 如何在 WebAPI 中启用 CORS
  5. html比赛项目,趣味运动会最新个人比赛项目
  6. 中国数学会副理事长田刚委员:建议从四个方面加强教师队伍建设
  7. C#算法设计排序篇之09-基数排序(附带动画演示程序)
  8. 女生做产品经理好吗_谁说女生不适合做产品经理?
  9. r矢量球坐标系旋度_三个常用坐标系的认识及矢量旋度表达式的证明
  10. ZeroMQ的一些配置
  11. [转] 谈谈MIXI的开源SNS架构
  12. SQL Server 2005无日志文件附加数据库[转载]
  13. 从其他项目中复制过来的mapper加载不进bean_手把手带你玩转k8s-一键部署springboot项目...
  14. Dsoframer注册方法
  15. dojo实现省份地市级联报错(一)
  16. (转载)c++builder/delphi中透明panel及透明窗口的实现方法_delphi教程
  17. 计算机二级C语言知识点复习资料,精简版
  18. 多变量微积分笔记10——二重积分的应用
  19. Python 每日一记31相关性矩阵建立
  20. svchost.exe病毒-任务栏怎么也关闭不掉的搜索框

热门文章

  1. 中国快递物流行业发展分析-快递100百递指数
  2. aotm对php加密,在 Atom 中使用 PHP-CS-Fixer
  3. 客户案例|保险行业借助智能外呼充分结合业务需求和实际应用场景快速筛选客户
  4. 我们的游戏世界(背包【仓库】,交易,任务,简单经济系统,装备)实现(基于仙剑demo聊聊游戏世界)第一篇谈谈交易
  5. C语言:字符函数与字符串函数(一)
  6. 赛门铁克新通告再犯严重错误 忽视大陆用户
  7. 【可视化大屏】屏幕多分辨率适配方案
  8. Riak学习(2):java连接Riak服务,使用Protocol Buffers连接
  9. 阿里云总裁胡晓明的重庆“缘”,妙不可言
  10. 应用OpenCV进行OCR字符识别