用Jsoup抓取长颈鹿但丁图片

(官网似乎已改版，此代码没用了)

1、pom文件配置或者添加jsoup1.6.3jar包：

   <dependencies><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.6.3</version></dependency></dependencies>

2、抓取长颈鹿但丁图片URL:

package com.sxit.jsoup;import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;/*** 功能：抓取长颈鹿但丁图片* 类名:jsoupPic* 作者:smile* 时间:Nov 11, 2012：2:17:57 PM*/
public class jsoupPic {public static List<String> getDocument() {List<String> list = new ArrayList<String>();try {Connection con = null;// 分页后缀String[] a = { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" };// 遍历语录漫画下面四个项目（哲理、职场、爱情、恶搞）for (int i = 1; i <= 4; i++) {// 单个项目的第一页urlString url = "http://www.danding.com.cn/pic_fl_" + i + ".html";// 建立连接con = Jsoup.connect(url);// 获取页面documentDocument doc = con.get();// 获取末尾是jpg的img标签元素Elements e = doc.select("img[src$=.jpg]");// 遍历第一页jpg图片的路径for (int j = 0; j < e.size(); j++) {Element ei = e.get(j);// System.out.println("第"+i+"页图片地址为----------->>>>>// http://www.danding.com.cn/"+ei.attr("src"));list.add("http://www.danding.com.cn/" + ei.attr("src"));}int flag = 0;while (flag == 0) {// 当前页是否存在下一页boolean isExist = true;isExist = isExistsNextPage(doc);int k = 0;while (isExist) {// System.out.println("----------------->>>存在下一页");// 下一页的url地址url = "http://www.danding.com.cn/pic_fl_" + i + a[k] + ".html";doc = traverse(url, list);isExist = isExistsNextPage(doc);k++;}flag = 1;}}} catch (IOException e) {e.printStackTrace();}return list;}/*** 判断是否有下一页* * @param doc* @return*/public static boolean isExistsNextPage(Document doc) {// 判断当前页是否还有下一页Elements e = doc.select(":containsOwn(下一页)");if (e.size() > 0) { // 有下一页return true;} elsereturn false;}/*** 遍历document* * @param list* @param doc* @throws IOException*/public static Document traverse(String src, List<String> list) throws IOException {Connection con = Jsoup.connect(src);Document doc = con.get();// 获取末尾是jpg的标签元素Elements e = doc.select("img[src$=.jpg]");for (int j = 0; j < e.size(); j++) {Element ei = e.get(j);list.add("http://www.danding.com.cn/" + ei.attr("src"));}return doc;}
}

3、批量下载到本地：

package com.sxit.jsoup;import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;/*** 功能：批量下载* 类名:ThreadPoolManage* 作者:smile* 时间:Nov 11, 2012：6:02:49 PM*/
public class ThreadPoolManage {final ExecutorService exec = Executors.newFixedThreadPool(20);private String filePath;private List<String> list;// 结束的倒数锁final CountDownLatch stop = new CountDownLatch(20);public static void main(String[] args) {new ThreadPoolManage("D://xxooThread");}public ThreadPoolManage(String filePath) {list = jsoupPic.getDocument();// 启20个线程跑,每个线程只跑总数的1/20,第一个线程跑1-->list.size()/20,// 第二个线程从list.size()/20+1-->2*list.size()/20,最后一个线程则跑(n-1)*list.size()/20+1-->list.size()for (int i = 1; i <= 20; i++) {this.exec.submit(new ImageThread(i, filePath, list, stop));}try {// 等待stop变为0stop.await();} catch (InterruptedException e) {e.printStackTrace();}// 等所有线程跑完最后关闭ExecutorServiceexec.shutdown();}
}class ImageThread implements Runnable {private String filePath;private List<String> list;private int index;private final CountDownLatch stop;public ImageThread(int index, String filePath, List<String> ilistst, CountDownLatch stop) {this.index = index;this.filePath = filePath;this.list = ilistst;this.stop = stop;}public void run() {String picUrl = "";InputStream is = null;OutputStream os = null;URL url = null;HttpURLConnection con = null;// 判断保存路径是否存在 不存在则新建文件夹File f = new File(filePath);File temp = null;if (!f.exists()) {f.mkdir();}if (list != null) {// 每次需要跑的数目int count = list.size() / 20;int start = (index - 1) * count + 1;int end = 0;if (index != 20) {end = index * count;} else {end = list.size() - 1;}for (int i = start; i <= end; i++) {picUrl = list.get(i);try {url = new URL(picUrl);con = (HttpURLConnection) url.openConnection();// 设置连接超时con.setConnectTimeout(100 * 1000);// 设置读取超时con.setReadTimeout(100 * 1000);is = new BufferedInputStream(con.getInputStream());os = new BufferedOutputStream(new FileOutputStream(new File(filePath + "/" + i + ".jpg")));byte[] b = new byte[1024];int length = 0;while ((length = is.read(b)) != -1) {os.write(b, 0, length);}os.flush();System.out.println(index + "号线程----------------->>>>>>>保存完第" + i + "张");} catch (Exception e) {System.out.println(index + "号线程跑到第" + start + "张图片+++++++++++++抛出异常,异常信息为：" + e.getMessage());// 抛出异常捕获，继续执行continue;}}try {if (is != null) {is.close();}if (os != null) {os.close();}} catch (IOException e) {e.printStackTrace();} finally {// 当前线程完成,减1this.stop.countDown();}}}
}

3、源码如下

用Jsoup抓取长颈鹿但丁图片相关推荐

springboot+jsoup抓取新闻网站信息
springboot+jsoup抓取新闻网站信息步骤: 一.导入jar包二.解析凤凰网新闻 jsoup获取动态生成的js内容 service serviceImpl mapper domian 步 ...
使用Jsoup抓取京东图书分类页面图书信息
一.目的: 1.任务使用 Jsoup抓取京东图书分类页面的图书信息. 抓取目标分类网址例如:https://list.jd.com/list.html?cat=1713,3259,3330 给与的某 ...
Jsoup抓取网页数据完成一个简易的Android新闻APP
前言:作为一个篮球迷,每天必刷NBA新闻.用了那么多新闻APP,就想自己能不能也做个简易的新闻APP.于是便使用Jsoup抓取了虎扑NBA新闻的数据,完成了一个简易的新闻APP.虽然没什么技术含量,但 ...
总说手机没有“好壁纸”，Python一次性抓取500张“美女”图片，够不够用！
作者 | 旧时晚风拂晓城编辑 | JackTian 来源 | 杰哥的IT之旅(ID:Jake_Internet) 原文链接:https://blog.csdn.net/fyfugoyfa ...
android 获取手机a标签页,Android关于对Jsoup抓取a标签和br标签之间的解决办法...
Jsoup官方给出的文档,链接:http://www.open-open.com/jsoup/ 描述问题: 学校教务处系统中,我想获取所有科目以及对应的成绩,因此我采用了Jsoup抓取采集成绩:fe ...
python美女源代码_单身程序员，每晚用python抓取百万张美女图片，连女友都不想找了...
每当夜深人静时,这位长期单身的程序员就会起床开电脑,然后用python抓取百万张美女图片,存进U盘,目的目前还不知道,但技术是万能的,这样一来,可能连找女朋友的钱都省了. 其实,还有更好看的! 而且还 ...
看腻了杨幂，热巴，快来抓取上千张美女图片，古装美女看个够
大家好,我是菜鸟哥! 最近的一段时间以来,小编都没有为大家送上一些小姐姐福利,今天小编就带领大家来抓取古装小姐姐的美照,通过程序来抓取上千张古装美女的图片,一起来看看吧. 01.程序讲解对于古装美女 ...
python获取网页图片_python抓取网页中的图片示例
python抓取网页中的图片示例代码如下: #coding:utf8 import re import urllib def getHTML(url): page = urllib.urlopen( ...
抓取网页上的图片(一)
抓取网页上的图片思路:网页(HTML)中的图片通常在img标签中,图片的链接通常在标签的src属性中,通过BeautifulSoup解析HTML,找到所有的img标签,获取每一个标签中的src的属性 ...

用Jsoup抓取长颈鹿但丁图片

用Jsoup抓取长颈鹿但丁图片相关推荐

最新文章

热门文章