java jsoup抓取百度图片保存到本地，并压缩到指定大小（可指定宽高或者流大小）

代码如下：

package cn.xyz.commons.utils;

import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.imageio.ImageIO;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.google.common.io.Files;

import net.coobird.thumbnailator.Thumbnails;
import net.coobird.thumbnailator.builders.BufferedImageBuilder;
import net.coobird.thumbnailator.resizers.Resizers;

/**
* 爬取百度图片
* @author llad
*
*/
public class JsoupBaiduImg {

private static Log sop = LogFactory.getLog(JsoupBaiduImg.class);

public static void main(String[] args) throws Exception {
       String downloadPath = "D:/www/resources/robit/o";
       String downloadMinPath = "D:/www/resources/robit/t";
       // 多个类型，空格分开
       List<String> list = nameList("卡通头像唯美头像");
       getPictures(list, 1, downloadPath, 640, 640, downloadMinPath, 1, 100, 100); // 1代表下载一页，一页设置的30张图片

}

/**
   *
   * @param keywordList 关键词路径
   * @param max 最多页码
   * @param downloadPath 下载后存储路径
   * @param bw 抓图宽度
   * @param bh 抓图高度
   * @param downloadMinPath 需压缩的路径
   * @param type 0不压缩，1压缩
   * @param w 指定压缩宽
   * @param h 指定压缩高度
   * @throws Exception
   */
   public static void getPictures(List<String> keywordList, int max, String downloadPath, int bw, int bh,
           String downloadMinPath, int type, int w, int h) throws Exception { // key为关键词,max作为爬取的页数
//       String gsm = Integer.toHexString(max) + "";
       String finalURL = "";
       String tempPath = "";
       for (String keyword : keywordList) {
           tempPath = downloadPath;
           if (!tempPath.endsWith("/")) {
               tempPath = downloadPath + "/";
           }
//           tempPath = tempPath + keyword + "\\";
           File f = new File(tempPath);
           if (!f.exists()) {
               f.mkdirs();
           }
           int picCount = 1;
           for (int page = 0; page <= max; page++) {
               sop.debug("正在下载第" + page + "页面");
               Document document = null;
               try {
                   String url = "http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word="
                           + keyword + "&cg=star&pn=" + page * 30 + "&rn=30&itg=0&z=0&fr=&width=" + bw + "&height="
                           + bh + "&lm=-1&ic=0&s=0&st=-1&gsm=" + Integer.toHexString(page * 30);
                   sop.debug(url);
                   document = Jsoup.connect(url).data("query", "Java")// 请求参数
                           .userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")// 设置urer-agent
                                                                                                           // get();
                           .timeout(5000).get();
                   String xmlSource = document.toString();
                   xmlSource = StringEscapeUtils.unescapeHtml3(xmlSource);
                   sop.debug(xmlSource);
                   String reg = "objURL\":\"http://.+?\\.jpg";
                   Pattern pattern = Pattern.compile(reg);
                   Matcher m = pattern.matcher(xmlSource);
                   while (m.find()) {
                       finalURL = m.group().substring(9);
                       sop.debug(keyword + picCount++ + ":" + finalURL);
                       download(finalURL, tempPath);
                       sop.debug(" 下载成功");
                   }
               } catch (IOException e) {
                   e.printStackTrace();
               }
           }
       }
       sop.debug("下载完毕");
       delMultyFile(downloadPath);
       sop.debug("已经删除所有空图");
       if (type == 1) {
           compressPicForWH(w, h, downloadPath, downloadMinPath);
           sop.debug("压缩所有图片");
       }

}

public static long delMultyFile(String path) {
       File file = new File(path);
       if (!file.exists())
           throw new RuntimeException("File \"" + path + "\" NotFound when excute the method of delMultyFile()....");
       File[] fileList = file.listFiles();
//       File tempFile = null;
       for (File f : fileList) {
           if (f.isDirectory()) {
               delMultyFile(f.getAbsolutePath());
           } else {
               if (f.length() <= 1024)
                   sop.debug(f.delete() + "---" + f.getName());
           }
       }
       return new File(path).length();
   }

public static List<String> nameList(String nameList) {
       List<String> arr = new ArrayList<>();
       String[] list;
       if (nameList.contains(","))
           list = nameList.split(",");
       else if (nameList.contains("、"))
           list = nameList.split("、");
       else if (nameList.contains(" "))
           list = nameList.split(" ");
       else {
           arr.add(nameList);
           return arr;
       }
       for (String s : list) {
           arr.add(s);
       }
       return arr;
   }

//   public static void sop.debug(Object obj) {
//       System.out.println(obj);
//   }

// 根据图片网络地址下载图片
   public static void download(String url, String path) {
       // path = path.substring(0,path.length()-2);
       File file = null;
       File dirFile = null;
       FileOutputStream fos = null;
       HttpURLConnection httpCon = null;
       URLConnection con = null;
       URL urlObj = null;
       InputStream in = null;
       byte[] size = new byte[1024];
       int num = 0;
       String downloadName = "";
       try {
//           downloadName = url.substring(url.lastIndexOf("/") + 1);
           downloadName = "rt" + new Random().nextInt(1000) + "_" + url.substring(url.lastIndexOf("/") + 1);
           dirFile = new File(path);
           if (!dirFile.exists() && path.length() > 0) {
               if (dirFile.mkdir()) {
                   sop.debug("creat document file \"" + path.substring(0, path.length() - 1) + "\" success...\n");
               }
           } else {
               file = new File(path + downloadName);
               fos = new FileOutputStream(file);
               if (url.startsWith("http")) {
                   urlObj = new URL(url);
                   con = urlObj.openConnection();
                   httpCon = (HttpURLConnection) con;
                   in = httpCon.getInputStream();
                   while ((num = in.read(size)) != -1) {
                       for (int i = 0; i < num; i++)
                           fos.write(size[i]);
                   }
               }
           }
       } catch (FileNotFoundException notFoundE) {
           sop.debug("找不到该网络图片....");
       } catch (NullPointerException nullPointerE) {
           sop.debug("找不到该网络图片....");
       } catch (IOException ioE) {
           sop.debug("产生IO异常.....");

} catch (Exception e) {
           e.printStackTrace();
       } finally {
           try {
               fos.close();
           } catch (Exception e) {
               e.printStackTrace();
           }
       }
   }

/**
   * 根据指定大小压缩图片
   *
   * @param w 宽
   * @param h 高
   * @param src 原路径
   * @param tsrc 压缩后路径
   * @return 压缩质量后的图片字节数组
   */
   public static void compressPicForWH(int w, int h, String src, String tsrc) {
       try {
           File file = new File(src);
           if (!file.exists())
               throw new RuntimeException(
                       "File \"" + src + "\" NotFound when excute the method of delMultyFile()....");

           File ft = new File(tsrc);
           if (!ft.exists()) {
               ft.mkdirs();
           }

           File[] fileList = file.listFiles();
//           File tempFile = null;
           for (File f : fileList) {
               if (f.isDirectory()) {
                   compressPicForWH(w, h, f.getAbsolutePath(), tsrc);
               } else {
                   // 压缩文件
                   BufferedImage srcImg = ImageIO.read(f);
                   BufferedImage tarImg = new BufferedImageBuilder(w, h, BufferedImage.TYPE_3BYTE_BGR).build();
                   Resizers.BILINEAR.resize(srcImg, tarImg);
                   // 写压缩文件
                   ImageIO.write(tarImg, "jpg", new File(tsrc + "/" + f.getName()));
               }
           }
           sop.debug(file.listFiles().length);
       } catch (Exception e) {
           sop.debug("【图片压缩】msg=图片压缩失败!" + e);
       }
   }

/**
   * 根据指定大小压缩图片
   *
   * @param imageBytes 源图片字节数组
   * @param desFileSize 指定图片大小，单位kb
   * @param imageId 影像编号
   * @return 压缩质量后的图片字节数组
   */
   public static byte[] compressPicForScale(byte[] imageBytes, long desFileSize, String imageId) {
       if (imageBytes == null || imageBytes.length <= 0 || imageBytes.length < desFileSize * 1024) {
           return imageBytes;
       }
       long srcSize = imageBytes.length;
       double accuracy = getAccuracy(srcSize / 1024);
       try {
           while (imageBytes.length > desFileSize * 1024) {
               ByteArrayInputStream inputStream = new ByteArrayInputStream(imageBytes);
               ByteArrayOutputStream outputStream = new ByteArrayOutputStream(imageBytes.length);
               Thumbnails.of(inputStream).scale(accuracy).outputQuality(accuracy).toOutputStream(outputStream);
               imageBytes = outputStream.toByteArray();
           }
           sop.debug("【图片压缩】imageId={} | 图片原大小={}kb | 压缩后大小={}kb" + imageId + srcSize / 1024 + imageBytes.length / 1024);
       } catch (Exception e) {
           sop.debug("【图片压缩】msg=图片压缩失败!" + e);
       }
       return imageBytes;
   }

/**
   * 自动调节精度(经验数值)
   *
   * @param size 源图片大小
   * @return 图片压缩质量比
   */
   private static double getAccuracy(long size) {
       double accuracy;
       if (size < 900) {
           accuracy = 0.85;
       } else if (size < 2047) {
           accuracy = 0.6;
       } else if (size < 3275) {
           accuracy = 0.44;
       } else {
           accuracy = 0.4;
       }
       return accuracy;
   }

}

java jsoup抓取百度图片保存到本地，并压缩到指定大小（可指定宽高或者流大小）相关推荐

Python3.8抓取百度图片高清原图『原来如此简单』『最新记录贴』
网上爬取百度图片的五花八门,要么有的过时的,有的则是爬取的是缩略图,不是原图等. 鉴于此在学习的过程中,记录此贴.在写本文的时候,代码是可用的. 前言 1.首先要有第一个明白的地方是百度图片目前是动态 ...
python爬虫抓取百度图片_Python爬虫抓取百度的高清摄影图片
成果预览: 源代码: import requests import re url = 'https://image.baidu.com/search/index' headers = { 'User- ...
Python + PySpider 抓取百度图片搜索的图片
说明 1.PySpider 是一个方便并且功能强大的Python爬虫框架 2.PySpider 依赖于PhantomJS 3.windows平台,PySpider 与64位的Python兼容不太好,需 ...
Java实现抓取百度识图结果的实现和思路-2-Json的处理
上一篇文章的断点传输以后再讲_(:зゝ∠)_,先讲讲json的处理什么是json???举个例子似乎短了点,如果用昨天的postURL案例得到的网址去访问的话,也可能会得到类似的结果,不过真实情况是 ...
Python 爬虫多线程爬取美女图片保存到本地
Wanning 我们不是生产者,我们只是搬运工资源来至于qiubaichengren ,代码基于Python 3.5.2 友情提醒:血气方刚的骚年.请谨慎阅图 !!! 谨慎阅图 !!! 谨慎 ...
java爬虫写一个百度图片下载器
文章目录 img_download 1.0 看看效果吧 2.0 了解一下 "图片下载器软件" 目录结构 3.0 如何使用? 4.0 源码剖析 5.0 项目地址 6.0 写在最后的话 ...
Python爬虫抓取百度搜索图片
最近玩机器学习,想搞一个关于识别动漫图片的训练集,苦于没有太多的动漫图片,后来忽然想到百度图片可以拿来用,于是乎写了个简单的爬虫,用来抓取百度图片(关于某个关键字的图片) 第一步,找到搜索图片的url ...
Python依据单个关键词爬取百度图片
最近由于工作需要要使用大量的水果蔬菜图片,故萌生使用爬虫抓取百度图片的想法,并未用于商业用途,只是为了测试数据.所以并未使用多线程.框架等技术. 由于百度图片是动态加载的,发现搜索关键词后action ...
java爬取验证码图片_JAVA HttpClient实现页面信息抓取(获取图片验证码并传入cookie实现信息获取)...
JAVA HttpClient实现页面信息抓取(获取图片验证码并传入cookie实现信息获取) 发布时间:2018-05-18 16:41, 浏览次数:632 , 标签: JAVA HttpClien ...

java jsoup抓取百度图片保存到本地，并压缩到指定大小（可指定宽高或者流大小）

java jsoup抓取百度图片保存到本地，并压缩到指定大小（可指定宽高或者流大小）相关推荐

最新文章

热门文章