Java抓取起点小说输出到本地文件夹和数据库

目录
- 项目结构
- 所需插件
- 项目代码
- 输出结果

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>com.cdw</groupId><artifactId>webCrawler</artifactId><version>1.0-SNAPSHOT</version><build><plugins><plugin><groupId>org.apache.maven.plugins</groupId><artifactId>maven-compiler-plugin</artifactId><configuration><source>8</source><target>8</target></configuration></plugin></plugins></build><dependencies><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.9.2</version></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>4.5.2</version></dependency><dependency><groupId>dom4j</groupId><artifactId>dom4j</artifactId><version>1.6.1</version></dependency><dependency><groupId>junit</groupId><artifactId>junit</artifactId><version>4.12</version></dependency><dependency><groupId>mysql</groupId><artifactId>mysql-connector-java</artifactId><version>5.1.3</version></dependency></dependencies></project>

项目代码

NovelAssist.java 用与多线程取小说列表和存入小说标题和内容，统一规范抓取规则

package graspTheNovel.entity;import org.jsoup.nodes.Document;import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.ReentrantReadWriteLock;/*** @program: webCrawler* @description: 小说协助类* @author: cdw* @create: 2020-10-01 20:48**/
public class NovelAssist {String url;//网络地址Queue<Map<String, List<Document>>> directoryQueue;//节章标题队列Queue<List<Map<String,String>>> NovelList;//列表队列ReentrantReadWriteLock listRwl;//列表读写锁ReentrantReadWriteLock directoryRwl;//目录读写锁CountDownLatch latch;//信号量//同一个html页面String listRule;//列表规则String nextRule;//列表下一页规则//同一个html页面String directoryRule;//小说目录的规则//同一个html页面String titleRule;//小说单章标题规则String contentsRule;//小说内容规则public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}public Queue<Map<String, List<Document>>> getDirectoryQueue() {return directoryQueue;}public void setDirectoryQueue(Queue<Map<String, List<Document>>> directoryQueue) {this.directoryQueue = directoryQueue;}public Queue<List<Map<String, String>>> getNovelList() {return NovelList;}public void setNovelList(Queue<List<Map<String, String>>> novelList) {NovelList = novelList;}public ReentrantReadWriteLock getListRwl() {return listRwl;}public void setListRwl(ReentrantReadWriteLock listRwl) {this.listRwl = listRwl;}public ReentrantReadWriteLock getDirectoryRwl() {return directoryRwl;}public void setDirectoryRwl(ReentrantReadWriteLock directoryRwl) {this.directoryRwl = directoryRwl;}public CountDownLatch getLatch() {return latch;}public void setLatch(CountDownLatch latch) {this.latch = latch;}public String getListRule() {return listRule;}public void setListRule(String listRule) {this.listRule = listRule;}public String getNextRule() {return nextRule;}public void setNextRule(String nextRule) {this.nextRule = nextRule;}public String getDirectoryRule() {return directoryRule;}public void setDirectoryRule(String directoryRule) {this.directoryRule = directoryRule;}public String getTitleRule() {return titleRule;}public void setTitleRule(String titleRule) {this.titleRule = titleRule;}public String getContentsRule() {return contentsRule;}public void setContentsRule(String contentsRule) {this.contentsRule = contentsRule;}
}

HttpUtils.java 拷贝别人写好的，http工具类

package graspTheNovel.utils;import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;/*** @program: webCrawler* @description: Http工具* <pre>* Http工具，包含：* 普通http请求工具(使用httpClient进行http,https请求的发送)* </pre>* @author: cdw* @create: 2020-10-01 15:07**/
public class HttpUtils {/*** 请求超时时间,默认20000ms*/private int timeout = 20000;/*** cookie表*/private Map<String, String> cookieMap = new HashMap<>();/*** 请求编码(处理返回结果)，默认UTF-8*/private String charset = "UTF-8";private static HttpUtils httpUtils;private HttpUtils() {}/*** 获取实例** @return*/public static HttpUtils getInstance() {if (httpUtils == null)httpUtils = new HttpUtils();return httpUtils;}/*** 清空cookieMap*/public void invalidCookieMap() {cookieMap.clear();}public int getTimeout() {return timeout;}/*** 设置请求超时时间** @param timeout*/public void setTimeout(int timeout) {this.timeout = timeout;}public String getCharset() {return charset;}/*** 设置请求字符编码集** @param charset*/public void setCharset(String charset) {this.charset = charset;}/*** 将网页返回为解析后的文档格式** @param html* @return* @throws Exception*/public static Document parseHtmlToDoc(String html) throws Exception {return removeHtmlSpace(html);}private static Document removeHtmlSpace(String str) {Document doc = Jsoup.parse(str);String result = doc.html().replace("&nbsp;", "");return Jsoup.parse(result);}/*** 执行get请求,返回doc** @param url* @return* @throws Exception*/public Document executeGetAsDocument(String url) throws Exception {return parseHtmlToDoc(executeGet(url));}/*** 执行get请求** @param url* @return* @throws Exception*/public String executeGet(String url) throws Exception {HttpGet httpGet = new HttpGet(url);httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());CloseableHttpClient httpClient = null;String str = "";try {httpClient = HttpClientBuilder.create().build();HttpClientContext context = HttpClientContext.create();CloseableHttpResponse response = httpClient.execute(httpGet, context);getCookiesFromCookieStore(context.getCookieStore(), cookieMap);int state = response.getStatusLine().getStatusCode();if (state == 404) {str = "";}try {HttpEntity entity = response.getEntity();if (entity != null) {str = EntityUtils.toString(entity, charset);}} finally {response.close();}} catch (IOException e) {throw e;} finally {try {if (httpClient != null)httpClient.close();} catch (IOException e) {throw e;}}return str;}/*** 用https执行get请求,返回doc** @param url* @return* @throws Exception*/public Document executeGetWithSSLAsDocument(String url) throws Exception {return parseHtmlToDoc(executeGetWithSSL(url));}public static String httpGetHeader(String url,String cook,String header) throws IOException{//获取请求连接Connection con = Jsoup.connect(url);//请求头设置，特别是cookie设置con.header("Accept", "text/html, application/xhtml+xml, */*");con.header("Content-Type", "application/x-www-form-urlencoded");con.header("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0))");con.header("Cookie", cook);//发送请求Connection.Response resp=con.method(Connection.Method.GET).execute();//获取cookie名称为__bsi的值String cookieValue = resp.cookie("__bsi");System.out.println("cookie  __bsi值：  "+cookieValue);//获取返回cookie所值Map<String,String> cookies = resp.cookies();System.out.println("所有cookie值：  "+cookies);//获取返回头文件值String headerValue = resp.header(header);System.out.println("头文件"+header+"的值："+headerValue);//获取所有头文件值Map<String,String> headersOne =resp.headers();System.out.println("所有头文件值："+headersOne);return headerValue;}/*** 用https执行get请求** @param url* @return* @throws Exception*/public String executeGetWithSSL(String url) throws Exception {HttpGet httpGet = new HttpGet(url);httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());CloseableHttpClient httpClient = null;String str = "";try {httpClient = createSSLInsecureClient();HttpClientContext context = HttpClientContext.create();CloseableHttpResponse response = httpClient.execute(httpGet, context);getCookiesFromCookieStore(context.getCookieStore(), cookieMap);int state = response.getStatusLine().getStatusCode();if (state == 404) {str = "";}try {HttpEntity entity = response.getEntity();if (entity != null) {str = EntityUtils.toString(entity, charset);}} finally {response.close();}} catch (IOException e) {throw e;} catch (GeneralSecurityException ex) {throw ex;} finally {try {if (httpClient != null)httpClient.close();} catch (IOException e) {throw e;}}return str;}/*** 执行post请求,返回doc** @param url* @param params* @return* @throws Exception*/public Document executePostAsDocument(String url, Map<String, String> params) throws Exception {return parseHtmlToDoc(executePost(url, params));}/*** 执行post请求** @param url* @param params* @return* @throws Exception*/public String executePost(String url, Map<String, String> params) throws Exception {String reStr = "";HttpPost httpPost = new HttpPost(url);httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));List<NameValuePair> paramsRe = new ArrayList<>();for (String key : params.keySet()) {paramsRe.add(new BasicNameValuePair(key, params.get(key)));}CloseableHttpClient httpclient = HttpClientBuilder.create().build();CloseableHttpResponse response;try {httpPost.setEntity(new UrlEncodedFormEntity(paramsRe));HttpClientContext context = HttpClientContext.create();response = httpclient.execute(httpPost, context);getCookiesFromCookieStore(context.getCookieStore(), cookieMap);HttpEntity entity = response.getEntity();reStr = EntityUtils.toString(entity, charset);} catch (IOException e) {throw e;} finally {httpPost.releaseConnection();}return reStr;}/*** 用https执行post请求,返回doc** @param url* @param params* @return* @throws Exception*/public Document executePostWithSSLAsDocument(String url, Map<String, String> params) throws Exception {return parseHtmlToDoc(executePostWithSSL(url, params));}/*** 用https执行post请求** @param url* @param params* @return* @throws Exception*/public String executePostWithSSL(String url, Map<String, String> params) throws Exception {String re = "";HttpPost post = new HttpPost(url);List<NameValuePair> paramsRe = new ArrayList<>();for (String key : params.keySet()) {paramsRe.add(new BasicNameValuePair(key, params.get(key)));}post.setHeader("Cookie", convertCookieMapToString(cookieMap));post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());CloseableHttpResponse response;try {CloseableHttpClient httpClientRe = createSSLInsecureClient();HttpClientContext contextRe = HttpClientContext.create();post.setEntity(new UrlEncodedFormEntity(paramsRe));response = httpClientRe.execute(post, contextRe);HttpEntity entity = response.getEntity();if (entity != null) {re = EntityUtils.toString(entity, charset);}getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);} catch (Exception e) {throw e;}return re;}/*** 发送JSON格式body的POST请求** @param url 地址* @param jsonBody json body* @return* @throws Exception*/public String executePostWithJson(String url, String jsonBody) throws Exception {String reStr = "";HttpPost httpPost = new HttpPost(url);httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));CloseableHttpClient httpclient = HttpClientBuilder.create().build();CloseableHttpResponse response;try {httpPost.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));HttpClientContext context = HttpClientContext.create();response = httpclient.execute(httpPost, context);getCookiesFromCookieStore(context.getCookieStore(), cookieMap);HttpEntity entity = response.getEntity();reStr = EntityUtils.toString(entity, charset);} catch (IOException e) {throw e;} finally {httpPost.releaseConnection();}return reStr;}/*** 发送JSON格式body的SSL POST请求** @param url 地址* @param jsonBody json body* @return* @throws Exception*/public String executePostWithJsonAndSSL(String url, String jsonBody) throws Exception {String re = "";HttpPost post = new HttpPost(url);post.setHeader("Cookie", convertCookieMapToString(cookieMap));post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());CloseableHttpResponse response;try {CloseableHttpClient httpClientRe = createSSLInsecureClient();HttpClientContext contextRe = HttpClientContext.create();post.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));response = httpClientRe.execute(post, contextRe);HttpEntity entity = response.getEntity();if (entity != null) {re = EntityUtils.toString(entity, charset);}getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);} catch (Exception e) {throw e;}return re;}private void getCookiesFromCookieStore(CookieStore cookieStore, Map<String, String> cookieMap) {List<Cookie> cookies = cookieStore.getCookies();for (Cookie cookie : cookies) {cookieMap.put(cookie.getName(), cookie.getValue());}}private String convertCookieMapToString(Map<String, String> map) {String cookie = "";for (String key : map.keySet()) {cookie += (key + "=" + map.get(key) + "; ");}if (map.size() > 0) {cookie = cookie.substring(0, cookie.length() - 2);}return cookie;}/*** 创建 SSL连接** @return* @throws GeneralSecurityException*/private static CloseableHttpClient createSSLInsecureClient() throws GeneralSecurityException {try {SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, (chain, authType) -> true).build();SSLConnectionSocketFactory sslConnectionSocketFactory =new SSLConnectionSocketFactory(sslContext,(s, sslContextL) -> true);return HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory).build();} catch (GeneralSecurityException e) {throw e;}}
}
ReptileUtil.java 爬虫工具类,封装对网站页面的解析```java
package graspTheNovel.utils;import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.locks.Lock;/*** @program: webCrawler* @description: 爬虫工具类* @author: cdw* @create: 2020-10-01 20:14**/
public class ReptileUtil {static HttpUtils httpUtils = HttpUtils.getInstance();/* @description : 根据url获取Document** @param url* @param lock lock锁* @return :* @author cdw* @date 2020/10/1 21:22*/public static Document getDocumentOfHttps(String url, Lock lock) {Document document = null;lock.lock();//获取写锁try {document = httpUtils.executeGetWithSSLAsDocument(url);} catch (Exception e) {e.printStackTrace();} finally {lock.unlock();//释放写锁}return document;}/* @description : 通过HTTPS获取Document** @param url* @return :* @author cdw* @date 2020/10/1 21:22*/public static Document getDocumentOfHttps(String url) {Document document = null;try {document = httpUtils.executeGetWithSSLAsDocument(url);} catch (Exception e) {e.printStackTrace();}return document;}/* @description : 获取域名** @param url* @return :* @author cdw* @date 2020/10/1 21:22*/public String getHost(String url) throws MalformedURLException {java.net.URL Url = new java.net.URL(url);return "https://" + Url.getHost();// 获取主机名;}/* @description : 功能说明** @param doc* @param nextName 下一章的规则id名或者class名* @return :* @author cdw* @date 2020/10/1 21:23*/public static Map<String, Object> isNext(Document doc, String nextName) throws MalformedURLException {Map<String, Object> map = new HashMap<String, Object>();boolean flag = true;Elements next = doc.select(nextName);if (next.size() > 0) {map.put("url", "https:" + next.get(0).attr("href"));} else {flag = false;}map.put("flag", flag);return map;}/* @description : 获取小说内容** @param doc* @param titleRule 小说标题规则* @param contentsRule 小说内容规则* @return :* @author cdw* @date 2020/10/1 21:24*/public static List<String> getDetails(Document doc,String titleRule,String contentsRule){List<String> list = new ArrayList<>();Elements titles = doc.select(titleRule);//标题Elements contents = doc.select(contentsRule);//内容for (Element title : titles) {list.add(title.text() + "\r\n");}for (Element content : contents) {list.add(content.text().replaceAll(" ", "\r\n") + "\r\n");}return list;}/* @description : 通过地址获取目录标题和地址** @param url 网络地址* @param rule 规则* @return :* @author cdw* @date 2020/10/1 21:24*/public static List<Map<String,String>> getNovelByUrlToList(String url,String rule){public static List<Map<String,String>> getNovelByUrlToList(String url,String rule){Document doc = ReptileUtil.getDocumentOfHttps(url);url = "https:" + doc.select(rule).attr("href");doc = ReptileUtil.getDocumentOfHttps(url);List<Map<String,String>> listAll = new ArrayList<>();Elements titleUrls = doc.select(".volume-wrap ul li a");//小说目录标题for (Element titleUrl : titleUrls) {Document document = ReptileUtil.getDocumentOfHttps("https:"+titleUrl.attr("href"));List<String> list = ReptileUtil.getDetails(document,".j_chapterName",".j_readContent");Map<String,String> map = new HashMap<>();map.put(list.get(0),list.get(0));System.out.println(list.get(0).toString());listAll.add(map);}return listAll;}/* @description : 通过已解析的网页文件获取目录标题和地址** @param doc  已解析的网页文件* @param rule 规则* @return :* @author cdw* @date 2020/10/1 21:25*/public static List<Map<String,String>> getNovelByUrlToList(Document doc, String rule){List<Map<String, String>> list = new ArrayList<Map<String, String>>();if (doc != null && doc.childNodeSize() > 0) {Elements titleUrls = doc.select(rule);//标题for (Element titleUrl : titleUrls) {Map<String,String> map = new HashMap<String,String>();map.put("title", titleUrl.text());map.put("url", "https:" + titleUrl.attr("href"));list.add(map);}}return list;}
}

接下来是3个线程
GetNovelListThread.java 获取小说列表线程

package graspTheNovel.service;import graspTheNovel.entity.NovelAssist;
import graspTheNovel.utils.ReptileUtil;
import org.jsoup.nodes.Document;import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.Lock;/*** @program: webCrawler* @description: 获取小说列表线程* @author: cdw* @create: 2020-10-01 20:08**/
public class GetNovelListThread implements Runnable {Lock w;//写锁CountDownLatch latch;//信号量String url;//网络地址String listRule;String nextRule;Queue<List<Map<String, String>>> NovelList;public GetNovelListThread(NovelAssist na) {this.NovelList = na.getNovelList();this.w = na.getListRwl().writeLock();this.latch = na.getLatch();this.url = na.getUrl();this.listRule = na.getListRule();this.nextRule = na.getNextRule();}@Overridepublic void run() {getAllNovelList(url, listRule, nextRule);}/* @description : 获取全部小说列表** @param url* @param listRule 小说列表规则* @param nextRule 小说列表下一页规则* @return :* @author cdw* @date 2020/10/1 21:14*/public void getAllNovelList(String url, String listRule, String nextRule) {Boolean flag = true;try {System.out.println(Thread.currentThread().getName()+"开启");while (flag) {Document doc = ReptileUtil.getDocumentOfHttps(url);w.lock();//上了局部写锁NovelList.offer(ReptileUtil.getNovelByUrlToList(doc,listRule));System.out.println("cg");w.unlock();Map<String, Object> map = ReptileUtil.isNext(doc, nextRule);flag = (Boolean) map.get("flag");url = (String) map.get("url");while (NovelList.size() > 500){System.out.println("NovelList过大。获取小说列表休眠10分钟");Thread.sleep(1000*60*30);}}System.out.println("获取列表完成");} catch (Exception e) {System.out.println("获取列表异常");e.printStackTrace();} finally {w.unlock();System.out.println("释放列表写锁");latch.countDown();System.out.println("信号量-1");}}
}

GetDirectoryToQueueThread.java 获取目录和内容的网页信息线程

package graspTheNovel.service;import graspTheNovel.entity.NovelAssist;
import graspTheNovel.utils.ReptileUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.Lock;/*** @program: webCrawler* @description: 获取目录线程* @author: cdw* @create: 2020-10-01 20:10**/
public class GetDirectoryToQueueThread implements Runnable{public GetDirectoryToQueueThread(NovelAssist na){this.r = na.getListRwl().readLock();this.w = na.getDirectoryRwl().writeLock();this.directoryRule = na.getDirectoryRule();this.directoryQueue = na.getDirectoryQueue();this.NovelList = na.getNovelList();this.latch = na.getLatch();}String directoryRule;Queue<List<Map<String,String>>> NovelList;Queue<Map<String,List<Document>>> directoryQueue;Lock w;//写锁Lock r;//读锁CountDownLatch latch;//信号量@Overridepublic void run() {getNovelDirectory(directoryRule);}/* @description : 获取全部目录** @param directoryRule  目录的规则* @return :* @author cdw* @date 2020/10/1 21:46*/public void getNovelDirectory(String directoryRule){System.out.println(Thread.currentThread().getName()+"目录开启");boolean flag = true;try {while (flag) {if (NovelList.size() > 0) {r.lock();//获取局部读锁List<Map<String,String>> novelList = NovelList.poll();r.unlock();for(Map<String,String> map : novelList){String url = map.get("url");String title = map.get("title");Document doc = ReptileUtil.getDocumentOfHttps(url);if(doc != null && doc.childNodeSize() > 0){Elements titleUrls = doc.select(directoryRule);//小说目录标题Map<String,List<Document>> directoryMap = new HashMap<String,List<Document>>();List<Document> contentDocumentList = new ArrayList<Document>();//小说内容网页信息for (Element titleUrl : titleUrls) {Document document = ReptileUtil.getDocumentOfHttps("https:"+titleUrl.attr("href"));contentDocumentList.add(document);}directoryMap.put(title,contentDocumentList);w.lock();directoryQueue.offer(directoryMap);w.unlock();}}} else {synchronized (NovelList) {while(NovelList.size() == 0) {try {System.out.println("列表队列空，等待数据");Thread.sleep(10000);System.out.println("唤醒队列检测");} catch (InterruptedException e) {e.printStackTrace();}}}}}System.out.println("读取目录执行完毕");}catch (Exception e){System.out.println("读目录出错");e.printStackTrace();}finally {r.unlock();//释放局部读锁System.out.println("释放目录读锁");latch.countDown();System.out.println("信号量-1");}}
}

ProcessOfGrabInfoThread.java 对小说内容的解析和处理（输出到本地和存入数据库）

package graspTheNovel.service;import graspTheNovel.dao.NovelPersist;
import graspTheNovel.entity.NovelAssist;
import graspTheNovel.utils.ReptileUtil;
import org.jsoup.nodes.Document;import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.Lock;/*** @program: webCrawler* @description: 获取目录和内容信息输出到本地或数据库* @author: cdw* @create: 2020-10-01 20:13**/
public class ProcessOfGrabInfoThread implements Runnable {public ProcessOfGrabInfoThread(NovelAssist na) {this.directoryQueue = na.getDirectoryQueue();this.r = na.getDirectoryRwl().readLock();this.latch = na.getLatch();this.titleRule=na.getTitleRule();this.contentsRule=na.getContentsRule();}Lock r; //读取锁CountDownLatch latch; //信号量Queue<Map<String,List<Document>>> directoryQueue;//目录队列String titleRule;//标题规则String contentsRule;//内容规则@Overridepublic void run() {read(titleRule,contentsRule);}/* @description : windows下文件名中不能含有：\ / : * ? " < > | 英文的这些字符 ，这里使用"."、"'"进行替换。* \/:?| 用.替换* "<> 用'替换* @param dirPath* @return :* @author cdw* @date 2020/9/29 20:32*/public static String replaceSpecialCharacters(String dirPath) {dirPath = dirPath.replaceAll("[/\\\\:*?|]", "");dirPath = dirPath.replaceAll("[\"<>]", "'");return dirPath;}/* @description : 从队列中获取网页文件** @param titleRule 小说标题规则* @param contentsRule 小说内容规则* @return :* @author cdw* @date 2020/10/1 22:08*/public void read(String titleRule,String contentsRule) {System.out.println(Thread.currentThread().getName()+"开启");boolean flag = true;try {String sql = "insert into novel (`name`,`title`,`content`)" +"VALUES (? , ? , ?)";//数据库操作语句（插入）while (flag) {if (directoryQueue.size() > 0) {r.lock();Map<String,List<Document>> map = directoryQueue.poll();r.unlock();for (String fileName:map.keySet()){String path = "D:\\idea\\work\\webCrawler\\file\\"+fileName;FileWriter fw;File file = new File(path);//判断是否存在if (!file.exists()) {//创建目录文件file.mkdirs();}for (Document doc:map.get(fileName)){List<String> list = ReptileUtil.getDetails(doc,titleRule,contentsRule);NovelPersist novelPersist = new NovelPersist();List<String> sqlValues = new ArrayList<String>();sqlValues.add(fileName);for (String s : list) {sqlValues.add(s);}novelPersist.executeUpdate(sql, sqlValues);try{String u = path + "//" + replaceSpecialCharacters(list.get(0).trim()).trim() + ".txt";//这个改fw = new FileWriter(u,true);PrintWriter bw = new PrintWriter(fw);bw.println(list.get(list.size() - 1));bw.flush();bw.close();}catch(IOException e){System.out.println("写人失败");e.printStackTrace();}}}} else {synchronized (directoryQueue) {while(directoryQueue.size() == 0) {try {System.out.println("目录队列空，等待数据");Thread.sleep(30000);System.out.println("唤醒队列检测");} catch (InterruptedException e) {e.printStackTrace();}}}}}System.out.println("获取目录进硬盘执行完毕");} catch (Exception e) {System.out.println("读目录进硬盘出错");} finally {r.unlock();System.out.println("释放目录读锁");latch.countDown();System.out.println("信号量-1");}}
}

NovelServiceImpl.java 实现接口，主要实现过程

package graspTheNovel.service;import graspTheNovel.entity.NovelAssist;
import graspTheNovel.utils.ReptileUtil;
import org.jsoup.nodes.Document;import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.locks.ReentrantReadWriteLock;/*** @program: webCrawler* @description: 接口实现* @author: cdw* @create: 2020-10-01 20:06**/
public class NovelServiceImpl implements NovelService {ExecutorService es = Executors.newCachedThreadPool();//线程池protected static final int COUNTDOWN_LATCH = 4;volatile Queue<Map<String, List<Document>>> directoryList = new LinkedList<Map<String, List<Document>>>();//小说目录volatile Queue<List<Map<String,String>>> NovelList = new LinkedList<List<Map<String,String>>>();//小说列表@Overridepublic void getNovelByAll(String url) {CountDownLatch latch = new CountDownLatch(COUNTDOWN_LATCH);//两个工人的协作ReentrantReadWriteLock listRwl = new ReentrantReadWriteLock();ReentrantReadWriteLock directoryRwl = new ReentrantReadWriteLock();NovelAssist novelAssist = new NovelAssist();novelAssist.setUrl(url);novelAssist.setLatch(latch);novelAssist.setDirectoryQueue(directoryList);novelAssist.setNovelList(NovelList);novelAssist.setListRwl(listRwl);novelAssist.setDirectoryRwl(directoryRwl);novelAssist.setListRule(".cf li .book-mid-info h4 a");//小说名列表获取规则novelAssist.setNextRule("a.lbf-pagination-next");//下一页规则novelAssist.setDirectoryRule(".volume-wrap ul li a");//小说目录规则novelAssist.setTitleRule(".j_chapterName");//小说单章标题规则novelAssist.setContentsRule(".j_readContent");//小说内容规则//起点一个线程读取小说列表es.submit(new GetNovelListThread(novelAssist));//起点两个线程读取列表中小说的目录es.submit(new GetDirectoryToQueueThread(novelAssist));es.submit(new GetDirectoryToQueueThread(novelAssist));//起点一个线程对抓取到的小说进行处理es.submit(new ProcessOfGrabInfoThread(novelAssist));try {latch.await();} catch (InterruptedException e) {e.printStackTrace();}System.out.println("结束了");}@Overridepublic List<Map<String,String>> getNovelByName(String NovelName) {String url = "https://www.qidian.com/search?kw="+NovelName;String rule =".book-mid-info h4 a";return ReptileUtil.getNovelByUrlToList(url,rule);}}

最后是测试文件

/*** @program: webCrawler* @description: 测试类* @author: cdw* @create: 2020-09-28 15:01**/
public class Test01 {@Testpublic void test1() {//抓取一本Long startTime, endTime;System.out.println("爬虫 抓取起点开始了..........");startTime = new Date().getTime();NovelService novelService = new NovelServiceImpl();List<Map<String,String>> novelList = novelService.getNovelByName("万族之劫");System.out.println(novelList);endTime = new Date().getTime();System.out.println("爬虫 抓取起点结束了，用时" + (endTime - startTime) + "ms");}@Testpublic void test2() {//抓取全部
//        String url="https://www.qidian.com/finish?action=hidden&orderId=&page=1&vip=0&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2";String url = "https://www.qidian.com/all";NovelService novelService = new NovelServiceImpl();novelService.getNovelByAll(url);}
}

输出结果

运行系统控制台输出

本地小说列表

本地输出小说目录

数据库输出情况

Java抓取起点小说输出到本地文件夹和数据库相关推荐

使用aiohttp 抓取起点小说网
使用aiohttp 抓取起点小说网主要是为了学习异步协程抓取库,包括asyncio, aiohttp,等库的用法,顺便学习.感受一下并发的速度. 所以代码也不是很难,都是直接找到小说目录接口,找到目 ...
java小说目录提取_完整Java爬取起点小说网小说目录以及对应链接
完整Java爬取起点小说网小说目录以及对应链接完整Java爬取起点小说网小说目录以及对应链接 (第一次使用markdown写,其中的排版很不好,望大家理解) ?? 因为最近有一个比赛的事情,故前期看 ...
python爬虫多url_Python爬虫抓取多个URL写入本地文件
Python爬虫抓取多个URL写入本地文件!1. Pycharm中运行Scrapy windows环境下cmd中通过scrapy startproject 项目名,创建scrapy项目修改Run-中 ...
小爬虫爬取小猫咪图片并存入本地文件夹
小爬虫爬取小猫咪图片并存入本地文件夹本人是安徽工业大学电气与信息工程学院研一学生,最近还不能开学真的是很糟心哦,由于自己比较笨吧,起步较晚还要忙着学习机器学习还有计算机视觉,但是总学这个感觉很闷也没 ...
java爬虫抓取起点小说_爬虫实践-爬取起点中文网小说信息
qidian.py: import xlwt import requests from lxml import etree import time all_info_list = [] def get ...
java把控制台的结果输出为本地文件作为日志
方法一:PrintStream import java.io.FileNotFoundException; import java.io.PrintStream; PrintStream print= ...
python爬取明星百度图片并存入本地文件夹
python爬取明星百度图片并存入本地文件夹想要一个明星图片的时候,发现图片量过大,一张张保存太累,不太现实这时候就可以用到爬虫,批量爬取图片现在又出现一个问题,当发现一个明星爬完后,再爬取下一 ...
Python爬取起点小说并保存到本地文件夹和MongoDB数据库中
Python爬取起点小说并保存到本地MongoDB数据库中工具:Python3.7 + Mongo4.0 + Pycharm """ 爬取起点小说<诡秘之主> ...
Python爬虫编程思想（48）：项目实战：抓取起点中文网的小说信息
本文会利用requests库抓取起点中文网上的小说信息,并通过XPath提取相关的内容,最后将经过提取的内容保存到Excel文件中.本例需要使用第三方的xlwt库,该库用来通过Python操作Exce ...

Java抓取起点小说输出到本地文件夹和数据库

Java抓取起点小说输出到本地文件夹和数据库

目录

项目结构

所需插件

项目代码

输出结果

Java抓取起点小说输出到本地文件夹和数据库相关推荐

最新文章

热门文章