Java抓取起点小说输出到本地文件夹和数据库

  • 目录
    • 项目结构
    • 所需插件
    • 项目代码
    • 输出结果

目录

项目结构

第一次写网络爬虫,参考了别人的,也自己理解了用法

所需插件

因为使用了mevan,直接上pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>com.cdw</groupId><artifactId>webCrawler</artifactId><version>1.0-SNAPSHOT</version><build><plugins><plugin><groupId>org.apache.maven.plugins</groupId><artifactId>maven-compiler-plugin</artifactId><configuration><source>8</source><target>8</target></configuration></plugin></plugins></build><dependencies><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.9.2</version></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>4.5.2</version></dependency><dependency><groupId>dom4j</groupId><artifactId>dom4j</artifactId><version>1.6.1</version></dependency><dependency><groupId>junit</groupId><artifactId>junit</artifactId><version>4.12</version></dependency><dependency><groupId>mysql</groupId><artifactId>mysql-connector-java</artifactId><version>5.1.3</version></dependency></dependencies></project>

项目代码

NovelAssist.java 用与多线程取小说列表和存入小说标题和内容,统一规范抓取规则

package graspTheNovel.entity;import org.jsoup.nodes.Document;import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.ReentrantReadWriteLock;/*** @program: webCrawler* @description: 小说协助类* @author: cdw* @create: 2020-10-01 20:48**/
public class NovelAssist {String url;//网络地址Queue<Map<String, List<Document>>> directoryQueue;//节章标题队列Queue<List<Map<String,String>>> NovelList;//列表队列ReentrantReadWriteLock listRwl;//列表读写锁ReentrantReadWriteLock directoryRwl;//目录读写锁CountDownLatch latch;//信号量//同一个html页面String listRule;//列表规则String nextRule;//列表下一页规则//同一个html页面String directoryRule;//小说目录的规则//同一个html页面String titleRule;//小说单章标题规则String contentsRule;//小说内容规则public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}public Queue<Map<String, List<Document>>> getDirectoryQueue() {return directoryQueue;}public void setDirectoryQueue(Queue<Map<String, List<Document>>> directoryQueue) {this.directoryQueue = directoryQueue;}public Queue<List<Map<String, String>>> getNovelList() {return NovelList;}public void setNovelList(Queue<List<Map<String, String>>> novelList) {NovelList = novelList;}public ReentrantReadWriteLock getListRwl() {return listRwl;}public void setListRwl(ReentrantReadWriteLock listRwl) {this.listRwl = listRwl;}public ReentrantReadWriteLock getDirectoryRwl() {return directoryRwl;}public void setDirectoryRwl(ReentrantReadWriteLock directoryRwl) {this.directoryRwl = directoryRwl;}public CountDownLatch getLatch() {return latch;}public void setLatch(CountDownLatch latch) {this.latch = latch;}public String getListRule() {return listRule;}public void setListRule(String listRule) {this.listRule = listRule;}public String getNextRule() {return nextRule;}public void setNextRule(String nextRule) {this.nextRule = nextRule;}public String getDirectoryRule() {return directoryRule;}public void setDirectoryRule(String directoryRule) {this.directoryRule = directoryRule;}public String getTitleRule() {return titleRule;}public void setTitleRule(String titleRule) {this.titleRule = titleRule;}public String getContentsRule() {return contentsRule;}public void setContentsRule(String contentsRule) {this.contentsRule = contentsRule;}
}

HttpUtils.java 拷贝别人写好的,http工具类

package graspTheNovel.utils;import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;/*** @program: webCrawler* @description: Http工具* <pre>* Http工具,包含:* 普通http请求工具(使用httpClient进行http,https请求的发送)* </pre>* @author: cdw* @create: 2020-10-01 15:07**/
public class HttpUtils {/*** 请求超时时间,默认20000ms*/private int timeout = 20000;/*** cookie表*/private Map<String, String> cookieMap = new HashMap<>();/*** 请求编码(处理返回结果),默认UTF-8*/private String charset = "UTF-8";private static HttpUtils httpUtils;private HttpUtils() {}/*** 获取实例** @return*/public static HttpUtils getInstance() {if (httpUtils == null)httpUtils = new HttpUtils();return httpUtils;}/*** 清空cookieMap*/public void invalidCookieMap() {cookieMap.clear();}public int getTimeout() {return timeout;}/*** 设置请求超时时间** @param timeout*/public void setTimeout(int timeout) {this.timeout = timeout;}public String getCharset() {return charset;}/*** 设置请求字符编码集** @param charset*/public void setCharset(String charset) {this.charset = charset;}/*** 将网页返回为解析后的文档格式** @param html* @return* @throws Exception*/public static Document parseHtmlToDoc(String html) throws Exception {return removeHtmlSpace(html);}private static Document removeHtmlSpace(String str) {Document doc = Jsoup.parse(str);String result = doc.html().replace("&nbsp;", "");return Jsoup.parse(result);}/*** 执行get请求,返回doc** @param url* @return* @throws Exception*/public Document executeGetAsDocument(String url) throws Exception {return parseHtmlToDoc(executeGet(url));}/*** 执行get请求** @param url* @return* @throws Exception*/public String executeGet(String url) throws Exception {HttpGet httpGet = new HttpGet(url);httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());CloseableHttpClient httpClient = null;String str = "";try {httpClient = HttpClientBuilder.create().build();HttpClientContext context = HttpClientContext.create();CloseableHttpResponse response = httpClient.execute(httpGet, context);getCookiesFromCookieStore(context.getCookieStore(), cookieMap);int state = response.getStatusLine().getStatusCode();if (state == 404) {str = "";}try {HttpEntity entity = response.getEntity();if (entity != null) {str = EntityUtils.toString(entity, charset);}} finally {response.close();}} catch (IOException e) {throw e;} finally {try {if (httpClient != null)httpClient.close();} catch (IOException e) {throw e;}}return str;}/*** 用https执行get请求,返回doc** @param url* @return* @throws Exception*/public Document executeGetWithSSLAsDocument(String url) throws Exception {return parseHtmlToDoc(executeGetWithSSL(url));}public static String httpGetHeader(String url,String cook,String header) throws IOException{//获取请求连接Connection con = Jsoup.connect(url);//请求头设置,特别是cookie设置con.header("Accept", "text/html, application/xhtml+xml, */*");con.header("Content-Type", "application/x-www-form-urlencoded");con.header("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0))");con.header("Cookie", cook);//发送请求Connection.Response resp=con.method(Connection.Method.GET).execute();//获取cookie名称为__bsi的值String cookieValue = resp.cookie("__bsi");System.out.println("cookie  __bsi值:  "+cookieValue);//获取返回cookie所值Map<String,String> cookies = resp.cookies();System.out.println("所有cookie值:  "+cookies);//获取返回头文件值String headerValue = resp.header(header);System.out.println("头文件"+header+"的值:"+headerValue);//获取所有头文件值Map<String,String> headersOne =resp.headers();System.out.println("所有头文件值:"+headersOne);return headerValue;}/*** 用https执行get请求** @param url* @return* @throws Exception*/public String executeGetWithSSL(String url) throws Exception {HttpGet httpGet = new HttpGet(url);httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());CloseableHttpClient httpClient = null;String str = "";try {httpClient = createSSLInsecureClient();HttpClientContext context = HttpClientContext.create();CloseableHttpResponse response = httpClient.execute(httpGet, context);getCookiesFromCookieStore(context.getCookieStore(), cookieMap);int state = response.getStatusLine().getStatusCode();if (state == 404) {str = "";}try {HttpEntity entity = response.getEntity();if (entity != null) {str = EntityUtils.toString(entity, charset);}} finally {response.close();}} catch (IOException e) {throw e;} catch (GeneralSecurityException ex) {throw ex;} finally {try {if (httpClient != null)httpClient.close();} catch (IOException e) {throw e;}}return str;}/*** 执行post请求,返回doc** @param url* @param params* @return* @throws Exception*/public Document executePostAsDocument(String url, Map<String, String> params) throws Exception {return parseHtmlToDoc(executePost(url, params));}/*** 执行post请求** @param url* @param params* @return* @throws Exception*/public String executePost(String url, Map<String, String> params) throws Exception {String reStr = "";HttpPost httpPost = new HttpPost(url);httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));List<NameValuePair> paramsRe = new ArrayList<>();for (String key : params.keySet()) {paramsRe.add(new BasicNameValuePair(key, params.get(key)));}CloseableHttpClient httpclient = HttpClientBuilder.create().build();CloseableHttpResponse response;try {httpPost.setEntity(new UrlEncodedFormEntity(paramsRe));HttpClientContext context = HttpClientContext.create();response = httpclient.execute(httpPost, context);getCookiesFromCookieStore(context.getCookieStore(), cookieMap);HttpEntity entity = response.getEntity();reStr = EntityUtils.toString(entity, charset);} catch (IOException e) {throw e;} finally {httpPost.releaseConnection();}return reStr;}/*** 用https执行post请求,返回doc** @param url* @param params* @return* @throws Exception*/public Document executePostWithSSLAsDocument(String url, Map<String, String> params) throws Exception {return parseHtmlToDoc(executePostWithSSL(url, params));}/*** 用https执行post请求** @param url* @param params* @return* @throws Exception*/public String executePostWithSSL(String url, Map<String, String> params) throws Exception {String re = "";HttpPost post = new HttpPost(url);List<NameValuePair> paramsRe = new ArrayList<>();for (String key : params.keySet()) {paramsRe.add(new BasicNameValuePair(key, params.get(key)));}post.setHeader("Cookie", convertCookieMapToString(cookieMap));post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());CloseableHttpResponse response;try {CloseableHttpClient httpClientRe = createSSLInsecureClient();HttpClientContext contextRe = HttpClientContext.create();post.setEntity(new UrlEncodedFormEntity(paramsRe));response = httpClientRe.execute(post, contextRe);HttpEntity entity = response.getEntity();if (entity != null) {re = EntityUtils.toString(entity, charset);}getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);} catch (Exception e) {throw e;}return re;}/*** 发送JSON格式body的POST请求** @param url 地址* @param jsonBody json body* @return* @throws Exception*/public String executePostWithJson(String url, String jsonBody) throws Exception {String reStr = "";HttpPost httpPost = new HttpPost(url);httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));CloseableHttpClient httpclient = HttpClientBuilder.create().build();CloseableHttpResponse response;try {httpPost.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));HttpClientContext context = HttpClientContext.create();response = httpclient.execute(httpPost, context);getCookiesFromCookieStore(context.getCookieStore(), cookieMap);HttpEntity entity = response.getEntity();reStr = EntityUtils.toString(entity, charset);} catch (IOException e) {throw e;} finally {httpPost.releaseConnection();}return reStr;}/*** 发送JSON格式body的SSL POST请求** @param url 地址* @param jsonBody json body* @return* @throws Exception*/public String executePostWithJsonAndSSL(String url, String jsonBody) throws Exception {String re = "";HttpPost post = new HttpPost(url);post.setHeader("Cookie", convertCookieMapToString(cookieMap));post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());CloseableHttpResponse response;try {CloseableHttpClient httpClientRe = createSSLInsecureClient();HttpClientContext contextRe = HttpClientContext.create();post.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));response = httpClientRe.execute(post, contextRe);HttpEntity entity = response.getEntity();if (entity != null) {re = EntityUtils.toString(entity, charset);}getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);} catch (Exception e) {throw e;}return re;}private void getCookiesFromCookieStore(CookieStore cookieStore, Map<String, String> cookieMap) {List<Cookie> cookies = cookieStore.getCookies();for (Cookie cookie : cookies) {cookieMap.put(cookie.getName(), cookie.getValue());}}private String convertCookieMapToString(Map<String, String> map) {String cookie = "";for (String key : map.keySet()) {cookie += (key + "=" + map.get(key) + "; ");}if (map.size() > 0) {cookie = cookie.substring(0, cookie.length() - 2);}return cookie;}/*** 创建 SSL连接** @return* @throws GeneralSecurityException*/private static CloseableHttpClient createSSLInsecureClient() throws GeneralSecurityException {try {SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, (chain, authType) -> true).build();SSLConnectionSocketFactory sslConnectionSocketFactory =new SSLConnectionSocketFactory(sslContext,(s, sslContextL) -> true);return HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory).build();} catch (GeneralSecurityException e) {throw e;}}
}
ReptileUtil.java 爬虫工具类,封装对网站页面的解析```java
package graspTheNovel.utils;import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.locks.Lock;/*** @program: webCrawler* @description: 爬虫工具类* @author: cdw* @create: 2020-10-01 20:14**/
public class ReptileUtil {static HttpUtils httpUtils = HttpUtils.getInstance();/* @description : 根据url获取Document** @param url* @param lock lock锁* @return :* @author cdw* @date 2020/10/1 21:22*/public static Document getDocumentOfHttps(String url, Lock lock) {Document document = null;lock.lock();//获取写锁try {document = httpUtils.executeGetWithSSLAsDocument(url);} catch (Exception e) {e.printStackTrace();} finally {lock.unlock();//释放写锁}return document;}/* @description : 通过HTTPS获取Document** @param url* @return :* @author cdw* @date 2020/10/1 21:22*/public static Document getDocumentOfHttps(String url) {Document document = null;try {document = httpUtils.executeGetWithSSLAsDocument(url);} catch (Exception e) {e.printStackTrace();}return document;}/* @description : 获取域名** @param url* @return :* @author cdw* @date 2020/10/1 21:22*/public String getHost(String url) throws MalformedURLException {java.net.URL Url = new java.net.URL(url);return "https://" + Url.getHost();// 获取主机名;}/* @description : 功能说明** @param doc* @param nextName 下一章的规则id名或者class名* @return :* @author cdw* @date 2020/10/1 21:23*/public static Map<String, Object> isNext(Document doc, String nextName) throws MalformedURLException {Map<String, Object> map = new HashMap<String, Object>();boolean flag = true;Elements next = doc.select(nextName);if (next.size() > 0) {map.put("url", "https:" + next.get(0).attr("href"));} else {flag = false;}map.put("flag", flag);return map;}/* @description : 获取小说内容** @param doc* @param titleRule 小说标题规则* @param contentsRule 小说内容规则* @return :* @author cdw* @date 2020/10/1 21:24*/public static List<String> getDetails(Document doc,String titleRule,String contentsRule){List<String> list = new ArrayList<>();Elements titles = doc.select(titleRule);//标题Elements contents = doc.select(contentsRule);//内容for (Element title : titles) {list.add(title.text() + "\r\n");}for (Element content : contents) {list.add(content.text().replaceAll(" ", "\r\n") + "\r\n");}return list;}/* @description : 通过地址获取目录标题和地址** @param url 网络地址* @param rule 规则* @return :* @author cdw* @date 2020/10/1 21:24*/public static List<Map<String,String>> getNovelByUrlToList(String url,String rule){public static List<Map<String,String>> getNovelByUrlToList(String url,String rule){Document doc = ReptileUtil.getDocumentOfHttps(url);url = "https:" + doc.select(rule).attr("href");doc = ReptileUtil.getDocumentOfHttps(url);List<Map<String,String>> listAll = new ArrayList<>();Elements titleUrls = doc.select(".volume-wrap ul li a");//小说目录标题for (Element titleUrl : titleUrls) {Document document = ReptileUtil.getDocumentOfHttps("https:"+titleUrl.attr("href"));List<String> list = ReptileUtil.getDetails(document,".j_chapterName",".j_readContent");Map<String,String> map = new HashMap<>();map.put(list.get(0),list.get(0));System.out.println(list.get(0).toString());listAll.add(map);}return listAll;}/* @description : 通过已解析的网页文件获取目录标题和地址** @param doc  已解析的网页文件* @param rule 规则* @return :* @author cdw* @date 2020/10/1 21:25*/public static List<Map<String,String>> getNovelByUrlToList(Document doc, String rule){List<Map<String, String>> list = new ArrayList<Map<String, String>>();if (doc != null && doc.childNodeSize() > 0) {Elements titleUrls = doc.select(rule);//标题for (Element titleUrl : titleUrls) {Map<String,String> map = new HashMap<String,String>();map.put("title", titleUrl.text());map.put("url", "https:" + titleUrl.attr("href"));list.add(map);}}return list;}
}

接下来是3个线程
GetNovelListThread.java 获取小说列表线程

package graspTheNovel.service;import graspTheNovel.entity.NovelAssist;
import graspTheNovel.utils.ReptileUtil;
import org.jsoup.nodes.Document;import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.Lock;/*** @program: webCrawler* @description: 获取小说列表线程* @author: cdw* @create: 2020-10-01 20:08**/
public class GetNovelListThread implements Runnable {Lock w;//写锁CountDownLatch latch;//信号量String url;//网络地址String listRule;String nextRule;Queue<List<Map<String, String>>> NovelList;public GetNovelListThread(NovelAssist na) {this.NovelList = na.getNovelList();this.w = na.getListRwl().writeLock();this.latch = na.getLatch();this.url = na.getUrl();this.listRule = na.getListRule();this.nextRule = na.getNextRule();}@Overridepublic void run() {getAllNovelList(url, listRule, nextRule);}/* @description : 获取全部小说列表** @param url* @param listRule 小说列表规则* @param nextRule 小说列表下一页规则* @return :* @author cdw* @date 2020/10/1 21:14*/public void getAllNovelList(String url, String listRule, String nextRule) {Boolean flag = true;try {System.out.println(Thread.currentThread().getName()+"开启");while (flag) {Document doc = ReptileUtil.getDocumentOfHttps(url);w.lock();//上了局部写锁NovelList.offer(ReptileUtil.getNovelByUrlToList(doc,listRule));System.out.println("cg");w.unlock();Map<String, Object> map = ReptileUtil.isNext(doc, nextRule);flag = (Boolean) map.get("flag");url = (String) map.get("url");while (NovelList.size() > 500){System.out.println("NovelList过大。获取小说列表休眠10分钟");Thread.sleep(1000*60*30);}}System.out.println("获取列表完成");} catch (Exception e) {System.out.println("获取列表异常");e.printStackTrace();} finally {w.unlock();System.out.println("释放列表写锁");latch.countDown();System.out.println("信号量-1");}}
}

GetDirectoryToQueueThread.java 获取目录和内容的网页信息线程

package graspTheNovel.service;import graspTheNovel.entity.NovelAssist;
import graspTheNovel.utils.ReptileUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.Lock;/*** @program: webCrawler* @description: 获取目录线程* @author: cdw* @create: 2020-10-01 20:10**/
public class GetDirectoryToQueueThread implements Runnable{public GetDirectoryToQueueThread(NovelAssist na){this.r = na.getListRwl().readLock();this.w = na.getDirectoryRwl().writeLock();this.directoryRule = na.getDirectoryRule();this.directoryQueue = na.getDirectoryQueue();this.NovelList = na.getNovelList();this.latch = na.getLatch();}String directoryRule;Queue<List<Map<String,String>>> NovelList;Queue<Map<String,List<Document>>> directoryQueue;Lock w;//写锁Lock r;//读锁CountDownLatch latch;//信号量@Overridepublic void run() {getNovelDirectory(directoryRule);}/* @description : 获取全部目录** @param directoryRule  目录的规则* @return :* @author cdw* @date 2020/10/1 21:46*/public void getNovelDirectory(String directoryRule){System.out.println(Thread.currentThread().getName()+"目录开启");boolean flag = true;try {while (flag) {if (NovelList.size() > 0) {r.lock();//获取局部读锁List<Map<String,String>> novelList = NovelList.poll();r.unlock();for(Map<String,String> map : novelList){String url = map.get("url");String title = map.get("title");Document doc = ReptileUtil.getDocumentOfHttps(url);if(doc != null && doc.childNodeSize() > 0){Elements titleUrls = doc.select(directoryRule);//小说目录标题Map<String,List<Document>> directoryMap = new HashMap<String,List<Document>>();List<Document> contentDocumentList = new ArrayList<Document>();//小说内容网页信息for (Element titleUrl : titleUrls) {Document document = ReptileUtil.getDocumentOfHttps("https:"+titleUrl.attr("href"));contentDocumentList.add(document);}directoryMap.put(title,contentDocumentList);w.lock();directoryQueue.offer(directoryMap);w.unlock();}}} else {synchronized (NovelList) {while(NovelList.size() == 0) {try {System.out.println("列表队列空,等待数据");Thread.sleep(10000);System.out.println("唤醒队列检测");} catch (InterruptedException e) {e.printStackTrace();}}}}}System.out.println("读取目录执行完毕");}catch (Exception e){System.out.println("读目录出错");e.printStackTrace();}finally {r.unlock();//释放局部读锁System.out.println("释放目录读锁");latch.countDown();System.out.println("信号量-1");}}
}

ProcessOfGrabInfoThread.java 对小说内容的解析和处理(输出到本地和存入数据库)

package graspTheNovel.service;import graspTheNovel.dao.NovelPersist;
import graspTheNovel.entity.NovelAssist;
import graspTheNovel.utils.ReptileUtil;
import org.jsoup.nodes.Document;import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.Lock;/*** @program: webCrawler* @description: 获取目录和内容信息输出到本地或数据库* @author: cdw* @create: 2020-10-01 20:13**/
public class ProcessOfGrabInfoThread implements Runnable {public ProcessOfGrabInfoThread(NovelAssist na) {this.directoryQueue = na.getDirectoryQueue();this.r = na.getDirectoryRwl().readLock();this.latch = na.getLatch();this.titleRule=na.getTitleRule();this.contentsRule=na.getContentsRule();}Lock r; //读取锁CountDownLatch latch; //信号量Queue<Map<String,List<Document>>> directoryQueue;//目录队列String titleRule;//标题规则String contentsRule;//内容规则@Overridepublic void run() {read(titleRule,contentsRule);}/* @description : windows下文件名中不能含有:\ / : * ? " < > | 英文的这些字符 ,这里使用"."、"'"进行替换。* \/:?| 用.替换* "<> 用'替换* @param dirPath* @return :* @author cdw* @date 2020/9/29 20:32*/public static String replaceSpecialCharacters(String dirPath) {dirPath = dirPath.replaceAll("[/\\\\:*?|]", "");dirPath = dirPath.replaceAll("[\"<>]", "'");return dirPath;}/* @description : 从队列中获取网页文件** @param titleRule 小说标题规则* @param contentsRule 小说内容规则* @return :* @author cdw* @date 2020/10/1 22:08*/public void read(String titleRule,String contentsRule) {System.out.println(Thread.currentThread().getName()+"开启");boolean flag = true;try {String sql = "insert into novel (`name`,`title`,`content`)" +"VALUES (? , ? , ?)";//数据库操作语句(插入)while (flag) {if (directoryQueue.size() > 0) {r.lock();Map<String,List<Document>> map = directoryQueue.poll();r.unlock();for (String fileName:map.keySet()){String path = "D:\\idea\\work\\webCrawler\\file\\"+fileName;FileWriter fw;File file = new File(path);//判断是否存在if (!file.exists()) {//创建目录文件file.mkdirs();}for (Document doc:map.get(fileName)){List<String> list = ReptileUtil.getDetails(doc,titleRule,contentsRule);NovelPersist novelPersist = new NovelPersist();List<String> sqlValues = new ArrayList<String>();sqlValues.add(fileName);for (String s : list) {sqlValues.add(s);}novelPersist.executeUpdate(sql, sqlValues);try{String u = path + "//" + replaceSpecialCharacters(list.get(0).trim()).trim() + ".txt";//这个改fw = new FileWriter(u,true);PrintWriter bw = new PrintWriter(fw);bw.println(list.get(list.size() - 1));bw.flush();bw.close();}catch(IOException e){System.out.println("写人失败");e.printStackTrace();}}}} else {synchronized (directoryQueue) {while(directoryQueue.size() == 0) {try {System.out.println("目录队列空,等待数据");Thread.sleep(30000);System.out.println("唤醒队列检测");} catch (InterruptedException e) {e.printStackTrace();}}}}}System.out.println("获取目录进硬盘执行完毕");} catch (Exception e) {System.out.println("读目录进硬盘出错");} finally {r.unlock();System.out.println("释放目录读锁");latch.countDown();System.out.println("信号量-1");}}
}

NovelServiceImpl.java 实现接口,主要实现过程

package graspTheNovel.service;import graspTheNovel.entity.NovelAssist;
import graspTheNovel.utils.ReptileUtil;
import org.jsoup.nodes.Document;import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.locks.ReentrantReadWriteLock;/*** @program: webCrawler* @description: 接口实现* @author: cdw* @create: 2020-10-01 20:06**/
public class NovelServiceImpl implements NovelService {ExecutorService es = Executors.newCachedThreadPool();//线程池protected static final int COUNTDOWN_LATCH = 4;volatile Queue<Map<String, List<Document>>> directoryList = new LinkedList<Map<String, List<Document>>>();//小说目录volatile Queue<List<Map<String,String>>> NovelList = new LinkedList<List<Map<String,String>>>();//小说列表@Overridepublic void getNovelByAll(String url) {CountDownLatch latch = new CountDownLatch(COUNTDOWN_LATCH);//两个工人的协作ReentrantReadWriteLock listRwl = new ReentrantReadWriteLock();ReentrantReadWriteLock directoryRwl = new ReentrantReadWriteLock();NovelAssist novelAssist = new NovelAssist();novelAssist.setUrl(url);novelAssist.setLatch(latch);novelAssist.setDirectoryQueue(directoryList);novelAssist.setNovelList(NovelList);novelAssist.setListRwl(listRwl);novelAssist.setDirectoryRwl(directoryRwl);novelAssist.setListRule(".cf li .book-mid-info h4 a");//小说名列表获取规则novelAssist.setNextRule("a.lbf-pagination-next");//下一页规则novelAssist.setDirectoryRule(".volume-wrap ul li a");//小说目录规则novelAssist.setTitleRule(".j_chapterName");//小说单章标题规则novelAssist.setContentsRule(".j_readContent");//小说内容规则//起点一个线程读取小说列表es.submit(new GetNovelListThread(novelAssist));//起点两个线程读取列表中小说的目录es.submit(new GetDirectoryToQueueThread(novelAssist));es.submit(new GetDirectoryToQueueThread(novelAssist));//起点一个线程对抓取到的小说进行处理es.submit(new ProcessOfGrabInfoThread(novelAssist));try {latch.await();} catch (InterruptedException e) {e.printStackTrace();}System.out.println("结束了");}@Overridepublic List<Map<String,String>> getNovelByName(String NovelName) {String url = "https://www.qidian.com/search?kw="+NovelName;String rule =".book-mid-info h4 a";return ReptileUtil.getNovelByUrlToList(url,rule);}}

最后是测试文件

/*** @program: webCrawler* @description: 测试类* @author: cdw* @create: 2020-09-28 15:01**/
public class Test01 {@Testpublic void test1() {//抓取一本Long startTime, endTime;System.out.println("爬虫 抓取起点开始了..........");startTime = new Date().getTime();NovelService novelService = new NovelServiceImpl();List<Map<String,String>> novelList = novelService.getNovelByName("万族之劫");System.out.println(novelList);endTime = new Date().getTime();System.out.println("爬虫 抓取起点结束了,用时" + (endTime - startTime) + "ms");}@Testpublic void test2() {//抓取全部
//        String url="https://www.qidian.com/finish?action=hidden&orderId=&page=1&vip=0&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2";String url = "https://www.qidian.com/all";NovelService novelService = new NovelServiceImpl();novelService.getNovelByAll(url);}
}

输出结果

运行系统控制台输出

本地小说列表

本地输出小说目录

数据库输出情况

Java抓取起点小说输出到本地文件夹和数据库相关推荐

  1. 使用aiohttp 抓取起点小说网

    使用aiohttp 抓取起点小说网 主要是为了学习异步协程抓取库,包括asyncio, aiohttp,等库的用法,顺便学习.感受一下并发的速度. 所以代码也不是很难,都是直接找到小说目录接口,找到目 ...

  2. java小说目录提取_完整Java爬取起点小说网小说目录以及对应链接

    完整Java爬取起点小说网小说目录以及对应链接 完整Java爬取起点小说网小说目录以及对应链接 (第一次使用markdown写,其中的排版很不好,望大家理解) ?? 因为最近有一个比赛的事情,故前期看 ...

  3. python爬虫多url_Python爬虫抓取多个URL写入本地文件

    Python爬虫抓取多个URL写入本地文件!1. Pycharm中运行Scrapy windows环境下cmd中通过scrapy startproject 项目名,创建scrapy项目 修改Run-中 ...

  4. 小爬虫爬取小猫咪图片并存入本地文件夹

    小爬虫爬取小猫咪图片并存入本地文件夹 本人是安徽工业大学电气与信息工程学院研一学生,最近还不能开学真的是很糟心哦,由于自己比较笨吧,起步较晚还要忙着学习机器学习还有计算机视觉,但是总学这个感觉很闷也没 ...

  5. java爬虫抓取起点小说_爬虫实践-爬取起点中文网小说信息

    qidian.py: import xlwt import requests from lxml import etree import time all_info_list = [] def get ...

  6. java把控制台的结果输出为本地文件作为日志

    方法一:PrintStream import java.io.FileNotFoundException; import java.io.PrintStream; PrintStream print= ...

  7. python爬取明星百度图片并存入本地文件夹

    python爬取明星百度图片并存入本地文件夹 想要一个明星图片的时候,发现图片量过大,一张张保存太累,不太现实 这时候就可以用到爬虫,批量爬取图片 现在又出现一个问题,当发现一个明星爬完后,再爬取下一 ...

  8. Python爬取起点小说并保存到本地文件夹和MongoDB数据库中

    Python爬取起点小说并保存到本地MongoDB数据库中 工具:Python3.7 + Mongo4.0 + Pycharm """ 爬取起点小说<诡秘之主> ...

  9. Python爬虫编程思想(48):项目实战:抓取起点中文网的小说信息

    本文会利用requests库抓取起点中文网上的小说信息,并通过XPath提取相关的内容,最后将经过提取的内容保存到Excel文件中.本例需要使用第三方的xlwt库,该库用来通过Python操作Exce ...

最新文章

  1. 阿里面试官:给我说说Netty是如何在Dubbo中应用的?
  2. 【分治】P1228 地毯填补问题(多联骨牌覆盖棋盘问题)(递归,分治)难度⭐⭐⭐
  3. 如何用简单易懂的例子解释隐马尔可夫模型?
  4. URL URI傻傻分不清楚,dart告诉你该怎么用
  5. 红队技巧-域渗透的协议利用
  6. (21)FPGA资源共享
  7. docker 容器命令
  8. bzoj 1632: [Usaco2007 Feb]Lilypad Pond(BFS)
  9. 登录亿邮网关windows脚本
  10. linux openfire mysql_Linux系统安装openfire及其如何后台运行openfire
  11. Pycharm主题颜色设置
  12. Asterisk G729编码支持
  13. 软件设计师中级-UML建模
  14. linux 安装云锁
  15. 企业如何利用BaaS平台快速部署区块链应用落地
  16. 文件在计算机系统储存具体位置,电脑微信的文件储存位置怎么打开
  17. Sqlite3内存数据库
  18. 计算机网络云技术是什么,什么是云计算 云计算的技术原理介绍【详解】
  19. 机器学习-SVM硬间隔与软间隔:重要参数C
  20. linux下固态硬盘ssd优化

热门文章

  1. 七点学完安全知识超级详细了解进程和病毒知识
  2. MP3文件格式及编解码流程
  3. 【精华】批量修改文件夹中的文件名
  4. 玉雕分几级(不要当真,纯属调侃)
  5. 生成神经对抗网络(GAN)的基本认识及应用
  6. MySQL 架构总览-查询执行流程-SQL 解析顺序
  7. 左、右、内、自然连接
  8. Webpage not available err_cleartext_not_permitted
  9. 如何带领项目团队按时高质量的完成交付
  10. 企业级SSD 寿命要怎么看?