java爬虫爬取动漫之家10000部漫画信息

Java大作业，爬取信息并写入Excel。

项目下载：https://pan.baidu.com/s/1Z1rMWSyaAeGvZCm5CMgexQ（请用eclipse 2018打开，我的jdk版本是1.8）

附我爬取的10000部漫画信息下载地址：https://pan.baidu.com/s/12wCasdunyxGfdRNw84nbHw

数据图例：

主类代码：（如果有import报错，那么请下载上面的链接把里面lib文件夹下的.jar文件放进自己的项目，并右键jar包选择build path即可，若仍搞不懂请百度）

import java.io.File;
import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader;
import us.codecraft.webmagic.scheduler.RedisScheduler;
import us.codecraft.webmagic.pipeline.FilePipeline;
import org.openqa.selenium.firefox.*;public class Main implements PageProcessor {private Site site = Site.me().setRetryTimes(3).   //失败重试次数setSleepTime(20);   //爬取时间间隔private int Num=0;private String Title;private String Author[]=new String[2];private String Country;private String State;private String Popularity;private String Tag[]=new String[3];private String Type;private String Update;private String BookNum;private String Talk;private String Details;private static WritableWorkbook book;private static WritableSheet Sheet1;private static WritableSheet Sheet2;public void process(Page page) {int StaticNum = 0;page.addTargetRequests(page.getHtml().links().regex("http://manhua.dmzj.com/[a-z 0-9 -]+/").    //筛选网页规则all());Title=page.getHtml().xpath("div[@class='odd_anim_title_m']//span//a//h1/text()").toString();if(Title!=null) {StaticNum=++Num;}Author[0]=page.getHtml().xpath("div[@class='anim-main_list']//table//tbody//tr[3]//td//a[1]/text()").toString();Author[1]=page.getHtml().xpath("div[@class='anim-main_list']//table//tbody//tr[3]//td//a[2]/text()").toString();Country=page.getHtml().xpath("div[@class='anim-main_list']//table//tbody//tr[4]//td//a/text()").toString();State=page.getHtml().xpath("div[@class='anim-main_list']//table//tbody//tr[5]//td//a/text()").toString();/*Popularity=page.getHtml().xpath("div[@class='anim-main_list']//table//tbody//tr[6]//td[@id='hot_hits']/text()").toString();*//*page.putField("img", page.getHtml().xpath("div[@class='anim-main_list']//table//tbody//tr[6]//td[@id='hot_hits']").toString());Popularity=page.getResultItems().get("img");System.out.println("人气：" + Popularity);*/Tag[0]=page.getHtml().xpath("div[@class='anim-main_list']//table//tbody//tr[7]//td//a[1]/text()").toString();Tag[1]=page.getHtml().xpath("div[@class='anim-main_list']//table//tbody//tr[7]//td//a[2]/text()").toString();Tag[2]=page.getHtml().xpath("div[@class='anim-main_list']//table//tbody//tr[7]//td//a[3]/text()").toString();Type=page.getHtml().xpath("div[@class='anim-main_list']//table//tbody//tr[8]//td//a/text()").toString();Update=page.getHtml().xpath("div[@class='anim-main_list']//table//tbody//tr[9]//td//a/text()").toString();/*BookNum=page.getHtml().xpath("span[@id='subscribe_num']/text()").toString();Talk=page.getHtml().xpath("span[@class='comment_num']/text()").toString();Details=page.getHtml().xpath("div[@class='odd_anim_title_m']//a/text()").toString();*/if(Title!=null) {  //读取到有效数据try {Label label=new Label(0,StaticNum,Title);    //标题Sheet1.addCell(label);label=new Label(1,StaticNum,Author[0]);  //作者Sheet1.addCell(label);if(Author[1]!=null) {label=new Label(2,StaticNum,Author[1]);    //第二作者Sheet1.addCell(label);}label=new Label(3,StaticNum,Country); //地区Sheet1.addCell(label);label=new Label(4,StaticNum,State);  //状态Sheet1.addCell(label);label=new Label(5,StaticNum,Tag[0]); //标签1Sheet1.addCell(label);if(Tag[1]!=null) {label=new Label(6,StaticNum,Tag[1]); //标签2Sheet1.addCell(label);if(Tag[2]!=null) {label=new Label(7,StaticNum,Tag[2]); //标签3Sheet1.addCell(label);}}label=new Label(8,StaticNum,Type);    //类型Sheet1.addCell(label);label=new Label(9,StaticNum,Update); //更新Sheet1.addCell(label);book.write(); //写入文件}catch(Exception e) {System.out.println(e); }/*System.out.println("编号：" + Num + "\n" +"作品：" + Title + "\n" +"作者：" + Author[0] + "\n" +"地区：" + Country + "\n" +"状态：" + State + "\n" +//"人气：" + Popularity + "\n" +"标签：" + Tag[0] + Tag[1] + Tag[2] + "\n" +"类型：" + Type + "\n" +"更新：" + Update + "\n" +//"订阅：" + BookNum + "\n" +//"评论：" + Talk + "\n" +//"详情：" + Details + "\n" );*/if(StaticNum==22000) { //抓取数据量try {book.write();   //写入文件book.close(); //关闭文件System.exit(0);   //退出爬虫}catch(Exception e) {System.out.println(e); }}System.out.println(StaticNum);}}public Site getSite() {return site;}public static void main(String[] args) {try {   //创建Excelbook= Workbook.createWorkbook(new File("File.xls"));Sheet1=book.createSheet("表1",0); //创建两个表页Sheet2=book.createSheet("表2",1);System.out.println("创建Excel成功\n");Label label=new Label(0,0,"漫画名"); //填表头Sheet1.addCell(label);label=new Label(1,0,"作者1");   //填表头Sheet1.addCell(label);label=new Label(2,0,"作者2");   //填表头Sheet1.addCell(label);label=new Label(3,0,"地区");    //填表头Sheet1.addCell(label);label=new Label(4,0,"状态");    //填表头Sheet1.addCell(label);label=new Label(5,0,"标签1");   //填表头Sheet1.addCell(label);label=new Label(6,0,"标签2");   //填表头Sheet1.addCell(label);label=new Label(7,0,"标签3");   //填表头Sheet1.addCell(label);label=new Label(8,0,"类型");    //填表头Sheet1.addCell(label);label=new Label(9,0,"连载进度");  //填表头Sheet1.addCell(label);}catch(Exception e) {System.out.println(e);}Spider.create(new Main()).addUrl("http://manhua.dmzj.com").    //起始网页thread(1).    //线程数run();}
}

Excel类：

import jxl.write.Label;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
public class Excel {private int Raw;public Excel(int Raw) {this.Raw=Raw;}public Label Add(int a,String c) throws RowsExceededException, WriteException {Label label=new Label(a,this.Raw,c);return label;}
}

爬动漫之家手机版网页：

import java.io.File;
import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;public class AppMain implements PageProcessor {private Site site = Site.me().setRetryTimes(5).    //失败重试次数setSleepTime(50);   //爬取时间间隔private int CodeNum=2;private int SuccessNum=0;private static WritableWorkbook book;private static WritableSheet Sheet1;private static WritableSheet Sheet2;public void process(Page page) {String Title;String Author[]=new String[2];String Tag[]=new String[3];String Type;String Country;String State;String UpdateTime;String Details;for(int i=0;i<5;i++)page.addTargetRequest("https://m.dmzj.com/info/" + (++CodeNum) + ".html");//CodeNum+=;Title=page.getHtml().xpath("div[@class='BarTit']/text()").toString();if(Title!=null) {UpdateTime=page.getHtml().xpath("span[@class='date']/text()").toString();Author[0]=page.getHtml().xpath("div[@class='sub_r']//p[1]//a[1]/text()").toString();Author[1]=page.getHtml().xpath("div[@class='sub_r']//p[1]//a[2]/text()").toString();Tag[0]=page.getHtml().xpath("div[@class='sub_r']//p[2]//a[1]/text()").toString();Tag[1]=page.getHtml().xpath("div[@class='sub_r']//p[2]//a[2]/text()").toString();Tag[2]=page.getHtml().xpath("div[@class='sub_r']//p[2]//a[3]/text()").toString();Type=page.getHtml().xpath("div[@class='sub_r']//p[3]//a[1]/text()").toString();Country=page.getHtml().xpath("div[@class='sub_r']//p[3]//a[2]/text()").toString();State=page.getHtml().xpath("div[@class='sub_r']//p[3]//a[3]/text()").toString();Details=page.getHtml().xpath("p[@class='txtDesc autoHeight']/text()").toString();try {if(CodeNum<49020) {SuccessNum++;Excel a=new Excel(SuccessNum);Sheet1.addCell(a.Add(0, Title));Sheet1.addCell(a.Add(1, Author[0]));if(Author[1]!=null) Sheet1.addCell(a.Add(2, Author[1]));Sheet1.addCell(a.Add(3, Tag[0]));if(Tag[1]!=null) {Sheet1.addCell(a.Add(4, Tag[1]));if(Tag[2]!=null) Sheet1.addCell(a.Add(5, Tag[2]));}Sheet1.addCell(a.Add(6, Type));Sheet1.addCell(a.Add(7, Country));Sheet1.addCell(a.Add(8, State));Sheet1.addCell(a.Add(9, UpdateTime));Sheet1.addCell(a.Add(10, Details));}if(CodeNum>=49020) {    //抓取数据量try {book.write();   //写入文件book.close(); //关闭文件System.exit(0);   //退出爬虫}catch(Exception e) {System.out.println(e); }}System.out.println(SuccessNum);//System.out.println("漫画名：" + Title);/*System.out.println("作者：" + Author[0] + " " + Author[1]);System.out.println("标签：" + Tag[0] + " " + Tag[1] + " " + Tag[2]);System.out.println("类型：" + Type);System.out.println("地区：" + Country);System.out.println("状态：" + State);System.out.println("最近更新时间：" + UpdateTime);System.out.println(Details);*/}catch(Exception e) {System.out.println(e);}}else System.out.println(SuccessNum);}public Site getSite() {return site;}public static void main(String[] args) {try {    //创建Excelbook= Workbook.createWorkbook(new File("File.xls"));Sheet1=book.createSheet("表1",0); //创建两个表页Sheet2=book.createSheet("表2",1);System.out.println("创建Excel成功\n");Label label=new Label(0,0,"漫画名"); //填表头Sheet1.addCell(label);label=new Label(1,0,"作者1");   //填表头Sheet1.addCell(label);label=new Label(2,0,"作者2");   //填表头Sheet1.addCell(label);label=new Label(3,0,"标签1");   //填表头Sheet1.addCell(label);label=new Label(4,0,"标签2");   //填表头Sheet1.addCell(label);label=new Label(5,0,"标签3");   //填表头Sheet1.addCell(label);label=new Label(6,0,"类型");    //填表头Sheet1.addCell(label);label=new Label(7,0,"地区");    //填表头Sheet1.addCell(label);label=new Label(8,0,"状态");    //填表头Sheet1.addCell(label);label=new Label(9,0,"最近更新时间");    //填表头Sheet1.addCell(label);label=new Label(10,0,"详情");   //填表头Sheet1.addCell(label);}catch(Exception e) {System.out.println(e);}Spider.create(new AppMain()).addUrl("https://m.dmzj.com").thread(1).run();}
}

java爬虫爬取动漫之家10000部漫画信息相关推荐

【python】使用爬虫爬取动漫之家漫画更新信息
网站名称为: https://manhua.dmzj.com/update_1.shtml 本篇仅在于交流学习 1.首先将相应的库导入: import re import requests from ...
python 爬取动漫之家，下载漫画
#!/usr/bin/python3 # -*- coding: utf-8 -*-import requests # 发送http请求 from bs4 import BeautifulSoup # ...
java爬虫爬取笔趣阁小说
java爬虫爬取笔趣阁小说 package novelCrawler;import org.jsoup.Connection; import org.jsoup.HttpStatusException ...
python java 爬数据_如何用java爬虫爬取网页上的数据
当我们使用浏览器处理网页的时候,有时候是不需要浏览的,例如使用PhantomJS适用于无头浏览器,进行爬取网页数据操作.最近在进行java爬虫学习的小伙伴们有没有想过如何爬取js生成的网络页面吗?别急 ...
Java爬虫 --- 爬取王者荣耀英雄图片
Java爬虫 - 爬取王者荣耀英雄图片 import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Docu ...
Java爬虫爬取某招聘网站招聘信息
Java爬虫爬取某招聘网站招聘信息一.系统介绍二.功能展示 1.需求爬取的网站内容 2.实现流程 2.1数据采集 2.2页面解析 2.3数据存储三.获取源码一.系统介绍系统主要功能:本项目 ...
Java爬虫爬取wallhaven的图片
Java爬虫爬取wallhaven的图片参考文章:JAVA Jsoup爬取网页图片下载到本地需要的jar包:jsuop wallhaven网站拒绝java程序访问,所以要伪装报头. 发送请求时 C ...
我的第一个开源项目：Java爬虫爬取旧版正方教务系统课程表、成绩表
Java爬虫爬取旧版正方教务系统课程表.成绩表一.项目展示 1.正方教务系统首页 2.爬虫系统首页: 成绩查询: 课表查询: 二.项目实现 1.爬取思路描述无论是成绩查询或课表查询亦或者其它的 ...
Java爬虫爬取天猫淘宝京东搜索页和商品详情
Java爬虫爬取天猫淘宝京东搜索页和商品详情先识别商品url,区分平台提取商品编号,再根据平台带着商品编号爬取数据. 1.导包 <d ...

java爬虫爬取动漫之家10000部漫画信息

java爬虫爬取动漫之家10000部漫画信息相关推荐

最新文章

热门文章

java爬虫 爬取动漫之家10000部漫画信息

java爬虫 爬取动漫之家10000部漫画信息相关推荐

最新文章

热门文章

java爬虫爬取动漫之家10000部漫画信息

java爬虫爬取动漫之家10000部漫画信息相关推荐