基于SpringBoot的哔哩哔哩动态爬取网站

原理：通过b站的动态api获取动态的json，从json中获取动态的作者与图片信息，并持久化到数据库。

b站的动态链接格式为 https://t.bilibili.com/xxx其中xxx为动态的id，通过b站的api接口：https://api.vc.bilibili.com/dynamic_svr/v1/dynamic_svr/get_dynamic_detail?dynamic_id=xxx 即可获取到此动态的json数据。

关键功能实现：

首先通过工具类从用户提交的完整网址中提取动态id

public class UrlUtil {public static String getApiId(String str){if(str.length()>49) {str = str.substring(0, 50);}if(str.contains("b23.tv")){str = shortLink.getLink(str);}str=str.trim();String str2="";if(!"".equals(str)){for(int i=0;i<str.length();i++){if(str.charAt(i)>=48 && str.charAt(i)<=57){str2+=str.charAt(i);}}}if(str2.length()>=18){return str2.substring(0,18);}elsereturn null;}
}

但是手机端用户复制下来的动态链接为短链接，为了减少用户使用的时间成本，可以通过工具类将短链接转换为长连接

public class shortLink {public static String getLink(String shortLink){OkHttpClient client = new OkHttpClient();Request request = new Request.Builder().url(shortLink).build();try {Response response = client.newCall(request).execute();HttpUrl realUrl = response.request().url();response.close();return String.valueOf(realUrl);} catch (IOException e) {return null;}}}

至此后台就可以通过用户上传的动态链接来获取到动态的id

之后通过fastjson来获取json里的各项数据

public class GetPicJson {public static List<String> upload(String link) throws IOException {link = UrlUtil.getApiId(link);String str = GetHttpData.getData("https://api.vc.bilibili.com/dynamic_svr/v1/dynamic_svr/get_dynamic_detail?dynamic_id="+link);JSONObject nameJson = JSON.parseObject(str);String name = nameJson.getJSONObject("data").getJSONObject("card").getJSONObject("card").getJSONObject("user").getString("name");String uid = nameJson.getJSONObject("data").getJSONObject("card").getJSONObject("desc").getString("uid");String time = nameJson.getJSONObject("data").getJSONObject("card").getJSONObject("desc").getString("timestamp");JSONArray tags;StringBuilder tag= new StringBuilder();if (nameJson.getJSONObject("data").getJSONObject("card").getJSONObject("display").getJSONObject("topic_info")!=null){tags = JSON.parseArray(nameJson.getJSONObject("data").getJSONObject("card").getJSONObject("display").getJSONObject("topic_info").getString("topic_details"));JSONObject jsonObject;for (Object o : tags) {jsonObject = (JSONObject) o;tag.append(jsonObject.getString("topic_name"));}}else tag.append("wuguan");List<String> s2;if(JSON.parseArray(nameJson.getJSONObject("data").getJSONObject("card").getJSONObject("card").getJSONObject("item").getString("pictures"),String.class)!=null){s2 = JSON.parseArray(nameJson.getJSONObject("data").getJSONObject("card").getJSONObject("card").getJSONObject("item").getString("pictures"),String.class);String first = s2.get(0);JSONObject picJson = JSON.parseObject(first);String url = picJson.getString("img_src");List<String> fin = new ArrayList<>();fin.add(name);fin.add(url);fin.add(uid);fin.add(time);fin.add(tag.toString());return fin;}return null;}
}

返回为一个list集合，分别为动态的用户，第一张图片的url，用户的uid，动态发布的时间，动态的tag。如果此动态没有tag将会自动添加一个”wuguan“tag，防止后面在比对时出现空指针异常。

根据json中获取的url将图片保存到本地

public class Download {public static void download(String url)throws IOException {URL url1 = new URL(url+"@400w.webp");String fileName= new FileName().getName(url);URLConnection uc = url1.openConnection();InputStream inputStream = uc.getInputStream();FileOutputStream out = new FileOutputStream(Path.path()+fileName);int j = 0;while ((j = inputStream.read()) != -1) {out.write(j);}inputStream.close();}
}

在图片url后添加@400w.webp即可以获取到宽度为400像素的webp格式缩略图，防止原图过大影响访问速度。文件名设置为图片的url

编写tag检测工具，本项目爬取的是asoul二创相关动态

public class TagCheck {public static boolean check(String tag){return tag.contains("a-soul")||tag.contains("A-SOUL")||tag.contains("ASOUL")||tag.contains("asoul")||tag.contains("嘉然") || tag.contains("向晚") || tag.contains("珈乐") || tag.contains("乃琳") || tag.contains("贝拉")||tag.contains("乃贝")||tag.contains("嘉晚饭")||tag.contains("贝贝珈")||tag.contains("果丹皮");}
}

最后是控制层与业务层

 @GetMapping("/up")public String up(String link, Model model,RedirectAttributes attributes)throws IOException{String sourceLink = UrlUtil.getApiId(link);if(sourceLink==null){ //判断url合法性model.addAttribute("result","url格式不正确");return "upload";}if(upByLinkService.findPicture(sourceLink)!=null){model.addAttribute("result","图片已存在");return "upload";}List<String> firstPic = GetPicJson.upload(link);//获取动态第一张图片信息if(firstPic==null){model.addAttribute("result","动态没有图片");return"upload";}if(!TagCheck.check(firstPic.get(4))){model.addAttribute("result","未找到a-soul有关tag");return "upload";}if(("0").equals(upByLinkService.allowUp(firstPic.get(2)))){model.addAttribute("result","此用户不允许上传");return "upload";}if(upByLinkService.allowUp(firstPic.get(2))==null){upByLinkService.saveAuthor(firstPic);}upByLinkService.savePicture(firstPic,sourceLink);model.addAttribute("pictures",pictureService.indexPic());attributes.addFlashAttribute("result","添加成功！");return "redirect:/";}

 public void savePicture(List<String> firstPic,String sourceLink) {Picture picture = new Picture();picture.setFileName(fileName.getName(firstPic.get(1)));picture.setUid(firstPic.get(2));picture.setIsShow("1");picture.setSourceLink(sourceLink);picture.setTime(Integer.valueOf(firstPic.get(3)));pictureService.savePicture(picture);//持久化到mysqltry {Download.download(firstPic.get(1));//本地保存照片} catch (IOException e) {e.printStackTrace();}}

自动爬取：

实现自动爬取，可以通过b站的api

“https://api.vc.bilibili.com/topic_svr/v1/topic_svr/topic_history?topic_name=“+tag+“&offset_dynamic_id=”+id
id为动态的id，若为空，则获取此tag下最新的25条动态，若不为空，则获取此id的动态之前发布的25条动态。 编写配置类获取最新的25条动态

public class ListNew {public static List<String> getList(String tag){String str = GetHttpData.getData("https://api.vc.bilibili.com/topic_svr/v1/topic_svr/topic_history?topic_name="+tag+"&offset_dynamic_id=");JSONObject Json = JSON.parseObject(str);JSONArray cards = JSON.parseArray(Json.getJSONObject("data").getString("cards"));List<String> ids = new ArrayList<>();for (int i = 0; i < cards.size(); i++) {if ("2".equals(cards.getJSONObject(i).getJSONObject("desc").getString("type"))){//判断动态类型是否为文字类动态ids.add(cards.getJSONObject(i).getJSONObject("desc").getString("dynamic_id_str"));}}return ids;}}

之后使用springboot的注解开启定时任务@EnableScheduling，因为一共爬取了三给tag下的动态，可能会产生重复的id，可以将其转存为set集合去除重复的动态id。同时设置每条动态爬取间隔一秒，知道了动态id后想要的就都有了。

@Configuration
@EnableScheduling   // 开启定时任务
public class AutoUpload {@Autowiredprivate UpByLinkService upByLinkService;@Scheduled(fixedRate=1800000) //30minpublic void autoLoad() throws IOException {List<String> ids = ListNew.getList("A-SOUL%E4%BA%8C%E5%88%9B%E6%BF%80%E5%8A%B1%E8%AE%A1%E5%88%92");ids.addAll(ListNew.getList("A-SOUL%20FANART"));ids.addAll(ListNew.getList("A-SOUL%E4%BA%8C%E5%88%9B"));Set<String> set = new HashSet<>(ids);List<String> newList = new ArrayList<>(set);for (String sourceLink : newList) {try {Thread.sleep(1000);} catch (InterruptedException e) {e.printStackTrace();}if (upByLinkService.findPicture(sourceLink) != null) {System.out.println("图片已存在");continue;}List<String> firstPic = GetPicJson.upload(sourceLink);//获取动态第一张图片信息if (firstPic == null) {System.out.println("动态没有图片");continue;}if (("0").equals(upByLinkService.allowUp(firstPic.get(2)))) {System.out.println("此用户不允许上传");continue;}if (upByLinkService.allowUp(firstPic.get(2)) == null) {upByLinkService.saveAuthor(firstPic);}upByLinkService.savePicture(firstPic, sourceLink);System.out.println("上传成功");}}
}

25条动态之前的动态获取的方法：将每次获取的25条动态的最后一条动态id作为参数，替换到api后面，不断循环，即可获取所有的动态。

手动上传文件方式

    public String fileUpload(IndexVo indexVo, MultipartFile file) {//创建输入输出流InputStream inputStream = null;OutputStream outputStream = null;try {//指定上传的位置String path = Path.path();//获取文件的输入流inputStream = file.getInputStream();//获取上传时的文件名String fileName = file.getOriginalFilename();assert fileName != null;String suffix = fileName.substring(fileName.lastIndexOf("."));String md5Name = Md5.getFileMd5(file)+suffix;//查重if(pictureService.findFile(md5Name) != null){return "文件已存在";}String name = indexVo.getAuthor();
//if("0".equals(upByLinkService.allowUp(name))){return "此用户不允许上传";}if(upByLinkService.allowUp(name) == null){Author author = new Author();author.setAuthor(name);author.setUid(name);author.setAllow("1");authorService.saveAuthor(author);}indexVo.setUid(name);indexVo.setFileName(md5Name);indexVo.setIsShow("0");indexVo.setTime(Math.toIntExact(new Date().getTime()/1000));pictureService.fileSave(indexVo);//注意是路径+文件名File targetFile = new File(path + md5Name);//判断文件父目录是否存在if (!targetFile.getParentFile().exists()) {//不存在就创建一个targetFile.getParentFile().mkdir();}//获取文件的输出流outputStream = new FileOutputStream(targetFile);//最后使用资源访问器FileCopyUtils的copy方法拷贝文件FileCopyUtils.copy(inputStream, outputStream);//告诉页面上传成功了return "上传成功，审核通过后将会显示";} catch (IOException e) {e.printStackTrace();return "上传失败，请稍后重试";} finally {//无论成功与否，都有关闭输入输出流if (inputStream != null) {try {inputStream.close();} catch (IOException e) {e.printStackTrace();}}if (outputStream != null) {try {outputStream.close();} catch (IOException e) {e.printStackTrace();}}}}

这里在保存时将文件名保存为文件的MD5值，防止有相同的图片被上传。

剩下的就是简单的crud了

源码：Ava_Wan/picwall (gitee.com)

演示：二创图墙demo (jiaran.fun)