1.爬虫开发准备

开发工具：Eclipse/IDEA

浏览器：Google Chrome

浏览器Selement驱动：Selenium 3.5

Jar包：

// Selenium驱动版本需要和Chrome浏览器版本对应，

// 下载地址http://chromedriver.storage.googleapis.com/index.html

//如果Selenium出现报错请看这篇文章：https://blog.csdn.net/qq_33259323/article/details/106445163

2.流程

获取Cookie(终端输入或者使用Selenium打开扫码登录)
请求https://api.bilibili.com/x/relation/followers接口
解析数据
存入csv

3.编码

package com.mm.rep;import java.io.IOException;
import java.nio.charset.Charset;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import org.openqa.selenium.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.csvreader.CsvWriter;
import org.apache.commons.lang3.StringUtils;public class Main {private static final Logger logger = LogManager.getLogger(Main.class);private static WebDriver driver = null;private static GetMethod getMethod = null;private static Set<Cookie> bcookies = null;private final static String BLOGINURL = "https://passport.bilibili.com/login";private final static String BMAINPAGE = "https://www.bilibili.com/";Main(){BasicConfigurator.configure();// 初始化GetMethod,设置不变的RequestHeadergetMethod = new GetMethod();getMethod.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, "UTF-8");getMethod.addRequestHeader(":authority", "api.bilibili.com");getMethod.addRequestHeader(":method", "api.bilibili.com");getMethod.addRequestHeader(":scheme", "GET");getMethod.addRequestHeader(":scheme", "https");getMethod.addRequestHeader("accept", "*/*");getMethod.addRequestHeader(":scheme", "https");getMethod.addRequestHeader("accept-language", "zh-CN,zh;q=0.9");getMethod.addRequestHeader("sec-fetch-dest", "script");getMethod.addRequestHeader("sec-fetch-mode", "no-cors");getMethod.addRequestHeader("sec-fetch-site", "same-site");getMethod.addRequestHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36");}public static String getCookie() throws InterruptedException {String scCookie = null;Scanner ip = new Scanner(System.in);logger.info("请输入Cookie,如果没有请按回车:");scCookie = ip.nextLine();if(scCookie.length() != 0) {return scCookie;}logger.info("开始扫码登录");// 设置驱动地址System.setProperty("webdriver.chrome.driver", "H:/chromedriver/chromedriver.exe");// 启动设置ChromeOptions options = new ChromeOptions();// 创建ChromeDriverdriver = new ChromeDriver(options);// 打开Bilibili登录页面driver.get(BLOGINURL);// 等待扫码登录while(true) {if(driver.getCurrentUrl().equals(BMAINPAGE)) {break;}else {Thread.sleep(100);}}logger.info("扫码登录成功");//获取cookiebcookies = driver.manage().getCookies();String cookie = StringUtils.join(bcookies, "; ");return cookie;}public static List<JSONObject> getFanS(String cookie,String vmid,int pn,int ps) throws InterruptedException, HttpException, IOException {HttpClient client = new HttpClient(); // 拼接urlStringBuffer sBuffer = new StringBuffer();sBuffer.append("https://api.bilibili.com/x/relation/followers?vmid=");sBuffer.append(vmid);sBuffer.append("&pn=");sBuffer.append(pn);sBuffer.append("&ps=");sBuffer.append(ps);sBuffer.append("&order=desc&jsonp=jsonp");getMethod.setURI(new URI(sBuffer.toString(), true));getMethod.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, "UTF-8");// 设置请求头getMethod.addRequestHeader("cookie", cookie);// 发送请求client.executeMethod(getMethod);// 获取数据String info = new String(getMethod.getResponseBody(), "UTF-8");JSONObject fans = JSONObject.parseObject(info).getJSONObject("data");JSONArray fArray = JSONArray.parseArray(fans.getString("list"));return JSON.parseArray(fArray.toJSONString(), JSONObject.class);}public static void main(String[] args) throws InterruptedException, HttpException{logger.info("程序开始...");new Main();// 获取CookieString cookie = Main.getCookie();CsvWriter csvWriter = new CsvWriter("C:\\Users\\computer\\Desktop\\aaa.csv", ',', Charset.forName("UTF-8"));String[] csvHeaders = { "mid", "粉丝名字","粉丝签名","粉丝头像"};try {csvWriter.writeRecord(csvHeaders);int pn = 1;boolean end = false;while(true) {for (JSONObject f : Main.getFanS(cookie, "309103931", pn, 20)) {if(f == null) {end = true;break;}String[] csvContent1 = {f.getString("mid"), f.getString("uname"),f.getString("sign"),f.getString("face")};System.out.println(csvContent1);csvWriter.writeRecord(csvContent1);}pn++;Thread.sleep(100);if(end == true) {break;}}} catch (IOException e) {System.out.println(e);e.printStackTrace();}csvWriter.close();driver.close();logger.info("程序结束");}}

4.缺点

容易被拦截，最多获取不到1000个

详细代码：https://github.com/mmnbplus/BiliBiliReptiles

B站地址：https://www.bilibili.com/read/cv7361471

Java爬虫(Selement)-B站粉丝取关人排查(1)相关推荐

java爬虫的2种爬取方式（HTTP||Socket）简单Demo(一)
转载自 java爬虫的2种爬取方式(HTTP||Socket)简单Demo(一) 最近在找java的小项目自己写着玩,但是找不到合适的,于是写开始学一点爬虫,自己也是感觉爬虫比较有趣.这里自己找了一个 ...
java爬虫爬b站_Java + golang 爬取B站up主粉丝数
自从学习了爬虫,就想在B站爬取点什么数据,最近看到一些个up主涨粉很快,于是对up主的粉丝数量产生了好奇,所以就有了标题~ 首先,我天真的以为通过up主个人空间的地址就能爬到 https://spac ...
java爬虫自动识别验证码_简单Java爬虫（一）爬取手机号码
原创野狗菌希望你能喜欢今天关于本文: 本文介绍一个简单Java爬虫,获取网页源码,爬取电话号码. 本篇教程用我的博客一个测试网页演示. --野狗菌[希望你能喜欢] 测试页面: https:// ...
Java爬虫学习一一Jsoup爬取彼岸桌面分类下的图片
最近在找工作,在这个过程中我感到很迷茫,投了很多简历,被查看的却很少,其中也有到现场去面试,结果也很不理想(╥╯^╰╥). 哈哈,跑题了,我在看之前所做的项目时,在我的收藏夹中看到了以前收藏的有关爬虫 ...
java爬虫系列第二讲-爬取最新动作电影《海王》迅雷下载地址
为什么80%的码农都做不了架构师?>>> 1. 目标使用webmagic爬取动作电影列表信息爬取电影**<海王>**详细信息[电影名称.电影迅雷下载地址列表] ...
JAVA爬虫-上海公交线路爬取
最开始的数据获取也有两个思路虽然我的数据不是这么来的一 .http://www.zuobus.com/line-overview.php?c=2 这个网站有所有的上海的公交线路二.百度poi ...
java爬虫,提供链接直接爬取网页代码
其实我只想要爬到整个网页的源代码的就好.通过java的一个包jsoup,就可以直接爬取了,后面有下载源代码(含jsoup包)的链接. 输入:网页链接输出:网页源代码代码比较简单,解析都在代码中: ...
java爬虫之WebMagic实战抓取前程无忧招聘信息
webmagic教程 http://webmagic.io/docs/zh/ 入门案例 package com.hikktn.webmagic;import us.codecraft.webmagic ...
几十行代码实现Java爬虫，结合jsoup爬取网名昵称
原文链接:点击打开链接 crawler4j是一个开源爬虫框架(https://github.com/yasserg/crawler4j), 我们可以使用它进行爬虫.以爬取 http://www.nib ...

Java爬虫(Selement)-B站粉丝取关人排查(1)

1.爬虫开发准备

2.流程

3.编码

4.缺点

Java爬虫(Selement)-B站粉丝取关人排查(1)相关推荐

最新文章

热门文章