WebDriver 登陆 Jsoup抓取内容

2019独角兽企业重金招聘Python工程师标准>>>

1. 环境

pom:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>mybatis</groupId><artifactId>test</artifactId><version>0.0.1-SNAPSHOT</version><packaging>jar</packaging><name>test</name><url>http://maven.apache.org</url><properties><project.build.sourceEncoding>UTF-8</project.build.sourceEncoding></properties><!-- 添加mybatis-generator插件 --><!-- ——>在Goals框中输入：mybatis-generator:generate 运行mybatis插件 --><build><plugins><plugin><groupId>org.mybatis.generator</groupId><artifactId>mybatis-generator-maven-plugin</artifactId><version>1.3.2</version><configuration><verbose>true</verbose><overwrite>true</overwrite></configuration></plugin></plugins></build><dependencies><dependency><groupId>mysql</groupId><artifactId>mysql-connector-java</artifactId><version>5.1.38</version></dependency><dependency><groupId>junit</groupId><artifactId>junit</artifactId><version>3.8.1</version><scope>test</scope></dependency><dependency><groupId>org.mybatis</groupId><artifactId>mybatis</artifactId><version>3.3.1</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>3.12</version></dependency><dependency><groupId>commons-logging</groupId><artifactId>commons-logging</artifactId><version>1.2</version></dependency><dependency><groupId>net.sourceforge.jexcelapi</groupId><artifactId>jxl</artifactId><version>2.6.12</version></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>4.5.2</version></dependency><dependency><groupId>junit</groupId><artifactId>junit</artifactId><version>4.12</version></dependency><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.8.3</version></dependency><dependency><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-server</artifactId><version>2.53.0</version></dependency><dependency><groupId>log4j</groupId><artifactId>log4j</artifactId><version>1.2.17</version></dependency></dependencies></project>

2. 初始化WebDriver的类 DriverFactory.java

package test;import java.util.Arrays;import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.DesiredCapabilities;public class DriverFactory {public static ChromeDriver create() {// TODO Auto-generated method stubString chromdriver = "C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe";System.setProperty("webdriver.chrome.driver", chromdriver);ChromeOptions options = new ChromeOptions();DesiredCapabilities capabilities = DesiredCapabilities.chrome();capabilities.setCapability("chrome.switches", Arrays.asList("--start-maximized"));options.addArguments("--test-type", "--start-maximized");ChromeDriver driver = new ChromeDriver(options);return driver;}}

3. 西祠胡同的登陆抓取类

package test;import java.io.File;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.openqa.jetty.http.SSORealm;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.Platform;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.ie.InternetExplorerDriver;
import org.openqa.selenium.remote.DesiredCapabilities;
import mx4j.log.Log;public class XiciLogin2 {public Logger log = Logger.getLogger(Main.class);public static Set<Cookie> cookies = new HashSet<Cookie>();public static ChromeDriver driver = DriverFactory.create();/** * 抓取到每一个分页上所有详细页链接 * *@param url */public List<String> crawlSource(String url) {int time = 1;System.out.println("开始抓： " + url);log.info("开始抓： " + url);List<String> sourceUrls = new ArrayList<String>();String baseUrl = "http://www.xici.net";driver.get(url);Document document = Jsoup.parse(driver.getPageSource());WebElement webElement = driver.findElement(By.xpath(".//*[@id='board_t']/tbody/tr/td[2]/a"));Elements elements = document.select("table#board_t tbody tr");System.out.println(elements);if (elements != null) {for (Element element : elements) {if (element.select("td").isEmpty()) {continue;}String targets = element.select("td a[onclick=this.parentNode.className ='visited';]").attr("href");if (targets == "" || targets == null) {continue;}targets = baseUrl + targets; // System.out.println(targets);sourceUrls.add(targets);}} else{System.out.println(url + "中没有详细页链接~~");}System.out.println(sourceUrls.size());if (sourceUrls.size() == 0 && time <= 5) {System.out.println("抓不到啦~ 重新抓一下");crawlSource(url);time++;}return sourceUrls;}/** * 解析详细页 出东西 */public void crawlTarget(String url) {driver.get(url);Document document = Jsoup.parse(driver.getPageSource());System.out.println("抓" + url + "的标题"); // 取标题 Element element =document.select("div#doc_tit h1").first();if (element != null) {System.out.println("标题:" + element.text());} else {System.out.println("");}}public static void main(String[] args) {PropertyConfigurator.configure("log4j.properties");XiciLogin2 xc = new XiciLogin2();String site = "http://www.xici.net/b1513005/";try {xc.xiciLogin(); //xc.crawlTarget("http://www.xici.net/d191739198.htm");xc.getMaxPageNum("http://www.xici.net/b1468535/");int page = 1; //int maxPageNum = xc.getMaxPageNum(site);do {String sourceUrl = site + page;System.out.println("分页: " + sourceUrl);List<String> targetsList = xc.crawlSource(sourceUrl);if (targetsList.isEmpty()) {System.out.println("没抓到详细页！！");} else {for (String target : targetsList) {try {xc.crawlTarget(target);Thread.sleep(3000);} catch (Exception e) {e.printStackTrace();}}}page++;Thread.sleep(3500);} while (page <= 15);} catch (Exception e) {e.printStackTrace();}}/** * 获取当前入口site的最大分页数 **/public int getMaxPageNum(String site) {Document document = null;int maxPageNum = 0;try {document = Jsoup.connect(site).get();Element element = document.select("div#page").first();String s = element.text();if (s.contains("共")) {s = s.split("共")[1];s = s.split("页")[0];}System.out.println(s);maxPageNum = Integer.parseInt(s);} catch (IOException e) {e.printStackTrace();}return maxPageNum;}public void xiciLogin() throws Exception {System.setProperty("webdriver.chrome.driver","C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe");driver.get("http://account.xici.net/login");WebElement user = driver.findElement(By.name("username"));WebElement pwa = driver.findElement(By.name("password")); // 分别将用户名和密码文本框清空// user.clear();pwa.clear(); // 输入用户名和密码 user.sendKeys("*******");pwa.sendKeys("*********"); // 找到登陆按钮点击 //driver.findElement(By.name("TANGRAM__PSP_3__submit")).click();driver.findElement(By.xpath("html/body/div[3]/div[2]/div[2]/form/div[4]/button")).click();// 输出title System.out.println(driver.getTitle()); cookies =driver.manage().getCookies();System.out.println(cookies);for (Cookie cookie2 : cookies) {driver.manage().addCookie(cookie2);System.out.println(cookie2);} // 能打开15页说明登陆成功 //driver.get("http://www.xici.net/b1402132/15");}
}

转载于:https://my.oschina.net/u/2561483/blog/659149

WebDriver 登陆 Jsoup抓取内容相关推荐

python登录新浪微博抓取微博内容_python机器登陆新浪微博抓取数据
使用python机器登陆新浪微博抓取数据 1.[代码][Python]代码 # import 这边需要注意的是只有一个rsa这个模块是需要install的,其他的都是内置 import re , ur ...
Jsoup抓取网页数据完成一个简易的Android新闻APP
前言:作为一个篮球迷,每天必刷NBA新闻.用了那么多新闻APP,就想自己能不能也做个简易的新闻APP.于是便使用Jsoup抓取了虎扑NBA新闻的数据,完成了一个简易的新闻APP.虽然没什么技术含量,但 ...
android 获取手机a标签页,Android关于对Jsoup抓取a标签和br标签之间的解决办法...
Jsoup官方给出的文档,链接:http://www.open-open.com/jsoup/ 描述问题: 学校教务处系统中,我想获取所有科目以及对应的成绩,因此我采用了Jsoup抓取采集成绩:fe ...
springboot+jsoup抓取新闻网站信息
springboot+jsoup抓取新闻网站信息步骤: 一.导入jar包二.解析凤凰网新闻 jsoup获取动态生成的js内容 service serviceImpl mapper domian 步 ...
使用Jsoup抓取京东图书分类页面图书信息
一.目的: 1.任务使用 Jsoup抓取京东图书分类页面的图书信息. 抓取目标分类网址例如:https://list.jd.com/list.html?cat=1713,3259,3330 给与的某 ...
python爬取小说出现乱码_详解Python解决抓取内容乱码问题（decode和encode解码）
一.乱码问题描述经常在爬虫或者一些操作的时候,经常会出现中文乱码等问题,如下原因是源网页编码和爬取下来后的编码格式不一致二.利用encode与decode解决乱码问题字符串在Python内部的 ...
superagent post php,node.js,superagent_superagent抓取内容后如何传值？，node.js,superagent - phpStudy...
superagent抓取内容后如何传值? 'use strict' var superagent = require("superagent"); var cheerio = re ...
java模拟登陆系统_Java模拟登录系统抓取内容【转载】
1 @Component2 public class Login extendsBaseJobs {3 4 SimpleDateFormat sdf = new SimpleDateFormat(&q ...
python爬取数据案例分析_基于Python及webdriver的网页抓取案例
上次有朋友问怎么抓取交易所网站的数据,特别是历史数据,这里特别推荐使用selenium这一自动化测试框架. 原本selenium是用来完成大量基于浏览器的自动化测试的,但由于可以方便地执行JS代码,摸 ...

WebDriver 登陆 Jsoup抓取内容

1. 环境

2. 初始化WebDriver的类 DriverFactory.java

3. 西祠胡同的登陆抓取类

WebDriver 登陆 Jsoup抓取内容相关推荐

最新文章

热门文章