動態頁面爬蟲前的準備:https://www.cnblogs.com/maohuidong/p/18517953
一:java新增maven依賴:
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.4</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.4</version>
</dependency>
<!--selenium依賴-->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
二:重寫PageProcessor:
import java.util.List;
import java.util.Set;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.RemoteWebDriver;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* 準備抓取這個頁面:http://yapi.rongyi.so/project/317/interface/api
* 但是這個頁面需要登入後才能訪問,登入頁面為:http://yapi.rongyi.so/login
* 登入後抓取的頁面有 分頁, 分頁的特點是地址不變,所以從第二頁開始,無法透過webMagic抓取第二頁的內容
* 只能透過Selenium來點選下一頁的操作,然後再解析頁面。
*/
public class YapiPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(0).setTimeOut(3000);
//用來儲存cookie資訊
private Set<Cookie> cookies;
private RemoteWebDriver driver;
public YapiPageProcessor(){
System.setProperty("webdriver.chrome.driver","E:\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe");
//建立瀏覽器引數物件
ChromeOptions chromeOptions = new ChromeOptions();
// 設定為 無介面瀏覽器 模式,若是不想看到瀏覽器開啟,就可以配置此項
//解決 403 出錯問題
chromeOptions.addArguments("--remote-allow-origins=*");
// chromeOptions.addArguments("--headless");
chromeOptions.addArguments("--window-size=1440,1080");// 設定瀏覽器視窗開啟大小
RemoteWebDriver driver = new ChromeDriver(chromeOptions);
this.driver = driver;
}
@Override
public void process(Page page) {
// 取所有的key 和對應的所有的value
List<String> keys = page.getHtml().xpath("//tbody[@class='ant-table-tbody']/tr/td/a/span/text()").all();
List<String> values = page.getHtml().xpath("//tbody[@class='ant-table-tbody']/tr/td/div/span[3]/text()").all();
//獲取使用者的id
for (int i = 0;i < keys.size();i++) {
page.putField(keys.get(i),values.get(i));
}
while (driver.findElements(By.xpath("//li[@title='下一頁' and @aria-disabled='false']")).size() > 0){
// 點選下一頁
driver.findElement(By.xpath("//li[@title='下一頁' and @aria-disabled='false']")).click();
// 等待2S
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
List<WebElement> elements = driver.findElements(By.xpath("//tbody[@class='ant-table-tbody']/tr"));
for (int ii = 1;ii <= elements.size();ii++) {
WebElement key = driver.findElement(By.xpath("//tbody[@class='ant-table-tbody']/tr[" + ii + "]/td/a/span"));
WebElement value = driver.findElement(By.xpath("//tbody[@class='ant-table-tbody']/tr[" + ii + "]/td/div/span[3]"));
page.putField(key.getText(),value.getText());
}
}
}
//使用 selenium 來模擬使用者的登入獲取cookie資訊
public void login(String loginUrl,String userName,String password)
{
driver.get(loginUrl);
driver.findElement(By.id("email")).clear();
//在******中填你的使用者名稱
driver.findElement(By.id("email")).sendKeys(userName);
driver.findElement(By.id("password")).clear();
//在*******填你密碼
driver.findElement(By.id("password")).sendKeys(password);
//模擬點選登入按鈕
driver.findElement(By.xpath("//button[@type='submit']")).click();
//獲取cookie資訊
cookies = driver.manage().getCookies();
// driver.close();
}
@Override
public Site getSite() {
//將獲取到的cookie資訊新增到webmagic中
for (Cookie cookie : cookies) {
site.addCookie(cookie.getName().toString(),cookie.getValue().toString());
}
return site.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1");
}
public static void main(String[] args){
YapiPageProcessor pageProcessor = new YapiPageProcessor();
//呼叫selenium,進行模擬登入
pageProcessor.login("http://yapi.rongyi.so/login","xxx","xxx");
Spider.create(pageProcessor)
.addUrl("http://yapi.rongyi.so/project/317/interface/api")
.setDownloader(new MyDownloader(pageProcessor.driver))//可選擇使用自定義的
// 輸出到D盤webmagic資料夾
.addPipeline(new JsonFilePipeline("D:\\webmagic1\\"))
//開啟1個執行緒抓取
.thread(1)
//啟動爬蟲
.run();
System.out.println("爬取結束");
}
}
三:重寫Downloader:
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.RemoteWebDriver;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.selector.PlainText;
import org.openqa.selenium.Cookie;
import java.util.Map;
public class MyDownloader implements Downloader {
//宣告驅動
private RemoteWebDriver driver;
public MyDownloader(RemoteWebDriver driver) {
this.driver = driver;
}
/**
* 由於selenium的預設域名為data;因此第一次必須跳轉到登入頁,才能加入對應域名
* @param request Request
*/
@Override
public Page download(Request request, Task task) {
try {
driver.get(request.getUrl());//第一次開啟url,跳轉到登入頁
Thread.sleep(3000);//等待開啟瀏覽器
//獲取從process返回的site攜帶的cookies,填充後第二次開啟url
Site site = task.getSite();
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies()
.entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue());
driver.manage().addCookie(cookie);
}
//新增對應domain的cookie後,第二次開啟url
driver.get(request.getUrl());
}
Thread.sleep(2000);
driver.executeScript("window.scrollTo(0, document.body.scrollHeight - 1000)");//需要滾動到頁面的底部,獲取完整的資料
Thread.sleep(2000);//等待滾動完成
//獲取頁面,打包成Page物件,傳給PageProcessor 實現類
Page page = createPage(request.getUrl(), driver.getPageSource());
//driver.close();//看需要是否關閉瀏覽器
return page;
} catch (InterruptedException e) {
e.printStackTrace();
}
return null;
}
@Override
public void setThread(int threadNum) {
}
//構建page返回物件
private Page createPage(String url, String content) {
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(url));
page.setRequest(new Request(url));
page.setDownloadSuccess(true);
return page;
}