哨得哨得
第一次在掘金髮部落格, 感覺爽爽的, 簡書和掘金之間我還是選擇掘金了, 因為掘金才是開發者的平臺, 簡書大部分還是作者吧!(個人觀點, 賢者勿噴)
進入正題
簡單介紹:
本次帶來的一個用java寫的爬取吾愛破解網(大家都懂得, 不是什麼不正經的網站哈, 不過也是福利)最新更新的資源, 畢竟此網站一直不定時更新牛×哄哄的資源, 這個就是專門爬取最新分享的資源的 (什麼XX軟體啊, 某馬教程視訊啊....)
意圖 (原因):
- 本人剛剛接觸java(有半年了吧), 工作用到了html解析, 感覺裡爬蟲不遠了, 就想涉足一下
- 本人資源收藏愛好者, 吾愛XX給了我海量資源, 但是由於大部分資源你是百度雲連結, 而且深知百毒雲有些敏感資源過時太快了, 所以想弄一個爬蟲, 自動爬取, 自動儲存(這一步下次更新完成吧)
- 畢竟誰也沒事執行下這個java程式, 後期會放入伺服器開通介面, 再用自己的微信小程式呼叫
(PS:有木有懂前端(喜歡開發UI)的來指導指導我啊!)
, 這樣只要在微信就可以直接看到最新的資源了, (不只是資源哦, 還有連結, 回覆, 連結狀態等等)
用到的知識點
- java基礎
- jsoup 解析html第三方jar
- okhttp 瀏覽器請求第三方jar
- 正規表示式(正規表示式 + Excel + NotePad++ + 列編輯模式幾乎解決所有字串批處理問題, 下次演示)
程式碼(兩個類):
GetInfo.java
package test;
import com.mtl.pojo.Item;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import org.junit.Test;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
public class GetInfo {
private static String lastTopic; //上一次查詢最終的帖子的標題, 用來判斷是否解析到上次解析的位置
private static String thisTopic; // 暫時儲存這次解析的第一個標題, 最後加到lastTopic中去
@Test
public void getInfo(){
try {
OkHttpClient client = new OkHttpClient();//建立OkHttpClient物件
for (int i = 1; i <= 8; i++) {
//構建請求物件 通過內部類Request.Builder構建
Request request = new Request.Builder()
.url("https://www.52pojie.cn/forum.php?mod=guide&view=newthread&page=" + i)//請求介面。如果需要傳參拼接到介面後面。
.build();
Response response = null;
//傳送請求得到response物件
response = client.newCall(request).execute();
//判斷返回狀態碼
if (response.isSuccessful()) {
String string = response.body().string();
// 檢視返回的response頭資訊, 實際上用來設定返回的cookie的, 還沒有完成
// Headers header = response.headers();
// for (int j = 0; j < header.size(); j++) {
// System.out.println(header.name(i) + "-----" + header.value(i));
// }
// System.out.println(string);
//呼叫方法解析html文字
ParseHtml parseHtml = new ParseHtml();
List<Item> items = parseHtml.getCurrentPageItems(string, lastTopic);
testInsert(items);
if (i == 1){
thisTopic = items.get(0).getTitle();
}
if (parseHtml.isFind()){
break;
}
}
}
lastTopic = thisTopic;
} catch (Exception e) {
e.printStackTrace();
}
}
public void testInsert(List<Item> items){
try {
InputStream resourceAsStream= Resources.getResourceAsStream("mybatis.xml");
SqlSessionFactory build = new SqlSessionFactoryBuilder().build(resourceAsStream);
SqlSession sqlSession = build.openSession();
int insert = sqlSession.insert("com.mtl.mapper.ItemMapper.insertItems", items);
System.out.println("insert = " + insert);
sqlSession.commit();
sqlSession.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
複製程式碼
ParseHtml.java
用來解析html字串的工具類吧(不過並沒有設定靜態方法,為了以後spring管理哈哈
)
package test;
import com.mtl.pojo.Item;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ParseHtml {
private boolean isFind = false; //用來判斷是否解析到了上次執行的最後一個標題, 結束條件
/**
* 獲取當前html頁面的所有item物件
* @param html 當前頁面的html字串
* @param lastTitle 停止解析的帖子標題
* @return item集合
* @throws IOException okhttp丟擲的異常
*/
public List<Item> getCurrentPageItems(String html, String lastTitle) throws IOException {
ArrayList<Item> items = new ArrayList<>();
//Jsoup解析html文字獲取Document物件
Document parse = Jsoup.parse(html);
Element body = parse.body();
//通過選擇器獲取到標誌的div然後賦值給item
Element element = body.selectFirst("div#forumnew");
// System.out.println("element = " + element);
Element table = element.nextElementSibling();
Elements tbodys = table.select("tbody");
for (int j = 0; j < tbodys.size(); j++) {
element = tbodys.get(j);
String title = element.selectFirst("a.xst").html();
if (title.equals(lastTitle)){ //如果查詢到上次的最後的話題就直接結束並通知前臺找到了標記
isFind = true;
break;
}
Item item = new Item();
item.setTitle(title);
Element tbody = element.selectFirst("tbody");
Elements tds = tbody.select("td");
for (int i = 0; i < tds.size(); i++) {
Element td = tds.get(i);
switch (i){
case 0:
item.setUrl("https://www.52pojie.cn/" + td.selectFirst("a").attr("href"));
Element span = td.selectFirst("span");
if (span != null)
item.setAuthorityLevel(span.html());
break;
case 1:
item.setPartition(td.selectFirst("a").html());
break;
case 2:
item.setAuther(td.selectFirst("a").html());
item.setPublishTime(td.selectFirst("span").html());
break;
case 3:
item.setReplyNum(td.selectFirst("a").html());
item.setViewNum(td.selectFirst("em").html());
break;
case 4:
item.setLastReplyName(td.selectFirst("a").html());
item.setLastReplyTime(td.selectFirst("em").selectFirst("a").html());
item.setLastReplyUrl("https://www.52pojie.cn/" + td.selectFirst("a").attr("href"));
break;
}
}
parseLink(item);
items.add(item);
}
return items;
}
/**
* 解析item內部的百度雲連結
* @param item item物件
*/
private void parseLink(Item item) throws IOException {
if (item.getAuthorityLevel() == null) {
OkHttpClient okHttpClient = new OkHttpClient();
String url = item.getUrl();
Request build = new Request.Builder()
.url(url)
.build();
Response response = okHttpClient.newCall(build).execute();
if (response.isSuccessful()){
String string = response.body().string();
// System.out.println(string);
Matcher matcher = Pattern.compile("[^\"](https://pan.baidu.com/s/[\\w\\-0-9_]+[a-zA-Z_0-9])((?!https).)+密碼: ?([a-zA-Z0-9]{4})[^a-zA-Z0-9]").matcher(string);
StringBuilder links = new StringBuilder();
StringBuilder pwds = new StringBuilder();
while (matcher.find()){
if (links.indexOf(matcher.group(1)) == -1){
links.append(matcher.group(1)).append(";");
pwds.append(matcher.group(3)).append(";");
}
// System.out.println("match = " + matcher.group(0));
}
if (!links.toString().equals("")){
item.setLinksAndPwdsStr(links.toString() + "#;#" + pwds.toString());
}
}
}
}
/**
* 測試需要閱讀許可權的連結返回的報文體 為以後自動登入獲取連結做準備
* @throws IOException
*/
@Test
public void testLink() throws IOException {
OkHttpClient okHttpClient = new OkHttpClient();
Request build = new Request.Builder()
.url("https://www.52pojie.cn/thread-719615-1-1.html")
.build();
Response response = okHttpClient.newCall(build).execute();
if (response.isSuccessful()){
String string = response.body().string();
Matcher authLevel = Pattern.compile("抱歉,本帖要求閱讀許可權高於 \\d+ 才能瀏覽").matcher(string);
System.out.println(string);
if (authLevel.find()) {
System.out.println("需要許可權");
}else {
Matcher matcher = Pattern.compile("[^\"](https://pan.baidu.com/s/[\\w\\-0-9_]+[a-zA-Z_0-9])((?!https).)+密碼: ?([a-zA-Z0-9]{4})[^a-zA-Z0-9]").matcher(string);
while (matcher.find()){
System.out.println("match = " + matcher.group(1) + "--" + matcher.group(3));
}
}
}
}
public boolean isFind() {
return isFind;
}
public void setFind(boolean find) {
isFind = find;
}
}
複製程式碼
Item.java
實體類
package com.mtl.pojo;
public class Item {
private String title; //標題
private String url; //連結
private String[] links; //百度雲連結陣列
private String[] pwds; //對應百度雲連結密碼
private String linksAndPwdsStr; //百度雲連結陣列
private String publishTime; //發表時間
private String authorityLevel; //檢視許可權
private String partition; //帖子分割槽
private String auther; //帖子作者
private String replyNum; //回覆數量
private String viewNum; //檢視數量
private String lastReplyName; //最後回覆賬戶
private String lastReplyTime; //最後回覆時間
private String lastReplyUrl; //最後回覆連結
private String firstPageReply; //第一頁回覆內容集合
private boolean isNeedReply; //是否需要回復才可以獲取下載連結
private int searchLinkTimes; //搜尋連結次數, 以備後期超過閾值不在獲取
public String getLinksAndPwdsStr() {
return linksAndPwdsStr;
}
public void setLinksAndPwdsStr(String linksAndPwdsStr) {
if (linksAndPwdsStr == null || linksAndPwdsStr.equals("")){
links = new String[]{};
pwds = new String[]{};
}else {
String[] split = linksAndPwdsStr.split("#;#");
links = split[0].split(";");
pwds = split[1].split(";");
}
this.linksAndPwdsStr = linksAndPwdsStr;
}
public String[] getLinks() {
return links;
}
public String[] getPwds() {
return pwds;
}
public String getFirstPageReply() {
return firstPageReply;
}
public void setFirstPageReply(String firstPageReply) {
this.firstPageReply = firstPageReply;
}
public boolean isNeedReply() {
return isNeedReply;
}
public void setNeedReply(boolean needReply) {
isNeedReply = needReply;
}
public int getSearchLinkTimes() {
return searchLinkTimes;
}
public void setSearchLinkTimes(int searchLinkTimes) {
this.searchLinkTimes = searchLinkTimes;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getPublishTime() {
return publishTime;
}
public void setPublishTime(String publishTime) {
this.publishTime = publishTime;
}
public String getAuthorityLevel() {
return authorityLevel;
}
public void setAuthorityLevel(String authorityLevel) {
this.authorityLevel = authorityLevel;
}
public String getPartition() {
return partition;
}
public void setPartition(String partition) {
this.partition = partition;
}
public String getAuther() {
return auther;
}
public void setAuther(String auther) {
this.auther = auther;
}
public String getReplyNum() {
return replyNum;
}
public void setReplyNum(String replyNum) {
this.replyNum = replyNum;
}
public String getViewNum() {
return viewNum;
}
public void setViewNum(String viewNum) {
this.viewNum = viewNum;
}
public String getLastReplyName() {
return lastReplyName;
}
public void setLastReplyName(String lastReplyName) {
this.lastReplyName = lastReplyName;
}
public String getLastReplyTime() {
return lastReplyTime;
}
public void setLastReplyTime(String lastReplyTime) {
this.lastReplyTime = lastReplyTime;
}
public String getLastReplyUrl() {
return lastReplyUrl;
}
public void setLastReplyUrl(String lastReplyUrl) {
this.lastReplyUrl = lastReplyUrl;
}
}
複製程式碼
由於資料庫是儲存陣列很麻煩, 所以我想了一個折中的辦法, 在實體類上下了手腳, 有興趣的小夥伴可以看一下
後續打算
- 完成ssm專案,配置好服務, 測試介面
- 完成微信小程式UI, 使用伺服器介面
- 上線微信小程式
- 完成自動登入, 防止閱讀許可權無法獲取問題
- 自動判斷百度盤是否失效, 自動去除該item
- 自動回覆獲取需要回復才可以檢視隱藏連結的帖子