動機
希望可以從各大技術論壇抓取自己感興趣的問題。
技術
- puppeteer
- dotenv
詳細設計
- 核心功能:使用puppeteer模擬使用者在瀏覽器上的行為,操作DOM獲取資料
- 快速啟動:使用dotenv將不同論壇之間的差異項(如元素的選擇器等)寫入環境變數,然後在package.json檔案裡配置啟動命令,快速啟動
1. 目錄結構
|-- env
|-- csdn.env
|-- segmentfault.env
|-- index.js
複製程式碼
2. 程式碼設計
程式碼邏輯特別簡單,主要流程如下:
- 開啟瀏覽器
- 建立一個新的頁面
- 跳轉到目標網站
- 獲取資料
- 將資料列印到控制檯(或者寫入到資料庫)
- 關閉瀏覽器
因為puppeteer很容易模擬使用者在瀏覽器上的行為,所以DEMO的核心在於如何獲取資料,或者說如何實現get_news
方法。
const puppeteer = require("puppeteer");
(async () => {
// 1. Open browser
const browser = await puppeteer.launch({});
// 2. Create a new page
const page = await browser.newPage();
// 3. Go to the target website
await page.goto(url, { waitUntil: "networkidle2" });
// 4. Get data
let data = await page.evaluate(get_news);
// 5. Print out the data in the console
console.log(data);
// 6. Close browser
await browser.close();
})();
// Getting news
function get_news() {
// To do something to get news
}
複製程式碼
以CSDN為例,如下圖所示,在網頁裡我們可以很容易的通過原生的DOM操作document.querySelector(selector)
或者jQuery的DOM操作$(selector)
來找到頁面上的元素,從而獲取頁面資訊。在puppeteer的page.evaluate
方法同時支援原生的DOM操作和jQuery的DOM操作,因此我們獲取頁面資料就會變得很容易。具體程式碼如下所示。
function get_news() {
let result = [];
let titles = $(".forums_title");
let dates = $(".forums_author em");
titles.map((i, title) => {
result.push({
title: title.text,
link: title.href,
date: dates[i].textContent
});
});
return result;
}
複製程式碼
現在執行程式碼可以在控制檯中列印出抓取到的網頁資料,如下圖所示。你同樣可以將資料寫入到資料庫。
3. 更多細節
程式碼裡還有很多的細節實現,因為都有詳細的註釋我就不一一展開了,感興趣的小夥伴可以閱讀後面的程式碼。主要包括的技術細節如下:
- 怎麼通過dotenv配置環境變數
- 怎麼在puppeteer的
page.evaluate
裡用console除錯 - 問題有效時間,關鍵字的檢查
- 怎麼在package.json檔案裡配置快速啟動的命令
參考
後記
我在前端上還是個小白,程式碼質量可能不高,如果有什麼問題希望大家在評論區裡及時指出,幫助小白成長,感激不盡!!!
完整程式碼
1. index.js
const puppeteer = require("puppeteer");
const {
resolve
} = require("path");
(async(path_name, start_time) = >{
// 1. Analytical path of environmental variables
let dotenvPath = resolve(__dirname, "env", path_name);
require("dotenv").config({
path: dotenvPath
});
// 2. Open browser
const browser = await puppeteer.launch({});
// 3. Create a new page
const page = await browser.newPage();
// Catch headless navigator's console event
page.on("console", msg = >{
for (let i = 0; i < msg.args().length; ++i) {
console.log(`$ {i}: $ {msg.args()[i]}`);
}
});
// 4. Getting env variables
let tags = JSON.parse(process.env.TAGS);
let titles = process.env.SELECTOR_TITLES;
let dates = process.env.SELECTOR_DATES;
let keywords = JSON.parse(process.env.KEYWORDS);
let time_interval = process.env.TIME_INTERVAL;
let para = { path_name, start_time, time_interval, titles, dates, keywords};
// Get page url based on label and page index
const get_news_url = (tag, pageIndex) = >process.env.LIST_URL.replace("{tag}", tag).replace("{pageIndex}", pageIndex);
// 5. Traverse through all tags to get data
await Promise.all(tags.map(async tag => {
let i = 0;
while (true) {
// 1) Go to the specified page
await page.goto(get_news_url(tag, ++i), { waitUntil: "networkidle2" });
// 2) Get data by function get_news
let _titles = await page.evaluate(get_news, para);
// 3) Stop the loop if it can't find the required data
if (_titles.length === 0) break;
// 4) Output captured data in console
console.log(i, get_news_url(tag, i));
console.log(_titles);
}
}));
// 6. Close browser
await browser.close();
})(process.env.PATH_NAME, process.env.START_TIME);
// Getting news
async
function get_news(para) {
// Get release time of issue
const get_release_time = dom => {
if (path_name === "csdn.env") return dom.textContent;
if (path_name === "segmentfault.env") return new Date(dom.dataset.created * 1000);
}
// Check whether the issue release time is within the valid time interval
const validate_time = (time, start_time) => {
let time_diff = (new Date(time)) - (new Date(start_time));
return (time_diff > 0) && (time_diff < time_interval);
}
// Check to see if the keyword is included
const validate_keyword = (keywords, title) => !!keywords.find(keyword = >(new RegExp(keyword)).test(title))
// 1. Waiting for callback data
let { path_name, start_time, time_interval, titles, dates, keywords } = await Promise.resolve(para);
// 2. Traverse the page data to find the required data
let result = [];
$(titles).map((i, title) => {
// 1) Verify that the data is valid in time
let check_time = validate_time(get_release_time($(dates)[i]), start_time);
if (!check_time) return;
// 2) Verify that the data contains the specified keywords
let check_keyword = validate_keyword(keywords, a.text);
if (!check_keyword) return;
result.push({
title: title.text,
link: title.href,
date: get_release_time($(dates)[i]).toString()
});
});
return result;
}
複製程式碼
2. csdn.env
LIST_URL=https://bbs.csdn.net/forums/{tag}?page={pageIndex}
TAGS=["CSharp","DotNET"]
KEYWORDS=[".net","C#","c#"]
SELECTOR_TITLES=.forums_topic .forums_title
SELECTOR_DATES=.forums_author em
複製程式碼
3. segmentfault.env
LIST_URL=https://segmentfault.com/questions/unanswered?page={pageIndex}
TAGS=[""]
KEYWORDS=["js","mysql","vue","html","javascript"]
SELECTOR_TITLES=.title a
SELECTOR_DATES=.askDate
複製程式碼
4. package.json
{
"name": "fetch-question",
"version": "1.0.0",
"description": "fetch questions from internet",
"main": "index.js",
"dependencies": {
"cross-env": "^5.2.0",
"dotenv": "^7.0.0",
"puppeteer": "^1.13.0"
},
"devDependencies": {},
"scripts": {
"csdn:list": "cross-env PATH_NAME=csdn.env START_TIME=2019/3/18 TIME_INTERVAL=172800000 node index.js",
"segmentfault:list": "cross-env PATH_NAME=segmentfault.env START_TIME=2019/3/18 TIME_INTERVAL=172800000 node index.js",
},
"author": "linli",
"license": "ISC"
}
複製程式碼