內容提取
const puppeteer = require('puppeteer');
var url = process.argv.splice(2)[0];
/**
* 提取內容
* /usr/local/bin/node get_cont.js 'http://www.baidu.com?id=456'
*
* 返回 所有資訊
*/
(async () => {
const browser = await puppeteer.launch({
executablePath: '/usr/bin/google-chrome',
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.goto(url);
//標題名稱
const name = await page.$eval('.detail_xq h2', e => e.innerHTML);
//提取基本內容
const c_list = await page.$$eval('.detail_xq ul li',e=>{
var dd = {};
for( var i = 0;i<e.length;i++){
if(e[i].children.length < 2){
dd[i] = e[i].children[0].innerHTML;
}else{
dd[i] = e[i].children[0].innerHTML.replace(/\n/g,'').replace(/\t/g,'').trim()+e[i].children[1].innerHTML.replace(/\n/g,'').replace(/\t/g,'').trim();
}
}
return dd;
});
//提取簡介
const c_descript = await page.$$eval('.d_ldjj',e=>{
var dd = {};
for( var i = 0;i<e.length;i++){
dd[i] = e[i].querySelector("h2").innerHTML+":";
for (var j = 0;j<e[i].querySelectorAll("p").length;j++){
dd[i] += '#XX#'+e[i].querySelectorAll("p")[j].innerHTML.replace(/\n/g,'').replace(/\t/g,'').trim();
}
}
return dd;
});
var len = Object.keys(c_list).length;
for (var p in c_descript) {
c_list[len] = c_descript[p];
len ++;
}
var data = {};
data['title'] = name;
data['msg'] = c_list;
console.log(JSON.stringify(data));
browser.close();
})();
列表提取
const puppeteer = require('puppeteer');
var url = process.argv.splice(2)[0];
/**
* 提取列表內容
* /usr/local/bin/node test.js 'url'
*
* 返回
*/
(async () => {
const browser = await puppeteer.launch({
executablePath: '/usr/bin/google-chrome',
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.goto(url);
//提取時間
const date_list = await page.$$eval('.list_list ul li .fr', files => {
var dd = {};
var reg = /[0-9]{4}\-[0-9]{2}\-[0-9]{2}/;
for( var i = 0;i<files.length;i++){
dd[i] = reg.exec(files[i].innerHTML)[0];
}
return dd;
});
//提取
const date_cnv = await page.$$eval('.list_list ul li p a', files => {
var dd = {};
for( var i = 0;i<files.length;i++){
dd[i] = files[i].innerHTML;
}
return dd;
});
var data = {};
data['date'] = date_list;
data['cnv'] = date_cnv;
console.log(JSON.stringify(data));
browser.close();
})();
本作品採用《CC 協議》,轉載必須註明作者和本文連結