puppeteer 頁面爬取例項(元素遍歷)

Vckin發表於2018-12-07

內容提取

const puppeteer = require('puppeteer');
var url = process.argv.splice(2)[0];
/**
 * 提取內容
 *  /usr/local/bin/node get_cont.js 'http://www.baidu.com?id=456'
 *
 *  返回 所有資訊
 */
(async () => {
    const browser = await puppeteer.launch({
        executablePath: '/usr/bin/google-chrome',
        headless: true,
        args: ['--no-sandbox', '--disable-setuid-sandbox']
    });
    const page = await browser.newPage();
    await page.goto(url);
    //標題名稱
    const name = await page.$eval('.detail_xq h2', e => e.innerHTML);
    //提取基本內容
    const c_list = await page.$$eval('.detail_xq ul li',e=>{
        var dd = {};
        for( var i = 0;i<e.length;i++){
            if(e[i].children.length < 2){
                dd[i] = e[i].children[0].innerHTML;
            }else{
                dd[i] = e[i].children[0].innerHTML.replace(/\n/g,'').replace(/\t/g,'').trim()+e[i].children[1].innerHTML.replace(/\n/g,'').replace(/\t/g,'').trim();
            }
        }
        return dd;
    });
    //提取簡介
    const  c_descript = await page.$$eval('.d_ldjj',e=>{
        var dd = {};
        for( var i = 0;i<e.length;i++){
            dd[i] = e[i].querySelector("h2").innerHTML+":";
            for (var j = 0;j<e[i].querySelectorAll("p").length;j++){
                dd[i] += '#XX#'+e[i].querySelectorAll("p")[j].innerHTML.replace(/\n/g,'').replace(/\t/g,'').trim();
            }
        }
        return dd;
    });
    var len = Object.keys(c_list).length;
    for (var p in c_descript) {
        c_list[len] = c_descript[p];
        len ++;
    }
    var data = {};
    data['title'] = name;
    data['msg'] = c_list;
    console.log(JSON.stringify(data));
    browser.close();
})();

列表提取

const puppeteer = require('puppeteer');
var url = process.argv.splice(2)[0];
/**
 * 提取列表內容
 *   /usr/local/bin/node test.js 'url'
 *
 *  返回 
 */
(async () => {
    const browser = await puppeteer.launch({
        executablePath: '/usr/bin/google-chrome',
        headless: true,
        args: ['--no-sandbox', '--disable-setuid-sandbox']
    });
    const page = await browser.newPage();
    await page.goto(url);
    //提取時間
    const date_list = await page.$$eval('.list_list ul li .fr', files => {
        var dd = {};
        var reg = /[0-9]{4}\-[0-9]{2}\-[0-9]{2}/;
        for( var i = 0;i<files.length;i++){
            dd[i] = reg.exec(files[i].innerHTML)[0];
        }
        return dd;
    });
    //提取
    const date_cnv = await page.$$eval('.list_list ul li p a', files => {
        var dd = {};
        for( var i = 0;i<files.length;i++){
            dd[i] = files[i].innerHTML;
        }
        return dd;
    });
    var data = {};
    data['date'] = date_list;
    data['cnv'] = date_cnv;
    console.log(JSON.stringify(data));
    browser.close();
})();
本作品採用《CC 協議》,轉載必須註明作者和本文連結

相關文章