基於nodejs網站爬蟲程式開發

yesye發表於2021-09-09
webside_parser_node

基於nodejs網站爬蟲程式

1. 安裝依賴
yarn #npm install
2. 檔案目錄結構
+-- src
|   HtmlDownloader  //網頁下載器
|   HtmlParser      //網頁解析器
|   Outputer        //內容輸出
|   UrlManager      //url管理
|   main            //主入口和排程
3. 主要實現
//main.js
class ParserScheduler {
  constructor() {
    this.parseCount = 0;
    this.urls = new UrlManager();
    this.downloader = new HtmlDownloader();
    this.outputer = new Outputer();
    this.parser = new HtmlParser();
  }
  parse() {
    const {urls, downloader, parser, outputer} = this;
    let newUrl = urls.getNewUrl();
    console.log('new url:' + newUrl);
    return downloader
      .download(newUrl)
      .then(html => {
          const [newUrls,content] = parser.parse(html);
          urls.addNewUrls(newUrls);
          outputer.collectData(content);
          if(this.urls.hasNewUrl()) {
            this.parse();
            this.parseCount++;
          }else{
            console.log('complete!');
            this.outputer.output();
          }
      });
  }
  start(count) {
    this.urls.addNewUrl(rootUrl);
    this.parse();
  }
}

//parser
class HtmlParser {
  parse(html) {
    let aLinks = [];
    let images = [];
    const $ = cheerio.load(html);
    $('.text-page-tag').each((index, item) => {
      let href = item.attribs.href;
      if (href.indexOf('/course/list') >= 0) {
        aLinks.push(href);
      }
    });
    $('.course-banner').each((index,item)=>{
      const src = item.attribs.src;
      const alt = $(item).closest('.course-card-container')
      .find('.course-card-name').text();
      if (src) {
        images.push({src, alt});
      }
    });
    return [aLinks, images];
  }
}

//downloader
class HtmlDownloader {
  download(url) {
    return new Promise((resolve, reject) => {
      request(url, {
        headers: {
          'User-Agent': 'Mozilla/5.0',
        },
      }, (error, response, body) => {
        if (error) {
          reject(error);
        }
        resolve(body);
      });
    });
  }
}

//urlmanager
class UrlManager {
  constructor() {
    this.newUrls = [];
    this.oldUrls = [];
  }
  hasNewUrl() {
    return this.newUrls.length !== 0;
  }
  getNewUrl() {
    const url = this.newUrls.shift();
    if (!this.oldUrls.includes(url)) {
      this.oldUrls.push(url);
    }
    return url;
  }
  addNewUrl(url) {
    const {newUrls} = this;
    url = (url.indexOf('http')>=0)?url:(''+url);
    if (!newUrls.includes(url) && !this.oldUrls.includes(url)) {
      newUrls.push(url);
    }
  }
  addNewUrls(urls) {
    if (Array.isArray(urls)) {
      urls.forEach(url=>{
        this.addNewUrl(url);
      });
    }
  }
}
//outputer
class Outputer {
  constructor() {
    this.data = [];
  }
  _getImage(url, filename) {
    console.log('寫入圖片檔案:'+filename);
    url = url.indexOf('http:')>=0?url:('http:'+url);
    let bufferArray = [];
    const opts = {
      headers: {
        'User-Agent': 'Mozilla/5.0',
      },
    };
    request(url).pipe(fs.createWriteStream('D:parser_pics\'+encodeURIComponent(filename)+'.jpg'))
  }
  collectData(datas) {
    if (datas && Array.isArray(datas)) {
      datas.forEach(data => {
        this
          .data
          .push(data);
      });
    }
  }
  output() {
    // console.log('output');
    const {data} = this;
    for (let i = 0, len = data.length; i 
4. 執行
node main.js
5. 說明

目前支援的nodejs版本為node 10.0.0

來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/2041/viewspace-2809152/,如需轉載,請註明出處,否則將追究法律責任。

相關文章