golang-spider-從單任務版爬蟲到併發爬蟲01

weixin_34185560發表於2018-04-05

學習總結:

  • 設計的過程是簡單到複雜的演化過程
  • 每一次的變化都應該是需求決定的
  • 抽象的過程是清晰的概念和邏輯劃分的過程

單任務版爬蟲架構

  • 第一步 將請求放入到engine的佇列中
  • 第二步 engine重佇列取出請求交給fetcher去下載,fetcher將下載結果交給engine
  • 第三步 engine將下載結果交給parser去解析,解析後結果交給engine進行下一步排程
2119458-77a6a0b722a35843.png
spider01.png

package engine

import (
    "hans/learn/spider/crawler/fetcher"
    "log"
)

func Run(seeds ...Request){

    var requests  []Request

    for _,r := range seeds {
        requests = append(requests,r)
    }

    for len(requests) > 0 {
        r := requests[0]
        requests =requests[1:]

        log.Printf("Fetching %s",r.Url)
        body,err :=fetcher.Fetch(r.Url)
        if err != nil {
            log.Printf("Fetcher: error " +
                "fetching url %s: %v",r.Url,err)
            continue
        }

        parseResult := r.ParserFunc(body)
        requests = append(requests,parseResult.Requests...)

        for _,item := range parseResult.Items {
            log.Printf("Got item %v",item)
        }
    }
}

改進一

上圖中的parser和fetcher可以合併成一步,叫做worker


2119458-9de5a9ff05252745.png
spider02.png
func Run(seeds ...Request){

    var requests  []Request

    for _,r := range seeds {
        requests = append(requests,r)
    }

    for len(requests) > 0 {
        r := requests[0]
        requests =requests[1:]

        log.Printf("Fetching %s",r.Url)

        parseResult,err := worker(r)
        if err != nil {
            continue
        }
        
        requests = append(requests,parseResult.Requests...)

        for _,item := range parseResult.Items {
            log.Printf("Got item %v",item)
        }

    }
}

func worker(request Request) (ParseResult,error){
        body,err := fetcher.Fetch(request.Url)
        if err != nil {
            log.Printf("Fetcher: error " +
                "fetching url %s: %v",r.Url,err)
            return ParseResult{},err
        }

        return  request.ParserFunc(body),nil

}

改進二 併發版爬蟲

  • 第一步,將request傳遞給engine
  • 第二步,engine將request交給scheduler排程
  • 第三步,scheduler將request放入channel,然後由worker進行解析
    將解析結果再放入channel,scheduler從channel中獲取
    parseResult繼續排程任務

engine做了哪些事情?

  • 根據使用者配置WorkerCount,開啟多個goroutine實現併發
  • 建立了供所有worker使用的channel
  • 迴圈消費channel中的request,有就交給scheduler進行排程

scheduler做了哪些事情?如何履行排程的指責?

  • 不停的把request放入channel中,供worker消費.......
2119458-18522d8b437670ca.png
spider04.png

engine.go

package engine

import "log"

type ConcurrentEngine struct {
    Scheduler Scheduler
    WorkerCount int
}

type Scheduler interface {
    Submit(Request)
    ConfigMasterWorkerChan(chan Request)
}


func (e *ConcurrentEngine) Run(seeds ...Request){
    //scheduler建立多個worker去處理任務
    in := make(chan Request)
    out := make(chan ParseResult)
    e.Scheduler.ConfigMasterWorkerChan(in)

    for i :=0;i<e.WorkerCount;i++{
        createWorker(in,out)
    }


    //將request扔給scheduler處理
    for _,r := range seeds {
        e.Scheduler.Submit(r)
    }

    //迴圈channel 將channel中的request交給scheduler去排程

    countItem :=0
    result := <- out

    //迴圈處理channel
    for {

        //對item計數
        for _,item := range result.Items{
            log.Printf("Got item #%d  %v \n",countItem,item)
            countItem ++
        }


        //將request交給scheduler繼續排程
        for _, request := range result.Requests {
            log.Printf("Get Url %s",request.Url)
            e.Scheduler.Submit(request)
        }
    }

}


func createWorker(in chan Request,out chan ParseResult){
    //每一個worker都是一個goroutine,worker功能是fetcher和parser

    go func(){
        for {
            //從in 這個channel中獲取request, fetcher處理後,將結果放回out 這個channel中
            request := <- in
            result,err :=worker(request)
            if err != nil {
                continue
            }
            out <- result
        }
    }()
}

scheduler.go

package scheduler

import "hans/learn/spider/crawler/engine"

type SimpleScheduler struct {
    WorkerChan chan engine.Request
}


func (s *SimpleScheduler) Submit (r engine.Request) {
    go func(){
        s.WorkerChan <- r
    }()
}

func (s *SimpleScheduler) ConfigMasterWorkerChan(c chan engine.Request){
    s.WorkerChan = c
}

注: 程式碼示例地址

相關文章