A tour of Go 中關於 Web Crawler 的解答

kooder發表於2017-09-23

按照提示,改寫了 Crawl 函式,但是隻能找出一條 url,想不出原因,請教大家,謝謝。

package main

import (
    "fmt"
    "sync"
)

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // TODO: Fetch URLs in parallel.
    // TODO: Don't fetch the same URL twice.
    // This implementation doesn't do either:
    if depth <= 0 {
        fmt.Printf("depth <= 0 return")
        return
    }
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q\n", url, body)
    crawled.mux.Lock()
    crawled.c[url]++
    crawled.mux.Unlock()
    for _, u := range urls {
        //crawled.mux.Lock()
        if cnt, ok := crawled.c[u]; ok {
            cnt++
        } else {
            fmt.Println("go ...", u)
            go Crawl(u, depth-1, fetcher)
        }
        //crawled.mux.Unlock()
        //Crawl(u, depth-1, fetcher)
    }
    return
}

func main() {
    Crawl("http://golang.org/", 4, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "http://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "http://golang.org/pkg/",
            "http://golang.org/cmd/",
        },
    },
    "http://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "http://golang.org/",
            "http://golang.org/cmd/",
            "http://golang.org/pkg/fmt/",
            "http://golang.org/pkg/os/",
        },
    },
    "http://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "http://golang.org/",
            "http://golang.org/pkg/",
        },
    },
    "http://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "http://golang.org/",
            "http://golang.org/pkg/",
        },
    },
}

type crawledUrl struct {
    c   map[string]int
    mux sync.Mutex
}

var crawled = crawledUrl{c: make(map[string]int)}

更多原創文章乾貨分享,請關注公眾號
  • A tour of Go 中關於 Web Crawler 的解答
  • 加微信實戰群請加微信(註明:實戰群):gocnio

相關文章