colly 在golang中的地位,比之scrapy在python的作用,都是爬蟲界的大佬。本文用其抓取博文資訊, 從收集器例項配置,goQuery進行dom節點資料抓取,自動分頁訪問,到csv資料持久化,json控制檯輸出,全程簡單直觀。
Code
抓取資料入口為社群某使用者部落格列表頁,比如 https://learnku.com/blog/pardon
package main
import (
"encoding/csv"
"encoding/json"
"log"
"os"
"regexp"
"strconv"
"strings"
"github.com/gocolly/colly"
)
// Article 抓取blog資料
type Article struct {
ID int `json:"id,omitempty"`
Title string `json:"title,omitempty"`
URL string `json:"url,omitempty"`
Created string `json:"created,omitempty"`
Reads string `json:"reads,omitempty"`
Comments string `json:"comments,omitempty"`
Feeds string `json:"feeds,omitempty"`
}
// 資料持久化
func csvSave(fName string, data []Article) error {
file, err := os.Create(fName)
if err != nil {
log.Fatalf("Cannot create file %q: %s\n", fName, err)
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
writer.Write([]string{"ID", "Title", "URL", "Created", "Reads", "Comments", "Feeds"})
for _, v := range data {
writer.Write([]string{strconv.Itoa(v.ID), v.Title, v.URL, v.Created, v.Reads, v.Comments, v.Feeds})
}
return nil
}
func main() {
articles := make([]Article, 0, 200)
// 1.準備收集器例項
c := colly.NewCollector(
// 開啟本機debug
// colly.Debugger(&debug.LogDebugger{}),
colly.AllowedDomains("learnku.com"),
// 防止頁面重複下載
// colly.CacheDir("./learnku_cache"),
)
// 2.分析頁面資料
c.OnHTML("div.blog-article-list > .event", func(e *colly.HTMLElement) {
article := Article{
Title: e.ChildText("div.content > div.summary"),
URL: e.ChildAttr("div.content a.title", "href"),
Feeds: e.ChildText("div.item-meta > a:first-child"),
}
// 查詢同一集合不同子項
e.ForEach("div.content > div.meta > div.date>a", func(i int, el *colly.HTMLElement) {
switch i {
case 1:
article.Created = el.Attr("data-tooltip")
case 2:
// 用空白切割字串
article.Reads = strings.Fields(el.Text)[1]
case 3:
article.Comments = strings.Fields(el.Text)[1]
}
})
// 正則匹配替換,字串轉整型
article.ID, _ = strconv.Atoi(regexp.MustCompile(`\d+`).FindAllString(article.URL, -1)[0])
articles = append(articles, article)
})
// 下一頁
c.OnHTML("a[href].page-link", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
// 啟動
c.Visit("https://learnku.com/blog/pardon")
// 輸出
csvSave("pardon.csv", articles)
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
enc.Encode(articles)
// 顯示收集器的列印資訊
log.Println(c)
}
Output
控制檯輸出
....
"id": 30604,
"title": "教程: TodoMVC 與 director 路由",
"url": "https://learnku.com/articles/30604",
"created": "2019-07-01 12:42:01",
"reads": "650",
"comments": "0",
"feeds": "0"
},
{
"id": 30579,
"title": "flaskr 進階筆記",
"url": "https://learnku.com/articles/30579",
"created": "2019-06-30 19:01:04",
"reads": "895",
"comments": "0",
"feeds": "0"
},
{
"id": 30542,
"title": "教程 Redis+ flask+vue 線上聊天",
"url": "https://learnku.com/articles/30542",
"created": "2019-06-29 12:19:45",
"reads": "2760",
"comments": "1",
"feeds": "2"
}
]
2019/12/20 15:50:14 Requests made: 5 (5 responses) | Callbacks: OnRequest: 0, OnHTML: 2, OnResponse: 0, OnError: 0
csv 文字輸出
ID,Title,URL,Created,Reads,Comments,Feeds
37991,ferret 爬取動態網頁,https://learnku.com/articles/37991,2019-12-15 10:43:03,219,0,3
37803,匿名類 與 索引重建,https://learnku.com/articles/37803,2019-12-09 19:35:09,323,1,0
37476,大話併發,https://learnku.com/articles/37476,2019-12-08 21:17:55,612,0,4
37738,三元運算子,https://learnku.com/articles/37738,2019-12-08 09:44:36,606,0,0
37719,筆試之 模板變數替換,https://learnku.com/articles/37719,2019-12-07 18:30:42,843,0,0
37707,筆試之 連續數增維,https://learnku.com/articles/37707,2019-12-07 13:50:17,872,0,0
37616,筆試之 一行程式碼求重,https://learnku.com/articles/37616,2019-12-05 12:10:24,792,0,0
....
Colly
- 簡潔API
- 快速(單個核心上> 1k請求/秒)
- 管理請求延遲和每個域的最大併發
- 自動cookie和會話處理
- 同步/非同步/並行抓取
- 分散式爬蟲
- 自動編碼非unicode響應
- 支援 Robots.txt
- 支援 Google App Engine