目錄結構
- controller
- spider.js // 封裝的spider方法
- index.js // 專案入口
index.js
var cheerio = require("cheerio");
var server = require("./controller/spider");
// var url = "http://v.163.com/special/opencourse/englishs1.html";
var url = 'http://zwbk.com/';
server.fetchData(url, function(data) {
if (data) {
//console.log(data);
var $ = cheerio.load(data);
$("a").each(function(i, e) {
console.log(1111);
});
console.log("done");
} else {
console.log("error");
}
});
複製程式碼
spider.js
var http = require("http");
var fetchData = function (url, callback) {
http.get(url, function(res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function() {
callback(data);
});
}).on("error", function() {
callback(null);
});
}
exports.fetchData = fetchData;
複製程式碼
執行node index.js
,發現只輸出了一個done
,猜測可能是做了反爬蟲,在網上隨便找了一個域名,執行後果然有了輸出。為了解決這個問題,我引入了superAgent
來傳送請求,superAgent
可以很方便的模擬瀏覽器的一些屬性,例如refer,請求頭等...還有個好處是superAgent
可以直接抓取https
的頁面.
現在講地址替換成https://github.com/azoth1991
,已經可以抓到頁面內容了
$("a").each(function(i, e) {
console.log(i,e.attribs.href);
});
0 '#start-of-content'
1 'https://github.com/'
2 '/features'
3 '/features/code-review/'
4 '/features/project-management/'
5 '/features/integrations'
6 '/features/actions'
7 '/features#team-management'
8 '/features#social-coding'
9 '/features#documentation'
10 '/features#code-hosting'
11 '/customer-stories'
12 '/security'
13 '/enterprise'
14 '/explore'
...
複製程式碼
倉庫地址 歡迎star ^_^
待續...