PHP蜘蛛爬蟲開發文件

kakasaber發表於2021-01-12

原文網址 : https://learnku.com/articles/53464

PHP爬蟲

PHP蜘蛛爬蟲開發文件

官方文件

https://doc.phpspider.org/

githup地址

https://github.com/owner888/phpspider

phpspider-master/core 檔案介紹

檔名	描述
init.php	公共入口檔案
constans.php	公共入口檔案
phpspider.php	核心類檔案
-	-
configs詳解	-
requests.php	請求類檔案
selector.php	選擇器類檔案
db.php	資料庫類檔案
cache.php	快取類檔案
log.php	日誌類檔案
queue.php	Redis操作類檔案
util.php	實用函式集合類檔案
worker.php	多程式操作類

載入核心檔案

require './vendor/autoload.php';
use phpspider\core\phpspider;

requests.php 請求類詳解

成員	描述
input_encoding	輸入編碼明確指定輸入的頁面編碼格式(UTF-8,GB2312,…..)，防止出現亂碼,如果設定null則自動識別
output_encoding	輸出編碼明確指定輸出的編碼格式(UTF-8,GB2312,…..)，防止出現亂碼,如果設定null則為utf-8
encoding	獲取網頁編碼
content	獲取響應內容 - 轉碼前內容
text	獲取響應內容 - 轉碼後內容
status_code	網頁狀態碼
headers	獲取響應頭
request	獲取請求頭

# 載入核心檔案
require './vendor/autoload.php';
use phpspider\core\phpspider;
# 載入請求類檔案
use phpspider\core\requests;
# 設定輸入編碼
requests::$input_encoding = null; // null=自動識別
# 設定輸出編碼
requests::$output_encoding = null; // null=utf-8
# 獲取網頁編碼
request::$encoding;
# 獲取響應內容 - 轉碼前內容
request::$content;
# 獲取響應內容 - 轉碼後內容
request::$text;
# 獲取網頁狀態碼
request::$status_code;
# 獲取獲取響應頭
request::$headers;
# 獲取獲取請求頭
request::$request;

方法	描述
set_timeout( $timeout )	設定請求超時時間
set_proxy( $proxy )	設定請求代理
set_useragent( $useragent )	瀏覽器useragent(UA)
set_referer( $referer )	瀏覽器請求來路URL
set_header( $key, $value )	新增請求的Header
set_cookie( $key, $value, $domain = ‘’ )	新增請求的Cookie
get_cookie( $name, $domain = ‘’ )	獲取請求的Cookie
set_cookies( $cookies, $domain = ‘’ )	設定請求Cookie
get_cookie( $domain = ‘’ )	獲取請求的Cookie
set_client_ip( $ip )	設定請求偽IP
set_hosts( $host, $ips )	設定請求的第三方主機和IP
get( $url, $params, $allow_redirects, $cert )	用來獲取某個網頁
post( $url, $params, $files, $allow_redirects, $cert )	用來獲取某個網頁
put( $url, $params, $allow_redirects, $cert )	用來獲取某個網頁
delete( $url, $params, $allow_redirects, $cert )	用來獲取某個網頁

# 設定請求超時時間
# 1. 單一值 (同時設定connect和read)
requests::set_timeout(10);
# 2. 陣列值 (設定connect和read二者的timeout)
requests::set_timeout( array(3, 27) );
# 設定請求代理
1. 字串
requests::set_proxy('http://user:pass@host:port');
2. 陣列
requests::set_proxy(
array(
'http://user:pass@host:port',
'http://user:pass@host:port'
)
);
# 設定 UA頭
// 1. 字串
requests::set_useragent("Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/");
// 2. 陣列
requests::set_proxy(
array(
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/"
)
);
# 設定請求來路URL
requests::set_referer('https://www.baidu.com');
# 設定請求的Header
requests::set_header("Referer", "http://www.baidu.com");
# 新增請求的Cookie
requests::set_cookie("BAIDUID", "FEE96299191CB0F11954F3A0060FB470:FG=1", "http://www.baidu.com");
requests::set_cookie("BAIDUID=FEE96299191CB0F11954F3A0060FB470:FG=1", "http://www.baidu.com");
# 獲取請求的Cookie
requests::get_cookie("BAIDUID", "http://www.baidu.com");
requests::get_cookie("http://www.baidu.com");
# 設定設定請求偽IP
// 1. 單一值
requests::set_client_ip("192.168.0.2");
// 2. 陣列
requests::set_client_ip(
array(
"192.168.0.1",
"192.168.0.2"
)
);
# 設定請求的第三方主機和IP
requests::set_hosts(
"http://www.baidu.com",
array(
"203.195.143.21",
"203.195.143.22"
)
);
# 發起 get 請求
requests::get("https://github.com/timeline.json");
# 發起 post 請求
// 1. 登入
requests::post(
"http://www.domain.com",
array(
"username" => "test", "password" => "test"
)
);
// 2. 檔案上傳
request::post(
"http://www.domain.com",
null,
array(
"file1" => "test1.jpg",
"file2" => "test2.jpg"
)
);
# 發起 put 請求
requests::put(
"http://www.domain.com",
"{username:\"test888\",username:\"123456\"}"
);
# 發起 delete 請求
requests::delete(
"http://www.domain.com",
"{username:\"test888\"}"
);

selector.php 選擇器類詳解

方法	描述
select( $html, $selector, $selector_type = ‘xpath’ )	選擇匹配的內容
remove( $html, $selector, $selector_type = ‘xpath’ )	刪除匹配的內容

/**
* select( $html, $selector, $selector_type = 'xpath' )
* @param $html 需要篩選的網頁內容
* @param $selector 選擇器規則
* @param $selector_type 選擇器型別: xpath (預設) / regex / css
*/
# 1. xpath
$html = requests::get("https://news.163.com/20/0831/15/FLCBLJOT000189FH.html");
$data = selector::select($html, '//*[@id="endText"]'); // 讀取網易新聞新聞內容
var_dump($data);
# 2. css
$html = requests::get("https://news.163.com/20/0831/15/FLCBLJOT000189FH.html");
$data = selector::select($html, ".post_content_main > h1", "css"); // 讀取網易新聞詳情頁標題
var_dump($data);
# 3. regex
$html = requests::get("https://news.163.com/20/0831/15/FLCBLJOT000189FH.html");
$data = selector::select($html, "@<title>(.*?)</title>@", "regex"); // 讀取網易新聞 title標題內容
var_dump($data);
/**
* remove( $html, $selector, $selector_type = 'xpath' )
* @param $html 需要篩選的網頁內容
* @param $selector 選擇器規則
* @param $selector_type 選擇器型別: xpath (預設) / regex / css
*/
$html = requests::get("https://news.163.com/20/0831/15/FLCBLJOT000189FH.html");
$html = selector::select($html, '//*[@id="endText"]'); // 讀取網易新聞新聞內容
// 在上面獲取的內容基礎上，刪除第一個<p>標籤(原標題)
$data = selector::select($html, '//*[@id="endText"]/p[1]');
var_dump($data);

db.php 資料庫類詳解

# 資料配置連結
$db_config = array(
'host' => '127.0.0.1',
'port' => 3306,
'user' => 'root',
'pass' => '123456',
'name' => 'demo_db'
);
// 資料庫配置
db::set_connect('default', $db_config);
// 資料庫連線
db::init_mysql();

方法	描述
query($sql)	原生SQL操作
get_one($sql)	原生SQL操作
get_all($sql)	單條查詢
insert($table, $data)	單條插入
insert_batch($table, $data)	單條修改
update_batch($table, $data, $index)	批量修改
delete($table, $where)	單條刪除

# query 原生操作
// 1. 查詢
$query = db::query("select * fromcontent");
while($row = db::fetch($query)) {
echo "id = {$row['id']}; name = {$row['name']}; \n";
}
// 2. 新增
db::query("insert intocontent(name) values (test);");
// 3. 更新
db::query("updatecontentsetname='test' whereid=1;");
// 4. 刪除
db::query("delete fromcontentwhereid=1;");
# get_one
$row = db::get_one("select * fromcontentwhereid=1;");
# get_all
$rows = db::get_all("select * fromcontentlimit 5;");
# insert
$rows = db::insert('content', array('name' => 'test'));
# insert_batch
$rows = db::insert_batch(
'content',
array(
array(
'name' => 'test1'
),
array(
'name' => 'test2'
)
)
);
# update_batch
db::update_batch(
'content',
array(
array(
'id' => 1,
'name' => 'test1'
),
array(
'id' => 2,
'name' => 'test2'
)
),
'id' // 以 id 為條件進行修改
);
# delete
$rows = db::delete('content', "id=1");

使用 configs 來編寫爬蟲

# 載入核心檔案
require './vendor/autoload.php';
use phpspider\core\phpspider;
# 官方文件說不要刪除這段註釋，我並不知道有什麼用，文件說加就加
/* Do NOT delete this comment */
/* 不要刪除這段註釋 */
# $configs = array(
'name' => '163新聞', // 當前爬蟲名稱
'log_show' => false, // 是否顯示日誌, 預設false, 可選 true (顯示除錯資訊) | false (顯示爬取皮膚, tail -f data/phpspider.log 檢視日誌)
'log_file' => 'data/phpspider.log', // 日誌檔案路徑, 預設 data/phpspider.log
'log_type' => '', // 顯示和記錄的日誌型別, 預設空, 可選 info(普通) | warn(警告) | debug(除錯) | error(錯誤 )
'input_encoding' => null, // 輸入編碼, 預設null(自動識別)
'output_encoding' => null, // 輸出編碼, 預設null(null=utf-8)
'tasknum' => 1, // 同時工作的爬蟲任務數, 預設1(單程式任務爬取)
'multiserver' => false, // 多伺服器處理, 預設false, 可選 true | false
'serverid' => 1, // 伺服器ID, 預設1, 啟用第二天伺服器可設定為2
'save_running_state' => false, // 儲存爬蟲執行狀態, 預設false(不儲存), 可選 true | false
'queue_config' => array( // redis 配置, 儲存爬蟲執行狀態、多工處理和多伺服器處理都需要 redis 來儲存採集任務資料
'host' => '127.0.0.1',
'port' => 6379,
'pass' => '',
'db' => 5,
'prefix' => 'phpspider',
'timeout' => 30
),
'proxy' => array( // 代理伺服器，如果爬取的網站根據ip做了反爬蟲，可以設定此項
'http://host:port',
'http://user:pass@host:port',
),
'interval' => 1000, // 爬取單個網頁的時間間隔, 單位毫秒
'timeout' => 5, // 爬取每個網頁的超時時間, 單位秒
'max_try' => 0, // 爬取每個網頁失敗後嘗試次數, 預設0(不重複爬取)
'max_depth' => 0, // 爬取網頁深度, 超過深度的頁面不再採集, 預設0(不限制)
'max_fields' => 0, // 爬取內容網頁最大條數, 預設0(不限制)
'user_agent' => "", // 爬取網頁所使用的瀏覽器型別
// 1. 列舉型別
// phpspider::AGENT_ANDROID, 表示爬蟲爬取網頁時, 使用安卓手機瀏覽器
// phpspider::AGENT_IOS, 表示爬蟲爬取網頁時, 使用蘋果手機瀏覽器
// phpspider::AGENT_PC, 表示爬蟲爬取網頁時, 使用PC瀏覽器
// phpspider::AGENT_MOBILE, 表示爬蟲爬取網頁時, 使用移動裝置瀏覽器
// 2. 自定義型別
// 'user_agent' => "Mozilla/5.0"
// 3. 隨機瀏覽器型別
// 'user_agent' => array(
// "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
// "Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_3 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13G34 Safari/601.1",
// "Mozilla/5.0 (Linux; U; Android 6.0.1;zh_cn; Le X820 Build/FEXCNFN5801507014S) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/49.0.0.0 Mobile Safari/537.36 EUI Browser/5.8.015S",
// );
'client_ip' => "", // 爬取網頁所使用的偽IP，用於破解防採集
// 1. 字串型別
// 'client_ip' => '192.168.0.2'
// 2. 陣列型別
// 'client_ip' => array(
// '192.160.0.1',
// '192.160.0.2',
// );
'export' => array( // 爬取資料資料匯出
'type' => 'csv', // 匯出型別 csv | sql | db
'file' => './data/163_news.csv', // 匯出檔案路徑
// 'type' => 'sql'
// 'file' => './data/163_news.sql',
// 'table' => 'news_table', // 匯出db、sql資料庫表名
// 'type' => 'db'
// 'table' => 'news_table', // 匯出db、sql資料庫表名
),
'db_config' => array( // 資料庫配置
'host' => '127.0.0.1',
'port' => 3306,
'user' => 'root',
'pass' => 'root',
'name' => 'demo',
),
'domains' => array( // 定義爬蟲爬取哪些域名下的網頁, 非域名下的url會被忽略以提高爬取速度
'163.com',
'new.163.com'
),
'scan_urls' => array( // 定義爬蟲的入口連結, 爬蟲從這些連結開始爬取,同時這些連結也是監控爬蟲所要監控的連結
'https://news.163.com'
),
'content_url_regexes' => array( // 定義內容頁url的規則, 正規表示式最好填寫以提高爬取效率
'https://news.163.com/\d+/\d+/\d+/\w+.html'
),
'list_url_regexes' => array( // 定義列表頁url的規則, 對於有列表頁的網站, 使用此配置可以大幅提高爬蟲的爬取速率
'https://news.163.com/gz/page/\d+.html'
),
'fields' => array( // 定義內容頁的抽取規則, 規則由一個個field組成, 一個field代表一個資料抽取項
array(
'name' => "content", // 名稱, 不能為空
'selector' => '//*[@id="endText"]', // 定義抽取規則, 不能為空, 預設使用xpath
'selector_type' => 'xpath', // 抽取規則型別, 預設xpath, 可選 xpaht | jsonpath | regex
'required' => true, // 是否必須的, 預設false, 可選 true | false
'repeated' => false, // 抽取到的內容是否多項, 預設false, 可選 false | true(結果都是陣列型別)
'children' => array( // 為此field定義子項, 子項的定義仍然是一個fields陣列
array(
'name' => 'replay', // # 例如抽取新聞下面的評論
'selector' => "//div[contains(@class,'replay')]"
)
),
'source_type' => 'url_content', // 該field的資料來源, 預設從當前的網頁 (url_context) 中抽取資料, 可選 url_context | attached_url
// 'source_type' => 'attached_url',
// 'attached_url' => 'https://news.163.com/{comment_id}/comments', // 當source_type設定為attached_url時, 定義新請求的url
),
array(
'name' => "title",
'selector' => '//*[@id="epContentLeft"]/h1',
)
)
);
// 載入配置
$spider = new phpspider($configs);
// 啟動爬蟲
$spider->start();

本作品採用《CC 協議》，轉載必須註明作者和本文連結

爬蟲開發技巧
2020-11-14
爬蟲
【閱讀筆記】《Python3網路爬蟲開發實戰》PDF文件
2020-01-14
筆記Python爬蟲
PHP爬蟲初探......先爬Learnku試試看
2020-07-23
PHP爬蟲
python3網路爬蟲開發實戰_Python 3開發網路爬蟲(一)
2020-12-07
Python爬蟲
《Python3網路爬蟲開發實戰》教程||爬蟲教程
2018-11-13
Python爬蟲
Java爬蟲快速開發工具：uncs
2019-01-15
Java爬蟲
網路爬蟲開發常用框架
2019-02-27
爬蟲框架
C#爬蟲開發小結
2023-01-19
C#爬蟲
C#網路爬蟲開發
2023-02-12
C#爬蟲
python爬蟲開發微課版pdf_Python爬蟲開發實戰教程（微課版）
2020-11-21
Python爬蟲
Web網站如何檢視搜尋引擎蜘蛛爬蟲的行為
2019-05-12
Web網站爬蟲
.NET使用分散式網路爬蟲框架DotnetSpider快速開發爬蟲功能
2023-12-08
分散式爬蟲框架IDE
[Python3網路爬蟲開發實戰] 分散式爬蟲原理
2019-12-08
Python爬蟲分散式
GB標準文件爬蟲下載程式
2024-04-11
爬蟲
【Python學習】爬蟲爬蟲爬蟲爬蟲~
2018-05-03
Python爬蟲
不踩坑的Python爬蟲：Python爬蟲開發與專案實戰，從爬蟲入門 Python
2021-12-17
Python爬蟲
python爬蟲實戰教程-Python爬蟲開發實戰教程（微課版）
2020-11-11
Python爬蟲
網路爬蟲專案開發日誌（三）：爬蟲上線準備
2022-02-02
爬蟲
Python開發爬蟲專案+程式碼
2019-04-24
Python爬蟲
爬蟲實戰開發學習（一）
2021-07-06
爬蟲
python3網路爬蟲開發實戰_Python3 爬蟲實戰
2022-01-24
Python爬蟲
用typescript開發爬蟲過程實踐
2019-02-27
TypeScript爬蟲
基於nodejs網站爬蟲程式開發
2021-09-09
NodeJS網站爬蟲
Python 3網路爬蟲開發實戰
2021-04-28
Python爬蟲
三篇文件學會使用casperjs製作爬蟲
2018-08-14
JS爬蟲
我爬取了爬蟲崗位薪資，分析後發現爬蟲真香
2020-12-09
爬蟲
那些年，我爬過的北科(四)——爬蟲進階之極簡併行爬蟲框架開發
2019-03-04
爬蟲框架
爬蟲：多程式爬蟲
2021-05-19
爬蟲
phpspider簡單快速上手的php爬蟲框架
2020-02-17
PHPIDE爬蟲框架
python爬蟲---網頁爬蟲，圖片爬蟲，文章爬蟲，Python爬蟲爬取新聞網站新聞
2019-01-04
Python爬蟲網頁網站
[爬蟲手記] 我是如何在3分鐘內開發完一個爬蟲的
2019-05-27
爬蟲
python爬蟲實操專案_Python爬蟲開發與專案實戰 1.6 小結
2021-02-04
Python爬蟲
Reactjs前端、Python爬蟲、Nodejs後臺開發招聘
2018-11-09
React前端Python爬蟲NodeJS
JavaFX 整合 Sqlite 和 Hibernate 開發爬蟲應用
2019-08-06
JavaSQLite爬蟲
Python爬蟲開發與專案實戰pdf
2020-01-11
Python爬蟲
Python 開發簡單爬蟲 (學習筆記)
2019-08-05
Python爬蟲筆記
爬蟲開發知識入門基礎（1）
2020-06-22
爬蟲
《網路爬蟲開發實戰案例》筆記
2020-08-10
爬蟲筆記

PHP蜘蛛爬蟲開發文件

PHP蜘蛛爬蟲開發文件

官方文件

githup地址

phpspider-master/core 檔案介紹

載入核心檔案

requests.php 請求類 詳解

selector.php 選擇器類 詳解

db.php 資料庫類 詳解

使用 configs 來編寫爬蟲

相關文章

requests.php 請求類詳解

selector.php 選擇器類詳解

db.php 資料庫類詳解