介紹
基於laravel+ guzzle 實現的一個簡單爬蟲指令碼(抓取網站所有連線獲取http狀態)。
路由
$router->get('reptile/index', 'ReptileController@index');
程式碼
<?php
namespace App\Http\Controllers\Admin;
use App\Http\Controllers\Controller;
use GuzzleHttp\Client;
use GuzzleHttp\Pool;
use Illuminate\Http\Request;
use Illuminate\Support\Facades\DB;
/**
* 爬站點所有url-http狀態
* Class ArticleController
* @package App\Http\Controllers\Admin
*/
class ReptileController extends Controller
{
/**
* 爬蟲指令碼
* @param Request $request
*/
public function index(Request $request)
{
//http://127.0.0.1:8000/admin/reptile/index?url=https://www.xxxx.com
$testUrl = $request->input('url');
if (empty($testUrl)) {
die('error');
}
$siteContent = $this->getUrlContent($testUrl);
if (!empty($siteContent)) {
$urlList = $this->crawler($testUrl, $siteContent);
$list = [];
foreach ($urlList as $item) {
//是否屬於當前域名
$url = substr($item, 0, strlen($testUrl));
if (empty(strcmp($url, $testUrl))) {
$list[] = $item;
}
}
//查重
$list = array_unique($list);
//再次遍歷剛才的結果連結列表
$list1 = $this->filter($list, $testUrl);
$this->getAllHeaderResponse($list1);
}
}
/**
* 讀取網站內容並篩選出相同域名下的連線列表
* @param $result
* @param $testUrl
* @return array
*/
private function filter($result, $testUrl)
{
$list = [];
foreach ($result as $item) {
$siteContent = $this->getUrlContent($item);
if (!empty($siteContent)) {
$urlList1 = $this->crawler($item, $siteContent);
if (!empty($urlList1)) {
foreach ($urlList1 as $value) {
$url = substr($value, 0, strlen($testUrl));
if (empty(strcmp($url, $testUrl))) {
$list[] = $value;
}
}
$list = array_unique($list);
}
}
}
return $list;
}
/**
* 獲取網站http相應狀態
* @param $result
*/
private function getAllHeaderResponse($result)
{
//重置索引
$result = array_merge($result);
$count = count($result) ?? 0;
$res = $this->multiCheckNetResource($result);
$data = [];
foreach ($result as $k => $v) {
foreach ($res as $kk => $vv) {
if ($k == $kk) {
$data[] = [
'url' => $v,
'status' => $vv,
'count' => $count,
];
}
}
}
DB::table('site_status')->insert($data);
die('success');
}
/**
* 獲取網站內容
* @param $url
* @return bool|false|string
*/
private function getUrlContent($url)
{
try {
$handle = file_get_contents($url);
return $handle;
} catch (\Throwable $exception) {
return false;
}
}
/**
* 獲取網站內容連結
* @param $url
* @param string $content
* @return array|bool
*/
private function crawler($url, $content = '')
{
$urlList = $this->reviseUrl($url, $this->filterUrl($content));
if ($urlList) {
return $urlList;
} else {
return false;
}
}
/**
* 正則域名
* @param $webContent
* @return bool|mixed
*/
private function filterUrl($webContent)
{
$reg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/';
$result = preg_match_all($reg, $webContent, $matchResult);
if ($result) {
return $matchResult[1];
} else {
return false;
}
}
/**
* 獲取域名下面的所有子uri
* @param $baseUrl
* @param $urlList
* @return array|bool
*/
private function reviseUrl($baseUrl, $urlList)
{
$urlInfo = parse_url($baseUrl);
$baseUrl = $urlInfo["scheme"] . '://' . $urlInfo["host"];
$result = [];
if (is_array($urlList)) {
foreach ($urlList as $urlItem) {
if (preg_match('/^http/', $urlItem)) {
// 已經是完整的url
$result[] = $urlItem;
} else {
// 不完整的url
if (substr($urlItem, 0, 1) == '/') {
$realUrl = $baseUrl . $urlItem;
} else {
$realUrl = $baseUrl . '/' . $urlItem;
}
$result[] = $realUrl;
}
}
return $result;
} else {
return false;
}
}
/**
* 併發多請求 檢查網路資源是否200
* @param $taskUrls
* @param int $concurrency
* @param array $config
* @return array
*/
private static function multiCheckNetResource(
$taskUrls,
$concurrency = 20,
$config = [
'verify' => false,
'timeout' => 3,
]
)
{
$client = new Client($config); //併發請求連結地址
$requests = function () use ($client, $taskUrls) {
foreach ($taskUrls as $item) {
yield new \GuzzleHttp\Psr7\Request('HEAD', $item);
}
};
$result = [];
$pool = new Pool($client, $requests(), [
'concurrency' => $concurrency, //同時併發抓取幾個
'fulfilled' => function (\GuzzleHttp\Psr7\Response $response, $index) use (&$result) {
// this is delivered each successful response
$result[$index] = $response->getStatusCode();
},
'rejected' => function (\Throwable $throwable, $index) use (&$result) {
$result[$index] = $throwable->getCode();
// this is delivered each failed request
},
]);
$promise = $pool->promise();
$promise->wait();
return $result;
}
}
最終效果
本作品採用《CC 協議》,轉載必須註明作者和本文連結