php採集遠端文章簡單類

suboysugar發表於2015-07-16

<?php
/**
 * 採集類
 * @author Milkcy 
 * @copyright            (C) 2012-2015 TCCMS.COM
 * @lastmodify             2012-07-10 14:00
 */
class gather {

    public $pagestring = ``;
    private $db;

    function __construct() {
        global $db;
        $this->db = $db;
    }

    function geturlfile($url) {
        $url = trim($url);
        $content = ``;
        if (extension_loaded(`curl`)) {
            $ch = curl_init();
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
            curl_setopt($ch, CURLOPT_HEADER, 0);
            $content = curl_exec($ch);
            curl_close($ch);
        } else {
            $content = file_get_contents($url);
        }
        return trim($content);
    }

    function get_all_url($code) {
        preg_match_all(`/<a.+?href=["|\`]?([^>"\` ]+)["|\`]?\s*[^>]*>([^>]+)<\/a>/is`, $code, $arr);
        return array(`name` => $arr[2], `url` => $arr[1]);
    }

    function get_sub_content($str, $start, $end) {
        $start = trim($start);
        $end = trim($end);
        if ($start == `` || $end == ``) {
            return $str;
        }
        $str = explode($start, $str);
        $str = explode($end, $str[1]);
        return $str[0];
    }

    function vd($var) {
        echo "<div style=\"border:1px solid #ddd;background:#F7F7F7;padding:5px 10px;\">\r\n";
        echo "<pre style=\"font-family:Arial,Vrinda;font-size:14px;\">\r\n";
        var_dump($var);
        echo "\r\n</pre>\r\n";
        echo "</div>";
    }

}

?>

<?php
define(`ROOT_PATH`, str_replace(`\\`, `/`, dirname(__FILE__)));
include ROOT_PATH."/gather.class.php";
set_time_limit(0);
header("Content-type: text/html; charset=gb2312");
//目標網址
$url = `http://news.163.com/special/00013C0O/guojibjtj_03.html`;
//例項化採集機器
$gather = new gather();
//獲取目標網址HTML
$html = $gather->geturlfile($url);
//定義採集列表區間
$start = `<div class="bd clearfix">`;
$end = `<div class="pages-1 mt25">`;
//獲取區間內的文章URL和TITLE
$code = $gather->get_sub_content($html, $start, $end);
$newsAry = $gather->get_all_url($code);
//列印出結果
//$gather->vd($newsAry);
$tarGetUrl = $newsAry[`url`][0];
//獲取目標網址HTML
$html = $gather->geturlfile($tarGetUrl);
//定義採集列表區間
$start = `<div id="endText">`;
$end = `<span class="cDGray right" style="white-space:nowrap;">`;
//獲取區間內的文章URL和TITLE
$code = $gather->get_sub_content($html, $start, $end);
$killHtml = `<iframe src="http://g.163.com/r?site=netease&affiliate=news&cat=article&type=tvscreen200x300&location=1" width="200" height="300" frameborder="no" border="0" marginwidth="0" marginheight="0" scrolling="no"></iframe>`;
$killHtml2 = `<a href="http://news.163.com/"><img src="http://img1.cache.netease.com/cnews/img07/end_i.gif" alt="netease" width="12" height="11" border="0" class="icon" /></a>`;
$code = str_replace($killHtml, "", $code);
$code = str_replace($killHtml2, "", $code);
$gather->vd($code);
?>
//該片段來自於http://outofmemory.cn

php 文章採集正則程式碼

//採集html 
function getwebcontent($url){ 
$ch = curl_init(); 
$timeout = 10; 
curl_setopt($ch, CURLOPT_URL, $url); 
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1); 
$contents = trim(curl_exec($ch)); 
curl_close($ch); 
return $contents; 
} 


//獲得標題和url 
$string = 
getwebcontent(`http://www.***.com/learn/zhunbeihuaiyun/jijibeiyun/2`); 
//正則匹配<li>獲取標題和地址 
preg_match_all ("/<li><a href="/learn/article/(.*)">(.*)</a>/",$string, $out, PREG_SET_ORDER);
foreach($out as $key => $value){ 
$article[`title`][] = $out[$key][2]; 
$article[`link`][] = "http://www.***.com/learn/article/".$out[$key][1]; 
} 
//根據url獲取文章內容 
foreach($article[`link`] as $key=>$value){ 
$content_html = getwebcontent($article[`link`][$key]); 
preg_match("/<div id=pagenum_0(.*)>[s|S]*?</div>/",$content_html,$matches); 
$article[content][$key] = $matches[0]; 

} 
//不轉碼還真不能儲存成檔案 
foreach($article[title] as $key=>$value){ 
$article[title][$key] = iconv(`utf-8`, `gbk`, $value);//轉碼 
} 
//存入檔案 
$num = count($article[`title`]); 
for($i=0; $i<$num; $i++){ 
file_put_contents("{$article[title][$i]}.txt", $article[`content`][$i]); 
} 
?>

如何聯絡我：【萬里虎】www.bravetiger.cn
【QQ】3396726884 （諮詢問題100元起，幫助解決問題500元起）
【部落格】http://www.cnblogs.com/kenshinobiy/

PHP使用QueryList採集微信文章頁
2019-02-16
PHP
PHP 實現簡單的資料採集併入庫
2019-12-15
PHP
php+nginx實現最簡單的遠端呼叫rpc(微服務)
2023-12-25
PHPNginxRPC微服務
物聯網終端裝置——遠端資料採集儀
2019-02-20
自媒體素材採集平臺，採集影片文章素材
2020-07-10
簡單的php連線mysql類
2024-08-19
PHPMySql
文章採集代理ip怎麼用？
2021-09-11
遠端兼職 PHP
2018-11-16
PHP
原來 Java 遠端除錯如此簡單
2020-09-30
Java除錯
vs聯合halcon——採集影像（實時採集與單次採集）
2021-06-27
php 分頁分頁類簡單實用
2021-09-09
PHP
php簡單操作mysql資料庫的類
2021-01-26
PHPMySql資料庫
英特佩斯遠端資料採集和車隊管理平臺
2020-07-14
數控磨床資料採集遠端監控物聯網系統
2023-04-11
本人想找PHP兼職（遠端）
2021-11-11
PHP
PHP 採集程式中日常的引數
2019-05-11
PHP
PHP 資料採集的一種思路
2019-01-04
PHP
【京東】商品list列表採集+類目下的商品列表資料採集
2023-04-23
WordPress自動採集釋出文章01-使用火車頭採集目標網站
2018-07-08
網站
QueryList 4.0 簡潔、優雅、可擴充套件的PHP採集工具(爬蟲)
2019-02-16
套件PHP爬蟲
消防應急電源資料採集遠端監控系統解決方案
2024-01-31
熱壓機PLC資料採集遠端監控物聯網解決方案
2023-12-19
水質監測儀資料採集遠端監控系統解決方案
2024-04-11
簡單簡易實現伺服器遠端登陸傳送簡訊提示
2018-09-07
伺服器
POCO相簿的照片批量採集下載的簡單方法有嗎？
2021-11-17
三類遠端連線命令
2020-09-24
Rdp遠端桌面簡介，利用遠端桌面連線遠端伺服器的方法
2020-05-05
伺服器
WordPress自動採集釋出文章04-如何批量定時釋出文章
2018-07-08
自動繞線機PLC資料採集遠端維護上下載系統方案
2024-01-23
PHP PDO 簡單教程
2019-05-26
PHP
基於 WebSocket 的 PPT 遠端控制器簡單實現
2021-09-15
Web
win7遠端桌面連線方法，十分簡單！
2020-02-10
Win7
有了它，在家遠端辦公其實可以很簡單
2020-02-20
利用遠端桌面管理winserver叢集
2020-11-11
Server
【PHP資料結構】插入類排序：簡單插入、希爾排序
2021-09-09
PHP資料結構排序
Rd遠端桌面簡介，利用rd遠端桌面連線遠端伺服器的方法
2020-05-05
伺服器
WordPress自動採集釋出文章03-如何批量把文章上傳到wordpress
2018-07-08
PHP multipart/form-data 遠端DOS漏洞
2020-08-19
PHPORM
php利用ssh操作遠端伺服器
2021-09-09
PHP伺服器

php採集遠端文章簡單類

php 文章採集正則程式碼

相關文章