爬取網站新聞

今天NLP了嗎發表於2020-09-24

原文網址 : https://blog.csdn.net/li_jiaoyang/article/details/108772058

軍事新聞爬取

光明網軍事新聞

import json
import redis
import time
import requests
session = requests.session()
import logging.handlers
import pickle
import sys
import re
import datetime
from bs4 import BeautifulSoup
import importlib
import csv
import  string
import zhon.hanzi
import sys
importlib.reload(sys)
punce = string.punctuation
puncz = zhon.hanzi.punctuation
f = open('0905/0905原文.csv','a',encoding='utf-8',newline='')
fp = open('0905/0905段落.csv','a',encoding='utf-8',newline='')
csv_article = csv.writer(f)
csv_para = csv.writer(fp)

def getNewsDetail(newsurl):
    news_p=[]
    p1=''
    res = requests.get(newsurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text,'html.parser')
    # news_p.append([p.text.strip() for p in soup.select('.u-mainText p')])
    for p in soup.select('.u-mainText p'):
        p1 = p1 + p.text.strip().replace('\n','')
        if len(p1) >= 200 and len(p1) <= 500 :
            news_p.append(p1)
            p1 = ''
    # if p1 != '':
    #     news_p.append(p1)
    news_article = ' '.join([p.text.strip().replace('<br/>','') for p in soup.select('.u-mainText p')])
    return news_article, news_p


def spider():
    # pages = [57918,57919,234399,234400]
    pages = ['','_2','_3','_4','_5',
             '_6','_7','_8','_9','_10']
    for onepage in pages:
        #組合url
        url = "http://mil.gmw.cn/node_8979"+onepage+".htm"
        print(url)
        # 偽裝請求頭
        headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        }

        result = session.get(url=url,headers=headers).content
        soup = BeautifulSoup(result,'html.parser')
        if soup is None:
            break
        #找到新聞列表
        result_div = soup.find('div', attrs={'class': 'channelLeftPart'}).find_all('div')[1].find_all('ul', attrs={'class': 'channel-newsGroup'})
            # findall('ul', attrs={'class': 'channel-newsGroup'})
        #去下換行
        result_replace = str(result_div).replace('\n','').replace('\r','').replace('\t','')
        #正則匹配資訊
        result_list = re.findall('<li>(.*?)</li>',result_replace)

        for i in result_list:
            # http://mil.gmw.cn/2020-09/04/content_34157244.htm
            news_url = 'http://mil.gmw.cn/' + re.findall('<a href="(.*?)" target=',i)[0]
            news_name = re.findall('target="_blank">(.*?)</a>',i)[0]
            print(news_name)
            # news_time = re.findall('<span class="time">\((.*?)\)</span>',i)[0]
            # 標題-段落
            news_article, news_p = getNewsDetail(news_url)
            for p1 in news_p:
                if p1!='':
                    csv_para.writerow([p1.replace("\u00a0", ""), news_name.replace("\u00a0", "")])

            # 標題-原文
            if news_article!='':
                csv_article.writerow([news_name.replace("\u00a0", ""), news_article.replace("\u00a0", "")])
        time.sleep(3)
spider()
f.close()

國防科技資訊網

# -*- coding:utf-8 -*-
import json
import time
import requests
session = requests.session()
import logging.handlers
import pickle
import sys
import re
import datetime
from bs4 import BeautifulSoup
import importlib
import csv
import  string
import zhon.hanzi
import sys
importlib.reload(sys)
punce = string.punctuation
puncz = zhon.hanzi.punctuation
f = open('0906/0906electron原文.csv','w+',encoding='utf-8',newline='')
fp = open('0906/0906electron段落.csv','w+',encoding='utf-8',newline='')
csv_article = csv.writer(f)
csv_para = csv.writer(fp)
allparas = []
def getNewsDetail(newsurl):
    news_p=[]
    p1=''
    # res = requests.get(newsurl)
    # res.encoding = 'utf-8'
    result = session.get(url=newsurl)
    soup = BeautifulSoup(result.text,'html.parser')
    # news_p.append([p.text.strip() for p in soup.select('.u-mainText p')])
    for p in soup.select('.newsContent p'):
        p1 = p1 + p.text.replace('\n','')
        if len(p1) >= 200 and len(p1) <= 500 :
            news_p.append(p1)
            p1 = ''
    news_article = ' '.join([p.text.strip().replace('\n','') for p in soup.select('.newsContent p')])
    return news_article, news_p


def spider():
    # pages = [57918,57919,234399,234400]
    # http://www.dsti.net/Information/HyeList/aviation/ 0-487頁
    # http://www.dsti.net/Information/HyeList/spaceflight 0-48頁
    # http://www.dsti.net/Information/HyeList/electron/ 1,30
    for page in range(1,30):
        #組合url
        # url = "http://mil.gmw.cn/node_8979"+onepage+".htm"
        # url = "http://www.dsti.net/Information/HyeList/spaceflight/" + str(page)
        url = "http://www.dsti.net/Information/HyeList/electron/" + str(page)

        print(url)
        # 偽裝請求頭
        headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        }

        result = session.get(url=url,headers=headers).content
        # soup = BeautifulSoup(result,'html.parser').encode('GBK','ignore').decode('GBK')
        soup = BeautifulSoup(result, 'html.parser',from_encoding="gb18030")
        if soup is None:
            break
        # req = requests.get(headers=headers, url=url)
        # content = req.content
        # soup = content.decode('gbk')
        # res = requests.get(url=url, headers=headers)
        # res.encoding = 'gb18030'
        # soup = BeautifulSoup(res.text, 'html.parser')
        #找到新聞列表
        result_div = soup.find('div', attrs={'class': 'listMidContent'}).find('ul')
        # result_div = result_div.encode('GBK','ignore').decode('GBK')
        #去下換行
        result_replace = str(result_div).replace('\n','').replace('\r','').replace('\t','')
        #正則匹配資訊
        result_list = re.findall('<li><h1>.(.*?)</h1>',result_replace)
        for i in result_list:
            # http://www.dsti.net/Information/News/120652
            news_url = 'http://www.dsti.net/' + re.findall('href="(.*?)" target="_blank">',i)[0]
            news_name = re.findall('target="_blank">(.*?)</a>',i)[0]
            # news_time = re.findall('<span class="time">\((.*?)\)</span>',i)[0]
            # 標題-段落
            news_article, news_p = getNewsDetail(news_url)
            for p1 in news_p:
                if p1!='':
                    csv_para.writerow([p1.replace("\u00a0", ""), news_name.replace("\u00a0", "")])

            # 標題-原文
            if news_article!='':
                csv_article.writerow([news_name.replace("\u00a0", ""), news_article.replace("\u00a0", "")])
        # time.sleep(1)
spider()
f.close()

python爬蟲---網頁爬蟲，圖片爬蟲，文章爬蟲，Python爬蟲爬取新聞網站新聞
2019-01-04
Python爬蟲網頁網站
爬蟲搭建代理池、爬取某網站影片案例、爬取新聞案例
2023-03-16
爬蟲網站
Nodejs爬取新聞列表
2021-09-09
NodeJS
Node.js爬取科技新聞網站cnBeta（附前端及服務端原始碼）
2018-12-16
Node.js網站前端服務端原始碼
動態網站的爬取
2018-08-29
網站
Python爬蟲—爬取某網站圖片
2020-11-19
Python爬蟲網站
使用 Python 爬取網站資料
2024-07-27
Python網站
Jsoup + HtmlUtil 實現網易新聞網頁爬蟲
2019-01-14
JSHTML網頁爬蟲
JB的Python之旅-爬取phizhub網站
2019-02-21
Python網站
快速爬取登入網站資料
2020-11-20
網站
爬取某網站寫的python程式碼
2019-11-29
網站Python
如何使用robots禁止各大搜尋引擎爬蟲爬取網站
2018-08-28
爬蟲網站
爬蟲：HTTP請求與HTML解析（爬取某乎網站）
2021-05-19
爬蟲HTTPHTML網站
央視新聞《玩彩網最新網站》手機搜狐網
2022-03-22
網站
如何抽取上千家新聞網站正文
2020-03-31
網站
大規模非同步新聞爬蟲：實現一個同步定向新聞爬蟲
2018-12-03
非同步爬蟲
大規模非同步新聞爬蟲：簡單的百度新聞爬蟲
2018-12-02
非同步爬蟲
網站後臺修改圖片新聞？公司網站圖片怎麼修改？
2024-10-16
網站
JB的Python之旅-爬取phizhub網站（原始碼）
2019-03-01
Python網站原始碼
如何用Python爬取需要登入的網站？
2018-08-23
Python網站
Scrapy使用隨機User-Agent爬取網站
2018-08-31
隨機網站
python 非同步佇列爬取多個網站
2020-11-21
Python非同步佇列網站
爬取彼岸網站的桌布（分類可選）
2024-07-03
網站
Python爬蟲入門教程 2-100 妹子圖網站爬取
2018-12-13
Python爬蟲網站
簡單的爬蟲：爬取網站內容正文與圖片
2021-09-09
爬蟲網站
央視新聞《大⃞發888官方網站》手機搜狐網
2022-03-22
網站
comScore：新冠病毒推動新聞網站訪問量增長
2020-09-15
網站
人民新聞網
2019-05-11
[譯] 如何使用 Python 和 BeautifulSoup 爬取網站內容
2019-02-23
Python網站
使用puppeteer爬取網站，抓出404無效連結
2018-12-20
網站
利用Python爬取攝影網站圖片，切勿商用
2018-12-18
Python網站
爬取薅羊毛網站百度雲資源
2020-02-16
網站
某網站加密返回資料加密_爬取過程
2024-06-08
網站加密
爬蟲Selenium+PhantomJS爬取動態網站圖片資訊（Python）
2018-03-24
爬蟲JS網站Python
大規模非同步新聞爬蟲：網頁正文的提取
2018-12-03
非同步爬蟲網頁
利用Python爬蟲獲取招聘網站職位資訊
2021-08-09
Python爬蟲網站
瑞星：週末攔截掛馬網站數減少新聞類網站佔主打
2019-05-11
網站
使用正則編寫簡單的爬蟲爬取某網站的圖片
2018-06-06
爬蟲網站

爬取網站新聞

軍事新聞爬取

光明網軍事新聞

國防科技資訊網

相關文章