爬取網站新聞

今天NLP了嗎發表於2020-09-24

光明網軍事新聞

import json
import redis
import time
import requests
session = requests.session()
import logging.handlers
import pickle
import sys
import re
import datetime
from bs4 import BeautifulSoup
import importlib
import csv
import  string
import zhon.hanzi
import sys
importlib.reload(sys)
punce = string.punctuation
puncz = zhon.hanzi.punctuation
f = open('0905/0905原文.csv','a',encoding='utf-8',newline='')
fp = open('0905/0905段落.csv','a',encoding='utf-8',newline='')
csv_article = csv.writer(f)
csv_para = csv.writer(fp)

def getNewsDetail(newsurl):
    news_p=[]
    p1=''
    res = requests.get(newsurl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text,'html.parser')
    # news_p.append([p.text.strip() for p in soup.select('.u-mainText p')])
    for p in soup.select('.u-mainText p'):
        p1 = p1 + p.text.strip().replace('\n','')
        if len(p1) >= 200 and len(p1) <= 500 :
            news_p.append(p1)
            p1 = ''
    # if p1 != '':
    #     news_p.append(p1)
    news_article = ' '.join([p.text.strip().replace('<br/>','') for p in soup.select('.u-mainText p')])
    return news_article, news_p


def spider():
    # pages = [57918,57919,234399,234400]
    pages = ['','_2','_3','_4','_5',
             '_6','_7','_8','_9','_10']
    for onepage in pages:
        #組合url
        url = "http://mil.gmw.cn/node_8979"+onepage+".htm"
        print(url)
        # 偽裝請求頭
        headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        }

        result = session.get(url=url,headers=headers).content
        soup = BeautifulSoup(result,'html.parser')
        if soup is None:
            break
        #找到新聞列表
        result_div = soup.find('div', attrs={'class': 'channelLeftPart'}).find_all('div')[1].find_all('ul', attrs={'class': 'channel-newsGroup'})
            # findall('ul', attrs={'class': 'channel-newsGroup'})
        #去下換行
        result_replace = str(result_div).replace('\n','').replace('\r','').replace('\t','')
        #正則匹配資訊
        result_list = re.findall('<li>(.*?)</li>',result_replace)

        for i in result_list:
            # http://mil.gmw.cn/2020-09/04/content_34157244.htm
            news_url = 'http://mil.gmw.cn/' + re.findall('<a href="(.*?)" target=',i)[0]
            news_name = re.findall('target="_blank">(.*?)</a>',i)[0]
            print(news_name)
            # news_time = re.findall('<span class="time">\((.*?)\)</span>',i)[0]
            # 標題-段落
            news_article, news_p = getNewsDetail(news_url)
            for p1 in news_p:
                if p1!='':
                    csv_para.writerow([p1.replace("\u00a0", ""), news_name.replace("\u00a0", "")])

            # 標題-原文
            if news_article!='':
                csv_article.writerow([news_name.replace("\u00a0", ""), news_article.replace("\u00a0", "")])
        time.sleep(3)
spider()
f.close()

國防科技資訊網

# -*- coding:utf-8 -*-
import json
import time
import requests
session = requests.session()
import logging.handlers
import pickle
import sys
import re
import datetime
from bs4 import BeautifulSoup
import importlib
import csv
import  string
import zhon.hanzi
import sys
importlib.reload(sys)
punce = string.punctuation
puncz = zhon.hanzi.punctuation
f = open('0906/0906electron原文.csv','w+',encoding='utf-8',newline='')
fp = open('0906/0906electron段落.csv','w+',encoding='utf-8',newline='')
csv_article = csv.writer(f)
csv_para = csv.writer(fp)
allparas = []
def getNewsDetail(newsurl):
    news_p=[]
    p1=''
    # res = requests.get(newsurl)
    # res.encoding = 'utf-8'
    result = session.get(url=newsurl)
    soup = BeautifulSoup(result.text,'html.parser')
    # news_p.append([p.text.strip() for p in soup.select('.u-mainText p')])
    for p in soup.select('.newsContent p'):
        p1 = p1 + p.text.replace('\n','')
        if len(p1) >= 200 and len(p1) <= 500 :
            news_p.append(p1)
            p1 = ''
    news_article = ' '.join([p.text.strip().replace('\n','') for p in soup.select('.newsContent p')])
    return news_article, news_p


def spider():
    # pages = [57918,57919,234399,234400]
    # http://www.dsti.net/Information/HyeList/aviation/ 0-487頁
    # http://www.dsti.net/Information/HyeList/spaceflight 0-48頁
    # http://www.dsti.net/Information/HyeList/electron/ 1,30
    for page in range(1,30):
        #組合url
        # url = "http://mil.gmw.cn/node_8979"+onepage+".htm"
        # url = "http://www.dsti.net/Information/HyeList/spaceflight/" + str(page)
        url = "http://www.dsti.net/Information/HyeList/electron/" + str(page)

        print(url)
        # 偽裝請求頭
        headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        }

        result = session.get(url=url,headers=headers).content
        # soup = BeautifulSoup(result,'html.parser').encode('GBK','ignore').decode('GBK')
        soup = BeautifulSoup(result, 'html.parser',from_encoding="gb18030")
        if soup is None:
            break
        # req = requests.get(headers=headers, url=url)
        # content = req.content
        # soup = content.decode('gbk')
        # res = requests.get(url=url, headers=headers)
        # res.encoding = 'gb18030'
        # soup = BeautifulSoup(res.text, 'html.parser')
        #找到新聞列表
        result_div = soup.find('div', attrs={'class': 'listMidContent'}).find('ul')
        # result_div = result_div.encode('GBK','ignore').decode('GBK')
        #去下換行
        result_replace = str(result_div).replace('\n','').replace('\r','').replace('\t','')
        #正則匹配資訊
        result_list = re.findall('<li><h1>.(.*?)</h1>',result_replace)
        for i in result_list:
            # http://www.dsti.net/Information/News/120652
            news_url = 'http://www.dsti.net/' + re.findall('href="(.*?)" target="_blank">',i)[0]
            news_name = re.findall('target="_blank">(.*?)</a>',i)[0]
            # news_time = re.findall('<span class="time">\((.*?)\)</span>',i)[0]
            # 標題-段落
            news_article, news_p = getNewsDetail(news_url)
            for p1 in news_p:
                if p1!='':
                    csv_para.writerow([p1.replace("\u00a0", ""), news_name.replace("\u00a0", "")])

            # 標題-原文
            if news_article!='':
                csv_article.writerow([news_name.replace("\u00a0", ""), news_article.replace("\u00a0", "")])
        # time.sleep(1)
spider()
f.close()

相關文章