光明網軍事新聞
import json
import redis
import time
import requests
session = requests.session()
import logging.handlers
import pickle
import sys
import re
import datetime
from bs4 import BeautifulSoup
import importlib
import csv
import string
import zhon.hanzi
import sys
importlib.reload(sys)
punce = string.punctuation
puncz = zhon.hanzi.punctuation
f = open('0905/0905原文.csv','a',encoding='utf-8',newline='')
fp = open('0905/0905段落.csv','a',encoding='utf-8',newline='')
csv_article = csv.writer(f)
csv_para = csv.writer(fp)
def getNewsDetail(newsurl):
news_p=[]
p1=''
res = requests.get(newsurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text,'html.parser')
for p in soup.select('.u-mainText p'):
p1 = p1 + p.text.strip().replace('\n','')
if len(p1) >= 200 and len(p1) <= 500 :
news_p.append(p1)
p1 = ''
news_article = ' '.join([p.text.strip().replace('<br/>','') for p in soup.select('.u-mainText p')])
return news_article, news_p
def spider():
pages = ['','_2','_3','_4','_5',
'_6','_7','_8','_9','_10']
for onepage in pages:
url = "http://mil.gmw.cn/node_8979"+onepage+".htm"
print(url)
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
result = session.get(url=url,headers=headers).content
soup = BeautifulSoup(result,'html.parser')
if soup is None:
break
result_div = soup.find('div', attrs={'class': 'channelLeftPart'}).find_all('div')[1].find_all('ul', attrs={'class': 'channel-newsGroup'})
result_replace = str(result_div).replace('\n','').replace('\r','').replace('\t','')
result_list = re.findall('<li>(.*?)</li>',result_replace)
for i in result_list:
news_url = 'http://mil.gmw.cn/' + re.findall('<a href="(.*?)" target=',i)[0]
news_name = re.findall('target="_blank">(.*?)</a>',i)[0]
print(news_name)
news_article, news_p = getNewsDetail(news_url)
for p1 in news_p:
if p1!='':
csv_para.writerow([p1.replace("\u00a0", ""), news_name.replace("\u00a0", "")])
if news_article!='':
csv_article.writerow([news_name.replace("\u00a0", ""), news_article.replace("\u00a0", "")])
time.sleep(3)
spider()
f.close()
國防科技資訊網
import json
import time
import requests
session = requests.session()
import logging.handlers
import pickle
import sys
import re
import datetime
from bs4 import BeautifulSoup
import importlib
import csv
import string
import zhon.hanzi
import sys
importlib.reload(sys)
punce = string.punctuation
puncz = zhon.hanzi.punctuation
f = open('0906/0906electron原文.csv','w+',encoding='utf-8',newline='')
fp = open('0906/0906electron段落.csv','w+',encoding='utf-8',newline='')
csv_article = csv.writer(f)
csv_para = csv.writer(fp)
allparas = []
def getNewsDetail(newsurl):
news_p=[]
p1=''
result = session.get(url=newsurl)
soup = BeautifulSoup(result.text,'html.parser')
for p in soup.select('.newsContent p'):
p1 = p1 + p.text.replace('\n','')
if len(p1) >= 200 and len(p1) <= 500 :
news_p.append(p1)
p1 = ''
news_article = ' '.join([p.text.strip().replace('\n','') for p in soup.select('.newsContent p')])
return news_article, news_p
def spider():
for page in range(1,30):
url = "http://www.dsti.net/Information/HyeList/electron/" + str(page)
print(url)
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
result = session.get(url=url,headers=headers).content
soup = BeautifulSoup(result, 'html.parser',from_encoding="gb18030")
if soup is None:
break
result_div = soup.find('div', attrs={'class': 'listMidContent'}).find('ul')
result_replace = str(result_div).replace('\n','').replace('\r','').replace('\t','')
result_list = re.findall('<li><h1>.(.*?)</h1>',result_replace)
for i in result_list:
news_url = 'http://www.dsti.net/' + re.findall('href="(.*?)" target="_blank">',i)[0]
news_name = re.findall('target="_blank">(.*?)</a>',i)[0]
news_article, news_p = getNewsDetail(news_url)
for p1 in news_p:
if p1!='':
csv_para.writerow([p1.replace("\u00a0", ""), news_name.replace("\u00a0", "")])
if news_article!='':
csv_article.writerow([news_name.replace("\u00a0", ""), news_article.replace("\u00a0", "")])
spider()
f.close()