爬取幣世界標紅快訊內容(移動版)
# 引入依賴
from lxml import etree
import requests
import pymongo
import time
client = pymongo.MongoClient(`寫你自己的資料庫地址`, 27017) # 需要自己安裝mongodb客戶端
mydb = client[`mydb`]
information = mydb[`information`] # 資料庫表名
currentTime = time.strftime("%m%d%H", time.localtime())
saveTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
# 偽造成手機
header = {
`User-Agent`: `Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1`
}
def get_url(url):
html = requests.get(url, headers=header)
selector = etree.HTML(html.text)
infos = selector.xpath(`//div[@id="kuaixun_list"]/div/article/section[@class="focus"]`)
onlyOne = selector.xpath(`//div[@id="kuaixun_list"]/div/article/section[@class="focus"]`)[0]
saveId = onlyOne.xpath(`../@id`)[0]
file = open(r`C:/Users/SCZ/PycharmProjects/CommunityCrawl/newest`, `w`) # 寫你自己的檔案地址
file.write(currentTime +` `+saveId)
file.close()
for info in infos:
try:
title = (info.xpath(`h3[@class="text_title"]/text()`)[0]).strip()
content = (info.xpath(`p[@class="text_show"]/text()`)[0]).strip()
date = info.xpath(`../h3[@class="timenode"]/text()`)[0]
infoId = info.xpath(`../@id`)[0]
data = {
`title`: title,
`id`: infoId,
`date`: saveTime,
`content`: content,
`source`: `bishijie`
}
print(data)
if (int(infoId) > int(saveId) - 20):
print(`插入了一條新資料!`)
information.insert_one(data)
else:
print(`無新資料產生!`)
except IndexError:
pass
if __name__ == `__main__`:
fs = open(`C:/Users/SCZ/PycharmProjects/CommunityCrawl/newest`, `r+`) # 寫你自己的檔案地址
line = fs.read()
fileDate = line[0:6]
if (fileDate != currentTime):
print(`時間不一致,當機使用當前系統時間進行爬取!`)
urls = [`http://m.bishijie.com/kuaixun?fm=` + currentTime]
for url in urls:
get_url(url)
time.sleep(2)
else:
print(`時間一致, 正常執行!`)
urls = [`http://m.bishijie.com/kuaixun?fm=` + currentTime]
for url in urls:
get_url(url)
time.sleep(2)
主要要求掌握內容: xpath語法,python操作檔案,python的基礎語法
本文內容比較基礎,寫的不好,多多指教!大家一起進步!!!
我的其他關於python的文章
Python爬蟲入門
Python爬蟲之使用MongoDB儲存資料