gitee連結:
https://gitee.com/zxbaixuexi/2024scrapy/tree/master/第三次實驗
作業①:
1)
要求:要求:指定一個網站,爬取這個網站中的所有的所有圖片,例如中國氣象網(http://www.weather.com.cn)。使用scrapy框架分別實現單執行緒和多執行緒的方式爬取。
程式碼:
weatherspider.py
import scrapy
from ..items import WeatherItem
class WeatherspiderSpider(scrapy.Spider):
name = "weatherspider"
allowed_domains = ["weather.com.cn","pi.weather.com.cn"]
start_urls = ["http://www.weather.com.cn/"]
def parse(self, response):
img_address = response.xpath("//img/@src").getall()
item = WeatherItem()
item['image_urls'] = img_address
yield item
items.py
import scrapy
class WeatherItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
image_paths = scrapy.Field()
pipelines.py
from scrapy import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class WeatherPipeline(ImagesPipeline):
default_headers = {
'accept': 'image/webp,image/*,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, sdch, br',
'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',
'cookie': 'bid=yQdC/AzTaCw',
'referer': 'https://www.douban.com/photos/photo/2370443040/',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
}
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
self.default_headers['referer'] = image_url
yield Request(image_url, headers=self.default_headers)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings.py
ITEM_PIPELINES = {
"weather.pipelines.WeatherPipeline":300
}
IMAGES_STORE = r'C:\Users\supermejane\Desktop\爬蟲實踐\第三次實驗\pythonProject1\weather\images'
# 過期天數
IMAGES_EXPIRES = 90 #90天內抓取的都不會被重抓
執行結果
2)心得體會
主要練習了使用scrapy下載指定域名的圖片,可以使用前面的request請求獲取資料再with open file as...下載,這裡使用的是scrapy自帶的ImagesPipeline下載,需要注意的是,要重寫get_media_requests,item_completed方法,並且要在settings.py啟用自定義的ImagesPipeline並設定IMAGES_STORE作為圖片儲存路徑。另外還需要正確設定weatherspider.py中的allow_domain
作業②:
1)
要求:熟練掌握scrapy中Item、Pipeline資料的序列化輸出方法;Scrapy+Xpatho +MySQL資料庫儲存技術路線爬取股票相關資訊。本次爬取的是東方財富網:https://www.eastmoney.com/
程式碼
stockspider.py
import scrapy
from scrapy.http import Request
import json
from ..items import StockItem
class StockspiderSpider(scrapy.Spider):
name = "stockspider"
allowed_domains = ["eastmoney.com"]
start_urls = [
"http://38.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112406848566904145428_1697696179672&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1697696179673"]
def start_requests(self):
for url in self.start_urls:
yield Request(url)
def parse(self, response):
'''
序號,股票程式碼:f12,股票名稱:f14,最新報價:f2,漲跌幅:f3,漲跌額:f4,成交量:f5,成交額:f6
振幅:f7,最高:f15,最低:f16,今開:f17,昨收:f18
f12,f14,f2,f3,f4,f5,f6,f7,f15,f16,f17,f18
'''
# 提取括號內的JSON資料部分
start_index = response.text.find('(') + 1
end_index = response.text.rfind(')')
json_data = response.text[start_index:end_index]
# 解析JSON資料
# print(json_data)
# print(type(json_data))
json_obj = json.loads(json_data)
# 列印輸出
# print(type(json_obj))
# print(json_obj)
# 取出data中的資料列表list
data = json_obj['data']['diff']
# # print(data,type(data))
goods_list = []
name = ['f12', 'f14', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f15', 'f16', 'f17', 'f18']
count = 0
for li in data:
list = []
list.append(count)
for n in name:
list.append(li[n])
count += 1
goods_list.append(list)
# # print(goods_list)
for k in goods_list:
# [1, '301348', '藍箭電子', 50.53, 20.0, 8.42, 172116, 815409272.27, 22.56, 50.53, 41.03, 41.04, 42.11]
stock = StockItem()
stock['id'] = str(k[0])
stock['number'] = str(k[1])
stock['name'] = str(k[2])
stock['new_price'] = str(k[3])
stock['up_down_precent'] = str(k[4])
stock['up_down_num'] = str(k[5])
stock['turnover'] = str(k[6])
stock['Transaction_volume'] = str(k[7])
stock['vibration'] = str(k[8])
stock['maxx'] = str(k[9])
stock['minn'] = str(k[10])
stock['today'] = str(k[11])
stock['yesterday'] = str(k[12])
yield stock
items.py
import scrapy
class StockItem(scrapy.Item):
# define the fields for your item here like:
id = scrapy.Field()
number = scrapy.Field()
name = scrapy.Field()
new_price = scrapy.Field()
up_down_precent = scrapy.Field()
up_down_num = scrapy.Field()
turnover = scrapy.Field()
Transaction_volume = scrapy.Field()
vibration = scrapy.Field()
maxx = scrapy.Field()
minn = scrapy.Field()
today = scrapy.Field()
yesterday = scrapy.Field()
pipelines.py
import pymysql
class StockPipeline:
def open_spider(self, spider):
self.client = pymysql.connect(host="localhost", port=3306, user="root", password="123456", db="homework1",
charset="utf8")
self.cursor = self.client.cursor()
def process_item(self, item, spider):
args = [
item.get("id"),
item.get("number"),
item.get("name"),
item.get("new_price"),
item.get("up_down_precent"),
item.get("up_down_num"),
item.get("turnover"),
item.get("Transaction_volume"),
item.get("vibration"),
item.get("maxx"),
item.get("minn"),
item.get("today"),
item.get("yesterday"),
]
sql = "insert into stock values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
self.cursor.execute(sql, args)
self.client.commit()
return item
def close_spider(self, spider):
self.client.close()
self.cursor.close()
settings.py
ITEM_PIPELINES = {
"stock.pipelines.StockPipeline": 300,
}
COOKIES_ENABLED = False
ROBOTSTXT_OBEY = False
執行結果
2)心得體會
主要練習了透過scrapy框架訪問api獲取原始資料(json格式)並解析,需要注意的是要在settings.py中設定COOKIES_ENABLED = False,
ROBOTSTXT_OBEY = False這兩項,否則會報錯 [scrapy.downloadermiddlewares.robotstxt] DEBUG: Forbidden by robots.txt
作業③:
1)
要求:熟練掌握scrapy中Item、Pipeline資料的序列化輸出方法;↵
使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取外匯網站資料。
程式碼
stockspider.py
import scrapy
from ..items import BankItem
class BankspiderSpider(scrapy.Spider):
name = "bankspider"
allowed_domains = ["boc.cn"]
start_urls = ["https://www.boc.cn/sourcedb/whpj/"]
def parse(self, response):
# 使用XPath選擇所有<tr>元素
rows = response.xpath("//tr[position()>1]") # 忽略第一個<tr>元素
# 遍歷每個<tr>元素
'''
<th>貨幣名稱</th>
<th>現匯買入價</th>
<th>現鈔買入價</th>
<th>現匯賣出價</th>
<th>現鈔賣出價</th>
<th>中行折算價</th>
<th>釋出日期</th>
<th>釋出時間</th>
'''
for row in rows:
# 使用XPath選擇當前<tr>下的所有<td>元素,並提取文字值
currencyname = row.xpath("./td[1]//text()").get()
hui_in = row.xpath("./td[2]//text()").get()
chao_in = row.xpath("./td[3]//text()").get()
hui_out = row.xpath("./td[4]//text()").get()
chao_out = row.xpath("./td[5]//text()").get()
zhonghang = row.xpath("./td[6]//text()").get()
date = row.xpath("./td[7]//text()").get()
time = row.xpath("./td[8]//text()").get()
print(currencyname)
print(hui_in)
print(chao_in)
print(hui_out)
print(chao_out)
print(zhonghang)
print(date)
print(time)
currency = BankItem()
currency['currencyname'] = str(currencyname)
currency['hui_in'] = str(hui_in)
currency['chao_in'] = str(chao_in)
currency['hui_out'] = str(hui_out)
currency['chao_out'] = str(chao_out)
currency['zhonghang'] = str(zhonghang)
currency['date'] = str(date)
currency['time'] = str(time)
yield currency
items.py
import scrapy
class BankItem(scrapy.Item):
currencyname = scrapy.Field()
hui_in = scrapy.Field()
chao_in = scrapy.Field()
hui_out = scrapy.Field()
chao_out = scrapy.Field()
zhonghang = scrapy.Field()
date = scrapy.Field()
time = scrapy.Field()
pipelines.py
import pymysql
class BankPipeline:
def open_spider(self, spider):
self.client = pymysql.connect(host="localhost", port=3306, user="root", password="123456", db="homework1",
charset="utf8")
self.cursor = self.client.cursor()
def process_item(self, item, spider):
args = [
item.get("currencyname"),
item.get("hui_in"),
item.get("chao_in"),
item.get("hui_out"),
item.get("chao_out"),
item.get("zhonghang"),
item.get("date"),
item.get("time"),
]
sql = "insert into currency(currencyname,hui_in,chao_in,hui_out,chao_out,zhonghang,date,time) values(%s,%s,%s,%s,%s,%s,%s,%s)"
self.cursor.execute(sql, args)
self.client.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.client.close()
settings.py
ITEM_PIPELINES = {
"bank.pipelines.BankPipeline": 300,
}
執行結果
2)心得體會
練習了scrapy框架爬取並解析網頁資料,重寫pipeline的process_item處理資料並存入mysql,並透過重寫自帶的open_spider,close_spider開啟關閉資料庫連線,使用的庫是pymysql