作業1
要求:要求:指定一個網站,爬取這個網站中的所有的所有圖片,例如:中國氣象網(http://www.weather.com.cn)。使用scrapy框架分別實現單執行緒和多執行緒的方式爬取。
–務必控制總下載的圖片數量(學號尾數後2位)限制爬取的措施。
Gitee連結 |
---|
https://gitee.com/xiaoaibit/102202131_LX/tree/master/homework3/demo1 |
單執行緒爬蟲程式碼
import requests
import scrapy
from demo1.items import ImgItem
class FZuSpider(scrapy.Spider):
name = "FZu"
allowed_domains = ["news.fzu.edu.cn"]
start_urls = ["https://news.fzu.edu.cn/fdyw.htm"]
def parse(self, response):
imgs= response.xpath("//img/@src").extract()
for img in imgs:
item = ImgItem()
item['img_url'] = response.urljoin(img).split('?')[0]
item['img'] = requests.get(response.urljoin(img)).content
yield item
part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
next_url = response.urljoin(part_url)
yield scrapy.Request(url=next_url,callback=self.parse)
多執行緒爬蟲程式碼
import requests
import scrapy
from demo1.items import ImgItem
class FZuSpider1(scrapy.Spider):
name = "FZu1"
allowed_domains = ["news.fzu.edu.cn"]
start_urls = ["https://news.fzu.edu.cn/fdyw.htm"]
def parse(self, response):
imgs= response.xpath("//img/@src").extract()
for img in imgs:
item = ImgItem()
item['img_url'] = response.urljoin(img).split('?')[0]
item['img'] = requests.get(response.urljoin(img)).content
yield item
part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
next_url = response.urljoin(part_url)
print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())
if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 10 :
yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider2(scrapy.Spider):
name = "FZu2"
allowed_domains = ["news.fzu.edu.cn"]
start_urls = ["https://news.fzu.edu.cn/fdyw/711.htm"]
def parse(self, response):
imgs= response.xpath("//img/@src").extract()
for img in imgs:
item = ImgItem()
item['img_url'] = response.urljoin(img)
item['img'] = requests.get(response.urljoin(img)).content
yield item
part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
next_url = response.urljoin(part_url)
print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())
if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 20:
yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider3(scrapy.Spider):
name = "FZu3"
allowed_domains = ["news.fzu.edu.cn"]
start_urls = ["https://news.fzu.edu.cn/fdyw/701.htm"]
def parse(self, response):
imgs= response.xpath("//img/@src").extract()
for img in imgs:
item = ImgItem()
item['img_url'] = response.urljoin(img)
item['img'] = requests.get(response.urljoin(img)).content
yield item
part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
next_url = response.urljoin(part_url)
print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())
if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 30:
yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider4(scrapy.Spider):
name = "FZu4"
allowed_domains = ["news.fzu.edu.cn"]
start_urls = ["https://news.fzu.edu.cn/fdyw/691.htm"]
def parse(self, response):
imgs= response.xpath("//img/@src").extract()
for img in imgs:
item = ImgItem()
item['img_url'] = response.urljoin(img)
item['img'] = requests.get(response.urljoin(img)).content
yield item
part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
next_url = response.urljoin(part_url)
print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())
if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 40:
yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider5(scrapy.Spider):
name = "FZu5"
allowed_domains = ["news.fzu.edu.cn"]
start_urls = ["https://news.fzu.edu.cn/fdyw/681.htm"]
def parse(self, response):
imgs= response.xpath("//img/@src").extract()
for img in imgs:
item = ImgItem()
item['img_url'] = response.urljoin(img)
item['img'] = requests.get(response.urljoin(img)).content
yield item
part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
next_url = response.urljoin(part_url)
print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())
if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 50:
yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider6(scrapy.Spider):
name = "FZu6"
allowed_domains = ["news.fzu.edu.cn"]
start_urls = ["https://news.fzu.edu.cn/fdyw/671.htm"]
def parse(self, response):
imgs= response.xpath("//img/@src").extract()
for img in imgs:
item = ImgItem()
item['img_url'] = response.urljoin(img)
item['img'] = requests.get(response.urljoin(img)).content
yield item
part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
next_url = response.urljoin(part_url)
print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())
if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 60:
yield scrapy.Request(url=next_url,callback=self.parse)
items程式碼
import scrapy
class ImgItem(scrapy.Item):
img_url = scrapy.Field()
img = scrapy.Field()
pipelines程式碼
import os
from urllib.parse import urlparse, unquote
class Demo1Pipeline:
def process_item(self, item, spider):
img_url = item['img_url']
img = item['img']
# 解析URL並獲取淨檔名(去除查詢引數)
parsed_url = urlparse(img_url)
path = unquote(parsed_url.path) # 解碼路徑,以防有特殊字元
filename = os.path.basename(path) # 獲取檔名
# 清理檔名,去除非法字元
clean_filename = os.path.splitext(filename)[0] + os.path.splitext(filename)[1]
# 確保images目錄存在
img_dir = 'images'
if not os.path.exists(img_dir):
os.makedirs(img_dir, exist_ok=True)
# 構造完整的檔案路徑
file_path = os.path.join(img_dir, clean_filename)
# 嘗試開啟檔案
try:
with open(file_path, 'wb') as self.file:
self.file.write(img)
except Exception as e:
print(f"無法建立檔案:{file_path}, 錯誤資訊:{e}")
return item
結果
心得體會
效率
多執行緒爬取可以顯著提高爬取效率,尤其是在爬取大量圖片時。透過併發請求,可以減少等待時間,加快整體爬取速度。
資源消耗
多執行緒爬取會消耗更多的系統資源,如CPU和記憶體。在資源有限的情況下,需要合理設定併發請求的數量,以避免系統過載。
反爬蟲機制
許多網站都有反爬蟲機制,過多的請求可能會觸發這些機制,導致IP被封禁。因此,在設計爬蟲時,需要考慮到這一點,合理設定請求間隔和併發數量。
錯誤處理
在爬取過程中,可能會遇到各種錯誤,如網路錯誤、404錯誤等。合理地處理這些錯誤,可以提高爬蟲的穩定性和可靠性。
遵守法律法規
在爬取網站資料時,需要遵守相關法律法規,尊重網站的robots.txt檔案,不要對網站造成過大壓力。
作業2
要求: 熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取股票相關資訊。候選網站:東方財富網:https://www.eastmoney.com/輸出資訊:MySQL資料庫儲存和輸出格式如下:表頭英文命名例如:序號id,股票程式碼:bStockNo……,由同學們自行定義設計
Gitee連結 |
---|
https://gitee.com/xiaoaibit/102202131_LX/tree/master/homework3/demo2 |
爬蟲程式碼
import scrapy
from demo2.items import StockItem
class StockSpider(scrapy.Spider):
name = "stock"
allowed_domains = ["www.eastmoney.com"]
start_urls = ["https://quote.eastmoney.com/center/gridlist.html#hs_a_board"]
def parse(self, response):
stocks = response.xpath("//tbody//tr")
for stock in stocks:
item = StockItem()
item['id'] = stock.xpath('.//td[position() = 1]//text()').extract_first()
item['code'] = stock.xpath('.//td[position() = 2]//text()').extract_first()
item['name'] = stock.xpath('.//td[position() = 3]//text()').extract_first()
item['newPrice'] = stock.xpath('.//td[position() = 5]//text()').extract_first()
item['price_change_amplitude'] = stock.xpath('.//td[position() = 6]//text()').extract_first()
item['price_change_Lines'] = stock.xpath('.//td[position() = 7]//text()').extract_first()
item['volume'] = stock.xpath('.//td[position() = 8]//text()').extract_first()
item['turnover'] = stock.xpath('.//td[position() = 9]//text()').extract_first()
item['amplitude'] = stock.xpath('.//td[position() = 10]//text()').extract_first()
item['highest'] = stock.xpath('.//td[position() = 11]//text()').extract_first()
item['lowest'] = stock.xpath('.//td[position() = 12]//text()').extract_first()
item['today'] = stock.xpath('.//td[position() = 13]//text()').extract_first()
item['yesterday'] = stock.xpath('.//td[position() = 14]//text()').extract_first()
yield item
items程式碼
import scrapy
class StockItem(scrapy.Item):
id = scrapy.Field()
code = scrapy.Field()
name = scrapy.Field()
newPrice = scrapy.Field()
price_change_amplitude = scrapy.Field()
price_change_Lines = scrapy.Field()
volume = scrapy.Field()
turnover = scrapy.Field()
amplitude = scrapy.Field()
highest = scrapy.Field()
lowest = scrapy.Field()
today = scrapy.Field()
yesterday = scrapy.Field()
使用selenium作為下載中介軟體
import time
from scrapy.http import HtmlResponse
from selenium import webdriver
class SeleniumMiddleware:
def process_request(self,request,spider):
url = request.url
driver = webdriver.Edge()
driver.get(url)
time.sleep(3)
data =driver.page_source
driver.close()
return HtmlResponse(url=url,body=data.encode('utf-8'),encoding='utf-8',request=request)
pipelines程式碼
import pymysql
host = '127.0.0.1'
port = 3306
user = 'root'
password = 'yabdylm'
database = 'pycharm'
class Demo2Pipeline:
def __init__(self):
self.con = pymysql.connect(host=host, port=port, user=user, password=password, database=database, charset='utf8mb4')
self.cursor = self.con.cursor()
self.cursor.execute(
"CREATE TABLE IF NOT EXISTS stockData (id Integer,code VARCHAR(255),name VARCHAR(255),newPrice VARCHAR(255),price_change_amplitude VARCHAR(255),price_change_Lines VARCHAR(255), volume VARCHAR(255),turnover VARCHAR(255),amplitude VARCHAR(255),highest VARCHAR(255),lowest VARCHAR(255),today VARCHAR(255),yesterday VARCHAR(255));")
def process_item(self, item, spider):
try:
id = item['id']
code = item['code']
name = item['name']
newPrice = item['newPrice']
price_change_amplitude = item['price_change_amplitude']
price_change_Lines = item['price_change_Lines']
volume = item['volume']
turnover = item['turnover']
amplitude = item['amplitude']
highest = item['highest']
lowest = item['lowest']
today = item['today']
yesterday = item['yesterday']
# 插入資料
self.cursor.execute("""
INSERT INTO stockData VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
""", (id, code, name, newPrice, price_change_amplitude, price_change_Lines, volume, turnover, amplitude,
highest, lowest, today, yesterday))
self.con.commit() # 提交事務
except Exception as e:
print(f"An error occurred: {e}")
return item
def __del__(self):
self.con.close()
結果
心得體會
專案結構和程式碼組織:
在開始編寫爬蟲之前,合理規劃專案結構和程式碼組織是非常重要的。這包括定義Item、Pipelines、Spiders等元件,以及如何組織這些元件的程式碼檔案。良好的程式碼組織可以提高程式碼的可讀性和可維護性。
Item定義:
定義Item時,需要根據目標網站的資料結構來設計欄位。這要求對網站的資料結構有清晰的理解,並且能夠預見到未來可能需要的資料欄位。在東方財富網的例子中,可能需要定義如股票程式碼、名稱、價格、成交量等欄位。
作業3
要求:熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取外匯網站資料。候選網站:中國銀行網:https://www.boc.cn/sourcedb/whpj/
Gitee連結 |
---|
https://gitee.com/xiaoaibit/102202131_LX/tree/master/homework3/demo3 |
爬蟲程式碼
import scrapy
from demo3.items import BankItem
class BankSpider(scrapy.Spider):
name = "bank"
allowed_domains = ["www.boc.cn"]
start_urls = ["https://www.boc.cn/sourcedb/whpj/"]
def parse(self, response):
banks = response.xpath('//tbody[position() = 1]/tr')
for i in range(2,len(banks) - 2):
bank = banks[i]
item = BankItem()
item['Currency'] = bank.xpath(".//td[position() = 1]//text()").extract_first()
item['TBP'] = bank.xpath(".//td[position() = 2]//text()").extract_first()
item['CBP'] = bank.xpath(".//td[position() = 3]//text()").extract_first()
item['TSP'] = bank.xpath(".//td[position() = 4]//text()").extract_first()
item['CSP'] = bank.xpath(".//td[position() = 5]//text()").extract_first()
item['Time'] = bank.xpath(".//td[position() = 8]//text()").extract_first()
yield item
myDb.closeDB()
items程式碼
import scrapy
class BankItem(scrapy.Item):
Currency = scrapy.Field()
TBP = scrapy.Field()
CBP = scrapy.Field()
TSP = scrapy.Field()
CSP = scrapy.Field()
Time = scrapy.Field()
selenium中介軟體程式碼
import time
from scrapy.http import HtmlResponse
from selenium import webdriver
class SeleniumMiddleware:
def process_request(self,request,spider):
url =request.url
driver = webdriver.Edge()
driver.get(url)
time.sleep(1)
data = driver.page_source
return HtmlResponse(url=url,body=data.encode('utf-8'),encoding='utf-8',reques
pipelines程式碼
import pymysql
from scrapy.exceptions import DropItem
class BankPipeline:
def __init__(self):
# 這裡填寫您的資料庫配置資訊
self.host = 'localhost'
self.database = 'pycharm'
self.user = 'root'
self.password = 'yabdylm'
# 建立資料庫連線
self.con = pymysql.connect(
host=self.host,
user=self.user,
password=self.password,
database=self.database,
charset='utf8mb4' # 使用 utf8mb4 字符集以支援全字符集
)
self.cursor = self.con.cursor()
def process_item(self, item, spider):
# SQL 插入語句
insert_sql = """
INSERT INTO bankData (Currency, TBP, CBP, TSP, CSP, Time)
VALUES (%s, %s, %s, %s, %s, %s)
"""
try:
# 執行 SQL 插入語句
self.cursor.execute(
insert_sql,
(
item['Currency'],
item['TBP'],
item['CBP'],
item['TSP'],
item['CSP'],
item['Time']
)
)
# 提交事務
self.con.commit()
except pymysql.Error as e:
# 如果發生錯誤,回滾事務
self.con.rollback()
raise DropItem(f"Error inserting row {item!r} into database: {e}")
return item
def close_spider(self, spider):
# 關閉資料庫連線
self.cursor.close()
self.con.close()
結果
心得體會
Xpath選擇器:
使用Xpath選擇器提取資料時,需要對Xpath語法有一定的瞭解。同時,由於網站結構可能會變化,編寫的Xpath選擇器需要具有一定的靈活性和健壯性,以應對可能的結構變化。
資料清洗和處理:
資料庫設計:
設計資料庫表結構時,需要考慮到資料之間的關係和查詢效率。例如,是否需要設定外來鍵、索引等。同時,表頭的英文命名需要符合資料庫命名規範,易於理解且避免關鍵字衝突。