作業1
倉庫連結:https://gitee.com/jyppx000/crawl_project
作業①
要求:指定一個網站,爬取這個網站中的所有的所有圖片,例如:中國氣象網(http://www.weather.com.cn)。使用scrapy框架分別實現單執行緒和多執行緒的方式爬取。
1.1 程式碼和圖片
我這裡列出核心程式碼
first.py
import scrapy
from bs4 import UnicodeDammit
from firstBlood.items import FirstbloodItem
class FirstSpider(scrapy.Spider):
name = "first"
# 控制頁碼
start_urls = [f"http://www.weather.com.cn/page_{i}" for i in range(1, 57)]
def parse(self, response, **kwargs):
try:
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector = scrapy.Selector(text=data)
qixiangs = selector.xpath("//img")
for qixiang in qixiangs:
item = FirstbloodItem()
item["url"] = qixiang.xpath("./@src").extract_first()
if item["url"]:
print(item["url"])
yield item
except Exception as err:
print(err)
piplines.py
import os
import urllib
class FirstbloodPipeline:
count = 0
# 控制最大下載圖片數量
max_images = 156
def process_item(self, item, spider):
if FirstbloodPipeline.count >= FirstbloodPipeline.max_images:
spider.crawler.engine.close_spider(spider, "達到最大圖片數量,停止爬取")
return item
Base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
image_path = os.path.join(Base_path, "images")
if not os.path.exists(image_path): # 確保目錄存在
os.makedirs(image_path)
img_url = item['url']
if img_url:
FirstbloodPipeline.count += 1
extension = img_url.split('.')[-1]
filename = os.path.join(image_path, f"{FirstbloodPipeline.count}.{extension}")
try:
urllib.request.urlretrieve(img_url, filename=filename)
print(f"成功爬取:{filename}")
except Exception as e:
print(f"下載失敗: {e}")
return item
items.py
class FirstbloodItem(scrapy.Item):
url = scrapy.Field()
pass
settings.py
ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 100
CONCURRENT_REQUESTS_PER_IP = 100
ITEM_PIPELINES = {
'firstBlood.pipelines.FirstbloodPipeline': 300,
}
main.py
import os
import sys
from scrapy import cmdline
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
cmdline.execute(['scrapy', "crawl", "first"])
1.2 作業心得
- 在實現特定功能的同時,我注意到了程式碼結構的重要性。透過將爬蟲邏輯與資料處理邏輯分開,保持了程式碼的清晰與可維護性
- 在處理大量資料時,如何合理地組織和儲存資料就很重要了。我在實踐中學習到了如何透過Pipeline處理和儲存圖片,並在過程中避免重複下載。這使我對資料儲存和管理有了更深入的理解
- 透過處理下載過程中可能出現的異常,我的錯誤處理能力得到了提升。讓我在實際操作中更加從容應對各種問題
作業②
要求:熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取股票相關資訊。
2.1 程式碼和圖片
```second.py``
import re
import scrapy
import requests
from secondBlood.items import SecondbloodItem
class SecondSpider(scrapy.Spider):
name = "second"
# allowed_domains = ["www.xxx.com"]
start_url = "http://quote.eastmoney.com/center/gridlist.html#hs_a_board"
def start_requests(self):
url = SecondSpider.start_url
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response, **kwargs):
try:
stocks = []
url = "https://68.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124009413428787683675_1696660278138&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12,f14,f2,f3,f4,f5,f6,f7,f15,f16,f17,f18&_=1696660278155"
r = requests.get(url=url)
data = re.compile("\"f2\":.*").findall(r.text)
data1 = data[0].split("},{")
data1[-1] = data1[-1].split("}")[0]
for i in range(len(data1)):
stock0 = data1[i].replace('"', "").split(",")
list = [6, 7, 0, 1, 2, 3, 4, 5, 8, 9, 10, 11]
stock = []
for j in list:
stock.append(stock0[j].split(":")[1])
stocks.append(stock)
print(stocks[0][0])
for i in range(len(stocks)):
item = SecondbloodItem()
item["stockname"] = stocks[i][0]
item["name"] = stocks[i][1]
item["newprice"] = stocks[i][2]
item["zhangdiefu"] = stocks[i][3]
item["zhangdieer"] = stocks[i][4]
item["chengjiaoliang"] = stocks[i][5]
item["chengjiaoer"] = stocks[i][6]
item["zhenfu"] = stocks[i][7]
item["zuigao"] = stocks[i][8]
item["zuidi"] = stocks[i][9]
item["jinkai"] = stocks[i][10]
item["zuoshou"] = stocks[i][11]
print(item)
yield item
except Exception as err:
print(err)
piplines.py
import pymysql
class SecondbloodPipeline:
def __init__(self):
self.mydb = pymysql.connect(
host="127.0.0.1",
port=3306,
user='root',
password='123456',
database="stock",
charset='utf8'
)
self.cursor = self.mydb.cursor()
self.cursor.execute('''CREATE TABLE IF NOT EXISTS stocks(
stockname VARCHAR(256),
name VARCHAR(256),
newprice VARCHAR(256),
zhangdiefu VARCHAR(256),
zhangdieer VARCHAR(256),
chengjiaoliang VARCHAR(256),
chengjiaoer VARCHAR(256),
zhenfu VARCHAR(256),
zuigao VARCHAR(256),
zuidi VARCHAR(256),
jinkai VARCHAR(256),
zuoshou VARCHAR(256)
)''')
self.mydb.commit()
def process_item(self, item, spider):
try:
sql = "INSERT INTO stocks VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
self.cursor.execute(sql, (
item.get("stockname"), item.get("name"), item.get("newprice"),
item.get("zhangdiefu"), item.get("zhangdieer"),
item.get("chengjiaoliang"), item.get("chengjiaoer"),
item.get("zhenfu"), item.get("zuigao"),
item.get("zuidi"), item.get("jinkai"),
item.get("zuoshou")))
self.mydb.commit()
print("Successfully inserted:", item)
except Exception as e:
print("Error inserting item:", e)
self.mydb.rollback() # Rollback in case of error
return item
def close_spider(self, spider):
self.cursor.close()
self.mydb.close()
items.py
import scrapy
class SecondbloodItem(scrapy.Item):
stockname = scrapy.Field()
name = scrapy.Field()
newprice = scrapy.Field()
zhangdiefu = scrapy.Field()
zhangdieer = scrapy.Field()
chengjiaoliang = scrapy.Field()
chengjiaoer = scrapy.Field()
zhenfu = scrapy.Field()
zuigao = scrapy.Field()
zuidi = scrapy.Field()
jinkai = scrapy.Field()
zuoshou = scrapy.Field()
setting.py
ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
ITEM_PIPELINES = {
'secondBlood.pipelines.SecondbloodPipeline': 1,
}
main.py
import os
import sys
from scrapy import cmdline
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
cmdline.execute(['scrapy', "crawl", "second"])
2.2 作業心得
-
我瞭解瞭如何使用Scrapy框架抓取和解析網頁資料。使用
requests
庫獲取API資料,結合正規表示式進行資料提取 -
在將爬取的資料儲存到MySQL資料庫的過程中,我掌握瞭如何透過在Pipeline中處理資料儲存
-
深入了異常處理機制
作業③:
要求:熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取股票相關資訊。
3.1程式碼和圖片
third.py
import time
import scrapy
from lxml import etree
from selenium import webdriver
from thirdBlood.items import ThirdbloodItem
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
class ThirdSpider(scrapy.Spider):
name = "third"
def __init__(self, *args, **kwargs):
super(ThirdSpider, self).__init__(*args, **kwargs)
chrome_options = Options()
chrome_options.add_argument("--headless") #
self.driver = webdriver.Chrome(service=Service(r'D:\tools\package\chromedriver-win64\chromedriver.exe'), options=chrome_options)
def start_requests(self):
url = 'https://www.boc.cn/sourcedb/whpj/'
self.driver.get(url)
time.sleep(1)
html = etree.HTML(self.driver.page_source)
yield scrapy.Request(url, self.parse, meta={'html': html})
def parse(self, response):
global item
html = response.meta['html']
lis = html.xpath('/html/body/div/div[5]/div[1]/div[2]/table/tbody/tr')
number = 1
# 獲取元素
for link in lis:
if number != 1:
texts = link.xpath('./td[1]/text()')
name = texts[0] if texts else ''
texts = link.xpath('./td[2]/text()')
TBP = texts[0] if texts else ''
texts = link.xpath('./td[3]/text()')
CBP = texts[0] if texts else ''
texts = link.xpath('./td[4]/text()')
TSP = texts[0] if texts else ''
texts = link.xpath('./td[5]/text()')
CSP = texts[0] if texts else ''
texts = link.xpath('./td[8]/text()')
TIME = texts[0] if texts else ''
item = ThirdbloodItem()
item["currency"] = name
item["TBP"] = TBP
item["CBP"] = CBP
item["TSP"] = TSP
item["CSP"] = CSP
item["time"] = TIME
yield item
if number == 1:
number += 1
def closed(self, reason):
self.driver.quit()
piplines.py
import pymysql
class ThirdbloodPipeline:
def process_item(self, item, spider):
try:
print(item["currency"])
print(item["TSP"])
print(item["CSP"])
print(item["TBP"])
print(item["CBP"])
print(item["time"])
print()
# 將資料插入資料庫的表中
mydb = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="china_bank",
charset="utf8")
mycursor = mydb.cursor()
sql = "INSERT INTO currency (currency, TSP, CSP, TBP, CBP, time) VALUES (%s, %s, %s, %s, %s, %s)"
val = (item["currency"], item["TSP"], item["CSP"], item["TBP"], item["CBP"], item["time"])
# 列印 SQL 語句和要插入的值
print(f"Inserting into currency: {val}")
mycursor.execute(sql, val)
mydb.commit()
except Exception as err:
print(f"Error: {err}")
finally:
mycursor.close()
mydb.close()
return item
items.py
import scrapy
class ThirdbloodItem(scrapy.Item):
currency = scrapy.Field()
TSP = scrapy.Field()
CSP = scrapy.Field()
TBP = scrapy.Field()
CBP = scrapy.Field()
time = scrapy.Field()
settings.py
ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
ITEM_PIPELINES = {
'thirdBlood.pipelines.ThirdbloodPipeline': 1,
}
main.py
import os
import sys
from scrapy import cmdline
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
cmdline.execute(['scrapy', "crawl", "third"])
3.2 作業心得
- 學會了如何將Selenium與Scrapy結合使用,以處理動態網頁的資料抓取。使用Selenium能夠獲取需要的JavaScript渲染後的內容
- 在解析抓取的HTML內容時,我熟悉了對XPath選擇器的使用。透過XPath可以精準定位到需要的資料,提高了抓取效率和資料提取的準確性