前言
因為要訓練識別驗證碼的模型,需要爬取源資料。
如果需要其他的型別,自行修改
爬取結果
這邊把圖片跟文字命名一樣的。
程式碼
import time
import json
import requests
def save_image_from_url(url, file_path):
response = requests.get(url)
with open(file_path, 'wb') as f:
f.write(response.content)
if __name__ == '__main__':
"""
@author: xXinG
@dec: 爬取數美驗證碼空間 邏輯推理圖片、文字
@time: 2023/4/26 19:50
"""
execute_count = 100 # 爬取條數
start_time = time.time()
for i in range(execute_count):
url = "https://captcha1.fengkongcloud.cn/ca/v1/register"
querys = {"callback": "0", "channel": "DEFAULT",
"captchaUuid": "20240426194937QdanEmbtwJrRaJwbdG", "rversion": "1.0.4", "lang": "zh-cn",
"data": "{}", "model": "spatial_select", "sdkver": "1.1.3", "organization": "d6tpAY1oV0Kv5jRSgxQr",
"appId": "default"}
response = requests.request("GET", url, params=querys)
data_str = response.text[2:-1] # 去掉字串開頭的'0('和結尾的')'
data_dict = json.loads(data_str) # 將字串轉換為字典
if data_dict['code'] == 1100:
url = data_dict['detail']['bg'] # 圖片url, 需要加上 https://castatic.fengkongcloud.cn/
des = data_dict['detail']['order'] # 空間目標描述
last_slash_index = url.rfind('/')
jpg_start_index = url.find('.jpg')
name = url[last_slash_index + 1:jpg_start_index] # 圖片命名
save_image_from_url('https://castatic.fengkongcloud.cn' + data_dict['detail']['bg'],
'./result/img/' + name + '.jpg')
with open("./result/label/" + name + '.txt', "w", encoding="utf-8") as f:
f.write(des[0])
print("完成 -> " + name)
end_time = time.time()
print("總共用時: " + str(end_time - start_time) + "秒")
這邊測試爬取100張