目錄
- 1. 程式碼
- 2. 舉一反三
1. 程式碼
該 Python 指令碼可多執行緒地批次下載新浪圖床圖片,每次下載會檢查哪些圖片已下載並過濾已下載的圖片。
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
import time
from tqdm import tqdm
def setup_logger():
# 設定日誌記錄:同時輸出到檔案和控制檯
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# 檔案處理器
file_handler = logging.FileHandler('download.log')
file_handler.setLevel(logging.INFO)
# 控制檯處理器
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR)
# 日誌格式
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)
# 新增處理器到logger
logger.addHandler(file_handler)
logger.addHandler(console_handler)
def download_image_by_change_referer(output_dir, url):
"""
下載圖片
Args:
output_dir: 儲存圖片的資料夾
url: 圖片連結
Returns: 下載成功返回True,否則返回False
"""
headers = {
'Referer': 'https://weibo.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
file_name = os.path.join(output_dir, url.split('/')[-1])
with open(file_name, 'wb') as f:
f.write(response.content)
logging.info(f'Successfully downloaded {file_name}')
return True
else:
logging.error(f'Failed to download {url}, status code: {response.status_code}')
return False
except requests.RequestException as e:
logging.error(f'Error downloading {url}: {e}')
return False
# time.sleep(0.1) # 限制請求頻率,防止過快請求
def download_image_by_baidu_cache(output_dir, url):
"""
使用第三方快取服務來解決防盜鏈問題
參考 https://code.newban.cn/466.html
Args:
output_dir: 儲存圖片的資料夾
url: 圖片連結
Returns: 下載成功返回True,否則返回False
"""
headers = {
'Referer': 'https://image.baidu.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
try:
response = requests.get(f'https://image.baidu.com/search/down?url={url}', headers=headers, timeout=10)
if response.status_code == 200:
file_name = os.path.join(output_dir, url.split('/')[-1])
with open(file_name, 'wb') as f:
f.write(response.content)
logging.info(f'Successfully downloaded {file_name}')
return True
else:
logging.error(f'Failed to download {url}, status code: {response.status_code}')
return False
except requests.RequestException as e:
logging.error(f'Error downloading {url}: {e}')
return False
def batch_download(output_dir, urls, method='baidu_cache', max_workers=10):
"""
批次下載圖片
Args:
output_dir: 儲存圖片的資料夾
urls: 圖片連結列表
max_workers: 最大執行緒數
method: 下載方法,可選值為'change_referer'或'baidu_cache'
Returns: 成功下載的圖片數量和失敗的圖片數量
"""
if method == 'change_referer':
download_image_function = download_image_by_change_referer
elif method == 'baidu_cache':
download_image_function = download_image_by_baidu_cache
else:
raise ValueError(f'Invalid method: {method}')
success_count, failed_count = 0, 0
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {executor.submit(download_image_function, output_dir, url): url for url in urls}
for future in tqdm(as_completed(future_to_url), total=len(urls)):
url = future_to_url[future]
try:
result = future.result()
if result:
success_count += 1
else:
failed_count += 1
except Exception as e:
logging.error(f'Error processing {url}: {e}')
failed_count += 1
return success_count, failed_count
def get_all_image_urls(dataset_root_dir):
"""
讀取所有圖片連結
Args:
dataset_root_dir: 資料集根目錄,包含img_url_dev.json, img_url_test.json, img_url_train.json
Returns: 圖片連結列表
"""
files = [r'img_url_dev.json', r'img_url_test.json', r'img_url_train.json']
image_urls = set()
for idx, file in enumerate(files):
path = os.path.join(dataset_root_dir, file)
with open(path, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
image_urls.update(line.strip().split(';'))
return image_urls
def get_downloaded_images(output_dir):
"""
讀取已下載的圖片(避免重複下載)
Returns: 已下載的圖片集合
"""
downloaded_images = os.listdir(output_dir)
return downloaded_images
def main():
setup_logger()
config = dict(
dataset_root_dir=r'D:\Library\Datasets\20_MMChat',
output_dir='downloaded_images',
download_image_method='baidu_cache',
)
# 圖片連結列表
image_urls = get_all_image_urls(config['dataset_root_dir'])
# 目標資料夾
output_dir = config['output_dir']
os.makedirs(output_dir, exist_ok=True)
# 過濾已下載的圖片
downloaded_images = set(get_downloaded_images(output_dir))
image_urls_to_download = [url for url in image_urls if url.split('/')[-1] not in downloaded_images]
logging.info(f'Total images: {len(image_urls)}, images to download: {len(image_urls_to_download)}')
confirm = input(
f'共有 {len(image_urls)} 張圖片,已過濾 {len(image_urls) - len(image_urls_to_download)} 張已下載的圖片,待下載 {len(image_urls_to_download)} 張圖片,確認下載?(y/n): ')
if confirm.lower() != 'y':
return
# 開始批次下載
success_count, failed_count = batch_download(output_dir, image_urls_to_download,
method=config['download_image_method'], max_workers=40)
logging.info(f'下載完成,Success: {success_count}, Failed: {failed_count}')
print(f'下載完成,Success: {success_count}, Failed: {failed_count}')
if __name__ == '__main__':
main()
2. 舉一反三
該程式碼適用 MMChat 資料集,原理上支援所有新浪圖床批次下載已失效的圖片。
你需要修改 get_all_image_urls
方法來獲取你想下載的所有圖片的 URL 列表,該方法返回值:
[
"https://wx2.sinaimg.cn/mw2048/bc5ca296ly1fpt9oq74vsj20hs0npdqp.jpg",
"https://wx3.sinaimg.cn/mw2048/8bec28c2ly1fg1i9o0liqj20zk0qo7bj.jpg",
"https://wx4.sinaimg.cn/mw2048/954d55d0ly1fmwvia87e1j20ku11210q.jpg",
...
]
get_downloaded_images
方法返回已下載的圖片,如果上面的三張圖片已下載,那麼該方法會返回
[
"bc5ca296ly1fpt9oq74vsj20hs0npdqp.jpg",
"8bec28c2ly1fg1i9o0liqj20zk0qo7bj.jpg",
"954d55d0ly1fmwvia87e1j20ku11210q.jpg",
]
最後,你可以修改圖片的下載方式,目前支援 download_image_by_change_referer
、download_image_by_baidu_cache
。