可以輸入汽車品牌名,從而爬取對應汽車引數
點選檢視程式碼
from random import random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from datetime import time
from colorama import Fore
from openpyxl import load_workbook
from openpyxl.styles import Alignment
from openpyxl.utils import get_column_letter
import re
import requests
import json
import os
import pandas as pd
def get_band_response(brand_id="0"):
num = 1 # 用於統計請求次數
while True:
headers = {
"user-agent": UserAgent().random # 隨機獲取ua
}
url = "https://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx"
params = {
"typeId": "1",
"brandId": brand_id,
"fctId": "0",
"seriesId": "0"
}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
return response
else:
if num >= 5:
print("請求超過5次,退出程式")
break
else:
print("請求失敗,正在重新請求...")
num += 1
time.sleep(1)
def parse_series(band, response, series_dict):
html = re.findall(r'document.writeln\("(.*)"\)', response.text)
html = "".join(html)
soup = BeautifulSoup(html, "html.parser")
data_list = soup.select(".current > dl > dd > a")
still_sell = [i for i in data_list if "停售" not in i.get_text(strip=True)]
print(f"該品牌共找到{len(data_list)}個車型,其中,在售車型共{len(still_sell)}個。")
print("----------------------------------------------\n在售車型列表如下:\n----------------------------------------------")
for still_index, still_data in enumerate(still_sell, start=1):
series_name = still_data.contents[0].text.strip()
href = still_data.get("href")
series_id = re.findall(r'/price/series-(\d+).html', href)[0]
series_dict[series_id] = series_name
print(f"序號:{still_index}\t車型:{series_name}\t車型id:{series_id}")
return series_dict
def get_response(series_id="0"):
num = 1 # 用於統計請求次數
while True:
headers = {
"user-agent": UserAgent().random # 隨機獲取ua
}
url = "https://car-web-api.autohome.com.cn/car/param/getParamConf"
params = {
"mode": "1",
"site": "1",
"seriesid": series_id
}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
return response
else:
if num >= 5:
print("請求超過5次,退出程式")
break
else:
print("請求失敗,正在重新請求...")
num += 1
time.sleep(1)
def get_car_config(config_dic):
allconfig = []
configname_list = []
for title in config_dic['result']['titlelist']:
for item in title['items']:
configname_list.append(item['itemname'])
allconfig.append(configname_list)
for data in config_dic['result']['datalist']:
configvalue_list = []
for valueitem in data['paramconflist']:
if valueitem.get('itemname') != '':
configvalue_list.append(valueitem['itemname'])
elif not valueitem.get('sublist'):
configvalue_list.append('-')
else:
stri = []
for multivalue in valueitem['sublist']:
stri.append(multivalue['value'] + multivalue['name'])
stro = '\n'.join(stri)
configvalue_list.append(stro)
allconfig.append(configvalue_list)
return allconfig
def save_to_excel(data, folder, filename):
if not os.path.exists(folder):
os.mkdir(folder)
df = pd.DataFrame(data)
excel_path = f"{folder}/{filename}"
df.T.to_excel(excel_path, index=False, header=False)
workbook = load_workbook(excel_path)
sheet = workbook.active
for row in sheet.iter_rows():
for cell in row:
cell.alignment = Alignment(wrap_text=True, vertical='center')
num_columns = df.shape[0]
for col in range(1, num_columns + 1):
column_letter = get_column_letter(col) # 使用 get_column_letter 獲取列名
sheet.column_dimensions[column_letter].width = 20
workbook.save(excel_path)
print(Fore.GREEN + "配置下載完成,儲存到檔案------> ", f"{folder}/{filename}")
def main():
series_dict = {}
while True:
band = input("請輸入汽車品牌:").strip()
response = get_band_response()
band_pattern = f"<a href=([^>]*?)><i[^>]*?></i>{band}<em>"
band_info = re.search(band_pattern, response.text)
if not band_info:
print("該品牌不存在,請重新輸入")
continue
else:
band_href = band_info.group(1)
band_id = re.findall(r'/price/brand-(\d+).html', band_href)[0]
print(F"{band} 品牌id為:", band_id)
resp_brand = get_band_response(brand_id=band_id)
series_dict = parse_series(band, resp_brand, series_dict) # 修改此行
choice = input(Fore.RED + "\n請輸入需要下載的車型id,輸入0則下載該品牌全部車型配置:").strip()
if choice == '0':
for series_id in series_dict.keys():
series_name = series_dict[series_id]
print(Fore.CYAN + f"---正在下載{band}-{series_name},車型id為:{series_id}...")
response = get_response(series_id)
if "抱歉" in response.text and "暫無相關資料" in response.text:
print(Fore.RED + f"該系列車 {series_name} 暫無配置資訊")
else:
resp_dict = json.loads(response.text)
all_info = get_car_config(resp_dict)
excel_name = f"{band}_{series_name}.xlsx"
save_to_excel(all_info, folder=band, filename=excel_name)
elif choice in series_dict.keys():
series_name = series_dict[choice]
series_url = f"https://car.autohome.com.cn/config/series/{choice}.html"
print(Fore.CYAN + f"---正在下載{band}-{series_name},車型id為:{choice},配置連結為:{series_url}...")
response = get_response(choice)
if "抱歉" in response.text and "暫無相關資料" in response.text:
print(Fore.RED + "該系列車暫無配置資訊")
else:
resp_dict = json.loads(response.text)
all_info = get_car_config(resp_dict)
excel_name = f"{band}_{series_name}.xlsx"
save_to_excel(all_info, folder=band, filename=excel_name)
else:
print("輸入的車型id不存在,請重新輸入。")
input("請按任意鍵關閉程式...")
exit() # 確保程式退出,而不是重新進入迴圈
if __name__ == '__main__':
main()