爬蟲——爬取貴陽房價(Python實現)

專注的阿熊發表於2022-02-09

#================== 匯入相關庫 ==================================

from bs4 import BeautifulSoup

import numpy as np

import requests

from requests.exceptions import  RequestException

import pandas as pd

#============= 讀取網頁 =========================================

def craw(url,page):

     try:

         headers = {

             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36"}

         html1 = requests.request("GET", url, headers=headers,timeout=10)

         html1.encoding ='utf-8' # 加編碼,重要!轉換為字串編碼, read() 得到的是 byte 格式的

         html=html1.text

         return html

     except RequestException:# 其他問題

         print(' {0} 讀取網頁失敗 '.format(page))

         return None

#========== 解析網頁並儲存資料到表格 ======================

def pase_page(url,page):

     html=craw(url,page)

     html = str(html)

     if html is not None:

         soup = BeautifulSoup(html, 'lxml')

         "-- 先確定房子資訊,即 li 標籤列表 --"

         houses=soup.select('.resblock-list-wrapper li')# 房子列表

         "-- 再確定每個房子的資訊 --"

         for j in range(len(houses)):# 遍歷每一個房子

             house=houses[j]

             " 名字 "

             recommend_project=house.select('.resblock-name a.name')

             recommend_project=[i.get_text()for i in recommend_project]# 名字 英華天元,斌鑫江南御府 ...

             recommend_project=' '.join(recommend_project)

             #print(recommend_project)

             " 型別 "

             house_type=house.select('.resblock-name span.resblock-type')

             house_type=[i.get_text()for i in house_type]# 寫字樓 , 底商 ...

             house_type=' '.join(house_type)

             #print(house_type)

             " 銷售狀態 "

             sale_status = house.select('.resblock-name span.sale-status')

             sale_status=[i.get_text()for i in sale_status]# 在售 , 在售 , 售罄 , 在售 ...

             sale_status=' '.join(sale_status)

             #print(sale_status)

             " 大地址 "

             big_address=house.select('.resblock-location span')

             big_address=[i.get_text()for i in big_address]#

             big_address=''.join(big_address)

             #print(big_address)

             " 具體地址 "

             small_address=house.select('.resblock-location a')

             small_address=[i.get_text()for i in small_address]#

             small_address=' '.join(small_address)

             #print(small_address)

             " 優勢。 "

             advantage=house.select('.resblock-tag span')

             advantage=[i.get_text()for i in advantage]#

             advantage=' '.join(advantage)

             #print(advantage)

             " 均價:多少 1 "

             average_price=house.select('.resblock-price .main-price .number')

             average_price=[i.get_text()for i in average_price]#16000,25000, 價格待定 ..

             average_price=' '.join(average_price)

             #print(average_price)

             " 總價 , 單位萬 "

             total_price=house.select('.resblock-price .second')

             total_price=[i.get_text()for i in total_price]# 總價 400 / 套,總價 100 / '...

             total_price=' '.join(total_price)

             #print(total_price)

             #===================== 寫入表格 =================================================

             information = [recommend_project, house_type, sale_status,big_address,small_address,advantage,average_price,total_price]

             information = np.array(information)

             information = information.reshape(-1, 8)

             information = 外匯跟單gendan5.compd.DataFrame(information, columns=[' 名稱 ', ' 型別 ', ' 銷售狀態 ',' 大地址 ',' 具體地址 ',' 優勢 ',' 均價 ',' 總價 '])

             information.to_csv(' 貴陽房價 .csv', mode='a+', index=False, header=False)  # mode='a+' 追加寫入

         print(' {0} 頁儲存資料成功 '.format(page))

     else:

         print(' 解析失敗 ')

#================== 雙執行緒 =====================================

import threading

for i  in range(1,100,2):# 遍歷網頁 1-101

     url1="(i)+"/"

     url2 = " + str(i+1) + "/"

     t1 = threading.Thread(target=pase_page, args=(url1,i))# 執行緒 1

     t2 = threading.Thread(target=pase_page, args=(url2,i+1))# 執行緒 2

     t1.start()

     t2.start()


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69946337/viewspace-2855035/,如需轉載,請註明出處,否則將追究法律責任。

相關文章