以無錫的某殼為例進行資料爬取,現在房子的價格起伏很快,買房是人生一個大事,瞭解本地的房價走勢來判斷是否應該入手。
(建議是近2年不買,本人在21年高位拋了一套房,基本是透過貝殼資料判斷房價已經到頂,希望此爬蟲能夠幫到各位。)
這裡只爬了必看好房的資料,貝殼有放抓機制,無法跑全所有資料,有心的可以拿過去擴充套件一番。
import requests from pyquery import PyQuery as pq import json import pandas as pd import datetime,time columns = ['id','title','place','msg', 'price', 'per_meter','area','city'] areas=['濱湖區','梁溪區','新吳區','惠山區','錫山區','江陰市','宜興市'] # 爬取某網頁 def get_a_page(url,area): result = requests.get(url) doc = pq(result.text) ul = doc('.sellListContent') divs = ul.children('.clear .info.clear').items() count = 0 realids=[] titles = [] places = [] msgs = [] prices = [] per_meters = [] realarea=[] citys=[] for div in divs: count += 1 realid=div.children('.address .priceInfo .unitPrice').attr('data-hid') title = div.children('.title a').text() place = div.children('.address .flood .positionInfo a').text() msg = div.children('.address .houseInfo').text() price = div.children('.address .priceInfo .totalPrice span').text() per_meter = div.children('.address .priceInfo .unitPrice span').text() city='無錫' dict = { 'id':realid, 'title': title, 'place': place, 'msg': msg, 'price': price, 'per_meter': per_meter, 'area':areas[area], 'city':'無錫' } realids.append(realid) titles.append(title) places.append(place) msgs.append(msg) prices.append(price) per_meters.append(per_meter) realarea.append(areas[area]) citys.append(city) print(str(count) + ':' + json.dumps(dict, ensure_ascii=False)) datas={ 'id':realids, 'title': titles, 'place': places, 'msg': msgs, 'price': prices, 'per_meter': per_meters, 'area':realarea, 'city':citys } df = pd.DataFrame(data=datas, columns=columns) df.to_csv('wx'+time.strftime('%Y-%m-%d')+'.csv', mode='a', index=False, header=False) if __name__ == '__main__': quyu=['binhu','liangxi','xinwu','huishan','xishan','jiangyinshi','yixingshi'] index=0 for qy in quyu: #print (index) #print (qy) for i in range(1, 20): get_a_page(f'https://wx.ke.com/ershoufang/{qy}/pg{i}tt9/',index) index=index+1