用python爬取鏈家的租房資訊

ALBDXV發表於2020-10-29

用python爬取鏈家的租房資訊(記錄自己的第一個python程式碼),
裡面涉及到的主要的點有:使用代理ip訪問;讀取網頁;翻頁等。歡迎交流

程式碼如下:

import requests
import urllib.request#urllib.request功能的瞭解
from bs4 import BeautifulSoup#BeautifulSoup功能瞭解
import bs4
import random
import re

##通過函式獲取網頁資訊
def gethtml(url):
    #用代理IP訪問
    proxy_support = urllib.request.ProxyHandler({'http':'119.6.144.73:81'})
    opener = urllib.request.build_opener(proxy_support)
    opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363')]
    urllib.request.install_opener(opener)
    #讀取網頁資訊
    #zf = urllib.request.urlopen('https://sh.lianjia.com/zufang/anting/rt200600000001l0/')
    zf = urllib.request.urlopen(url)
    #'https://ks.lianjia.com/zufang/kunshan/rt200600000001l0/'
    html = zf.read()
    ht = html.decode('utf8')
    zf.close
    Soup = BeautifulSoup(ht,'lxml')
    return Soup

##定義一些迴圈中會用到的變數
info = []
page = 1
TotalNumber = 0
urlMain = 'https://ks.lianjia.com/zufang/kunshan/'
urlOption = 'rt200600000001l0/'
#一共有多少條結果,防止找到限制條件以外的推薦結果
Number = int(gethtml(urlMain+urlOption).find(class_ = 'content__article').find(class_ = 'content__title').find('span').text)
print('已找到{}套租房'.format(Number))

##用while迴圈去讀取每一頁的租房資訊
while TotalNumber <= Number:
    print('正在讀取第%d頁'%page)
    if page == 1:
        url = urlMain + urlOption
    else:
        url = urlMain +'pg{}'.format(page) + urlOption
    Soup = gethtml(url)
    ###找到地址,價格,網址在網頁中的位置,然後用find篩選出來
    Soup = Soup.find_all(class_ = 'content__list--item')
    numberOfThisPage = len(Soup)
    print('該頁有%d條租房資訊'%numberOfThisPage)
    print('')
    counter = 0
    for Soup in Soup:
        #print(Soup)
        counter+=1
        #print(counter)
        Address = Soup.find(class_ = 'content__list--item--des').find_all('a')
        if Address == []:
            continue
        else:
            #print(Address)
            Address_DistrictName = Address[2].text
            Address_Location = Address[0].text+','+Address[1].text
            Price = Soup.find('em').text
            Website = Soup.find(class_ = 'content__list--item--title').find('a')['href']
            Website = 'https://sh.lianjia.com'+Website
        info.append([Address_DistrictName,Address_Location,Price,Website])
        ###寫入表格中
        fo=open("鏈家崑山租房資訊——全部.csv","w")
        for row in info:
            fo.write(",".join(row)+"\n")
        fo.close()
        if counter == numberOfThisPage:
            break
    TotalNumber += counter
    page += 1
    

爬取結果如下(一共八百多條資訊):
--|--

相關文章