python 爬蟲 5i5j房屋資訊 獲取並儲存到資料庫

暗香丶發表於2018-08-20
 1 from lxml import etree
 2 from selenium import webdriver
 3 import pymysql
 4 
 5 def Geturl(fullurl):#獲取每個招聘網頁的連結
 6     browser.get(fullurl)
 7     shouye_html_text = browser.page_source
 8     shouye_ele = etree.HTML(shouye_html_text)
 9     zf_list = shouye_ele.xpath(`/html/body/div[4]/div[1]/div[2]/ul/li/div/h3/a/@href`)#連結url
10     zf_url_list  = []
11     for zf_url_lost in zf_list:
12         zf_url  = `https://bj.5i5j.com`+zf_url_lost
13         zf_url_list.append(zf_url)
14     return zf_url_list
15 def Getinfo(zp_url_list):
16     for zp_url in zp_url_list:
17         browser.get(zp_url)
18         zp_info_html = browser.page_source
19         zp_ele = etree.HTML(zp_info_html)
20         zp_info_title = str(zp_ele.xpath(`//html/body/div[3]/div[1]/div[1]/h1/text()`)[0])
21         zp_info_num = str(zp_ele.xpath(`/html/body/div[3]/div[2]/div[2]/div[1]/div[1]/div/p[1]/text()`)[0])+`元/月`#價格
22         zp_info_type = str(zp_ele.xpath(`/html/body/div[3]/div[2]/div[2]/div[1]/div[2]/div/p[1]/text()`)[0])#戶型
23         zp_info_zone = str(zp_ele.xpath(`/html/body/div[3]/div[2]/div[2]/div[1]/div[3]/div/p[1]/text()`)[0])+`平米`#房屋大小
24         zp_info_need_1 = str(zp_ele.xpath(`/html/body/div[3]/div[2]/div[2]/div[2]/ul/li[1]/span/text()`)[0])#房屋資訊
25         zp_info_need_2 = str(zp_ele.xpath(`/html/body/div[3]/div[2]/div[2]/div[2]/ul/li[1]/a/text()`)[0])#房屋資訊
26         zp_info_need = zp_info_need_1+zp_info_need_2
27         connection = pymysql.connect(host=`localhost`, user=`root`, password=`1234`, db=`5i5j`, )
28         try:
29             with connection.cursor() as cursor:
30                 sql = "INSERT INTO `5i5j_info` (`title`,`num`,`type`, `zone`,`need`) VALUES (%s,%s,%s,%s, %s)"
31                 cursor.execute(sql, (zp_info_title,zp_info_num,zp_info_type,zp_info_zone,zp_info_need))
32             connection.commit()
33         finally:
34             connection.close()
35         print(zp_info_title,zp_info_num,zp_info_type,zp_info_zone,zp_info_need)
36 if __name__ == `__main__`:
37     browser = webdriver.Chrome()
38     pags = int(input(`需要幾頁?`))
39     for i in range(1,pags+1):
40         url = `https://bj.5i5j.com/zufang/huilongguan/n{}/`
41         fullurl = url.format(str(i))
42         zf_url_list = Geturl(fullurl)
43         print(fullurl)
44         # print(zf_url_list)
45         Getinfo(zf_url_list)
46     browser.close()

 

相關文章