爬取js渲染過的頁面(爬取一個婚慶網站為例)

wanghandou發表於2017-11-22
這個網站是js渲染過的,所以我們可以使用PhantomJS瀏覽器或者在network中找出需要post的qurrystring中的引數,發請求就可以了,得到的是json
# !/usr/bin/python # -*- encoding: UTF-8 -*- from lxml import etree import urllib import urllib2 import jsonpath import json from lxml import etree class we(): def __init__(self): self.page=3 self.headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",} def meiyiye(self): item=[] headers=self.headers url="http://search.jiayuan.com/v2/search_v2.php" fromdata={ "sex":"f", "key":"", "stc":"1%3A41%2C2%3A19.27%2C3%3A155.170%2C23%3A1", "sn":"default", "sv":"1", "p":self.page,#表示爬取的第幾頁 "f":"", "listStyle":"bigPhoto", "pri_uid":"170703614", "jsversion":"v5"} data = urllib.urlencode(fromdata) request=urllib2.Request(url,data=data,headers=headers)#post請求的話需要data值,而get請求不需要data有值 response = urllib2.urlopen(request) #得到的是json格式的字串(字典行的) html = response.read() html1=html.replace("##jiayser##","").replace("//","") #把json轉換成python格式的unicode字串(列表形式的) content = json.loads(html1) id_list=jsonpath.jsonpath(content,"$..uid")#在content中匹配出需要的個人id,然後通過這個id拼接出個人主頁的連結 for i in id_list: item.append(i) self.page+=1 self.meiyigeren(item) #處理這一頁每一個人的頁面資訊 def meiyigeren(self,item): for id in item: print "*******************************************" print u"使用者id:"+str(id) url="http://www.jiayuan.com/"+str(id)+"?fxly=search_v2_index"#拼接連線,然後傳送請求,找到個人主頁中需要的有用的內容 print u"主頁連結:"+url headers=self.headers request=urllib2.Request(url,headers=headers) response = urllib2.urlopen(request) html = response.read() content=etree.HTML(html)#解析HTML文件為HTML DOM模型,然後下面就可以使用xpath匹配出想要的內容 username=content.xpath('//div[@class="main_1000 bg_white mt15"]//h4/text()') if len(username)==1: print username[0] else: print u"沒有名字" a=content.xpath('//div[@class="main_1000 bg_white mt15"]//h6[@class="member_name"]/text()') we=" ".join(a) ni=we.replace(","," ").replace(',',' ') ha=ni.split(" ") print u"年齡:"+ha[0] header_url=content.xpath('//div[@class="big_pic fn-clear"]//li[2]//tr//img[@class="img_absolute"]//@_src') if len(header_url)==1: header_urll=header_url[0] else: header_urll=u"沒有頭像連結:" print u"頭像連結:"+header_urll image_url=content.xpath('//div[@class="small_pic_box fn-clear"]//div[@class="small_pic fn-clear"]//li//img//@src') print u"相簿連結:", print image_url content1=content.xpath('//div[@class="main_1000 mt15 fn-clear"]//div[@class="bg_white"]//div[@class="js_text"]//text()') content2="" for i in content1: content2+=i content3=content2 print u"內心獨白:"+content3.strip() place=content.xpath('//div[@class="main_1000 bg_white mt15"]//h6[@class="member_name"]/a[2]/text()') if len(place)==1: where=place[0] else: where=u"河南" print u"來自:"+where+u"省" xueli=content.xpath('//div[@class="main_1000 bg_white mt15"]//ul[@class="member_info_list fn-clear"]/li[1]//div[@class="fl pr"]/em/text()') if len(xueli)==1: print u"學歷:"+xueli[0] else: print u"學歷:本科" print "***********************************************" if self.page<=5: self.meiyiye() if __name__=="__main__": ni=we() ni.meiyiye()

相關文章