使用BeautifulSoap爬取安智網的所有應用資訊

xianjie0318發表於2018-01-16

開發工具:

python版本:python2.7

開發工具:Eclipse

開發需求:

1、爬取安智網下的app應用資訊:應用分類、應用名稱、下載次數、上線時間、包大小、支援系統版本、資費、作者、軟體語言

2、從網頁可以看到安智應用標籤頁中右側:有大類、小類

3、可以根據大類找到所有的小類進行分類儲存

4、可以點選小類標籤,進入小類的應用列表

5、可以根據該小類中每頁的url判斷每頁的url組成

...............

開發思路

1、找到app應用分類的url規律

首先找到安智應用右側分類頁面的url:http://www.anzhi.com/widgetcat_1.html

然後通過html找到每一子類的url,如http://www.anzhi.com/sort_49_1_hot.html

最後獲取所有大類下的每子類的url

2、找到每子類中的app應用列表翻頁的url規律

第一頁:http://www.anzhi.com/sort_49_1_hot.html

第二頁:http://www.anzhi.com/sort_49_2_hot.html

第三頁:http://www.anzhi.com/sort_49_3_hot.html

..............

3、找到app應用的超連結的url規律

http://www.anzhi.com+href

其中從app應用資訊中獲取href標籤半個路徑

如何獲取需要的下載次數、上線時間、包大小、支援系統版本、資費、作者、軟體語言,並組裝到一個列表中,然後組裝字典

生成excel檔案,並把字典資料儲存進去

4、原始碼實現

首先,建立一個空白的excel檔案

#encoding:utf-8
#/usr/bin/python2.7
'''
Created on 2018年01月12日
@author: *********
'''
import xlwt
import time,os
class StatisticsReport(object):
    t=time.strftime('%Y%m%d%H%M%S',time.localtime(time.time()))
    #設定單元格樣式
    def set_style(self,name,height,bold=False):
        # 初始化樣式
        style = xlwt.XFStyle()
        # 為樣式建立字型
        font = xlwt.Font()
        font.name = name
        font.bold = bold
        font.color_index = 4
        font.height = height
        style.font = font
        return style
    def __createStatisticsReport__(self):
        RunNo=self.t
        reportname=RunNo+'.xls'
        self.__setreportname__(reportname)
        ReportFile=xlwt.Workbook()
        #建立1個獲取應用資訊sheet頁名稱
        ReportFile.add_sheet(u'Android應用資訊',cell_overwrite_ok=True)
        #-------------寫入按獲取應用資訊的資訊表頭
        #父分類    子分類    應用名稱    下載次數    上線時間    包大小    支援系統版本    資費    作者    軟體語言
        wr_tree = ReportFile.get_sheet(0)
        row0=[u'父分類',u'子分類',u'應用名稱',u'下載次數',u'上線時間',u'包大小',u'支援系統版本',u'資費',u'作者',u'軟體語言']
        #生成按測試類崗位的資訊表頭
        for i in range(0,len(row0)):
            wr_tree.write(0,i,row0[i],self.set_style('Times New Roman',220,True))
        reportpath=os.path.abspath("..")+'\\'
        print reportpath+reportname
        ReportFile.save(reportpath+reportname)
    def __setreportname__(self,reportname):
        self.reportname=reportname
    def __getreportname__(self):
        return self.reportname

然後,迴圈找到app的應用資訊並實時儲存

#/usr/bin/python
#encoding:utf-8
'''
Created on 2018年01月12日

@author: ********
'''
import urllib2,re
from bs4 import BeautifulSoup
import xlrd,os
from xlutils.copy import copy
from StatisticsReport1 import StatisticsReport

def GetAppinfo(urlhead,page,report):
    dict1={}
    head = {}   #設定頭
    head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
    #獲取url路徑
    get_url=urlhead;
    #模擬瀏覽器,定製http請求頭
    try:
        request=urllib2.Request(url=get_url,headers = head)
        #模擬瀏覽器,呼叫urlopen並傳入Request物件,將返回一個相關請求response物件
        reponse=urllib2.urlopen(request)
    except:
        print u"父類標籤頁面,出現異常,終止"
    #這個應答物件如同一個檔案物件,可以在Response中呼叫.read()讀取這個檔案資訊
    appi_html=reponse.read().decode('utf-8')
    # UTF-8模式讀取獲取的頁面資訊標籤和內容
    appi_htmltables=BeautifulSoup(appi_html,'lxml');
    #獲取應用大分類的標籤dl以及內容
    get_linkdl_list=appi_htmltables.find_all('dl')
    #獲取所有的子分類的href
    #app記錄個數
    i=0
    for dllink in get_linkdl_list:
        Fatherclassname=dllink.h2.get_text()
        get_linka_list=dllink.find_all('a')
        for alink in get_linka_list:
            href=alink.get('href')
            if href.find('/sort_')==-1:
                pass
            else:
                hrefstr=re.findall(r"sort_(.+?)_1_hot.html",href)[0]
                n=1
                while True:
#                 for n in range(1,page+1):
                    get_subclassurl='http://www.anzhi.com'+'/sort_'+hrefstr+'_'+str(n)+'_hot.html';
                    subclassname=alink.get_text()
                    n+=1
                    #模擬瀏覽器,定製http請求頭
                    try:
                        get_subcalssrequest=urllib2.Request(url=get_subclassurl,headers = head)
                        #模擬瀏覽器,呼叫urlopen並傳入Request物件,將返回一個相關請求response物件
                        get_subclassreponse=urllib2.urlopen(get_subcalssrequest)
                    except:
                        print  u"子類頁碼頁面,出現異常,終止"
                    #這個應答物件如同一個檔案物件,可以在Response中呼叫.read()讀取這個檔案資訊
                    get_app_html=get_subclassreponse.read().decode('utf-8')
                    app_subhtmltables=BeautifulSoup(get_app_html,'lxml');
                    get_subapp_spanlist=app_subhtmltables.find_all('span',{"class":"app_name"})
                    if len(get_subapp_spanlist)>0:
                        for get_subapp_span in get_subapp_spanlist:
                                get_apphref=get_subapp_span.find_all('a')[0].get('href')
                                get_appurl="http://www.anzhi.com"+get_apphref
                                appname=get_subapp_span.get_text()
                                #模擬瀏覽器,定製http請求頭
                                try:
                                    get_apprequest=urllib2.Request(url=get_appurl,headers = head)
                                    #模擬瀏覽器,呼叫urlopen並傳入Request物件,將返回一個相關請求response物件
                                    get_appreponse=urllib2.urlopen(get_apprequest)
                                except:
                                    print u"App頁面,出現異常,終止,繼續"
                                    continue;
                                #這個應答物件如同一個檔案物件,可以在Response中呼叫.read()讀取這個檔案資訊
                                get_app_html=get_appreponse.read().decode('utf-8')
                                app_apphtmltables=BeautifulSoup(get_app_html,'lxml');
                                get_app_lilist=app_apphtmltables.find_all('ul',attrs={"id":"detail_line_ul"})
                                if len(get_app_lilist)>0:
                                    get_app_infolist=get_app_lilist[0].find_all('li')
                                    try:
                                        app_downloadcounts=get_app_infolist[1].get_text()
                                        app_uplinedate=get_app_infolist[2].get_text()
                                        app_pkgsize=get_app_infolist[3].get_text()
                                        app_Supportver=get_app_infolist[4].get_text()
                                        app_charge=get_app_infolist[5].get_text()
                                        app_author=get_app_infolist[6].get_text()
                                        app_language=get_app_infolist[7].get_text()
                                    except:
                                        app_downloadcounts=''
                                        app_uplinedate=''
                                        app_pkgsize=''
                                        app_Supportver=''
                                        app_charge=''
                                        app_author=''
                                        app_language=''
                                    list1=[Fatherclassname,subclassname,appname,app_downloadcounts,app_uplinedate,app_pkgsize,app_Supportver,app_charge,app_author,app_language]
                                    key='app_'+str(i+1)
                                    dict2=dict.fromkeys([key], list1)
                                    dict1={}
                                    dict1.update(dict2)   
                                    reportpath=os.path.abspath("..")+'\\'
                                    reportname=report.__getreportname__()
                                    bk=xlrd.open_workbook(reportpath+reportname)
                                    wb=copy(bk)
                                    wa=wb.get_sheet(0)
                                    for j in range(0,len(dict1.values()[0])):
                                        wa.write(i+1,j,dict1.values()[0][j])
                                    i+=1
                                    wb.save(reportpath+reportname)
#                                     time.sleep(0.001)
                                else:
                                    print u"app頁面,無詳情資訊,跳出迴圈"
                                    break;
                    else:
                        print u"當前頁面無app資料,跳出迴圈"
                        break;
                print u"爬取到子類名稱:",subclassname
    print u'已經爬取app總數:',i                                                         
def GenerateReport(report,job_dict):
    reportpath=os.path.abspath("..")+'\\'
    reportname=report.__getreportname__()
    bk=xlrd.open_workbook(reportpath+reportname)
    wb=copy(bk)
    wa=wb.get_sheet(0)
    for i in range(0,len(job_dict)):
        for j in range(0,len(job_dict.values()[i])):
            wa.write(i+1,j,job_dict.values()[i][j])
    wb.save(reportpath+reportname)


if __name__ == '__main__':
    report=StatisticsReport()
    report.__createStatisticsReport__()
    url='http://www.anzhi.com/widgetcat_1.html';
    page=1
    app_dict=GetAppinfo(url,page,report)
#     GenerateReport(report,app_dict)


總結:

該方式消耗的cpu和網路資源比較大,穩定,但效率較慢,後續研究效能較快的方式    
   

相關文章