Python爬資料之全國中小學資訊

zenobia119發表於2018-07-08

爬取網站:http://www.xuexiaodaquan.com/ 學校大全

技術路線: requests + BeautifulSoup

貌似這個網站反爬蟲還挺牛的,經常就返回自動跳入的139網站,隨意得換著IP試試

需要準備中國市名稱拼音存在EXCEL中,顯示是第一列:市民;第二列:拼音;到市級就可以。

需要挖掘哪些城市就放哪些,如果挖全國,就要放所有市名。

如:


輸出是一個EXCEL,包括:

城市 型別 學習名稱 地址 電話 網址

如:


直接上程式碼:

from bs4 import BeautifulSoup
import requests
import re
import sys
import xlwt
import xlrd
from xlutils.copy import copy

#獲取html
def getHtmlText(url, code="GBK"):
    try:
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'}
        r = requests.get(url,headers = headers,timeout = 30)
        r.raise_for_status()
        r.encoding = code
        return r.text
    except:
        return "獲取html異常"
#解析地區,返回地區清單
'''
def getGroundList(htext):
    try:
        grounddict = {}
        soup = BeautifulSoup(htext, "html.parser")
        gdname = soup.find('dl', attrs={'class':'nobackground'})
        keyList = gdname.find_all('a')
        for i in range(1,len(keyList)):
            key = keyList[i].text
            val = keyList[i].get('href')
            grounddict[key] = val
        return grounddict
    except:
        print("getGroundList異常")
'''
#解析頁碼
def getPageCode(htext,typeitem):   
    try:
        soup = BeautifulSoup(htext, "html.parser")
        s1 = soup.find('a', attrs={'class':'last'})
        if (s1):
            pat = re.compile(typeitem + r'pn([0-9]+).html')
            if(s1.get('href')):
               code = pat.search(s1.get('href'))
               if(code):
                   return code.group(1)
        else:
            return 0
            
    except:
        print("getPageCode異常")
    

#解析學校資訊,返回學校名稱、地址、電話、網址
def getSchoolList(htext,fileAddress,cityitem,typeitem):
    try:
        schoolDict = {}
        soup = BeautifulSoup(htext, "html.parser")
        sclist1 = soup.find_all('dl',attrs={'class':'left'})
        sclist2 = soup.find_all('dl',attrs={'class':'right'})
        sclist = sclist1 + sclist2
        for item in sclist:
            schoolDict['城市'] = cityitem
            schoolDict['型別'] = typeitem
            schoolDict['學習名稱'] = item.find('p').text
            sl = item.find_all('li')
            schoolDict['地址'] = sl[0].text
            schoolDict['電話'] = sl[1].text
            schoolDict['網址'] = sl[2].text
            #f = open(fileAddress, 'a', encoding='utf-8')
            #f.write(str(schoolDict)  + '\n' )
            savefile(schoolDict,fileAddress)
    except:
        print("getSchoolList異常")

#儲存到excel
def savefile(schoolDict,fileAddress):
    workbook = xlrd.open_workbook(fileAddress,'w+b')
    sheet = workbook.sheet_by_index(0)
    wb = copy(workbook)
    ws = wb.get_sheet(0)
    rowNum = sheet.nrows
    ws.write(rowNum,0,schoolDict['城市'])
    ws.write(rowNum,1,schoolDict['型別'])
    ws.write(rowNum,2,schoolDict['學習名稱'])
    ws.write(rowNum,3,schoolDict['地址'])
    ws.write(rowNum,4,schoolDict['電話'])
    ws.write(rowNum,5,schoolDict['網址'])
    wb.save(fileAddress)
        
#獲取城市列表,城市由EXCEL檔案儲存
def getCityList():
    try:
        cityFileAddress = r'D:\中國省市名稱拼音.xls'
        file = xlrd.open_workbook(cityFileAddress)
        sheet = file.sheet_by_name('city')
        cityDic = {}
        for i in range(sheet.nrows):
            key = sheet.col_values(0)[i]
            value = sheet.col_values(1)[i].lower()
            cityDic[key] = value
        return cityDic
    except:
        print("getCityList失敗")
            
def main():
    cityList = getCityList()
    typeList = {'小學':'/xiaoxue/','初中':'/chuzhong/','高中':'/gaozhong/'}
    for cityitem in cityList:
        for typeitem in typeList:
            searchUrl = 'http://'+ cityList[cityitem] + '.xuexiaodaquan.com'
            fileAddress = 'D:/school.xls'
            htext = getHtmlText(searchUrl+typeList[typeitem])
            getSchoolList(htext,fileAddress,cityitem,typeitem)
            pagecode = int(getPageCode(htext,typeList[typeitem]))
            if pagecode != 0:
                for i in range(2,pagecode+1):
                    h1text = getHtmlText(searchUrl+typeList[typeitem]+'pn'+str(i)+'.html')
                    getSchoolList(h1text,fileAddress,cityitem,typeitem)
       
main()

相關文章