用xpath、bs4、re爬取B站python資料

sixkery發表於2018-08-07
import requests,re
from lxml import etree
from bs4 import BeautifulSoup

def get_page(page):
    try:
        #這裡要加上cookie
        headers = {
            `Cookie`: `LIVE_BUVID=AUTO5015218915110407; sid=4oag5i0u; fts=1521891539; pgv_pvi=3655305216; UM_distinctid=16257cdfffd2e4-032750a28294b2-3b60450b-100200-16257cdfffe2a0; buvid3=7B94813D-1039-4A88-A1EE-9AEFDF54BE05140244infoc; rpdid=kxsliqpkisdosikxllmww; CURRENT_QUALITY=80; finger=edc6ecda`,
            `Host`: `search.bilibili.com`,
            `Referer`: `https://www.bilibili.com/`,
            `Upgrade-Insecure-Requests`: `1`,
            `User-Agent`: `Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36`
        }
        url = `https://search.bilibili.com/all?keyword=python&from_source=banner_search&spm_id_from=333.334.banner_link.1&page={}`.format(str(page))
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
    except Exception:
        return None

#用xpath解析網頁,獲取屬性值用符號@
def xpath_parse_page(html):
    data = etree.HTML(html)
    items = data.xpath(`//*[@class="video-contain clearfix"]/li`)
    for item in items:
        yield {
        `video_image` : item.xpath(`./a/div/div[1]/img/@src`)[0],
        `video_title` : item.xpath(`./div/div[1]/a/text()`)[0],
        `video_view` : item.xpath(`./div/div[3]/span[1]/text()`)[0].strip(),
        `video_updateTime` : item.xpath(`./div/div[3]/span[3]/text()`)[0].strip(),
        `video_up` : item.xpath(`./div/div[3]/span[4]/a/text()`)[0]
        }

#用bs4解析網頁
def bs4_parse_page(html):
    soup = BeautifulSoup(html,`lxml`)
    items = soup.find_all(`li`,{`class`:`video matrix`})
    for item in items:
        yield{
        `video_image`  : item.find(`img`).get(`src`),
        `video_title` : item.find(`a`,{`class`:`title`}).get_text(),
        `video_view`: item.find(`span`,{`class`:`so-icon watch-num`}).get_text().strip(),
        `video_updateTime` : item.find(`span`, {`class`: `so-icon time`}).get_text().strip(),
        `video_up`: item.find(`span`,{`title`:`up主`}).get_text()#這裡用class的屬性得到的結果是播放量,要用title
        }

#用正則解析網頁,關鍵是確定好欄位的定位,一般來說定位在欄位上面的class屬性裡
def re_parse_page(html):
    pattern = re.compile(`<li.*?info.*?title="(.*?)".*?icon-playtime"></i>(.*?)</span>.*?icon-date"></i>(.*?)</span>.*?up-name">(.*?)</a>`,re.S)
    items = re.findall(pattern,html)
    for item in items:
        yield {
            `video_title`: item[0],
            `video_view`: item[1].strip(),
            `video_updateTime`:item[2].strip(),
            `video_up`: item[3].strip()
        }

def main():
    #處理翻頁
    for page in range(1,2):#這裡更改爬取的頁數
        html = get_page(page)
        for result in bs4_parse_page(html):#這裡更改使用哪一種方式解析網頁,xpath、bs4、re
            print(result)

if __name__ == `__main__`:
    main()


相關文章