日誌11月4日

sbqaqsjb發表於2020-11-04

BeautifulSoup常用方式程式碼:

from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup

def getBSObj(url):
    """根據url獲取頁面的物件"""
    try:
        html = urlopen(url)
    except (HTTPError, URLError) as e:
        return None

    try:
        bsObj = BeautifulSoup(html.read(), features = "lxml")
    except (AttributeError) as e:
        return None

    return bsObj

bsObj = getBSObj("http://pythonscraping.com/pages/warandpeace.html")

if bsObj == None:
    print("BSObj could not be found")
else:
    #使用find_all()函式抽取只包含在<span class="green"></span>標籤裡的文字
    nameList = bsObj.find_all("span", {"class" : "green"})
    for name in nameList:
        #get_text()清楚所有標籤,返回一個只包含文字的字串
        print(name.get_text())