安裝BeautifulSoup

# 安裝bs4
# 官網地址 https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#name
pip install beautifulsoup4
# 安裝解析器 Python標準庫自帶Html解析器
pip install lxml
pip install html5lib

使用

實驗網頁

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Document</title>
</head>
<body>
    <div style="height: 400px;background-color: whitesmoke;margin: 0 auto;">
        <h2 style="text-align: center;padding-top: 15px;">志研網</h2>
        <div style="margin: 0 auto;background-color: white;height: 200px;width: 500px;border: 1px solid rgb(172, 172, 172);">
            <h3 style="border-bottom: 1px solid  rgb(172, 172, 172);height: 40px;line-height: 40px;margin-top: 0;padding-left: 25px;">註冊驗證</h3>
            <p class="P1" id="S" style="margin-left: 25px;color:steelblue;">你好! yanggeol@qq.com</p>
            <p style="margin-left: 25px;color:steelblue;">歡迎註冊志研網，請將註冊碼填到相應頁面</p>
            <p style="margin-left: 25px;color:steelblue;">您的驗證碼是：965341</p>
            <p style="margin-left: 25px;color:gray;">@ginet.com</p>
        </div>
    </div>
</body>
</html>

基本使用

# 引入bs4
from bs4 import BeautifulSoup
# 開啟index.html
soup = BeautifulSoup(open("index.html", encoding='UTF-8'))
# 獲取標籤<p></p>的內容  預設第一個
tag = soup.p
# 列印該標籤
print(tag)
# 標籤型別
print(type(tag))
# 標籤名
print(tag.name)
# 修改名
tag.name = "blockquote"
print(tag)
# 列印標籤的class名
print(tag['class'])
# 列印標籤所有屬性
print(tag.attrs)
# 列印標籤的id
print(tag['id'])
# 刪除屬性
del tag['class']
del tag['id']

<p class="P1" id="S" style="margin-left: 25px;color:steelblue;">你好! yanggeol@qq.com</p>
<class 'bs4.element.Tag'>
p
<blockquote class="P1" id="S" style="margin-left: 25px;color:steelblue;">你好! yanggeol@qq.com</blockquote>
['P1']
{'class': ['P1'], 'id': 'S', 'style': 'margin-left: 25px;color:steelblue;'}
S

多值屬性

# tag轉換成字串時,多值屬性會合併為一個值
rel_soup = BeautifulSoup('<p>Back to the <a rel="index">homepage</a></p>')
rel_soup.a['rel']
# ['index']
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)
# <p>Back to the <a rel="index contents">homepage</a></p>

# 轉換的文件是XML格式,那麼tag中不包含多值屬性
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml')
xml_soup.p['class']
# u'body strikeout'

列印標籤的字串

from bs4 import BeautifulSoup


soup = BeautifulSoup(open("index.html", encoding='UTF-8'))

tag = soup.p

print(tag.string)
# 你好! yanggeol@qq.com

# 不能編輯但可以替換字串內容
tag.string.replace_with("No longer bold")

print(tag.string)

# tag包含了多個子節點,tag就無法確定 .string 方法應該呼叫哪個子節點的內容, .string 的輸出結果是 None

遍歷文件樹

tag的屬性

from bs4 import BeautifulSoup


soup = BeautifulSoup(open("index.html", encoding='UTF-8'))

# 列印head
print(soup.head)
# 列印標題
print(soup.title)
# 列印body下第一個p
print(soup.body.p)
# 獲取當前第一個p
print(soup.p)
# 獲取所有p
print(soup.find_all('p'))
# 將head的子節點以列表的方式輸出
print(soup.head.contents)
print(soup.head.contents[0])

tag = soup.head
print(tag.contents[1].name)
# 通過tag的 .children 生成器,可以對tag的子節點進行迴圈
for child in tag.children:
    print(child)
# .descendants 屬性可以對所有tag的子孫節點進行遞迴迴圈 包含字串
for child in tag.descendants:
    print(child)

# tag中包含多個字串,可以使用 .strings 來迴圈獲取
for string in soup.strings:
    print(repr(string))
# 輸出的字串中可能包含了很多空格或空行,使用 .stripped_strings 可以去除多餘空白內容
for string in soup.stripped_strings:
    print(repr(string))
# 全部是空格的行會被忽略掉,段首和段末的空白會被刪除

節點

父節點

from bs4 import BeautifulSoup

soup = BeautifulSoup(open("index.html", encoding='UTF-8'))

tag = soup.h3

print(tag)
 # .parent 屬性來獲取某個元素的父節點
print(tag.parent)

# <h3>註冊驗證</h3>
# <div>
#   <h3>註冊驗證</h3>
# </div>


# BeautifulSoup 物件的 .parent 是None:

# 通過元素的 .parents 屬性可以遞迴得到元素的所有父輩節點
for parent in tag.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

兄弟節點

# .next_sibling 和 .previous_sibling 屬性來查詢兄弟節點:
# 實際文件中的tag的 .next_sibling 和 .previous_sibling 屬性通常是字串或空白. 
from bs4 import BeautifulSoup

soup = BeautifulSoup(open("index.html", encoding='UTF-8'))

tag = soup.p
print(tag.previous_sibling.previous_sibling)
print(tag.next_sibling.next_sibling)

# 結果
# <h3 style="border-bottom: 1px solid  rgb(172, 172, 172);height: 40px;line-height: 40px;margin-top: 0;padding-left: 25px;">註冊驗證</h3>
# <p style="margin-left: 25px;color:steelblue;">歡迎註冊志研網，請將註冊碼填到相應頁面</p>

# .next_siblings 和 .previous_siblings 屬性可以對當前節點的兄弟節點迭代輸出

回退和前進

# .next_element 屬性指向解析過程中下一個被解析的物件(字串或tag)
# .previous_element 屬性剛好與 .next_element 相反,它指向當前被解析的物件的前一個解析物件
# 通過 .next_elements 和 .previous_elements 的迭代器就可以向前或向後訪問文件的解析內容,就好像文件正在被解析一樣

from bs4 import BeautifulSoup


soup = BeautifulSoup(open("index.html", encoding='UTF-8'))

tag = soup.p

print(tag.previous_element)
print(tag.next_elements)

過濾器

# 用於查詢文件中所有的<b>標籤
soup.find_all('b')

# 找出所有以b開頭的標籤,這表示<body>和<b>標籤都應該被找到
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)
    
# 如果傳入列表引數,Beautiful Soup會將與列表中任一元素匹配的內容返回.
soup.find_all(["a", "b"])

# True 可以匹配任何值,下面程式碼查詢到所有的tag,但是不會返回字串節點
for tag in soup.find_all(True):
    print(tag.name)

# 如果沒有合適過濾器,那麼還可以定義一個方法,方法只接受一個元素引數,如果這個方法返回 True 表示當前元素匹配並且被找到,如果不是則反回 False
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')
soup.find_all(has_class_but_no_id)

find_all

find_all(name, attrs,recursive,text, **kwargs)
# name 引數可以查詢所有名字為 name 的tag,字串物件會被自動忽略掉.
soup.find_all("title")
# 如果一個指定名字的引數不是搜尋內建的引數名,搜尋時會把該引數當作指定名字tag的屬性來搜尋,如果包含一個名字為 id 的引數,Beautiful Soup會搜尋每個tag的”id”屬性.
# 搜尋指定名字的屬性時可以使用的引數值包括 字串 , 正規表示式 , 列表, True .
# 使用多個指定名字的引數可以同時過濾tag的多個屬性
soup.find_all(id='link2')
# 有些tag屬性在搜尋不能使用,比如HTML5中的 data-* 屬性
# 但是可以通過 find_all() 方法的 attrs 引數定義一個字典引數來搜尋包含特殊屬性的tag
data_soup.find_all(attrs={"data-foo": "value"})

# 通過 class_搜尋CSS類名
# class_ 引數同樣接受不同型別的 過濾器 ,字串,正規表示式,方法或 True
soup.find_all("a", class_="sister")
# 搜尋 class 屬性時可以通過CSS值完全匹配
# 完全匹配 class 的值時,如果CSS類名的順序與實際不符,將搜尋不到結果

# 通過 text 引數可以搜搜文件中的字串內容.與 name 引數的可選值一樣, text 引數接受 字串 , 正規表示式 , 列表, True
soup.find_all("a", text="Elsie")

# find_all() 方法返回全部的搜尋結構,如果文件樹很大那麼搜尋會很慢.如果我們不需要全部結果,可以使用 limit 引數限制返回結果的數量.效果與SQL中的limit關鍵字類似,當搜尋到的結果數量達到 limit 的限制時,就停止搜尋返回結果.
soup.find_all("a", limit=2)

# 呼叫tag的 find_all() 方法時,Beautiful Soup會檢索當前tag的所有子孫節點,如果只想搜尋tag的直接子節點,可以使用引數 recursive=False
soup.html.find_all("title", recursive=False)

# find_all() 幾乎是Beautiful Soup中最常用的搜尋方法,所以我們定義了它的簡寫方法. BeautifulSoup 物件和 tag 物件可以被當作一個方法來使用,這個方法的執行結果與呼叫這個物件的 find_all() 方法相同,下面兩行程式碼是等價的
soup.find_all("a")
soup("a")

find

find(name, attrs,recursive,text, **kwargs)
# 找尋一個滿足條件的

find_parents 和 find_parent

find_all() 和 find() 只搜尋當前節點的所有子節點,孫子節點等.

find_parents() 和 find_parent() 用來搜尋當前節點的父輩節點,搜尋方法與普通tag的搜尋方法相同,搜尋文件搜尋文件包含的內容

find_next_siblings 和 find_next_sibling

find_next_siblings() 方法返回所有符合條件的後面的兄弟節點

find_next_sibling() 只返回符合條件的後面的第一個tag節點

find_previous_siblings 和 find_previous_sibling

find_previous_siblings() 方法返回所有符合條件的前面的兄弟節點

find_previous_sibling() 方法返回第一個符合條件的前面的兄弟節點:

find_all_next 和 find_next

find_all_next() 方法返回所有符合條件的節點

find_next() 方法返回第一個符合條件的節點

find_all_previous 和 find_previous