一、安裝BeautifulSoup庫
可以現在目前python安裝了哪些包
安裝beautifulsoup
二、beautifulsoup官網
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
三、beautifulsoup的主要解析器
四、beautifulsoup的find函式
查詢html的title
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p>
<p class="name">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
title_tag=bs.title.string
print(title_tag)
#點取元素的時候,只取第一個匹配的元素
div_tag1=bs.title
print("div_tag1:"+str(div_tag1))
輸出結果:
查詢html中的div元素
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p>
<p class="name">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
div_tag2=bs.find("div")
print("div_tag2:"+str(div_tag2))
輸出結果:
查詢html中的所有P元素
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p>
<p class="name">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
#找回所有的元素
div_tag3=bs.find_all("p")
print("p:"+str(div_tag3))
for p in div_tag3:
print(p.string)
輸出結果:
指定id進行html查詢
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p>
<p class="name">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
div_tag4=bs.find(id="info")
print("div_tag4:"+str(div_tag4))
div_tag5=bs.find_all("div",id="info")
print("div_tag5:"+str(div_tag5))
輸出結果:
正規表示式匹配元素
import re
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info-955">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p>
<p class="name">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
div_tag=bs.find("div",id=re.compile("info-\d+"))
print(div_tag)
輸出結果:
根據網頁字串定位元素
import re
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info-955">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p>
<p class="name">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
div_tag=bs.find(string="django打造線上教育")
print(div_tag)
輸出結果:
輸出dom樹子標籤的標籤名
import re
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info-955">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p>
<p class="name">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
div_tag=bs.find("div",id=re.compile("info-\d+"))
childrens=div_tag.contents
for child in childrens:
if child.name:
print(child.name)
childrens_childrens = div_tag.descendants
for child_child in childrens_childrens:
if child_child.name:
print(child_child.name)
輸出如下:輸出子標籤的標籤名,遍歷子元素
輸出dom樹的父標籤的標籤名
import re
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info-955">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p>
<p class="name">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
parents=bs.find("p",{"class":"name"}).parents
for parent in parents:
print(parent.name)
輸出結果:
輸出dom樹的兄弟標籤的標籤名
輸出下一個兄弟標籤的標籤名
import re
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info-955">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p>
<p class="name">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
next_siblings=bs.find("p",{"class":"age"}).next_siblings
for sibling in next_siblings:
print(sibling.string)
輸出結果:
輸出上一個兄弟標籤的標籤名
import re
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info-955">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p>
<p class="name">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
previous_siblings=bs.find("p",{"class":"name"}).previous_siblings
for sibling in previous_siblings:
print(sibling.string)
輸出結果:
如果要輸出前一個兄弟標籤的標籤名,需要去掉回車換行符
import re
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info-955">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p><p class="name">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
previous_sibling=bs.find("p",{"class":"name"}).previous_sibling
print(previous_sibling.string)
注意:此處html去掉回車換行符,否則無輸出
輸出結果:
獲取html的某些標籤元素的屬性值
import re
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info-955">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p>
<p class="name">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
name_tag=bs.find("p",{"class":"name"})
print(name_tag["class"])
print(name_tag.get("class"))
輸出結果:
元素多值屬性問題
import re
from bs4 import BeautifulSoup
html="""
<html lang="en">
<head>
<meta charset="UTF-8">
<title>bobby基本資訊</title>
<script src="jquery-3.5.1.min.js"></script>
</head>
<body>
<div id="info-955">
<p style="color: blue">講師資訊</p>
<div class="teacher_info">
Python全棧工程師
<p class="age">年齡:29</p>
<p class="name bobbyname" data-bind="bobby">姓名:bobby</p>
<p class="work_years">工作年限:7年</p>
<p class="position">職位:python開發工程師</p>
</div>
<p style="color:aquamarine">課程資訊</p>
<table class="courses">
<tbody><tr><th>課程名稱</th>
<th>講師</th>
<th>地址</th>
</tr><tr>
<td>django打造線上教育</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/78.html">訪問</a></td>
</tr><tr>
<td>python高階程式設計</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/200.html">訪問</a></td>
</tr><tr>
<td>scrapy分散式爬蟲</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/92.html">訪問</a></td>
</tr><tr>
<td>diango rest framework打造生鮮電商</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/131.html">訪問</a></td>
</tr><tr>
<td>tornado從入門到精通</td>
<td>bobby</td>
<td><a href="https://coding.imooc.com/class/290.html">訪問</a></td>
</tr></tbody></table>
</div>
</body>
</html>
"""
bs=BeautifulSoup(html,"html.parser")
name_tag=bs.find("p",{"class":"name"})
print(name_tag["class"])
print(name_tag.get("class"))
print(name_tag["data-bind"])
print(name_tag.get("data-bind"))
輸出結果: