一。request庫
import json import requests from io import BytesIO #顯示各種函式相當於api # print(dir(requests)) url = 'http://www.baidu.com' r = requests.get(url) print(r.text) print(r.status_code) print(r.encoding)
結果:
# 傳遞引數:不如http://aaa.com?pageId=1&type=content params = {'k1':'v1', 'k2':'v2'} r = requests.get('http://httpbin.org/get', params) print(r.url) 結果:
# 二進位制資料 # r = requests.get('http://i-2.shouji56.com/2015/2/11/23dab5c5-336d-4686-9713-ec44d21958e3.jpg') # image = Image.open(BytesIO(r.content)) # image.save('meinv.jpg') # json處理 r = requests.get('https://github.com/timeline.json') print(type(r.json)) print(r.text)
結果:
# 原始資料處理 # 流式資料寫入 r = requests.get('http://i-2.shouji56.com/2015/2/11/23dab5c5-336d-4686-9713-ec44d21958e3.jpg', stream = True) with open('meinv2.jpg', 'wb+') as f: for chunk in r.iter_content(1024): f.write(chunk) # 提交表單 form = {'username':'user', 'password':'pass'} r = requests.post('http://httpbin.org/post', data = form) print(r.text)
結果:引數以表單形式提交,所以引數放在form引數中
r = requests.post('http://httpbin.org/post', data = json.dumps(form)) print(r.text)
結果:引數不是以form表單提交的,所以放在json欄位中
# cookie url = 'http://www.baidu.com' r = requests.get(url) cookies = r.cookies #cookie實際上是一個字典 for k, v in cookies.get_dict().items(): print(k, v) 結果:cookie實際上是一個鍵值對
cookies = {'c1':'v1', 'c2': 'v2'} r = requests.get('http://httpbin.org/cookies', cookies = cookies) print(r.text) 結果:
# 重定向和重定向歷史 r = requests.head('http://github.com', allow_redirects = True) print(r.url) print(r.status_code) print(r.history) 結果:通過301定向
# # 代理 # # proxies = {'http': ',,,', 'https': '...'} # r = requests.get('...', proxies = proxies)
二。BeautifulSoup庫
html:舉例如下
<html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p>
解析程式碼如下:
from bs4 import BeautifulSoup soup = BeautifulSoup(open('test.html'))
#使html文字更加結構化 # print(soup.prettify()) # Tag print(type(soup.title))
結果:bs4的一個類
print(soup.title.name)
print(soup.title)
結果如下:
# String print(type(soup.title.string)) print(soup.title.string) 結果如下:只顯示標籤裡面內容
# Comment print(type(soup.a.string)) print(soup.a.string)
結果:顯示註釋中的內容,所以有時需要判斷獲取到的內容是不是註釋
# # ''' for item in soup.body.contents: print(item.name) 結果:body下面有三個item
# CSS查詢 print(soup.select('.sister'))
結果:樣式選擇器返回帶有某個樣式的所有內容 結果為一個list
print(soup.select('#link1'))
結果:ID選擇器,選擇ID等於link1的內容
print(soup.select('head > title')) 結果:
a_s = soup.select('a') for a in a_s: print(a)
結果:標籤選擇器,選擇所有a標籤的
持續更新中。。。。,歡迎大家關注我的公眾號LHWorld.