Python爬蟲知識點二

LHBlog發表於2017-11-16

一。request庫

import json
import requests

from io import BytesIO
#顯示各種函式相當於api
# print(dir(requests))


url = 'http://www.baidu.com'
r = requests.get(url)
print(r.text)
print(r.status_code)
print(r.encoding)
結果:

 

# 傳遞引數:不如http://aaa.com?pageId=1&type=content

params = {'k1':'v1', 'k2':'v2'}
r = requests.get('http://httpbin.org/get', params)
print(r.url)
結果:

 

# 二進位制資料

# r = requests.get('http://i-2.shouji56.com/2015/2/11/23dab5c5-336d-4686-9713-ec44d21958e3.jpg')
# image = Image.open(BytesIO(r.content))
# image.save('meinv.jpg')


# json處理

r = requests.get('https://github.com/timeline.json')
print(type(r.json))
print(r.text)
結果:

 

# 原始資料處理
# 流式資料寫入
r = requests.get('http://i-2.shouji56.com/2015/2/11/23dab5c5-336d-4686-9713-ec44d21958e3.jpg', stream = True)
with open('meinv2.jpg', 'wb+') as f:
    for chunk in r.iter_content(1024):
        f.write(chunk)


# 提交表單

form = {'username':'user', 'password':'pass'}
r = requests.post('http://httpbin.org/post', data = form)
print(r.text)
結果:引數以表單形式提交,所以引數放在form引數中

 


r = requests.post('http://httpbin.org/post', data = json.dumps(form))
print(r.text)
結果:引數不是以form表單提交的,所以放在json欄位中

 

# cookie

url = 'http://www.baidu.com'
r = requests.get(url)
cookies = r.cookies
#cookie實際上是一個字典
for k, v in cookies.get_dict().items():
    print(k, v)
結果:cookie實際上是一個鍵值對

 


cookies = {'c1':'v1', 'c2': 'v2'}
r = requests.get('http://httpbin.org/cookies', cookies = cookies)
print(r.text)
結果:

 

# 重定向和重定向歷史

r = requests.head('http://github.com', allow_redirects = True)
print(r.url)
print(r.status_code)
print(r.history)
結果:通過301定向

 

# # 代理
#
# proxies = {'http': ',,,', 'https': '...'}
# r = requests.get('...', proxies = proxies)

    

 二。BeautifulSoup庫

html:舉例如下

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>

解析程式碼如下:

from bs4 import BeautifulSoup

soup = BeautifulSoup(open('test.html'))
#使html文字更加結構化
# print(soup.prettify()) # Tag print(type(soup.title))
結果:bs4的一個類
print(soup.title.name)
print(soup.title)
結果如下:

 

# String

print(type(soup.title.string))
print(soup.title.string)
結果如下:只顯示標籤裡面內容

 

# Comment

print(type(soup.a.string))
print(soup.a.string)
結果:顯示註釋中的內容,所以有時需要判斷獲取到的內容是不是註釋

 

#
# '''
for item in soup.body.contents:
    print(item.name)

結果:body下面有三個item

 

# CSS查詢

print(soup.select('.sister'))
結果:樣式選擇器返回帶有某個樣式的所有內容 結果為一個list

 

print(soup.select('#link1'))
結果:ID選擇器,選擇ID等於link1的內容

 

print(soup.select('head > title'))
結果:

 



a_s = soup.select('a')
for a in a_s:
    print(a)
結果:標籤選擇器,選擇所有a標籤的

 


 

 持續更新中。。。。,歡迎大家關注我的公眾號LHWorld.

相關文章