阻塞式
import socket
from urllib.parse import urlparse
def get_url(url):
url = urlparse(url)
host = url.netloc
path = url.path
if path == "":
path = "/"
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
client.connect((host, 80))
# 模擬http協議
client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode('utf8'))
data = b''
while True:
d = client.recv(1024)
if d:
data += d
else:
break
data = data.decode('utf8')
html_data = data.split("\r\n\r\n")[1] # 去掉請求頭
print(html_data)
client.close()
if __name__=="__main__":
get_url("http://www.baidu.com")
複製程式碼
非阻塞 因為要詢問連線是否建立好,需要while迴圈不停的檢查狀態,多餘消耗了CPU
import socket
from urllib.parse import urlparse
def get_url(url):
url = urlparse(url)
host = url.netloc
path = url.path
if path == '':
path = '/'
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
client.setblocking(False) # 設定為非阻塞
try:
client.connect((host, 80))
except BlockingIOError as e:
pass
while True:
try:
client.send(
'GET {path} HTTP/1.1\r\nHost:{host}\r\nConnection:close\r\n\r\n'.format(path=path, host=host).encode(
'utf8'))
break
except OSError as e:
pass
data = b''
while True:
try:
d = client.recv(1024)
except BlockingIOError as e:
continue
if d:
data += d
else:
break
data = data.decode('utf8')
html_data = data.split('\r\n\r\n')[1]
print(html_data)
client.close()
if __name__ == '__main__':
get_url('http://www.baidu.com')
複製程式碼
select(poll/epoll) + 回撥 + 事件迴圈 看起來比較複雜,為什麼要改成這樣呢,因為只會處理那些準備好的socket,不會等待網路I/O,使用單執行緒模式,省去了執行緒間切換的開銷。實現了單執行緒併發,併發性高 但這種回撥的寫法實在是太蛋疼
import socket
from urllib.parse import urlparse
# 是select更易用的一個封裝,會根據平臺 win/linux 去自動選擇select/epull
from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE
selector = DefaultSelector()
urls = ['http://www.baidu.com']
stop = False
class Fetch:
def connected(self, key):
selector.unregister(key.fd) # 登出監控的事件
self.client.send('GET {path} HTTP/1.1\r\nHost:{host}\r\nConnection:close\r\n\r\n'.format(path=self.path, host=self.host).encode(
'utf8'))
selector.register(self.client.fileno(), EVENT_READ,self.readable)
def readable(self, key):
d = self.client.recv(1024)
if d:
self.data += d
else:
selector.unregister(key.fd)
data = self.data.decode('utf8')
html_data = data.split('\r\n\r\n')[1]
print(html_data)
self.client.close()
urls.remove(self.spider_url)
if not urls:
global stop
stop = True
def get_url(self, url):
self.spider_url = url
url = urlparse(url)
self.host = url.netloc
self.path = url.path
self.data = b''
if self.path == '':
self.path = '/'
self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.client.setblocking(False)
try:
self.client.connect((self.host, 80))
except BlockingIOError as e:
pass
# 註冊
selector.register(self.client.fileno(), EVENT_WRITE, self.connected)
def loop():
# 事件迴圈,不停的請求socket的狀態並呼叫對應的回撥函式
# 1. select 本身是不支援register模式
# 2. socket撞田變化以後的回撥式是由程式設計師完成的
while not stop:
ready = selector.select()
for key, mask in ready:
call_back = key.data
call_back(key)
if __name__ == '__main__':
fetcher = Fetch()
fetcher.get_url('http://www.baidu.com')
loop()
複製程式碼
可以說: 同步模式併發性不高, 回撥模式編碼複雜度高, 多執行緒需要執行緒間同步,影響併發效能。