Python 使用socket模擬http請求,從阻塞到協程

青穗黃發表於2018-10-30

阻塞式

import socket
from urllib.parse import urlparse


def get_url(url):
    url = urlparse(url)
    host = url.netloc
    path = url.path
    if path == "":
        path = "/"

    client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    client.connect((host, 80))
    # 模擬http協議
    client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(path, host).encode('utf8'))
    data = b''
    while True:
        d = client.recv(1024)
        if d:
            data += d
        else:
            break
    data = data.decode('utf8')
    html_data = data.split("\r\n\r\n")[1]  # 去掉請求頭
    print(html_data)
    client.close()

if __name__=="__main__":
    get_url("http://www.baidu.com")
複製程式碼

非阻塞 因為要詢問連線是否建立好,需要while迴圈不停的檢查狀態,多餘消耗了CPU

import socket
from urllib.parse import urlparse


def get_url(url):
    url = urlparse(url)
    host = url.netloc
    path = url.path
    if path == '':
        path = '/'

    client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    client.setblocking(False)  # 設定為非阻塞

    try:
        client.connect((host, 80))
    except BlockingIOError as e:
        pass

    while True:
        try:
            client.send(
                'GET {path} HTTP/1.1\r\nHost:{host}\r\nConnection:close\r\n\r\n'.format(path=path, host=host).encode(
                    'utf8'))
            break
        except OSError as e:
            pass

    data = b''
    while True:
        try:
            d = client.recv(1024)
        except BlockingIOError as e:
            continue

        if d:
            data += d
        else:
            break

    data = data.decode('utf8')
    html_data = data.split('\r\n\r\n')[1]
    print(html_data)
    client.close()


if __name__ == '__main__':
    get_url('http://www.baidu.com')

複製程式碼

select(poll/epoll) + 回撥 + 事件迴圈 看起來比較複雜,為什麼要改成這樣呢,因為只會處理那些準備好的socket,不會等待網路I/O,使用單執行緒模式,省去了執行緒間切換的開銷。實現了單執行緒併發,併發性高 但這種回撥的寫法實在是太蛋疼

import socket
from urllib.parse import urlparse
# 是select更易用的一個封裝,會根據平臺 win/linux 去自動選擇select/epull
from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE

selector = DefaultSelector()

urls = ['http://www.baidu.com']
stop = False
class Fetch:
    def connected(self, key):
        selector.unregister(key.fd) # 登出監控的事件
        self.client.send('GET {path} HTTP/1.1\r\nHost:{host}\r\nConnection:close\r\n\r\n'.format(path=self.path, host=self.host).encode(
                    'utf8'))
        selector.register(self.client.fileno(), EVENT_READ,self.readable)

    def readable(self, key):
        d = self.client.recv(1024)
        if d:
            self.data += d
        else:
            selector.unregister(key.fd)

        data = self.data.decode('utf8')
        html_data = data.split('\r\n\r\n')[1]
        print(html_data)
        self.client.close()
        urls.remove(self.spider_url)
        if not urls:
            global stop
            stop = True


    def get_url(self, url):
        self.spider_url = url
        url = urlparse(url)
        self.host = url.netloc
        self.path = url.path
        self.data = b''
        if self.path == '':
            self.path = '/'

        self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.client.setblocking(False)

        try:
            self.client.connect((self.host, 80))
        except BlockingIOError as e:
            pass

        # 註冊
        selector.register(self.client.fileno(), EVENT_WRITE, self.connected)

def loop():
    # 事件迴圈,不停的請求socket的狀態並呼叫對應的回撥函式
    # 1. select 本身是不支援register模式
    # 2. socket撞田變化以後的回撥式是由程式設計師完成的
    while not stop:
        ready = selector.select()
        for key, mask in ready:
            call_back = key.data
            call_back(key)



if __name__ == '__main__':
    fetcher = Fetch()
    fetcher.get_url('http://www.baidu.com')
    loop()
複製程式碼

可以說: 同步模式併發性不高, 回撥模式編碼複雜度高, 多執行緒需要執行緒間同步,影響併發效能。

相關文章