Web伺服器小專案(Linux / C / epoll)

星竹z發表於2024-09-29

原文網址 : https://www.cnblogs.com/xingzhuz/p/18440510

Web伺服器Linux

歡迎訪問我的另一個部落格: https://xingzhu.top/

注意：前置知識:
HTTP: https://xingzhu.top/archives/web-fu-wu-qi
Linux 多執行緒: https://xingzhu.top/archives/duo-xian-cheng

原始碼放github上了，歡迎star: https://github.com/xingzhuz/webServer

思路

實現程式碼

server.h

#pragma once
#include <arpa/inet.h>
#include <sys/epoll.h>
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/stat.h>
#include <assert.h>
#include <sys/sendfile.h>
#include <dirent.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <stdlib.h>
#include <ctype.h>
#include <pthread.h>

// 子執行緒執行動作函式的引數結構體
struct FdInfo
{
    int fd;
    int epfd;
    pthread_t tid;
};

// 初始化監聽的套接字
int initListenFd(unsigned short port);

// 啟動 epoll
int epollRun(int lfd);

// 和客戶端建立連線
// int acceptClient(int lfd, int epfd);
void *acceptClient(void *arg);

// 接收http請求
// int recvHttpRequest(int cfd, int epfd);
void *recvHttpRequest(void *arg);

// 解析請求行
int parseRequestLine(const char *line, int cfd);

// 傳送檔案
int sendFile(const char *fileName, int cfd);

// 傳送響應頭(狀態行+響應頭)
int sendHeadMsg(int cfd, int status, const char *descr, const char *type, int length);

// 根據檔名字或者字尾獲取 HTTP 格式響應的資料型別
const char *getFileType(const char *name);

// 傳送目錄
int sendDir(const char *dirName, int cfd);

// 將字元轉換為整形數
int hexToDec(char c);

// 解碼
// to 儲存解碼之後的資料，傳出引數，from為被解碼的資料，傳入引數
void decodeMsg(char *to, char *from);

main.c

#include "server.h"

int main(int argc, char *argv[])
{
    if (argc < 3)
    {
        printf("./a.out port path\n");
        return -1;
    }
    unsigned short port = atoi(argv[1]);

    // 切換伺服器的工作路徑
    chdir(argv[2]);

    // 初始化用於監聽的套接字
    int lfd = initListenFd(port);

    // 啟動伺服器程式
    epollRun(lfd);
    return 0;
}

initListenFd

// 初始化監聽的套接字
int initListenFd(unsigned short port)
{
    // 1.建立監聽的fd
    int lfd = socket(AF_INET, SOCK_STREAM, 0);
    if (lfd == -1)
    {
        perror("socket");
        return -1;
    }

    // 2. 設定埠複用
    int opt = 1;
    int ret = setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof opt);
    if (ret == -1)
    {
        perror("setsocket");
        return -1;
    }

    // 3. 繫結
    struct sockaddr_in addr;
    addr.sin_family = AF_INET;
    addr.sin_port = htons(port);
    addr.sin_addr.s_addr = INADDR_ANY;
    ret = bind(lfd, (struct sockaddr *)&addr, sizeof addr);
    if (ret == -1)
    {
        perror("bind");
        return -1;
    }

    // 4. 設定監聽
    ret = listen(lfd, 128);
    if (ret == -1)
    {
        perror("listen");
        return -1;
    }

    // 返回 fd
    return lfd;
}

這些步驟都是基礎的 Socket 網路通訊部分，不再贅述
解釋埠複用：因為存在伺服器端主動斷開連線的情況，如果是伺服器端主動斷開連線，主動斷開的一方存在一個等待時長，也就是在這個等待時長內，埠還是沒有被釋放，時長結束後才會釋放
如果不想等待這個時長或者由於這個時長而換埠，就需要設定這個埠複用，設定後即使即使是在等待時長時間段內，仍可使用該埠

int setsockopt(int sockfd, int level, int optname, const void *optval, socklen_t optlen);

sockfd: 套接字的檔案描述符，通常是透過 socket() 函式建立的
level: 選項所在的協議層，通常為 SOL_SOCKET，表示通用的套接字選項。也可以是特定協議的層，例如 IPPROTO_TCP
optname: 需要設定的選項的名稱。可以是多種選項，如：
- SO_REUSEADDR: 允許重用本地地址
- SO_KEEPALIVE: 啟用 TCP 的保活機制
- SO_BROADCAST: 允許傳送廣播訊息
optval: 指向要設定的選項值的指標。這個值的型別取決於選項的型別
optlen: optval 所指向的值的大小，通常使用 sizeof() 來獲取

epollRun

int epollRun(int lfd)
{
    // 1. 建立 epoll 例項
    int epfd = epoll_create(1); // 1 這個引數已被棄用，隨便寫一個大於 0 的數即可
    if (epfd == -1)
    {
        perror("epoll_create");
        return -1;
    }

    // 2. lfd 上樹
    struct epoll_event ev;
    ev.data.fd = lfd;    // data 是一個聯合體，只能使用一個成員，這裡使用 fd
    ev.events = EPOLLIN; // 委託核心需要檢測的事件: 檢測連線請求，對於伺服器而言就是讀事件 EPOLLIN

    // 引數: epoll id, 操作的動作, 操作的檔案描述符, 事件結構體
    // 可以做三件事，增加、修改、刪除
    int ret = epoll_ctl(epfd, EPOLL_CTL_ADD, lfd, &ev);
    if (ret == -1)
    {
        perror("epoll_ctl");
        return -1;
    }

    // 3. 檢測
    struct epoll_event evs[1024];
    int size = sizeof(evs) / sizeof(struct epoll_event);
    while (1)
    {
        // 檢測新增到 epoll 樹上的檔案描述符事件是否被啟用，也就是是否有事件到達
        // 第四個引數是阻塞時長，如果為 -1 就是一直阻塞
        int num = epoll_wait(epfd, evs, size, -1);
        for (int i = 0; i < num; i++)
        {
            struct FdInfo *info = (struct FdInfo *)malloc(sizeof(struct FdInfo));
            int fd = evs[i].data.fd;
            info->epfd = epfd;
            info->fd = fd;
            if (fd == lfd)
            {
                // 如果是監聽的檔案描述符，建立新連線 accept
                // 注意這裡的 accept 是不會阻塞的，因為 epoll 已經檢測了，只有觸發了，才會在 evs 陣列中
                // 這裡建立多執行緒處理，效率更高
                pthread_create(&info->tid, NULL, acceptClient, info);
            }
            else
            {
                // 響應客戶端請求，接收客戶端請求
                pthread_create(&info->tid, NULL, recvHttpRequest, info);
            }
        }
    }
}

epoll 是 IO 多路轉接 / 複用中的一個實現，可以大大提高效率，IO 多路轉接/複用可以實現一個執行緒就監視多個檔案描述符，其實現機制是在核心去中監視的，也就是可以大大減小開銷，不用手動建立執行緒阻塞等待連線了，核心區監視是否有連線請求
而 epoll 是實現方式中效率較高的，是基於紅黑樹實現的，搜尋起來快速

acceptClient

void *acceptClient(void *arg)
{
    struct FdInfo *info = (struct FdInfo *)arg;

    // 1. 建立連線
    // 第二三個引數都是客戶端相關資訊，不需要知道，直接指定為 NULL
    int cfd = accept(info->fd, NULL, NULL);
    if (cfd == -1)
    {
        perror("accept");
        return NULL;
    }

    // 2. 設定非阻塞
    int flag = fcntl(cfd, F_GETFL); // 第二個參數列示得到當前檔案描述符屬性
    flag |= O_NONBLOCK;             // 將非阻塞屬性 O_NONBLOCK 追加進去
    fcntl(cfd, F_SETFL, flag);      // 重新設定檔案描述符的屬性，即 flag

    // 3. cfd 新增到 epoll 中
    struct epoll_event ev;
    ev.data.fd = cfd;
    // 這個加的屬性 EPOLLET 表示設定這個通訊的檔案描述符對應的處理事件為邊沿觸發模式
    ev.events = EPOLLIN | EPOLLET;
    int ret = epoll_ctl(info->epfd, EPOLL_CTL_ADD, cfd, &ev);
    if (ret == -1)
    {
        perror("epoll_ctl");
        return NULL;
    }

    printf("acceptClient threadId: %ld\n", info->tid);
    free(info);
    return 0;
}

epoll 工作模式中，邊緣非阻塞模式效率最高，因此採用這個，所以設定了檔案描述符為非阻塞模式（預設為阻塞）
這裡的連線和接收資料用多執行緒處理效率更高，即使之前已經實現了多個客戶端和多個伺服器端通訊

recvHttpRequest

void *recvHttpRequest(void *arg)
{
    struct FdInfo *info = (struct FdInfo *)arg;

    int len = 0, total = 0;
    char tmp[1024] = {0};
    char buf[4096] = {0};

    while ((len = recv(info->fd, tmp, sizeof tmp, 0)) > 0)
    {
        if (total + len < sizeof buf)
        {
            memcpy(buf + total, tmp, len);
        }
        total += len;
    }
    // 判斷資料是否被接收完畢
    if (len == -1 && errno == EAGAIN)
    {
        // 解析請求行
        char *pt = strstr(buf, "\r\n");
        int reLen = pt - buf;
        buf[reLen] = '\0';
        parseRequestLine(buf, info->fd);
    }
    else if (len == 0)
    {
        // 客戶端斷開了連線
        // 刪除在 epoll 樹上的檔案描述符，因為不需要檢測這個檔案描述符了
        epoll_ctl(info->epfd, EPOLL_CTL_DEL, info->fd, NULL);
        close(info->fd);
    }
    else
        perror("recv");

    printf("resvMsg threadId: %ld\n", info->tid);
    free(info);
    return NULL;
}

上述 total 是偏移量，因為 memcpy 是從起始位置開始複製
雖然 buf 只有 4096 位元組，存在讀不完所有的請求資料，但是這也是沒問題的，有用的資料 4096 已經夠了，因為請求行最重要，只需要知道客戶端向伺服器請求的靜態資源是什麼，即便後面沒讀完，也不影響
由於這個套接字是非阻塞，所以當資料讀完後，不阻塞，但是返回 -1，但是讀取資料失敗也是返回 -1，這就無法判斷是否是讀取完資料了，此時再用到 errno == EAGAIN 就能判斷成功
如果套接字是阻塞的，當讀取完資料後，會一直阻塞，所以書寫邏輯需要更改，內部判斷是否讀取完畢，然後 break 迴圈

parseRequestLine

// 解析請求行
int parseRequestLine(const char *line, int cfd)
{
    // 解析請求行 get /xxx/1.jpg http/1.1
    char method[12];
    char path[1024];
    sscanf(line, "%[^ ] %[^ ]", method, path);
    if (strcasecmp(method, "get") != 0) // 這個比較忽略大小寫
    {
        // 這裡只處理 get 請求
        return -1;
    }
    
    // 處理中文編碼問題
    decodeMsg(path, path);

    // 處理客戶端請求的靜態資源(目錄或者檔案)
    char *file = NULL;
    if (strcmp(path, "/") == 0)
    {
        // 說明只有當前資源目錄
        file = "./";
    }
    else
    {
        // 說明目錄中存在當前資源目錄中的子目錄
        // 去掉 '/' 就能是相對路徑了，就成功了，或者在開頭加個 '.' 也行
        file = path + 1;
    }

    // printf("%s\n", file);
    // 獲取檔案屬性
    struct stat st;
    int ret = stat(file, &st);
    if (ret == -1)
    {
        // 檔案不存在 -- 回覆 404
        // 最後一個引數設定為 -1，讓瀏覽器自己計算長度
        sendHeadMsg(cfd, 404, "Not Found", getFileType(".html"), -1);
        sendFile("404.html", cfd); // 這個 html 需要當前資源目錄下的 html檔案(自己部署)
        return 0;
    }
    // 判斷檔案型別
    if (S_ISDIR(st.st_mode))
    {
        // 把這個目錄中的內容傳送給客戶端
        sendHeadMsg(cfd, 200, "OK", getFileType(".html"), -1);
        sendDir(file, cfd);
    }
    else
    {
        // 把檔案的內容傳送給客戶端
        sendHeadMsg(cfd, 200, "OK", getFileType(file), st.st_size);
        sendFile(file, cfd);
    }
}

sendFile

// 傳送的資料部分
int sendFile(const char *fileName, int cfd)
{
    // 1.開啟檔案
    int fd = open(fileName, O_RDONLY);
    // 斷言判斷檔案是否開啟成功，如果開啟失敗，程式直接掛在這裡，或者丟擲異常
    assert(fd > 0);  
#if 0
    while (1)
    {
        char buf[1024];
        int len = read(fd, buf, sizeof buf);
        if (len > 0)
        {
            send(cfd, buf, len, 0);
            usleep(10); // 這非常重要
        }
        else if (len == 0)   // 檔案內容讀取完畢
            break;
        else
            perror("read");
    }
#else
    off_t offset = 0;
    int size = lseek(fd, 0, SEEK_END);
    lseek(fd, 0, SEEK_SET);

    while (offset < size)
    {
        int ret = sendfile(cfd, fd, &offset, size - offset);
        printf("ret value: %d\n", ret);
        if (ret == -1 && errno != EAGAIN)
        {
            perror("sendfile");
        }
    }

#endif
    return 0;
}

上述是傳送檔案的兩種方式
第一種方式的 usleep(10) 很重要，傳送資料很快，但是客戶端讀資料不一定這麼快，客戶端需要讀取資料，然後進行解析，然後呈現出，這都需要耗時間的，不休眠一會兒，會存在接收資料不一致的問題（我遭受過...）
第二種方式使用庫函式 sendfile ，透過這個函式傳送，比手寫的傳送檔案程式碼效率高，因為會減少複製次數，第四個引數是傳送的大小，size - offset 的原因是 offset 這個引數是傳入傳出引數，會偏移到傳送的位置，由於多次傳送，前面傳送了資料之後，就不是 size 了，就需要減去傳送的位元組數，也就是傳出的偏移量 offset
注意 lseek 函式計算檔案大小，會移動檔案的指標，且 sendfile 也是有內部也是有快取大小的，因此需要迴圈讀取傳送
if 判斷是因為檔案描述符改為了非阻塞模式，會一直讀取資料，如果資料讀完，也會返回 -1 ，所以就需要再加個判斷

sendHeadMsg

// 傳送響應頭
int sendHeadMsg(int cfd, int status, const char *descr, const char *type, int length)
{
    // 狀態行
    char buf[4096] = {0};
    sprintf(buf, "http/1.1 %d %s\r\n", status, descr);

    // 響應頭
    sprintf(buf + strlen(buf), "content-type: %s\r\n", type);
    sprintf(buf + strlen(buf), "content-length: %d\r\n\r\n", length); // 注意兩個\r\n

    send(cfd, buf, strlen(buf), 0);
    return 0;
}

getFileType

// 根據檔名字或者字尾獲取 HTTP 格式響應的資料型別
const char *getFileType(const char *name)
{
    // a.jpg a.mp4 a.html
    // 自右向左查詢 '.' 字元，如不存在返回 NULL
    const char *dot = strrchr(name, '.');

    if (dot == NULL)
        return "text/plain;charset=utf-8"; // 純文字

    if (strcmp(dot, ".html") == 0 || strcmp(dot, ".htm") == 0)
        return "text/html; charset=utf-8";

    if (strcmp(dot, ".jpg") == 0 || strcmp(dot, ".jpeg") == 0)
        return "image/jpeg";

    if (strcmp(dot, ".gif") == 0)
        return "image/gif";

    if (strcmp(dot, ".png") == 0)
        return "image/png";

    if (strcmp(dot, ".css") == 0)
        return "text/css";

    if (strcmp(dot, ".au") == 0)
        return "audio/basic";

    if (strcmp(dot, ".wav") == 0)
        return "audio/wav";

    if (strcmp(dot, ".mp3"))
        return "audio/mp3";

    // 還有一些未寫

    return "text/plain; charset = utf-8";
}

sendDir

下述拼接是這樣的

<html>
    <head>
        <title>test</title>
    </head>
    <body>
        <table>    <!--- 開頭拼接到這 --->
            <tr>            <!--- 中間部分的拼接 --->
                <td></td>
                <td></td>
            </tr>
            <tr>
                <td></td>
                <td></td>
            </tr>
        </table>   <!--- 尾巴從這開始拼接  -->
    </body>
</html>

// 傳送目錄
int sendDir(const char *dirName, int cfd)
{
    char buf[8192] = {0};
    sprintf(buf,
            "<html>"
            "<head>"
            "<title>%s</title>"
            "<style>"
            "body { font-family: Arial, sans-serif; margin: 20px; background-color: #f4f4f4; }"
            "h1 { color: #2c3e50; text-align: center; }"
            "table { width: 100%%; border-collapse: collapse; margin-top: 20px; }"
            "th, td { border: 1px solid #ddd; padding: 12px; text-align: left; }"
            "th { background-color: #3498db; color: white; }"
            "tr:hover { background-color: #e7f3ff; }"
            "a { text-decoration: none; color: #3498db; transition: color 0.3s; }"
            "a:hover { color: #2980b9; text-decoration: underline; }"
            "</style>"
            "</head>"
            "<body><h1>%s</h1><table><tr><th>名稱</th><th>大小 (位元組)</th></tr>",
            dirName, dirName);

    struct dirent **namelist;
    // 第三個引數是回撥函式，表示遍歷時過濾的規則， 第四個引數是排序的方式
    int num = scandir(dirName, &namelist, NULL, alphasort);
    // 雖然 namelist 定義時沒有分配地址，但是在函式呼叫後就分配了地址，所以後續要釋放記憶體

    for (int i = 0; i < num; i++)
    {
        // 取出檔名，namelist 指向的是一個指標陣列
        char *name = namelist[i]->d_name;
        struct stat st;
        char subPath[1024] = {0};
        sprintf(subPath, "%s/%s", dirName, name);
        stat(subPath, &st);

        if (S_ISDIR(st.st_mode))
        {
            sprintf(buf + strlen(buf),
                    "<tr><td><a href=\"%s/\">%s</a></td><td>%ld</td></tr>",
                    name, name, st.st_size);
        }
        else
        {
            sprintf(buf + strlen(buf),
                    "<tr><td><a href=\"%s\">%s</a></td><td>%ld</td></tr>",
                    name, name, st.st_size);
        }

        send(cfd, buf, strlen(buf), 0);
        memset(buf, 0, sizeof buf);
        free(namelist[i]);
    }

    sprintf(buf, "</table></body></html>");
    send(cfd, buf, strlen(buf), 0);
    free(namelist);
    return 0;
}

拼接 html 網頁元素，是因為需要一個網頁形式傳送給瀏覽器
可以拼一份，發一份，因為底層使用的 TCP 協議
注意上述 a 標籤那兒 \"%s/\" 需要 \ 轉義，因為前面已經有 " 了，所以需要用 \ 轉義，%s 後面加 / 是因為可能需要點選進入這個子目錄，所以必須要這個 /

注意: 中文亂碼問題

HTTP 協議中，不支援特殊字元 (如中文)，會自動轉義為 utf-8 編碼，也就是如果當前檔名為中文，那麼 linux 會將這個特殊字元轉換為 utf-8 編碼
如 /Linux%E5%86%85%E6%A0%B8.jpg 原本是 /Linux核心.jpg ，這樣之後傳送資訊時就打不開了，報錯 Not Found ，因為本地檔名是帶有中文，但是經過程式碼處理後，程式讀出的檔名沒有中文，就找不到了
因此需要轉換一下

decodeMsg

// 將字元轉換為整形數
int hexToDec(char c)
{
    if (c >= '0' && c <= '9')
        return c - '0';
    if (c >= 'a' && c <= 'f')
        return c - 'a' + 10;
    if (c >= 'A' && c <= 'F')
        return c - 'A' + 10;

    return 0;
}

// 解碼
// to 儲存解碼之後的資料，傳出引數，from被解碼的資料，傳入引數
void decodeMsg(char *to, char *from)
{
    for (; *from != '\0';)
    {
        // isxdigit -> 判斷字元是不是16進位制格式，取值在 0-f
        if (from[0] == '%' && isxdigit(from[1]) && isxdigit(from[2]))
        {
            // 將 16進位制的數 -> 十進位制 將這個數值賦值給了字元 int -> char
            *to = hexToDec(from[1]) * 16 + hexToDec(from[2]);

            // 跳過 '%' 和後面的兩個字元
            to++;
            from += 3; // 修改為3
        }
        else
        {
            // 字元複製，賦值
            *to = *from;
            to++;
            from++;
        }
    }
    *to = '\0'; // 新增字串結束符
}

這個沒必要理解，直接網上搜尋即可，這裡讓 GPT 潤色修改成功的

404.html

<!DOCTYPE html>
<html lang="zh">

<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>404 頁面未找到</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            background-color: #f4f4f4;
            margin: 0;
            padding: 0;
            display: flex;
            justify-content: center;
            align-items: center;
            height: 100vh;
            text-align: center;
        }

        .container {
            background-color: #fff;
            border-radius: 8px;
            box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
            padding: 40px;
            max-width: 400px;
            width: 100%;
        }

        h1 {
            font-size: 72px;
            color: #e74c3c;
            margin: 0;
        }

        h2 {
            color: #333;
            margin: 20px 0;
        }

        p {
            color: #666;
            margin-bottom: 20px;
        }

        a {
            text-decoration: none;
            background-color: #3498db;
            color: #fff;
            padding: 10px 20px;
            border-radius: 5px;
            transition: background-color 0.3s;
        }

        a:hover {
            background-color: #2980b9;
        }
    </style>
</head>

<body>
    <div class="container">
        <h1>404</h1>
        <h2>頁面未找到</h2>
        <p>抱歉，我們找不到您請求的頁面。</p>
        <a href="/">返回首頁</a>
    </div>
</body>

</html>