pybind11 教程

夜sea如烟發表於2024-07-25

pybind11 教程

用途

透過pybind11可以實現以下功能:

  1. 將python中實現慢的邏輯,使用C++程式碼重寫,從而提升程式效率
  2. 將現有的C++程式碼編譯為python模組,減少重複開發,方便在python中整合

本次闡述透過pybind11實現以下幾個功能:

  1. C++中的物件或方法如何在python中使用
  2. pybind11中的gil鎖
  3. python物件如何在C++中使用

安裝方式

可以透過pip的方式安裝:pip install pybind11
安裝完成之後,可以執行pybind11-config --cmakedir確認CMake的配置

以下程式碼都已開源到github上:https://github.com/yeseary/pybind11_example

執行方式

# 如果有conda,可以先切換環境
conda activate py36
cd example_py
cmake .. && make -j
python example.py  # 最簡單的例項程式碼
python example_gil.py  # gil鎖
python example_multi.py  # 多執行緒
python example_object.py  # 物件傳遞

CMakeLists的配置例項

以下是一個CMakeLists.txt的配置示例:

cmake_minimum_required(VERSION 3.4...3.18)
project(example)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_VERBOSE_MAKEFILEON ON)


# 設定編譯器
set(CMAKE_CXX_COMPILER "/usr/bin/clang++")  # 設定c++編譯器
set(CMAKE_OSX_ARCHITECTURES "x86_64")  #如果是mac-m1晶片,需要設定


# 將pybind11的cmake目錄新增到cmake的搜尋路徑中
execute_process(
    COMMAND pybind11-config --cmakedir
    OUTPUT_VARIABLE PYBIND11_CMAKE_DIR
    OUTPUT_STRIP_TRAILING_WHITESPACE
)
list(APPEND CMAKE_PREFIX_PATH ${PYBIND11_CMAKE_DIR})
message(CMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}")
find_package(pybind11 REQUIRED)

# 生產環境使用
add_compile_options(-g -O2)
set(CMAKE_BUILD_TYPE "Release")

# 設定debug模式
# set(CMAKE_BUILD_TYPE "Debug")


# 匯出pybind11模組
pybind11_add_module(example_gil example_gil.cpp)
pybind11_add_module(example_multi example_multi.cpp)
pybind11_add_module(example_object example_object.cpp)

C++中的物件或方法如何在python中使用

example.cpp檔案如下:

#include <pybind11/pybind11.h>

namespace py = pybind11;

// 一個簡單的 C++ 函式
int add(int i, int j) {
    return i + j;
}

// 一個簡單的 C++ 類
class Pet {
public:
    Pet(const std::string &name) : name(name) {}

    void setName(const std::string &name_) { name = name_; }
    const std::string &getName() const { return name; }

private:
    std::string name;
};

// 繫結程式碼
PYBIND11_MODULE(example, m) {
    m.doc() = "pybind11 example plugin"; // 可選的模組文件字串

    m.def("add", &add, "A function which adds two numbers");

    py::class_<Pet>(m, "Pet")
        .def(py::init<const std::string &>())
        .def("setName", &Pet::setName)
        .def("getName", &Pet::getName);
}

example.py檔案如下:

import example

# 呼叫 C++ 函式
print(example.add(1, 2))  # 輸出: 3

# 使用 C++ 類
pet = example.Pet("Milo")
print(pet.getName())  # 輸出: Milo
pet.setName("Otis")
print(pet.getName())  # 輸出: Otis

pybind11中的gil鎖

GIL是python中的一個底層鎖,它是python直譯器的全域性鎖,在python中,只有一個執行緒可以執行python的程式碼,所以python的多執行緒是無法利用多核CPU的,所以python的多執行緒是無法利用多核CPU的。
但是透過pybind11可以靈活的擺脫GIL鎖的限制,使用多執行緒。

example_gil.cpp程式碼如下:

#include <pybind11/pybind11.h>
#include <pybind11/functional.h>
#include <chrono>
#include <thread>
#include "Python.h"
#include <iostream>

namespace py = pybind11;


// 一個模擬長時間執行的計算函式
void long_computation(bool no_gil) {
    // 釋放 GIL
    // std::cout<<"no gil:"<<no_gil<<std::endl;
    if(no_gil){
        py::gil_scoped_release release;
         // 模擬長時間計算
        std::this_thread::sleep_for(std::chrono::seconds(5));
        py::gil_scoped_acquire acquire;
    } else{
        std::this_thread::sleep_for(std::chrono::seconds(5));
    }
    
}

// 一個模擬計算密集型任務的函式
void compute(int thread_id, int num_iterations) {
    for (int i = 0; i < num_iterations; ++i) {
        // 模擬一些計算
        double result = 0.0;
        for (int j = 0; j < 10000; ++j) {
            result += j * 0.001;
        }
    }
    std::this_thread::sleep_for(std::chrono::seconds(5));
}

// 一個多執行緒計算函式
void parallel_compute(int num_threads, int num_iterations) {
    // 釋放 GIL
    py::gil_scoped_release release;

    std::vector<std::thread> threads;
    for (int i = 0; i < num_threads; ++i) {
        threads.emplace_back(compute, i, num_iterations);
    }

    for (auto& t : threads) {
        t.join();
    }

    // 重新獲取 GIL
    // py::gil_scoped_acquire acquire;
}

// 繫結程式碼
PYBIND11_MODULE(example_gil, m) {
    m.doc() = "pybind11 example plugin"; // 可選的模組文件字串

    m.def("long_computation", &long_computation, "A function that performs a long computation", py::arg("no_gil"));
    m.def("parallel_compute", &parallel_compute, "A function that performs parallel computation",
          py::arg("num_threads"), py::arg("num_iterations"));
}

example_gil.py程式碼如下:

import time
import example_gil
import threading

def run_task(no_gil:bool):

    n = 3
    def run_computation(i):
        print("開始長耗時任務計算...")
        example_gil.long_computation(no_gil)
        print("開始長耗時任務計算結束.")

    start_time = time.time()
    # 建立並啟動多個執行緒
    threads = [threading.Thread(target=run_computation, args=(i,)) for i in range(n)]
    for t in threads:
        t.start()

    # 等待所有執行緒完成
    for t in threads:
        t.join()
    end_time = time.time()
    print(f"釋放GIL鎖:{no_gil},時間: {end_time - start_time} seconds")

run_task(no_gil=True)
run_task(no_gil=False)

num_threads = 4
num_iterations = 10000

start_time = time.time()
example_gil.parallel_compute(num_threads, num_iterations)
end_time = time.time()

print(f"Parallel computation with {num_threads} threads took {end_time - start_time} seconds")

執行結果如下:

開始長耗時任務計算...
開始長耗時任務計算...
開始長耗時任務計算...
開始長耗時任務計算結束.
開始長耗時任務計算結束.
開始長耗時任務計算結束.
釋放GIL鎖:True,時間: 5.005779027938843 seconds
開始長耗時任務計算...
開始長耗時任務計算...
開始長耗時任務計算結束.
開始長耗時任務計算...
開始長耗時任務計算結束.
開始長耗時任務計算結束.
釋放GIL鎖:False,時間: 15.013175964355469 seconds
Parallel computation with 4 threads took 5.391555070877075 seconds

在C++中程式碼中,可以透過pybind11的介面進行GIL鎖的管理,對於以下場景:

  1. python單執行緒,C++多執行緒,可以利用多核;
  2. python多執行緒,C++單執行緒,如果不釋放GIL鎖,序列執行,釋放GIL鎖後可以並行;
  3. python多執行緒,C++多執行緒,同2;

python物件如何在C++中使用

example_object.cpp的程式碼如下:

import time
import example_object as example
import timeit

# 測試 square_list 函式
start_time = time.time()
n = int(1e4)
input_list = list(range(n))

# 引數傳遞耗時
for _ in range(n):
    squared_list = example.pass_list(input_list)
time_cost = time.time() - start_time
print("run time:{}s".format(time_cost))
print("avg pass list time cost:{}ms".format(time_cost / n*1000))

for _ in range(n):
    squared_list = example.pass_vec(input_list)
time_cost = time.time() - start_time
print("run time:{}s".format(time_cost))
print("avg pass vec time cost:{}ms".format(time_cost / n*1000))


large_dict = {f"key{i}": i for i in range(10000000)}

def run():
    start_time = time.time()
    
    for _ in range(1000000):
        example.print_dict(large_dict)
    time_cost = time.time() - start_time
    print("run time:{}s".format(time_cost))
    print("avg time cost:{}ms".format(time_cost / 1000000*1000))

run()
# 測試 call_python_function 函式
def my_python_function():
    return "Hello from Python!"

input_dict = dict(a=1, b=2, c=3)
example.call_python_function(id, input_dict)
print("input_dict:", id(input_dict))

python程式碼example_object.py如下:

import time
import example_object as example
import timeit

# 測試 square_list 函式
start_time = time.time()
n = int(1e4)
input_list = list(range(n))

# 引數傳遞耗時
for _ in range(n):
    squared_list = example.pass_list(input_list)
time_cost = time.time() - start_time
print("run time:{}s".format(time_cost))
print("avg pass list time cost:{}ms".format(time_cost / n*1000))

for _ in range(n):
    squared_list = example.pass_vec(input_list)
time_cost = time.time() - start_time
print("run time:{}s".format(time_cost))
print("avg pass vec time cost:{}ms".format(time_cost / n*1000))


large_dict = {f"key{i}": i for i in range(10000000)}

def run():
    start_time = time.time()
    
    for _ in range(1000000):
        example.print_dict(large_dict)
    time_cost = time.time() - start_time
    print("run time:{}s".format(time_cost))
    print("avg time cost:{}ms".format(time_cost / 1000000*1000))

run()
# 測試 call_python_function 函式
def my_python_function():
    return "Hello from Python!"

input_dict = dict(a=1, b=2, c=3)
example.call_python_function(id, input_dict)
print("input_dict:", id(input_dict))

執行結果如下:

run time:0.0034627914428710938s
avg pass list time cost:0.00034627914428710934ms
run time:1.315690040588379s
avg pass vec time cost:0.1315690040588379ms
run time:0.30083274841308594s
avg time cost:0.00030083274841308596ms
Result from Python function: 140240346308040
input_dict: 140240346308040

需要注意如果資料型別不一致的情況下,pybind11會進行型別轉換,當資料量非常大的時候,這個轉換耗時將不可忽略。因此,如果涉及到大量的資料轉換,最好避免型別轉換。

參考

  1. 給Python演算法插上效能的翅膀——pybind11落地實踐