pybind11 教程
用途
透過pybind11可以實現以下功能:
- 將python中實現慢的邏輯,使用C++程式碼重寫,從而提升程式效率
- 將現有的C++程式碼編譯為python模組,減少重複開發,方便在python中整合
本次闡述透過pybind11實現以下幾個功能:
- C++中的物件或方法如何在python中使用
- pybind11中的gil鎖
- python物件如何在C++中使用
安裝方式
可以透過pip的方式安裝:pip install pybind11
安裝完成之後,可以執行pybind11-config --cmakedir
確認CMake的配置
以下程式碼都已開源到github上:https://github.com/yeseary/pybind11_example
執行方式
# 如果有conda,可以先切換環境
conda activate py36
cd example_py
cmake .. && make -j
python example.py # 最簡單的例項程式碼
python example_gil.py # gil鎖
python example_multi.py # 多執行緒
python example_object.py # 物件傳遞
CMakeLists的配置例項
以下是一個CMakeLists.txt的配置示例:
cmake_minimum_required(VERSION 3.4...3.18)
project(example)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_VERBOSE_MAKEFILEON ON)
# 設定編譯器
set(CMAKE_CXX_COMPILER "/usr/bin/clang++") # 設定c++編譯器
set(CMAKE_OSX_ARCHITECTURES "x86_64") #如果是mac-m1晶片,需要設定
# 將pybind11的cmake目錄新增到cmake的搜尋路徑中
execute_process(
COMMAND pybind11-config --cmakedir
OUTPUT_VARIABLE PYBIND11_CMAKE_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE
)
list(APPEND CMAKE_PREFIX_PATH ${PYBIND11_CMAKE_DIR})
message(CMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}")
find_package(pybind11 REQUIRED)
# 生產環境使用
add_compile_options(-g -O2)
set(CMAKE_BUILD_TYPE "Release")
# 設定debug模式
# set(CMAKE_BUILD_TYPE "Debug")
# 匯出pybind11模組
pybind11_add_module(example_gil example_gil.cpp)
pybind11_add_module(example_multi example_multi.cpp)
pybind11_add_module(example_object example_object.cpp)
C++中的物件或方法如何在python中使用
example.cpp檔案如下:
#include <pybind11/pybind11.h>
namespace py = pybind11;
// 一個簡單的 C++ 函式
int add(int i, int j) {
return i + j;
}
// 一個簡單的 C++ 類
class Pet {
public:
Pet(const std::string &name) : name(name) {}
void setName(const std::string &name_) { name = name_; }
const std::string &getName() const { return name; }
private:
std::string name;
};
// 繫結程式碼
PYBIND11_MODULE(example, m) {
m.doc() = "pybind11 example plugin"; // 可選的模組文件字串
m.def("add", &add, "A function which adds two numbers");
py::class_<Pet>(m, "Pet")
.def(py::init<const std::string &>())
.def("setName", &Pet::setName)
.def("getName", &Pet::getName);
}
example.py檔案如下:
import example
# 呼叫 C++ 函式
print(example.add(1, 2)) # 輸出: 3
# 使用 C++ 類
pet = example.Pet("Milo")
print(pet.getName()) # 輸出: Milo
pet.setName("Otis")
print(pet.getName()) # 輸出: Otis
pybind11中的gil鎖
GIL是python中的一個底層鎖,它是python直譯器的全域性鎖,在python中,只有一個執行緒可以執行python的程式碼,所以python的多執行緒是無法利用多核CPU的,所以python的多執行緒是無法利用多核CPU的。
但是透過pybind11可以靈活的擺脫GIL鎖的限制,使用多執行緒。
example_gil.cpp程式碼如下:
#include <pybind11/pybind11.h>
#include <pybind11/functional.h>
#include <chrono>
#include <thread>
#include "Python.h"
#include <iostream>
namespace py = pybind11;
// 一個模擬長時間執行的計算函式
void long_computation(bool no_gil) {
// 釋放 GIL
// std::cout<<"no gil:"<<no_gil<<std::endl;
if(no_gil){
py::gil_scoped_release release;
// 模擬長時間計算
std::this_thread::sleep_for(std::chrono::seconds(5));
py::gil_scoped_acquire acquire;
} else{
std::this_thread::sleep_for(std::chrono::seconds(5));
}
}
// 一個模擬計算密集型任務的函式
void compute(int thread_id, int num_iterations) {
for (int i = 0; i < num_iterations; ++i) {
// 模擬一些計算
double result = 0.0;
for (int j = 0; j < 10000; ++j) {
result += j * 0.001;
}
}
std::this_thread::sleep_for(std::chrono::seconds(5));
}
// 一個多執行緒計算函式
void parallel_compute(int num_threads, int num_iterations) {
// 釋放 GIL
py::gil_scoped_release release;
std::vector<std::thread> threads;
for (int i = 0; i < num_threads; ++i) {
threads.emplace_back(compute, i, num_iterations);
}
for (auto& t : threads) {
t.join();
}
// 重新獲取 GIL
// py::gil_scoped_acquire acquire;
}
// 繫結程式碼
PYBIND11_MODULE(example_gil, m) {
m.doc() = "pybind11 example plugin"; // 可選的模組文件字串
m.def("long_computation", &long_computation, "A function that performs a long computation", py::arg("no_gil"));
m.def("parallel_compute", ¶llel_compute, "A function that performs parallel computation",
py::arg("num_threads"), py::arg("num_iterations"));
}
example_gil.py程式碼如下:
import time
import example_gil
import threading
def run_task(no_gil:bool):
n = 3
def run_computation(i):
print("開始長耗時任務計算...")
example_gil.long_computation(no_gil)
print("開始長耗時任務計算結束.")
start_time = time.time()
# 建立並啟動多個執行緒
threads = [threading.Thread(target=run_computation, args=(i,)) for i in range(n)]
for t in threads:
t.start()
# 等待所有執行緒完成
for t in threads:
t.join()
end_time = time.time()
print(f"釋放GIL鎖:{no_gil},時間: {end_time - start_time} seconds")
run_task(no_gil=True)
run_task(no_gil=False)
num_threads = 4
num_iterations = 10000
start_time = time.time()
example_gil.parallel_compute(num_threads, num_iterations)
end_time = time.time()
print(f"Parallel computation with {num_threads} threads took {end_time - start_time} seconds")
執行結果如下:
開始長耗時任務計算...
開始長耗時任務計算...
開始長耗時任務計算...
開始長耗時任務計算結束.
開始長耗時任務計算結束.
開始長耗時任務計算結束.
釋放GIL鎖:True,時間: 5.005779027938843 seconds
開始長耗時任務計算...
開始長耗時任務計算...
開始長耗時任務計算結束.
開始長耗時任務計算...
開始長耗時任務計算結束.
開始長耗時任務計算結束.
釋放GIL鎖:False,時間: 15.013175964355469 seconds
Parallel computation with 4 threads took 5.391555070877075 seconds
在C++中程式碼中,可以透過pybind11的介面進行GIL鎖的管理,對於以下場景:
- python單執行緒,C++多執行緒,可以利用多核;
- python多執行緒,C++單執行緒,如果不釋放GIL鎖,序列執行,釋放GIL鎖後可以並行;
- python多執行緒,C++多執行緒,同2;
python物件如何在C++中使用
example_object.cpp的程式碼如下:
import time
import example_object as example
import timeit
# 測試 square_list 函式
start_time = time.time()
n = int(1e4)
input_list = list(range(n))
# 引數傳遞耗時
for _ in range(n):
squared_list = example.pass_list(input_list)
time_cost = time.time() - start_time
print("run time:{}s".format(time_cost))
print("avg pass list time cost:{}ms".format(time_cost / n*1000))
for _ in range(n):
squared_list = example.pass_vec(input_list)
time_cost = time.time() - start_time
print("run time:{}s".format(time_cost))
print("avg pass vec time cost:{}ms".format(time_cost / n*1000))
large_dict = {f"key{i}": i for i in range(10000000)}
def run():
start_time = time.time()
for _ in range(1000000):
example.print_dict(large_dict)
time_cost = time.time() - start_time
print("run time:{}s".format(time_cost))
print("avg time cost:{}ms".format(time_cost / 1000000*1000))
run()
# 測試 call_python_function 函式
def my_python_function():
return "Hello from Python!"
input_dict = dict(a=1, b=2, c=3)
example.call_python_function(id, input_dict)
print("input_dict:", id(input_dict))
python程式碼example_object.py
如下:
import time
import example_object as example
import timeit
# 測試 square_list 函式
start_time = time.time()
n = int(1e4)
input_list = list(range(n))
# 引數傳遞耗時
for _ in range(n):
squared_list = example.pass_list(input_list)
time_cost = time.time() - start_time
print("run time:{}s".format(time_cost))
print("avg pass list time cost:{}ms".format(time_cost / n*1000))
for _ in range(n):
squared_list = example.pass_vec(input_list)
time_cost = time.time() - start_time
print("run time:{}s".format(time_cost))
print("avg pass vec time cost:{}ms".format(time_cost / n*1000))
large_dict = {f"key{i}": i for i in range(10000000)}
def run():
start_time = time.time()
for _ in range(1000000):
example.print_dict(large_dict)
time_cost = time.time() - start_time
print("run time:{}s".format(time_cost))
print("avg time cost:{}ms".format(time_cost / 1000000*1000))
run()
# 測試 call_python_function 函式
def my_python_function():
return "Hello from Python!"
input_dict = dict(a=1, b=2, c=3)
example.call_python_function(id, input_dict)
print("input_dict:", id(input_dict))
執行結果如下:
run time:0.0034627914428710938s
avg pass list time cost:0.00034627914428710934ms
run time:1.315690040588379s
avg pass vec time cost:0.1315690040588379ms
run time:0.30083274841308594s
avg time cost:0.00030083274841308596ms
Result from Python function: 140240346308040
input_dict: 140240346308040
需要注意如果資料型別不一致的情況下,pybind11會進行型別轉換,當資料量非常大的時候,這個轉換耗時將不可忽略。因此,如果涉及到大量的資料轉換,最好避免型別轉換。
參考
- 給Python演算法插上效能的翅膀——pybind11落地實踐