0%

使用Pybind11调用OpenMP并行化加速函数计算

Visual Studio 2019 下配置 OpenMP

Visual Studio 2019 已经内置支持 OpenMP

对如下的计算函数进行并行化加速的时候

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#include "pymatrix.h"
#include <iostream>
#include <omp.h>

py::array_t<double> calc_iv_array(
const py::array_t<bool> &is_call,
const py::array_t<double> &premium,
const py::array_t<double> &S,
const py::array_t<double> &X,
const py::array_t<double> &T,
const py::array_t<double> &b,
const py::array_t<double> &r,
const double &iv_lower_limit,
const double &iv_upper_limit,
const double &ttm_clip,
const double &vega_clip,
const int &num_steps,
const double &precision,
const bool &use_newton
) {
py::buffer_info buf_is_call = is_call.request();
py::buffer_info buf_premium = premium.request();
py::buffer_info buf_spot = S.request();
py::buffer_info buf_strike = X.request();
py::buffer_info buf_time = T.request();
py::buffer_info buf_carry_cost = b.request();
py::buffer_info buf_risk_free = r.request();

std::vector<ssize_t> shape = {buf_spot.size, 3};
auto result = py::array_t<double>(buf_spot.size * 3);
result.resize({shape[0], shape[1]});
py::buffer_info buffer_result = result.request();

bool *ptr_is_call = static_cast<bool *>(buf_is_call.ptr);
auto *ptr_premium = static_cast<double *>(buf_premium.ptr);
auto *ptr_spot = static_cast<double *>(buf_spot.ptr);
auto *ptr_strike = static_cast<double *>(buf_strike.ptr);
auto *ptr_time = static_cast<double *>(buf_time.ptr);
auto *ptr_carry_cost = static_cast<double *>(buf_carry_cost.ptr);
auto *ptr_risk_free = static_cast<double *>(buf_risk_free.ptr);
auto *ptr_result = static_cast<double *>(buffer_result.ptr);

#pragma omp parallel for
for (int idx = 0; idx < buf_spot.shape[0]; idx++) {
const double sigma = flow::model::bjerk02::calc_iv(ptr_is_call[idx],
ptr_premium[idx],
ptr_spot[idx],
ptr_strike[idx],
ptr_time[idx],
ptr_carry_cost[idx],
ptr_risk_free[idx],
iv_lower_limit,
iv_upper_limit,
ttm_clip,
vega_clip,
num_steps,
precision,
use_newton);
ptr_result[idx * shape[1] + 0] = sigma;
ptr_result[idx * shape[1] + 1] = flow::model::bjerk02::calc_delta(ptr_is_call[idx],
ptr_spot[idx],
ptr_strike[idx],
ptr_time[idx],
ptr_carry_cost[idx],
ptr_risk_free[idx],
sigma);

ptr_result[idx * shape[1] + 2] = flow::model::bjerk02::calc_vega(ptr_is_call[idx],
ptr_spot[idx],
ptr_strike[idx],
ptr_time[idx],
ptr_carry_cost[idx],
ptr_risk_free[idx],
sigma);
};

return result;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "bjerk2002.h"
#include "calculate.h"
#include "pymatrix.h"

namespace py = pybind11;


PYBIND11_MODULE(pybjerk, m) {
m.def("calc_iv_array", &calc_iv_array, "get bsm iv delta and vega", py::arg("is_call"),
py::arg("premium"), py::arg("S"), py::arg("X"), py::arg("T"), py::arg("b"), py::arg("r"),
py::arg("iv_lower_limit") = 1e-8, py::arg("iv_upper_limit") = 10, py::arg("ttm_clip") = 1e-6,
py::arg("vega_clip") = 1e-6, py::arg("num_steps") = 128, py::arg("precision") = 1e-8,
py::arg("use_newton") = false);
}

只需要在配置设置即可,不需要额外的安装openmp库

image-20210609085353624

在MacOS系统Clion下配置 OpenMP

目前clang已经支持openmp,但是apple clang没有自带omp库需要自己下载

1
brew install libomp

安装libomp之后使用cmake配置一下即可

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
cmake_minimum_required(VERSION 3.16)
project(pybjerk)

# ---------------------------------------------------------------------------------------
# Set default build to release
# ---------------------------------------------------------------------------------------
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose Release or Debug" FORCE)
endif ()

set(CMAKE_CXX_STANDARD 20)

find_package(OpenMP REQUIRED)
#if (OPENMP_FOUND)
# set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
# set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
#endif()

include_directories(deps/pybind11/include)
add_subdirectory(deps/pybind11)
aux_source_directory(. SOURCE)

pybind11_add_module(${PROJECT_NAME} SHARED ${SOURCE})
target_link_libraries(${PROJECT_NAME} PUBLIC OpenMP::OpenMP_CXX)

OpenMP加速效果对比

开启多线程并行化之后,计算函数在多核CPU运行速度提升明显

Windows下加速效果对比 Processor: Intel(R) Core(TM) i9-10900 CPU @ 2.80GHz (20 CPUs), ~2.8GHz test

MacOS下加速效果对比 Processor: 2.6 GHz 6-Core Intel Core i7 test