Skip to content

Commit c543c84

Browse files
committed
Perftest: Dynamic CUDA linking
This commit refactors the CUDA integration in Perftest by dynamically loading the CUDA library (`libcuda.so`) instead of linking it statically. Changes include: - Introduced `cuda_loader.c` to handle dynamic loading of CUDA functions. - Modified `cuda_memory.c` to use dynamically loaded function pointers instead of direct CUDA API calls. - Ensured proper cleanup of resources by introducing `unload_cuda_library()`. - Find CUDA header path automatically and set related defines if exists. This change increases flexibility, allowing Perftest to be compiled over systems with cuda and run on both systems with/without CUDA. Signed-off-by: Shmuel Shaul <[email protected]>
1 parent 14ae7a0 commit c543c84

File tree

6 files changed

+255
-44
lines changed

6 files changed

+255
-44
lines changed

Makefile.am

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ libperftest_a_SOURCES = src/get_clock.c src/perftest_communication.c src/perftes
3838
noinst_HEADERS = src/get_clock.h src/perftest_communication.h src/perftest_parameters.h src/perftest_resources.h src/perftest_counters.h src/memory.h src/host_memory.h src/mmap_memory.h src/cuda_memory.h src/rocm_memory.h src/neuron_memory.h src/hl_memory.h src/mlu_memory.h
3939

4040
if CUDA
41-
libperftest_a_SOURCES += src/cuda_memory.c
41+
libperftest_a_SOURCES += src/cuda_memory.c src/cuda_loader.h src/cuda_loader.c
4242
endif
4343

4444
if ROCM

README

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,14 @@ Special feature detailed explanation in tests:
244244
"-M" flag allows you to choose the multicast group address.
245245

246246
4. GPUDirect usage:
247-
To utilize GPUDirect feature, perftest should be compiled as:
248-
./autogen.sh && ./configure CUDA_H_PATH=<path to cuda.h> && make -j, e.g.:
247+
As of perftest release 25.07 the build system automatically
248+
detects the location of cuda.h. Passing CUDA_H_PATH to the configure
249+
script is therefore no longer required. The variable is still accepted
250+
for backward-compatibility but its usage is not recommended.
251+
The variable will depracted in the 25.10 release.
252+
253+
For perftest releases earlier than 25.07 you must still provide the path to
254+
cuda.h explicitly during configuration, for example:
249255
./autogen.sh && ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h && make -j
250256

251257
Thus --use_cuda=<gpu_index> flag will be available to add to a command line:
@@ -266,6 +272,9 @@ Special feature detailed explanation in tests:
266272
CUDA Runtime API support:
267273
To use the --gpu_touch option in Perftest, you must build Perftest with support for the CUDA Runtime API (libcudart).
268274
Run the configure script with the following flags:
275+
./configure --enable-cudart
276+
277+
For releases earlier than 25.07:
269278
./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h --enable-cudart
270279

271280
Note: Ensure that your NVIDIA CUDA Compiler (nvcc) version is compatible with your GCC version. Incompatibility between nvcc and gcc can cause build or runtime issues.

configure.ac

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -312,22 +312,46 @@ if [test $HAVE_EX_ODP = yes] && [test $HAVE_EX = yes]; then
312312
AC_DEFINE([HAVE_EX_ODP], [1], [Have Extended ODP support])
313313
fi
314314

315-
if [test "$CUDA_H_PATH" ]; then
315+
AC_CHECK_LIB([dl], [dlclose],
316+
[AC_MSG_RESULT([libdl found])
317+
LIBS="$LIBS -ldl"],
318+
[AC_MSG_ERROR([libdl not found. You need to install libdl for dynamic loading support])])
319+
320+
321+
# Check for CUDA header file in common location
322+
AC_MSG_CHECKING([for CUDA header file])
323+
cuda_found=no
324+
cuda_h_path=""
325+
326+
if test -f "/usr/local/cuda/include/cuda.h"; then
327+
cuda_h_path="/usr/local/cuda/include/cuda.h"
328+
cuda_found=yes
329+
AC_MSG_RESULT([found at $cuda_h_path])
330+
fi
331+
332+
# User defined cuda header path
333+
if test -f "$CUDA_H_PATH"; then
334+
cuda_h_path="$CUDA_H_PATH"
335+
cuda_found=yes
336+
AC_MSG_RESULT([found at $cuda_h_path])
337+
fi
338+
339+
if test "$cuda_found" = "yes"; then
316340
AC_DEFINE([HAVE_CUDA], [1], [Enable CUDA feature])
317-
AC_DEFINE_UNQUOTED([CUDA_PATH], "$CUDA_H_PATH" , [Enable CUDA feature])
318-
LIBS=$LIBS" -lcuda"
341+
AC_DEFINE_UNQUOTED([CUDA_PATH], "$cuda_h_path" , [Enable CUDA feature])
319342
AC_CHECK_LIB([cuda], [cuMemGetHandleForAddressRange], [HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE=yes], [HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE=no])
320-
343+
cuda_toolkit_version=`grep "define CUDA_VERSION" $cuda_h_path | cut -d' ' -f3`
344+
AC_DEFINE_UNQUOTED([CUDA_VER], [$cuda_toolkit_version], [Define CUDA_VER])
321345
AC_TRY_LINK([
322-
#include <$CUDA_H_PATH>],
346+
#include <$cuda_h_path>],
323347
[int x = CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD|CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED;],
324348
[CUDA_DMA_BUF_PARAMETERS_SUPPORT=yes], [CUDA_DMA_BUF_PARAMETERS_SUPPORT=no])
325349
if [test "x$HAVE_REG_DMABUF_MR" = "xyes"] && [test "x$HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE" = "xyes"] && [test "x$CUDA_DMA_BUF_PARAMETERS_SUPPORT" = "xyes"]; then
326350
HAVE_CUDA_DMABUF=yes
327351
AC_DEFINE([HAVE_CUDA_DMABUF], [1], [Enable CUDA DMABUF feature])
328352
fi
329353
AC_TRY_LINK([
330-
#include <$CUDA_H_PATH>],
354+
#include <$cuda_h_path>],
331355
[int x = CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE;],
332356
[CUDA_DMA_BUF_MAPPING_TYPE_PCIE_SUPPORT=yes], [CUDA_DMA_BUF_MAPPING_TYPE_PCIE_SUPPORT=no])
333357
if [test "x$CUDA_DMA_BUF_MAPPING_TYPE_PCIE_SUPPORT" = "xyes"] && [test "x$HAVE_REG_DMABUF_MR" = "xyes"]; then
@@ -337,17 +361,16 @@ if [test "$CUDA_H_PATH" ]; then
337361
if [test "x$enable_cudart" = "xyes"]; then
338362
AC_DEFINE([HAVE_CUDART], [1], [Enable CUDART features])
339363
LIBS=$LIBS" -lcudart -lstdc++"
340-
CPPFLAGS="-I$(dirname $CUDA_H_PATH)/include $CPPFLAGS"
341-
LDFLAGS="-L$(dirname $(dirname $CUDA_H_PATH))/lib -L$(dirname $(dirname $CUDA_H_PATH))/lib64 $LDFLAGS"
364+
CPPFLAGS="-I$(dirname $cuda_h_path)/include $CPPFLAGS"
365+
LDFLAGS="-L$(dirname $(dirname $cuda_h_path))/lib -L$(dirname $(dirname $cuda_h_path))/lib64 $LDFLAGS"
342366

343367
if [test "x${gpu_arch}" != "x"]; then
344368
NVCCFLAGS="${NVCCFLAGS} -arch compute_${gpu_arch} -code compute_${gpu_arch},sm_${gpu_arch}"
345369
AC_MSG_NOTICE([Setting GPU_ARCH = ${gpu_arch}])
346370
fi
347371
NVCCFLAGS="${NVCCFLAGS} -Xcompiler -fpermissive"
348-
AC_CHECK_DECLS([CUDA_VERSION], [HAVE_CUDA_VERSION=yes], [HAVE_CUDA_VERSION=no], [[#include "$CUDA_H_PATH"]])
372+
AC_CHECK_DECLS([CUDA_VERSION], [HAVE_CUDA_VERSION=yes], [HAVE_CUDA_VERSION=no], [[#include "$cuda_h_path"]])
349373
if [test "x$HAVE_CUDA_VERSION" = "xyes"]; then
350-
cuda_toolkit_version=`grep "define CUDA_VERSION" $CUDA_H_PATH | cut -d' ' -f3`
351374
AS_VERSION_COMPARE([$cuda_toolkit_version], [11070], [HAVE_CUDA_DIAGSUPPRESS=no], [HAVE_CUDA_DIAGSUPPRESS=yes], [HAVE_CUDA_DIAGSUPPRESS=yes])
352375
if [test "x$HAVE_CUDA_DIAGSUPPRESS" = "xyes"]; then
353376
NVCCFLAGS="${NVCCFLAGS} -Xcompiler -fpermissive -diag-suppress 2464 -diag-suppress 815"
@@ -356,7 +379,7 @@ if [test "$CUDA_H_PATH" ]; then
356379
fi
357380
fi
358381

359-
CUDA_TOOLKIT_PATH=$(dirname $(dirname $CUDA_H_PATH))
382+
CUDA_TOOLKIT_PATH=$(dirname $(dirname $cuda_h_path))
360383
AC_DEFINE_UNQUOTED([CUDA_TOOLKIT_PATH], "$CUDA_TOOLKIT_PATH", [Path to the CUDA Toolkit])
361384
AC_SUBST([NVCCFLAGS], ["${NVCCFLAGS}"])
362385
AC_SUBST([NVCC], ["$CUDA_TOOLKIT_PATH/bin/nvcc"])
@@ -367,8 +390,8 @@ fi
367390
AM_CONDITIONAL([CUDA_DMA_BUF_PARAMETERS_SUPPORT],[test "x$CUDA_DMA_BUF_PARAMETERS_SUPPORT" = "xyes"])
368391
AM_CONDITIONAL([CUDA_DMA_BUF_MAPPING_TYPE_PCIE_SUPPORT],[test "x$CUDA_DMA_BUF_MAPPING_TYPE_PCIE_SUPPORT" = "xyes"])
369392

370-
AM_CONDITIONAL([CUDA], [test "$CUDA_H_PATH"])
371-
AM_CONDITIONAL([HAVE_CUDA], [test "$CUDA_H_PATH"])
393+
AM_CONDITIONAL([CUDA], [test "$cuda_h_path"])
394+
AM_CONDITIONAL([HAVE_CUDA], [test "$cuda_h_path"])
372395
AM_CONDITIONAL([HAVE_CUDART], [test "x$enable_cudart" = "xyes"])
373396

374397
AC_ARG_ENABLE([neuron],

src/cuda_loader.c

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#include "cuda_loader.h"
2+
#include <dlfcn.h>
3+
#include <stdio.h>
4+
5+
static void *cuda_handle = NULL;
6+
7+
/* Define the function pointers */
8+
CUresult (*p_cuInit)(unsigned int) = NULL;
9+
CUresult (*p_cuDeviceGetCount)(int *) = NULL;
10+
CUresult (*p_cuDeviceGet)(CUdevice *, int) = NULL;
11+
CUresult (*p_cuDeviceGetAttribute)(int *, CUdevice_attribute, CUdevice) = NULL;
12+
CUresult (*p_cuDeviceGetName)(char *, int, CUdevice) = NULL;
13+
CUresult (*p_cuCtxCreate)(CUcontext *, unsigned int, CUdevice) = NULL;
14+
CUresult (*p_cuDevicePrimaryCtxRetain)(CUcontext *, CUdevice) = NULL;
15+
CUresult (*p_cuCtxSetCurrent)(CUcontext) = NULL;
16+
CUresult (*p_cuCtxDestroy)(CUcontext) = NULL;
17+
CUresult (*p_cuDeviceGetByPCIBusId)(int *, const char *) = NULL;
18+
CUresult (*p_cuMemAllocHost)(void **, size_t) = NULL;
19+
CUresult (*p_cuMemAlloc)(CUdeviceptr *, size_t) = NULL;
20+
CUresult (*p_cuMemFreeHost)(void *) = NULL;
21+
CUresult (*p_cuMemFree)(CUdeviceptr) = NULL;
22+
CUresult (*p_cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t) = NULL;
23+
CUresult (*p_cuMemcpyDtoD)(CUdeviceptr, CUdeviceptr, size_t) = NULL;
24+
#ifdef HAVE_CUDA_DMABUF
25+
CUresult (*p_cuMemGetHandleForAddressRange)(void *, void *, size_t, CUmemRangeHandleType, unsigned int) = NULL;
26+
#endif
27+
CUresult (*p_cuDriverGetVersion)(int* driverVersion) = NULL;
28+
#if CUDA_VER >= 12000
29+
CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int cudaVersion, uint64_t flags, CUdriverProcAddressQueryResult* symbolStatus) = NULL;
30+
#else
31+
CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int cudaVersion, uint64_t flags) = NULL;
32+
#endif
33+
CUresult (*p_cuMemAllocManaged)(CUdeviceptr* dptr, size_t bytesize, unsigned int flags) = NULL;
34+
CUresult (*p_cuCtxSynchronize) (void) = NULL;
35+
36+
37+
int load_cuda_function(void **func_ptr, const char *func_name, int version) {
38+
#if CUDA_VER >= 12000
39+
CUresult res = p_cuGetProcAddress(func_name, func_ptr, version, 0, NULL);
40+
#else
41+
CUresult res = p_cuGetProcAddress(func_name, func_ptr, version, 0);
42+
#endif
43+
if (res != CUDA_SUCCESS)
44+
{
45+
fprintf(stderr, "load_cuda_function: Failed to get driver entry point '%s' (CUDA error %u)\n", func_name, res);
46+
return -1;
47+
}
48+
49+
return 0;
50+
}
51+
52+
int load_cuda_library(void) {
53+
54+
cuda_handle = dlopen("libcuda.so", RTLD_LAZY);
55+
if (!cuda_handle) {
56+
fprintf(stderr, "dlerror: %s\n", dlerror());
57+
return -1;
58+
}
59+
60+
61+
p_cuGetProcAddress = dlsym(cuda_handle, "cuGetProcAddress");
62+
if (!p_cuGetProcAddress) {
63+
fprintf(stderr, "Failed to resolve cuGetProcAddress: %s\n", dlerror());
64+
unload_cuda_library();
65+
return -1;
66+
}
67+
68+
69+
static const CudaSymbol symbols[] = {
70+
{ (void**)&p_cuInit, "cuInit", CUDA_VER_2_0 },
71+
{ (void**)&p_cuDeviceGetCount, "cuDeviceGetCount", CUDA_VER_2_0 },
72+
{ (void**)&p_cuDeviceGet, "cuDeviceGet", CUDA_VER_2_0 },
73+
{ (void**)&p_cuDeviceGetAttribute, "cuDeviceGetAttribute", CUDA_VER_2_0 },
74+
{ (void**)&p_cuDeviceGetName, "cuDeviceGetName", CUDA_VER_2_0 },
75+
{ (void**)&p_cuCtxCreate, "cuCtxCreate", CUDA_VER_3_2 },
76+
{ (void**)&p_cuDevicePrimaryCtxRetain, "cuDevicePrimaryCtxRetain", CUDA_VER_7_0 },
77+
{ (void**)&p_cuCtxSetCurrent, "cuCtxSetCurrent", CUDA_VER_4_0 },
78+
{ (void**)&p_cuCtxDestroy, "cuCtxDestroy", CUDA_VER_4_0 },
79+
{ (void**)&p_cuDeviceGetByPCIBusId, "cuDeviceGetByPCIBusId", CUDA_VER_11_3 },
80+
{ (void**)&p_cuMemAllocHost, "cuMemAllocHost", CUDA_VER_3_2 },
81+
{ (void**)&p_cuMemAlloc, "cuMemAlloc", CUDA_VER_3_2 },
82+
{ (void**)&p_cuMemFreeHost, "cuMemFreeHost", CUDA_VER_3_2 },
83+
{ (void**)&p_cuMemFree, "cuMemFree", CUDA_VER_3_2 },
84+
{ (void**)&p_cuMemcpy, "cuMemcpy", CUDA_VER_11_3 },
85+
{ (void**)&p_cuMemcpyDtoD, "cuMemcpyDtoD", CUDA_VER_3_2 },
86+
#ifdef HAVE_CUDA_DMABUF
87+
{ (void**)&p_cuMemGetHandleForAddressRange, "cuMemGetHandleForAddressRange", CUDA_VER_11_7 },
88+
#endif
89+
{ (void**)&p_cuDriverGetVersion, "cuDriverGetVersion", CUDA_VER_2_2 },
90+
{ (void**)&p_cuCtxSynchronize, "cuCtxSynchronize", CUDA_VER_11_3 },
91+
{ (void**)&p_cuMemAllocManaged, "cuMemAllocManaged", CUDA_VER_11_3 }
92+
};
93+
94+
for (size_t i = 0; i < sizeof(symbols)/sizeof(symbols[0]); ++i) {
95+
if (load_cuda_function(symbols[i].func_ptr, symbols[i].name, symbols[i].min_version) != 0) {
96+
unload_cuda_library();
97+
return -1;
98+
}
99+
}
100+
101+
return 0;
102+
}
103+
104+
void unload_cuda_library(void) {
105+
if (cuda_handle) {
106+
dlclose(cuda_handle);
107+
cuda_handle = NULL;
108+
}
109+
}

src/cuda_loader.h

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#ifndef CUDA_LOADER_H
2+
#define CUDA_LOADER_H
3+
4+
#include "config.h"
5+
#include CUDA_PATH
6+
7+
#ifdef __cplusplus
8+
extern "C" {
9+
#endif
10+
11+
12+
#define CUDA_VER_2_0 2000 /* CUDA 2.0 */
13+
#define CUDA_VER_2_2 2020 /* CUDA 2.2 */
14+
#define CUDA_VER_3_2 3020 /* CUDA 3.2 */
15+
#define CUDA_VER_4_0 4000 /* CUDA 4.0 */
16+
#define CUDA_VER_7_0 7000 /* CUDA 7.0 */
17+
#define CUDA_VER_11_3 11030 /* CUDA 11.3 */
18+
#define CUDA_VER_11_7 11070 /* CUDA 11.7 */
19+
20+
21+
typedef struct {
22+
void **func_ptr;
23+
const char *name;
24+
int min_version;
25+
} CudaSymbol;
26+
27+
// Function pointers for CUDA Driver API
28+
extern CUresult (*p_cuInit)(unsigned int);
29+
extern CUresult (*p_cuDeviceGetCount)(int *);
30+
extern CUresult (*p_cuDeviceGet)(CUdevice *, int);
31+
extern CUresult (*p_cuDeviceGetAttribute)(int *, CUdevice_attribute, CUdevice);
32+
extern CUresult (*p_cuDeviceGetName)(char *, int, CUdevice);
33+
extern CUresult (*p_cuCtxCreate)(CUcontext *, unsigned int, CUdevice);
34+
extern CUresult (*p_cuDevicePrimaryCtxRetain)(CUcontext *, CUdevice);
35+
extern CUresult (*p_cuCtxSetCurrent)(CUcontext);
36+
extern CUresult (*p_cuCtxDestroy)(CUcontext);
37+
extern CUresult (*p_cuDeviceGetByPCIBusId)(int *, const char *);
38+
extern CUresult (*p_cuMemAllocHost)(void **, size_t);
39+
extern CUresult (*p_cuMemAlloc)(CUdeviceptr *, size_t);
40+
extern CUresult (*p_cuMemFreeHost)(void *);
41+
extern CUresult (*p_cuMemFree)(CUdeviceptr);
42+
extern CUresult (*p_cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t);
43+
extern CUresult (*p_cuMemcpyDtoD)(CUdeviceptr, CUdeviceptr, size_t);
44+
#ifdef HAVE_CUDA_DMABUF
45+
extern CUresult (*p_cuMemGetHandleForAddressRange)(void *, void *, size_t, CUmemRangeHandleType, unsigned int);
46+
#endif
47+
extern CUresult (*p_cuDriverGetVersion)(int* driverVersion);
48+
extern CUresult (*p_cuCtxSynchronize) (void);
49+
extern CUresult (*p_cuMemAllocManaged)(CUdeviceptr* dptr, size_t bytesize, unsigned int flags);
50+
#if CUDA_VER >= 12000
51+
extern CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int cudaVersion, uint64_t flags, CUdriverProcAddressQueryResult* symbolStatus);
52+
#else
53+
extern CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int cudaVersion, uint64_t flags);
54+
#endif
55+
56+
// Loader functions
57+
int load_cuda_library(void);
58+
void unload_cuda_library(void);
59+
60+
#ifdef __cplusplus
61+
}
62+
#endif
63+
64+
#endif // CUDA_LOADER_H

0 commit comments

Comments
 (0)