Perftest: Dynamic CUDA linking

sshaulnv · sshaulnv · commit c543c8491294 · 2025-08-05T21:45:06.000+03:00
This commit refactors the CUDA integration in Perftest by dynamically
loading the CUDA library (`libcuda.so`) instead of linking it
statically.

Changes include:
- Introduced `cuda_loader.c` to handle dynamic loading of CUDA
functions.
- Modified `cuda_memory.c` to use dynamically loaded function pointers
  instead of direct CUDA API calls.
- Ensured proper cleanup of resources by introducing
`unload_cuda_library()`.
- Find CUDA header path automatically and set related defines if exists.

This change increases flexibility, allowing Perftest to be compiled over
systems with cuda and run on both systems with/without CUDA.

Signed-off-by: Shmuel Shaul &lt;sshaul@nvidia.com&gt;
diff --git a/Makefile.am b/Makefile.am
@@ -38,7 +38,7 @@ libperftest_a_SOURCES = src/get_clock.c src/perftest_communication.c src/perftes
 noinst_HEADERS = src/get_clock.h src/perftest_communication.h src/perftest_parameters.h src/perftest_resources.h src/perftest_counters.h src/memory.h src/host_memory.h src/mmap_memory.h src/cuda_memory.h src/rocm_memory.h src/neuron_memory.h src/hl_memory.h src/mlu_memory.h
 
 if CUDA
-libperftest_a_SOURCES += src/cuda_memory.c
+libperftest_a_SOURCES += src/cuda_memory.c src/cuda_loader.h src/cuda_loader.c
 endif
 
 if ROCM
diff --git a/README b/README
@@ -244,8 +244,14 @@ Special feature detailed explanation in tests:
      "-M" flag allows you to choose the multicast group address.
 
   4. GPUDirect usage:
-     To utilize GPUDirect feature, perftest should be compiled as:
-     ./autogen.sh && ./configure CUDA_H_PATH=<path to cuda.h> && make -j, e.g.:
+     As of perftest release 25.07 the build system automatically
+     detects the location of cuda.h. Passing CUDA_H_PATH to the configure
+     script is therefore no longer required. The variable is still accepted
+     for backward-compatibility but its usage is not recommended.
+     The variable will depracted in the 25.10 release.
+
+     For perftest releases earlier than 25.07 you must still provide the path to
+     cuda.h explicitly during configuration, for example:
      ./autogen.sh && ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h && make -j
 
      Thus --use_cuda=<gpu_index> flag will be available to add to a command line:
@@ -266,6 +272,9 @@ Special feature detailed explanation in tests:
     CUDA Runtime API support:
       To use the --gpu_touch option in Perftest, you must build Perftest with support for the CUDA Runtime API (libcudart).
       Run the configure script with the following flags:
+      ./configure --enable-cudart
+
+      For releases earlier than 25.07:
       ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h --enable-cudart
 
       Note: Ensure that your NVIDIA CUDA Compiler (nvcc) version is compatible with your GCC version. Incompatibility between nvcc and gcc can cause build or runtime issues.
diff --git a/configure.ac b/configure.ac
@@ -312,22 +312,46 @@ if [test $HAVE_EX_ODP = yes] && [test $HAVE_EX = yes]; then
         AC_DEFINE([HAVE_EX_ODP], [1], [Have Extended  ODP support])
 fi
 
-if [test "$CUDA_H_PATH" ]; then
+AC_CHECK_LIB([dl], [dlclose],
+    [AC_MSG_RESULT([libdl found])
+     LIBS="$LIBS -ldl"],
+    [AC_MSG_ERROR([libdl not found. You need to install libdl for dynamic loading support])])
+
+
+# Check for CUDA header file in common location
+AC_MSG_CHECKING([for CUDA header file])
+cuda_found=no
+cuda_h_path=""
+
+if test -f "/usr/local/cuda/include/cuda.h"; then
+	cuda_h_path="/usr/local/cuda/include/cuda.h"
+	cuda_found=yes
+	AC_MSG_RESULT([found at $cuda_h_path])
+fi
+
+# User defined cuda header path
+if test -f "$CUDA_H_PATH"; then
+       cuda_h_path="$CUDA_H_PATH"
+       cuda_found=yes
+       AC_MSG_RESULT([found at $cuda_h_path])
+fi
+
+if test "$cuda_found" = "yes"; then
 	AC_DEFINE([HAVE_CUDA], [1], [Enable CUDA feature])
-	AC_DEFINE_UNQUOTED([CUDA_PATH], "$CUDA_H_PATH" , [Enable CUDA feature])
-	LIBS=$LIBS" -lcuda"
+	AC_DEFINE_UNQUOTED([CUDA_PATH], "$cuda_h_path" , [Enable CUDA feature])
 	AC_CHECK_LIB([cuda], [cuMemGetHandleForAddressRange], [HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE=yes], [HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE=no])
-
+	cuda_toolkit_version=`grep "define CUDA_VERSION" $cuda_h_path | cut -d' ' -f3`
+	AC_DEFINE_UNQUOTED([CUDA_VER], [$cuda_toolkit_version], [Define CUDA_VER])
 	AC_TRY_LINK([
-	#include <$CUDA_H_PATH>],
+	#include <$cuda_h_path>],
 	[int x = CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD|CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED;],
 	[CUDA_DMA_BUF_PARAMETERS_SUPPORT=yes], [CUDA_DMA_BUF_PARAMETERS_SUPPORT=no])
 	if [test "x$HAVE_REG_DMABUF_MR" = "xyes"] && [test "x$HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE" = "xyes"] && [test "x$CUDA_DMA_BUF_PARAMETERS_SUPPORT" = "xyes"]; then
 		HAVE_CUDA_DMABUF=yes
 		AC_DEFINE([HAVE_CUDA_DMABUF], [1], [Enable CUDA DMABUF feature])
 	fi
 	AC_TRY_LINK([
-	#include <$CUDA_H_PATH>],
+	#include <$cuda_h_path>],
 	[int x = CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE;],
 	[CUDA_DMA_BUF_MAPPING_TYPE_PCIE_SUPPORT=yes], [CUDA_DMA_BUF_MAPPING_TYPE_PCIE_SUPPORT=no])
 	if [test "x$CUDA_DMA_BUF_MAPPING_TYPE_PCIE_SUPPORT" = "xyes"] && [test "x$HAVE_REG_DMABUF_MR" = "xyes"]; then
@@ -337,17 +361,16 @@ if [test "$CUDA_H_PATH" ]; then
 	if [test "x$enable_cudart" = "xyes"]; then
 		AC_DEFINE([HAVE_CUDART], [1], [Enable CUDART features])
 		LIBS=$LIBS" -lcudart -lstdc++"
-		CPPFLAGS="-I$(dirname $CUDA_H_PATH)/include $CPPFLAGS"
-		LDFLAGS="-L$(dirname $(dirname $CUDA_H_PATH))/lib -L$(dirname $(dirname $CUDA_H_PATH))/lib64 $LDFLAGS"
+		CPPFLAGS="-I$(dirname $cuda_h_path)/include $CPPFLAGS"
+		LDFLAGS="-L$(dirname $(dirname $cuda_h_path))/lib -L$(dirname $(dirname $cuda_h_path))/lib64 $LDFLAGS"
 
 		if [test "x${gpu_arch}" != "x"]; then
 			NVCCFLAGS="${NVCCFLAGS} -arch compute_${gpu_arch} -code compute_${gpu_arch},sm_${gpu_arch}"
 			AC_MSG_NOTICE([Setting GPU_ARCH = ${gpu_arch}])
 		fi
 		NVCCFLAGS="${NVCCFLAGS} -Xcompiler -fpermissive"
-		AC_CHECK_DECLS([CUDA_VERSION], [HAVE_CUDA_VERSION=yes], [HAVE_CUDA_VERSION=no], [[#include "$CUDA_H_PATH"]])
+		AC_CHECK_DECLS([CUDA_VERSION], [HAVE_CUDA_VERSION=yes], [HAVE_CUDA_VERSION=no], [[#include "$cuda_h_path"]])
 		if [test "x$HAVE_CUDA_VERSION" = "xyes"]; then
-			cuda_toolkit_version=`grep "define CUDA_VERSION" $CUDA_H_PATH | cut -d' ' -f3`
 			AS_VERSION_COMPARE([$cuda_toolkit_version], [11070], [HAVE_CUDA_DIAGSUPPRESS=no], [HAVE_CUDA_DIAGSUPPRESS=yes], [HAVE_CUDA_DIAGSUPPRESS=yes])
 			if [test "x$HAVE_CUDA_DIAGSUPPRESS" = "xyes"]; then
 				NVCCFLAGS="${NVCCFLAGS} -Xcompiler -fpermissive -diag-suppress 2464 -diag-suppress 815"
@@ -356,7 +379,7 @@ if [test "$CUDA_H_PATH" ]; then
 			fi
 		fi
 
-		CUDA_TOOLKIT_PATH=$(dirname $(dirname $CUDA_H_PATH))
+		CUDA_TOOLKIT_PATH=$(dirname $(dirname $cuda_h_path))
 		AC_DEFINE_UNQUOTED([CUDA_TOOLKIT_PATH], "$CUDA_TOOLKIT_PATH", [Path to the CUDA Toolkit])
 		AC_SUBST([NVCCFLAGS], ["${NVCCFLAGS}"])
 		AC_SUBST([NVCC], ["$CUDA_TOOLKIT_PATH/bin/nvcc"])
@@ -367,8 +390,8 @@ fi
 AM_CONDITIONAL([CUDA_DMA_BUF_PARAMETERS_SUPPORT],[test "x$CUDA_DMA_BUF_PARAMETERS_SUPPORT" = "xyes"])
 AM_CONDITIONAL([CUDA_DMA_BUF_MAPPING_TYPE_PCIE_SUPPORT],[test "x$CUDA_DMA_BUF_MAPPING_TYPE_PCIE_SUPPORT" = "xyes"])
 
-AM_CONDITIONAL([CUDA], [test "$CUDA_H_PATH"])
-AM_CONDITIONAL([HAVE_CUDA], [test "$CUDA_H_PATH"])
+AM_CONDITIONAL([CUDA], [test "$cuda_h_path"])
+AM_CONDITIONAL([HAVE_CUDA], [test "$cuda_h_path"])
 AM_CONDITIONAL([HAVE_CUDART], [test "x$enable_cudart" = "xyes"])
 
 AC_ARG_ENABLE([neuron],
diff --git a/src/cuda_loader.c b/src/cuda_loader.c
@@ -0,0 +1,109 @@
+#include "cuda_loader.h"
+#include <dlfcn.h>
+#include <stdio.h>
+
+static void *cuda_handle = NULL;
+
+/* Define the function pointers */
+CUresult (*p_cuInit)(unsigned int) = NULL;
+CUresult (*p_cuDeviceGetCount)(int *) = NULL;
+CUresult (*p_cuDeviceGet)(CUdevice *, int) = NULL;
+CUresult (*p_cuDeviceGetAttribute)(int *, CUdevice_attribute, CUdevice) = NULL;
+CUresult (*p_cuDeviceGetName)(char *, int, CUdevice) = NULL;
+CUresult (*p_cuCtxCreate)(CUcontext *, unsigned int, CUdevice) = NULL;
+CUresult (*p_cuDevicePrimaryCtxRetain)(CUcontext *, CUdevice) = NULL;
+CUresult (*p_cuCtxSetCurrent)(CUcontext) = NULL;
+CUresult (*p_cuCtxDestroy)(CUcontext) = NULL;
+CUresult (*p_cuDeviceGetByPCIBusId)(int *, const char *) = NULL;
+CUresult (*p_cuMemAllocHost)(void **, size_t) = NULL;
+CUresult (*p_cuMemAlloc)(CUdeviceptr *, size_t) = NULL;
+CUresult (*p_cuMemFreeHost)(void *) = NULL;
+CUresult (*p_cuMemFree)(CUdeviceptr) = NULL;
+CUresult (*p_cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t) = NULL;
+CUresult (*p_cuMemcpyDtoD)(CUdeviceptr, CUdeviceptr, size_t) = NULL;
+#ifdef HAVE_CUDA_DMABUF
+CUresult (*p_cuMemGetHandleForAddressRange)(void *, void *, size_t, CUmemRangeHandleType, unsigned int) = NULL;
+#endif
+CUresult (*p_cuDriverGetVersion)(int* driverVersion) = NULL;
+#if CUDA_VER >= 12000
+CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int  cudaVersion, uint64_t flags, CUdriverProcAddressQueryResult* symbolStatus) = NULL;
+#else
+CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int  cudaVersion, uint64_t flags) = NULL;
+#endif
+CUresult (*p_cuMemAllocManaged)(CUdeviceptr* dptr, size_t bytesize, unsigned int  flags) = NULL;
+CUresult (*p_cuCtxSynchronize) (void) = NULL;
+
+
+int load_cuda_function(void **func_ptr, const char *func_name, int version) {
+    #if CUDA_VER >= 12000
+    CUresult res = p_cuGetProcAddress(func_name, func_ptr, version, 0, NULL);
+    #else
+    CUresult res = p_cuGetProcAddress(func_name, func_ptr, version, 0);
+    #endif
+    if (res != CUDA_SUCCESS)
+    {
+        fprintf(stderr, "load_cuda_function: Failed to get driver entry point '%s' (CUDA error %u)\n", func_name, res);
+        return -1;
+    }
+
+    return 0;
+}
+
+int load_cuda_library(void) {
+
+    cuda_handle = dlopen("libcuda.so", RTLD_LAZY);
+    if (!cuda_handle) {
+        fprintf(stderr, "dlerror: %s\n", dlerror());
+        return -1;
+    }
+
+
+    p_cuGetProcAddress = dlsym(cuda_handle, "cuGetProcAddress");
+    if (!p_cuGetProcAddress) {
+        fprintf(stderr, "Failed to resolve cuGetProcAddress: %s\n", dlerror());
+        unload_cuda_library();
+        return -1;
+    }
+
+
+    static const CudaSymbol symbols[] = {
+        { (void**)&p_cuInit,                      "cuInit",                       CUDA_VER_2_0  },
+        { (void**)&p_cuDeviceGetCount,            "cuDeviceGetCount",             CUDA_VER_2_0  },
+        { (void**)&p_cuDeviceGet,                 "cuDeviceGet",                  CUDA_VER_2_0  },
+        { (void**)&p_cuDeviceGetAttribute,        "cuDeviceGetAttribute",         CUDA_VER_2_0  },
+        { (void**)&p_cuDeviceGetName,             "cuDeviceGetName",              CUDA_VER_2_0  },
+        { (void**)&p_cuCtxCreate,                 "cuCtxCreate",                  CUDA_VER_3_2  },
+        { (void**)&p_cuDevicePrimaryCtxRetain,    "cuDevicePrimaryCtxRetain",     CUDA_VER_7_0  },
+        { (void**)&p_cuCtxSetCurrent,             "cuCtxSetCurrent",              CUDA_VER_4_0  },
+        { (void**)&p_cuCtxDestroy,                "cuCtxDestroy",                 CUDA_VER_4_0  },
+        { (void**)&p_cuDeviceGetByPCIBusId,       "cuDeviceGetByPCIBusId",        CUDA_VER_11_3 },
+        { (void**)&p_cuMemAllocHost,              "cuMemAllocHost",               CUDA_VER_3_2  },
+        { (void**)&p_cuMemAlloc,                  "cuMemAlloc",                   CUDA_VER_3_2  },
+        { (void**)&p_cuMemFreeHost,               "cuMemFreeHost",                CUDA_VER_3_2  },
+        { (void**)&p_cuMemFree,                   "cuMemFree",                    CUDA_VER_3_2  },
+        { (void**)&p_cuMemcpy,                    "cuMemcpy",                     CUDA_VER_11_3 },
+        { (void**)&p_cuMemcpyDtoD,                "cuMemcpyDtoD",                 CUDA_VER_3_2  },
+#ifdef HAVE_CUDA_DMABUF
+        { (void**)&p_cuMemGetHandleForAddressRange, "cuMemGetHandleForAddressRange", CUDA_VER_11_7 },
+#endif
+        { (void**)&p_cuDriverGetVersion,          "cuDriverGetVersion",           CUDA_VER_2_2  },
+        { (void**)&p_cuCtxSynchronize,            "cuCtxSynchronize",             CUDA_VER_11_3 },
+        { (void**)&p_cuMemAllocManaged,           "cuMemAllocManaged",            CUDA_VER_11_3 }
+    };
+
+    for (size_t i = 0; i < sizeof(symbols)/sizeof(symbols[0]); ++i) {
+        if (load_cuda_function(symbols[i].func_ptr, symbols[i].name, symbols[i].min_version) != 0) {
+            unload_cuda_library();
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+void unload_cuda_library(void) {
+    if (cuda_handle) {
+        dlclose(cuda_handle);
+        cuda_handle = NULL;
+    }
+}
diff --git a/src/cuda_loader.h b/src/cuda_loader.h
@@ -0,0 +1,64 @@
+#ifndef CUDA_LOADER_H
+#define CUDA_LOADER_H
+
+#include "config.h"
+#include CUDA_PATH
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define CUDA_VER_2_0   2000 /* CUDA 2.0  */
+#define CUDA_VER_2_2   2020 /* CUDA 2.2  */
+#define CUDA_VER_3_2   3020 /* CUDA 3.2  */
+#define CUDA_VER_4_0   4000 /* CUDA 4.0  */
+#define CUDA_VER_7_0   7000 /* CUDA 7.0  */
+#define CUDA_VER_11_3  11030 /* CUDA 11.3 */
+#define CUDA_VER_11_7  11070 /* CUDA 11.7 */
+
+
+typedef struct {
+    void **func_ptr;
+    const char *name;
+    int  min_version;
+} CudaSymbol;
+
+// Function pointers for CUDA Driver API
+extern CUresult (*p_cuInit)(unsigned int);
+extern CUresult (*p_cuDeviceGetCount)(int *);
+extern CUresult (*p_cuDeviceGet)(CUdevice *, int);
+extern CUresult (*p_cuDeviceGetAttribute)(int *, CUdevice_attribute, CUdevice);
+extern CUresult (*p_cuDeviceGetName)(char *, int, CUdevice);
+extern CUresult (*p_cuCtxCreate)(CUcontext *, unsigned int, CUdevice);
+extern CUresult (*p_cuDevicePrimaryCtxRetain)(CUcontext *, CUdevice);
+extern CUresult (*p_cuCtxSetCurrent)(CUcontext);
+extern CUresult (*p_cuCtxDestroy)(CUcontext);
+extern CUresult (*p_cuDeviceGetByPCIBusId)(int *, const char *);
+extern CUresult (*p_cuMemAllocHost)(void **, size_t);
+extern CUresult (*p_cuMemAlloc)(CUdeviceptr *, size_t);
+extern CUresult (*p_cuMemFreeHost)(void *);
+extern CUresult (*p_cuMemFree)(CUdeviceptr);
+extern CUresult (*p_cuMemcpy)(CUdeviceptr, CUdeviceptr, size_t);
+extern CUresult (*p_cuMemcpyDtoD)(CUdeviceptr, CUdeviceptr, size_t);
+#ifdef HAVE_CUDA_DMABUF
+extern CUresult (*p_cuMemGetHandleForAddressRange)(void *, void *, size_t, CUmemRangeHandleType, unsigned int);
+#endif
+extern CUresult (*p_cuDriverGetVersion)(int* driverVersion);
+extern CUresult (*p_cuCtxSynchronize) (void);
+extern CUresult (*p_cuMemAllocManaged)(CUdeviceptr* dptr, size_t bytesize, unsigned int  flags);
+#if CUDA_VER >= 12000
+extern CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int  cudaVersion, uint64_t flags, CUdriverProcAddressQueryResult* symbolStatus);
+#else
+extern CUresult (*p_cuGetProcAddress)(const char* symbol, void** pfn, int  cudaVersion, uint64_t flags);
+#endif
+
+// Loader functions
+int load_cuda_library(void);
+void unload_cuda_library(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CUDA_LOADER_H
diff --git a/src/cuda_memory.c b/src/cuda_memory.c