1,ldfcn 的一些信息
dl 系列函数的声明出现在文件 /usr/include/dlfcn.h 中,核心内容如下:
cpp
/* Open the shared object FILE and map it in; return a handle that can be
passed to `dlsym' to get symbol values from it. */
extern void *dlopen (const char *__file, int __mode) __THROWNL;
/* Unmap and close a shared object opened by `dlopen'.
The handle cannot be used again after calling `dlclose'. */
extern int dlclose (void *__handle) __THROWNL __nonnull ((1));
/* Find the run-time address in the shared object HANDLE refers to
of the symbol called NAME. */
extern void *dlsym (void *__restrict __handle,
const char *__restrict __name) __THROW __nonnull ((2));
#ifdef __USE_GNU
/* Like `dlopen', but request object to be allocated in a new namespace. */
extern void *dlmopen (Lmid_t __nsid, const char *__file, int __mode) __THROWNL;
/* Find the run-time address in the shared object HANDLE refers to
of the symbol called NAME with VERSION. */
extern void *dlvsym (void *__restrict __handle,
const char *__restrict __name,
const char *__restrict __version)
__THROW __nonnull ((2, 3));
#endif
/* When any of the above functions fails, call this function
to return a string describing the error. Each call resets
the error string so that a following call returns null. */
extern char *dlerror (void) __THROW;
/* Structure containing information about object searched using
`dladdr'. */
typedef struct
{
const char *dli_fname; /* File name of defining object. */
void *dli_fbase; /* Load address of that object. */
const char *dli_sname; /* Name of nearest symbol. */
void *dli_saddr; /* Exact value of nearest symbol. */
} Dl_info;
/* Fill in *INFO with the following information about ADDRESS.
Returns 0 iff no shared object's segments contain that address. */
extern int dladdr (const void *__address, Dl_info *__info)
__THROW __nonnull ((2));
/* Same as `dladdr', but additionally sets *EXTRA_INFO according to FLAGS. */
extern int dladdr1 (const void *__address, Dl_info *__info,
void **__extra_info, int __flags) __THROW __nonnull ((2));
/* Get information about the shared object HANDLE refers to.
REQUEST is from among the values below, and determines the use of ARG.
On success, returns zero. On failure, returns -1 and records an error
message to be fetched with `dlerror'. */
extern int dlinfo (void *__restrict __handle,
int __request, void *__restrict __arg)
__THROW __nonnull ((1, 3));
关于函数 extern void *dlopen (const char *__file, int __mode) __THROWNL;
其中 第二个参数mode的可能取值的宏却出现在另一个同名头文件中:
/usr/include/x86_64-linux-gnu/bits/dlfcn.h
其中核心内容如下:
cpp
/* The MODE argument to `dlopen' contains one of the following: */
#define RTLD_LAZY 0x00001 /* Lazy function call binding. */
#define RTLD_NOW 0x00002 /* Immediate function call binding. */
#define RTLD_BINDING_MASK 0x3 /* Mask of binding time value. */
#define RTLD_NOLOAD 0x00004 /* Do not load the object. */
#define RTLD_DEEPBIND 0x00008 /* Use deep binding. */
/* If the following bit is set in the MODE argument to `dlopen',
the symbols of the loaded object and its dependencies are made
visible as if the object were linked directly into the program. */
#define RTLD_GLOBAL 0x00100
/* Unix98 demands the following flag which is the inverse to RTLD_GLOBAL.
The implementation does this by default and so we can define the
value to zero. */
#define RTLD_LOCAL 0
/* Do not delete object when closed. */
#define RTLD_NODELETE 0x01000
下面我们做个实验,取两种mode的值:
mode = RTLD_LAZY
mode = RTLD_LAZY | RTLD_NODELETE
2,实验代码
main_drv.cpp
cpp
#include <stdio.h>
#include <stdlib.h>
#include <dlfcn.h>
#include <cuda.h>
#include <builtin_types.h>
#include "matSumKernel.h"
#define UNINIT (0x321cba00)
typedef
struct func_info{
int inited;
void* func;
}func_info;
func_info fa[20]={
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
{UNINIT, (void*)0},
};
enum func_order{
EcuCtxDetach = 0,
EcuMemFree,
EcuMemcpyDtoH,
EcuLaunchKernel,
EcuMemcpyHtoD,
EcuMemAlloc,
EcuModuleGetFunction,
EcuModuleLoad,
EcuCtxCreate,
EcuDeviceTotalMem,
EcuDeviceComputeCapability,
EcuDeviceGet,
EcuDeviceGetCount,
EcuInit,
EcuDeviceGetName,
Emax_cu
};
const
char fname[20][64] =
{
"cuCtxDetach",
"cuMemFree_v2",
"cuMemcpyDtoH_v2",
"cuLaunchKernel",
"cuMemcpyHtoD_v2",
"cuMemAlloc_v2",
"cuModuleGetFunction",
"cuModuleLoad",
"cuCtxCreate_v2",
"cuDeviceTotalMem_v2",
"cuDeviceComputeCapability",
"cuDeviceGet",
"cuDeviceGetCount",
"cuInit",
"cuDeviceGetName"
};
int init_drv()
{
void *handle;
char *error;
handle = dlopen("libcuda.so", RTLD_LAZY);
//handle = dlopen("libcuda.so", 0x01000|0x00001);
if (!handle) {
fprintf(stderr, "%s\n", dlerror());
return 2;
}
dlerror(); /* Clear any existing error */
for(int i=0; i<Emax_cu; i++){
//printf("i = %d", i); fflush(stdout);
fa[i].func = dlsym(handle, fname[i]);
//*(void **) (&(fa[i].func)) = dlsym(handle, fname[i]);
fa[i].inited = ~UNINIT;
printf("i = %d, %s : %p\n", i, fname[i], fa[i].func); fflush(stdout);
if ((error = dlerror()) != NULL) {
fprintf(stderr, "%s\n", error);
return 1;
}
}
if ((error = dlerror()) != NULL) {
fprintf(stderr, "%s\n", error);
return 1;
}
dlclose(handle);
return 0;
}
CUresult cuDDLaunchKernel(CUfunction f,
unsigned int gridDimX,
unsigned int gridDimY,
unsigned int gridDimZ,
unsigned int blockDimX,
unsigned int blockDimY,
unsigned int blockDimZ,
unsigned int sharedMemBytes,
CUstream hStream,
void **kernelParams,
void **extra)
{
typedef CUresult (*TcuLaunchKernel)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
if(fa[func_order::EcuLaunchKernel].inited == UNINIT)
init_drv();
TcuLaunchKernel fp = TcuLaunchKernel(fa[func_order::EcuLaunchKernel].func);
//printf("In %s() fp = %p, cuLaunchKernel = %p\n",__func__, fp, cuLaunchKernel ); fflush(stdout);
return fp(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
}
CUresult cuDDCtxDetach (CUcontext ctx)
{
typedef CUresult (*TcuCtxDetach) (CUcontext);
if(fa[func_order::EcuCtxDetach].inited == UNINIT)
init_drv();
//CUresult (*fp) (CUcontext)
TcuCtxDetach fp = TcuCtxDetach(fa[func_order::EcuCtxDetach].func);
//printf("In %s() fp = %p, cuCtxDetach = %p\n",__func__, fp, cuCtxDetach ); fflush(stdout);
return fp(ctx);
}
CUresult cuDDInit (unsigned int Flags)
{
typedef CUresult (*TcuInit) (unsigned int);
if(fa[func_order::EcuInit].inited == UNINIT)
init_drv();
TcuInit fp = TcuInit(fa[func_order::EcuInit].func);
//printf("In %s() fp = %p, cuInit = %p\n",__func__, fp, cuInit); fflush(stdout);
return fp(Flags);
}
CUresult cuDDDeviceGetCount (int *count)
{
typedef CUresult (*TcuDeviceGetCount)(int *);
if(fa[func_order::EcuDeviceGetCount].inited == UNINIT)
init_drv();
TcuDeviceGetCount fp = TcuDeviceGetCount(fa[func_order::EcuDeviceGetCount].func);
//printf("In %s() fp = %p, cuDeviceGetCount = %p\n",__func__, fp, cuDeviceGetCount); fflush(stdout);
return fp(count);
}
CUresult cuDDDeviceGet (CUdevice *device, int ordinal)
{
typedef CUresult (*TcuDeviceGet)(CUdevice *, int);
if(fa[func_order::EcuDeviceGet].inited == UNINIT)
init_drv();
TcuDeviceGet fp = TcuDeviceGet(fa[func_order::EcuDeviceGet].func);
//printf("In %s() fp = %p, cuDeviceGet = %p\n",__func__, fp, cuDeviceGet); fflush(stdout);
return fp(device, ordinal);
}
CUresult cuDDDeviceComputeCapability (int *major, int *minor, CUdevice dev)
{
typedef CUresult (*TcuDeviceComputeCapability)(int *, int *, CUdevice);
if(fa[func_order::EcuDeviceComputeCapability].inited == UNINIT)
init_drv();
TcuDeviceComputeCapability fp = TcuDeviceComputeCapability(fa[func_order::EcuDeviceComputeCapability].func);
//printf("In %s() fp = %p, cuDeviceComputeCapability = %p\n",__func__, fp, cuDeviceComputeCapability); fflush(stdout);
return fp(major, minor, dev);
}
CUresult cuDDDeviceTotalMem (size_t *bytes, CUdevice dev)
{
typedef CUresult (*TcuDeviceTotalMem) (size_t *bytes, CUdevice dev);
if(fa[func_order::EcuDeviceTotalMem].inited == UNINIT)
init_drv();
TcuDeviceTotalMem fp = TcuDeviceTotalMem(fa[func_order::EcuDeviceTotalMem].func);
//printf("In %s() fp = %p, cuDeviceTotalMem = %p\n", __func__, fp, cuDeviceTotalMem); fflush(stdout);
return fp(bytes, dev);
}
CUresult cuDDCtxCreate (CUcontext *pctx, unsigned int flags, CUdevice dev)
{
typedef CUresult (*TcuCtxCreate)(CUcontext *, unsigned int, CUdevice);
if(fa[func_order::EcuCtxCreate].inited == UNINIT)
init_drv();
TcuCtxCreate fp = TcuCtxCreate(fa[func_order::EcuCtxCreate].func);
//printf("In %s() fp = %p, cuCtxCreate = %p\n",__func__, fp, cuCtxCreate); fflush(stdout);
return fp(pctx, flags, dev);
}
CUresult cuDDModuleLoad (CUmodule *module, const char *fname)
{
typedef CUresult (*TcuModuleLoad)(CUmodule *, const char *);
if(fa[func_order::EcuModuleLoad].inited == UNINIT)
init_drv();
TcuModuleLoad fp = TcuModuleLoad(fa[func_order::EcuModuleLoad].func);
//printf("In %s() fp = %p, cuModuleLoad = %p\n",__func__, fp, cuModuleLoad); fflush(stdout);
return fp(module, fname);
}
CUresult cuDDModuleGetFunction (CUfunction *hfunc, CUmodule hmod, const char *name)
{
typedef CUresult (*TcuModuleGetFunction)(CUfunction *, CUmodule, const char *);
if(fa[func_order::EcuModuleGetFunction].inited == UNINIT)
init_drv();
TcuModuleGetFunction fp = TcuModuleGetFunction(fa[func_order::EcuModuleGetFunction].func);
//printf("In %s() fp = %p, cuModuleGetFunction = %p\n",__func__, fp, cuModuleGetFunction); fflush(stdout);
return fp(hfunc, hmod, name);
}
CUresult cuDDMemAlloc (CUdeviceptr *dptr, size_t bytesize)
{
typedef CUresult (*TcuMemAlloc)(CUdeviceptr *, size_t);
if(fa[func_order::EcuMemAlloc].inited == UNINIT)
init_drv();
TcuMemAlloc fp = TcuMemAlloc(fa[func_order::EcuMemAlloc].func);
//printf("In %s() fp = %p, cuMemAlloc = %p\n",__func__, fp, cuMemAlloc); fflush(stdout);
return fp(dptr, bytesize);
}
CUresult cuDDMemcpyHtoD (CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
{
typedef CUresult (*TcuMemcpyHtoD) (CUdeviceptr, const void *, size_t);
if(fa[func_order::EcuMemcpyHtoD].inited == UNINIT)
init_drv();
TcuMemcpyHtoD fp = TcuMemcpyHtoD(fa[func_order::EcuMemcpyHtoD].func);
//printf("In %s() fp = %p, cuMemcpyHtoD = %p\n",__func__, fp, cuMemcpyHtoD); fflush(stdout);
return fp(dstDevice, srcHost, ByteCount);
}
CUresult cuDDMemcpyDtoH (void *dstHost, CUdeviceptr srcDevice, size_t ByteCount)
{
typedef CUresult (*TcuMemcpyDtoH) (void *, CUdeviceptr, size_t);
if(fa[func_order::EcuMemcpyDtoH].inited == UNINIT)
init_drv();
TcuMemcpyDtoH fp = TcuMemcpyDtoH(fa[func_order::EcuMemcpyDtoH].func);
//printf("In %s() fp = %p, cuMemcpyDtoH = %p\n",__func__, fp, cuMemcpyDtoH); fflush(stdout);
return fp(dstHost, srcDevice, ByteCount);
}
CUresult cuDDMemFree (CUdeviceptr dptr)
{
typedef CUresult (*TcuMemFree) (CUdeviceptr);
if(fa[func_order::EcuMemFree].inited == UNINIT)
init_drv();
TcuMemFree fp = TcuMemFree(fa[func_order::EcuMemFree].func);
//printf("In %s() fp = %p, cuMemFree = %p\n",__func__, fp, cuMemFree); fflush(stdout);
printf("____");
return fp(dptr);
}
CUresult cuDDDeviceGetName (char *name, int len, CUdevice dev)
{
typedef CUresult (*TcuDeviceGetName) (char *, int, CUdevice);
if(fa[func_order::EcuDeviceGetName].inited == UNINIT)
init_drv();
TcuDeviceGetName fp = TcuDeviceGetName(fa[func_order::EcuDeviceGetName].func);
//printf("In %s() fp = %p, cuDeviceGetName = %p\n",__func__, fp, cuDeviceGetName); fflush(stdout);
return fp(name, len, dev);
}
int main2()
{
init_drv();
return 0;
}
#if 1
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
inline void __checkCudaErrors( CUresult err, const char *file, const int line )
{
if( CUDA_SUCCESS != err) {
fprintf(stderr,
"CUDA Driver API error = %04d from file <%s>, line %i.\n",
err, file, line );
exit(-1);
}
}
CUdevice device;
CUcontext context;
CUmodule module;
CUfunction function;
size_t totalGlobalMem;
char *module_file = (char*) "matSumKernel.ptx";
char *kernel_name = (char*) "matSum";
int main()
{
int a[N], b[N], c[N];
CUdeviceptr d_a, d_b, d_c;
// initialize host arrays
for (int i = 0; i < N; ++i) {
a[i] = N - i;
b[i] = i * i;
}
// initialize
printf("- Initializing...\n");
int deviceCount = 0;
CUresult err = cuDDInit(0);
int major = 0, minor = 0;
if (err == CUDA_SUCCESS)
checkCudaErrors(cuDDDeviceGetCount(&deviceCount));
if (deviceCount == 0) {
fprintf(stderr, "Error: no devices supporting CUDA\n");
exit(-1);
}
// get first CUDA device
checkCudaErrors(cuDDDeviceGet(&device, 0));
char name[100];
cuDDDeviceGetName(name, 100, device);
printf("> Using device 0: %s\n", name);
// get compute capabilities and the devicename
checkCudaErrors( cuDDDeviceComputeCapability(&major, &minor, device) );
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors( cuDDDeviceTotalMem(&totalGlobalMem, device) );
printf(" Total amount of global memory: %llu bytes\n",
(unsigned long long)totalGlobalMem);
printf(" 64-bit Memory Address: %s\n",
(totalGlobalMem > (unsigned long long)4*1024*1024*1024L)?
"YES" : "NO");
err = cuDDCtxCreate(&context, 0, device);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "* Error initializing the CUDA context.\n");
cuDDCtxDetach(context);
exit(-1);
}
err = cuDDModuleLoad(&module, module_file);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "* Error loading the module %s\n", module_file);
cuDDCtxDetach(context);
exit(-1);
}
err = cuDDModuleGetFunction(&function, module, kernel_name);
if (err != CUDA_SUCCESS) {
fprintf(stderr, "* Error getting kernel function %s\n", kernel_name);
cuDDCtxDetach(context);
exit(-1);
}
checkCudaErrors( cuDDMemAlloc(&d_a, sizeof(int) * N) );
checkCudaErrors( cuDDMemAlloc(&d_b, sizeof(int) * N) );
checkCudaErrors( cuDDMemAlloc(&d_c, sizeof(int) * N) );
// copy arrays to device
checkCudaErrors( cuDDMemcpyHtoD(d_a, a, sizeof(int) * N) );
checkCudaErrors( cuDDMemcpyHtoD(d_b, b, sizeof(int) * N) );
// run
printf("# Running the kernel...\n");
void *args[3] = { &d_a, &d_b, &d_c };
// grid for kernel: <<<N, 1>>>
checkCudaErrors( cuDDLaunchKernel(function, N, 1, 1, // Nx1x1 blocks
1, 1, 1, // 1x1x1 threads
0, 0, args, 0) );
printf("# Kernel complete.\n");
// copy results to host and report
checkCudaErrors( cuDDMemcpyDtoH(c, d_c, sizeof(int) * N) );
for (int i = 0; i < N; ++i) {
if (c[i] != a[i] + b[i])
printf("* Error at array position %d: Expected %d, Got %d\n",
i, a[i]+b[i], c[i]);
}
printf("*** All checks complete.\n");fflush(stdout);
// finish
printf("- Finalizing...\n");
checkCudaErrors( cuDDMemFree(d_a) );
checkCudaErrors( cuDDMemFree(d_b) );
checkCudaErrors( cuDDMemFree(d_c) );
checkCudaErrors(cuDDCtxDetach(context));
return 0;
}
#endif
/*
CUresult cuLaunchKernel (CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra)
CUresult cuInit (unsigned int Flags)
CUresult DD (int *count)
CUresult cuDeviceGet (CUdevice *device, int ordinal)
CUresult cuDeviceComputeCapability (int *major, int *minor, CUdevice dev)
CUresult cuDeviceTotalMem (size_t *bytes, CUdevice dev)
CUresult cuCtxCreate (CUcontext *pctx, unsigned int flags, CUdevice dev)
CUresult cuModuleLoad (CUmodule *module, const char *fname)
CUresult cuModuleGetFunction (CUfunction *hfunc, CUmodule hmod, const char *name)
CUresult cuMemAlloc (CUdeviceptr *dptr, size_t bytesize)
CUresult cuMemcpyHtoD (CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount)
CUresult DD (void *dstHost, CUdeviceptr srcDevice, size_t ByteCount)
CUresult cuMemFree (CUdeviceptr dptr)
CUresult cuCtxDetach (CUcontext ctx)
*/
matSumKernel.h
cpp
#ifndef __MATSUMKERNEL_H
#define __MATSUMKERNEL_H
// size of the vectors to sum
#define N 100
#endif __MATSUMKERNEL_H
cpp
#include "matSumKernel.h"
extern "C"
__global__ void matSum(int *a, int *b, int *c)
{
int tid = blockIdx.x;
if (tid < N)
c[tid] = a[tid] + b[tid];
}
Makefile
cpp
all: hello matSumKernel.ptx
%.ptx: %.cu
nvcc $< -ptx -arch=sm_75 -o $@ -I./
%.cu.o: %.cu
nvcc $< -c -arch=sm_75 -o $@ -I./
%.cpp.o: %.cpp
g++ -g $< -c -o $@ -I /usr/local/cuda/include
hello: main_drv.cpp.o
g++ -g $^ -o hello -lcuda -L /usr/local/cuda/lib64 -lcudart
.PHONY: clean
clean:
-rm -rf hello *.o *.ptx
3,实验现象
mode = RTLY_LAZY
虽然可以看到每个函数都后的了地址,但是当执行 dlclose 之后,libcuda.so 的内容又被释放了。
下面试试 mode = RTLD_LAZY | RTLD_NODELETE
可以发现程序正常运行,函数的内容依然存在于内存空间中。