I was trying to run a simple CUDA program that performs matrix addition on a specific size.
Here is my code:
main.cpp
/* sample CUDA programming to prove that (AB)transpose=(B)transpose*(A)transpose */
#include "common.h"
#include "utils.h"
#include <iostream>
#include <stdlib.h>
#include <time.h>
using namespace std;
void preprocess(int *A, int *B, int *C, int **da, int **db, int **dc,int M, int N, int P,int blksize);
void checktransposeppt(int *da, int *db, int *dc);
void display(int a[], int b[])
{
//display the matrices
}
int main()
{
int A[M*P],B[P*N];
int C[M*N];
int *da;
int *db;
int *dc;
//initializing values for A and B
display(A,B);
preprocess(A,B,C,&da,&db,&dc,M,N,P,blksize);
checktransposeppt(da,db,dc);
checkCudaErrors(cudaFree(da));
checkCudaErrors(cudaFree(db));
checkCudaErrors(cudaFree(dc));
}
and here is preprocess.cpp :- basically doing cudamalloc, cudamemcpy hosttodevice of arrays and devicetohost of resultant
#include "utils.h"
void preprocess(int *h_a, int *h_b, int *h_c,int **d_a,int **d_b,int **d_c,int M, int N, int P, int blksize)
{
checkCudaErrors(cudaFree(0));
checkCudaErrors(cudaMalloc(d_a,(size_t)sizeof(int)*(M*P)));
checkCudaErrors(cudaMalloc(d_b,(size_t)sizeof(int)*(P*N)));
checkCudaErrors(cudaMalloc(d_c,(size_t)sizeof(int)*(M*N)));
checkCudaErrors(cudaMemset(d_c,0,(size_t)sizeof(int)*(M*N)));
checkCudaErrors(cudaMemcpy(*d_a,h_a,(size_t)sizeof(int)*(M*P),cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(*d_b,h_b,(size_t)sizeof(int)*(P*N),cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(h_c,*d_c,(size_t)sizeof(int)*(M*N),cudaMemcpyDeviceToHost));
}
and this is common.h, a central place to define most of the external headers and global variables
#ifndef COMMON_H
#include <cuda.h>
#include <cuda_runtime.h>
#define COMMON_H
extern int M=256;
extern int P=128;
extern int N=64;
extern int blksize=16;
extern dim3 gridsize(M/blksize,N/blksize,1);
extern dim3 blocksize(blksize,blksize,1);
#endif
the kernel.cu
#include "utils.h"
#include "common.h"
__global__ void abkerneltranspose(int *d_a,int *d_b,int *d_c,int N);
__global__
void abkerneltranspose(int *d_a,int *d_b,int *d_c,int N)
{
int blkx=blockIdx.x;
int blky=blockIdx.y;
int thdx=threadIdx.x;
int thdy=threadIdx.y;
int row=blkx*blockDim.x+threadIdx.x;
int col=blky*blockDim.y+threadIdx.y;
d_c[row*N+col]=d_a[row*N+col]+d_b[row*N+col];
}
void checktransposeppt(int *d_a,int *d_b,int *d_c)
{
dim3 gridsize(M/blksize,N/blksize,1);
dim3 blocksize(blksize,blksize,1);
abkerneltranspose<<<gridsize,blocksize>>>(d_a,d_b,d_c,N);
}
and here is where I suspect the culprit to be: makefile
NVCC=nvcc
NVCC_OPTS=-O3 -arch=sm_20 -Xcompiler -Wall -Xcompiler -Wextra -m64
all: app
app: gpucompile.o cpucompile.o Makefile
nvcc -o app gpucompile.o cpucompile.o -L $(NVCC_OPTS) $(GCC_OPTS)
gpucompile.o: kernel.cu
nvcc -c kernel.cu $(NVCC_OPTS)
cpucompile.o: main.cpp preprocess.cpp
nvcc -x cu main.cpp preprocess.cpp -I. -I $(GCC_OPTS) -I $(CUDA_INCLUDEPATH)
clean:
rm -f *.o hw *.bin
ok here is the problem
on make command, It compiles correctly but throws an error
/tmp/tmpxft_00002074_00000000-21_main.o: In function
main': tmpxft_00002074_00000000-3_main.cudafe1.cpp:(.text+0x543): undefined reference tocheckTransposeppt(int*, int*, int*)'
I am really not sure why this occurs. I compile and create the cpp code separately (just ignore -x cu, it does not cause error) and do the same kernel.cu which I later link.
But this error is thrown by the intermediate main.o which leads me to believe that It failed in creating the cpucompile.o. But couldn't the linker wait till it gets gpucompile.o and then link the two.
Also I tried creating separate object files main.o, preprocess.o and kernel.o and link them all in one step
then I get the following additional error:
/tmp/tmpxft_00002f88_00000000-16_main.o: In function
main': tmpxft_00002f88_00000000-3_main.cudafe1.cpp:(.text+0x532): undefined reference topreprocess(int*, int*, int*, int**, int**, int**, int, int, int, int)'
I missed something basic, can someone please explain what is going wrong here?
Also what is the best practice for doing a project like this: I mean I separate compile device code and cpu code and then link them. I also have a common header where I define the external headers and global variables/classes/function definitions. Any suggestions?