can pycuda parse float as unsigned char array as C++/CUDA does?

Question

I'm trying to do base64 using pycuda for data transfer on network. I need to convert float to byte or unsigned char and I did it just by cudamemcpy after I found memcpy works well on CPU. I mean, I just do cuda mem copy some float values and take those values in the kernel by "unsigend char* " to treat it as byte array.

Also I saw my c++/cuda code works good too but the same code doesn't work in pycuda.

Partial code snap is as below; C++/CUDA

#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>
 
using namespace std;

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(1);                                                               \
    }                                                                          \
}  
// grid 2D block 2D
__global__ void base64_encode(int N, unsigned char* in, unsigned char* out) //////////////// not float type, but uchar to treat it as byte array!!

{
    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    if (idx < N){
        out[idx] = in[idx];
        printf("cuda thread %d : %02x \n",idx, in[idx]);
    }
}

int main(int argc, char **argv)
{
    printf("%s Starting...\n", argv[0]);
 
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("Using Device %d: %s\n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));
 
    int nx = 1 << 2;
    int ny = 1 << 2;

    int nxy = nx * ny;
    int nBytes = nxy * sizeof(float);
    printf("Matrix size: nx %d ny %d\n", nx, ny);
 
    float *h_A, *hostRef;
    unsigned char * gpuRef;
    h_A = (float *)malloc(nBytes); 
    hostRef = (float *)malloc(nBytes);
    gpuRef = (unsigned char *)malloc(nBytes);
    int size= (int)(nxy/ sizeof(float));

    unsigned char b[nxy];//sizeof(float)
    for (int i = 0; i < size; i++)
    {
        h_A[i] = (float)(i & 0xFF);
        cout << h_A[i]  << ", " << endl;
    }
    memset(hostRef, 0, nBytes);   
    memcpy(b, &h_A, nxy);  
    memset(gpuRef, 0, nBytes);
    
    unsigned char *d_input, *d_output;
    CHECK(cudaMalloc((void **)&d_input, nBytes)); 
    CHECK(cudaMalloc((void **)&d_output, nBytes));
 
    CHECK(cudaMemcpy(d_input, h_A, nBytes, cudaMemcpyHostToDevice)); 
 
    int dimx = 4*4; 
    dim3 block(dimx, 1);
    dim3 grid((nxy + block.x - 1) / block.x );
 
    base64_encode<<<grid, block>>>(nxy, d_input, d_output);
    CHECK(cudaDeviceSynchronize());  
    CHECK(cudaGetLastError()); 
    CHECK(cudaMemcpy(gpuRef, d_output, nBytes, cudaMemcpyDeviceToHost));
  
    for (int i = 0; i < nxy; i++) 
        printf("%02x, ",gpuRef[i]); 
 
    CHECK(cudaFree(d_input)); 
    CHECK(cudaFree(d_output));
 
    free(h_A); 
    free(hostRef);
    free(gpuRef);
 
    CHECK(cudaDeviceReset());

    return (0);
}

and the result looks good

0, 
1, 
2, 
3, 
cuda thread 0 : 00 
cuda thread 1 : 00 
cuda thread 2 : 00 
cuda thread 3 : 00 
cuda thread 4 : 00 
cuda thread 5 : 00 
cuda thread 6 : 80 
cuda thread 7 : 3f 
cuda thread 8 : 00 
cuda thread 9 : 00 
cuda thread 10 : 00 
cuda thread 11 : 40 
cuda thread 12 : 00 
cuda thread 13 : 00 
cuda thread 14 : 40 
cuda thread 15 : 40 
00, 00, 00, 00, 00, 00, 80, 3f, 00, 00, 00, 40, 00, 00, 40, 40

PyCUDA code

import numpy as np
import matplotlib.pyplot as plt
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray

kernel = SourceModule("""
#include <stdio.h>  
using namespace std;

__global__ void base64_encode(int N, unsigned char* in, unsigned char* out){
    int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    if (idx < N){
        out[idx] = in[idx];
        printf("cuda thread %d : %02x \\n",idx, in[idx]);
    }
}
""")
def gpu_rgb2gray(): 
    floatValue = np.asarray(1.0).astype(np.float32)
    floatValue_gpu = cuda.mem_alloc(floatValue.nbytes) 
    cuda.memcpy_htod(floatValue_gpu, floatValue)
    
    h_output = np.asarray(0.0).astype(np.float32)      
    d_output = cuda.mem_alloc(h_output.nbytes)
    cuda.memcpy_htod(d_output, h_output) 
    base64_encoder = kernel.get_function("base64_encode") 
    blockDim = (4, 1, 1)  
    gridDim = (1, 1, 1)  
    base64_encoder(4, floatValue_gpu, d_output, block=blockDim, grid=gridDim)
    
    h_output2 = np.array(d_output.get(), dtype=np.ubyte) 
    return 0#h_output

this code shows and error : TypeError: invalid type on parameter #0 (0-based) Could I ask anybody help me please?

Robert Crovella Robert Crovella · Accepted Answer · 2021-08-14T15:05:17

First of all this import:

import pycuda.driver as drv

doesn't match the rest of your code. To match the rest of your code, that should be:

import pycuda.driver as cuda

On to your question. The parameter that pycuda is complaining about is parameter 0 (ie. the first parameter) in this line:

base64_encoder(4, floatValue_gpu, d_output, block=blockDim, grid=gridDim)
               ^

That has nothing to do with the usage of float or any of the subjects you bring up in your question. In your kernel definition, you are expecting a 32-bit integer:

__global__ void base64_encode(int N, ...
                              ^^^

but a bare constant like that in python is evidently something else. You can fix this by modifying the call like this:

base64_encoder(np.int32(4), floatValue_gpu, d_output, block=blockDim, grid=gridDim)
               ^^^^^^^^^^^

When I make those two changes, and run your gpu_rgb2gray() function, I get reasonable-looking printf output from the kernel.

After making those changes, although you're not really using it in the code you have posted, the next problem you will run into is here:

h_output2 = np.array(d_output.get(), dtype=np.ubyte)

Your d_output is a DeviceAllocation object, not a GPUArray object, and so it doesn't have a get attribute/method. To fix that with minimum changes, I would just reverse the methodology you used to populate that object:

h_output2 = np.empty(floatValue.nbytes, dtype=np.ubyte)
cuda.memcpy_dtoh(h_output2, d_output)

Here is a complete example:

$ cat t29.py
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from   pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray

kernel = SourceModule("""
#include <stdio.h>
using namespace std;

__global__ void base64_encode(int N, unsigned char* in, unsigned char* out){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N){
        out[idx] = in[idx];
        printf("cuda thread %d : %02x \\n",idx, in[idx]);
    }
}
""")
def gpu_rgb2gray():
    floatValue = np.asarray(1.0).astype(np.float32)
    floatValue_gpu = cuda.mem_alloc(floatValue.nbytes)
    cuda.memcpy_htod(floatValue_gpu, floatValue)

    h_output = np.asarray(0.0).astype(np.float32)
    d_output = cuda.mem_alloc(h_output.nbytes)
    cuda.memcpy_htod(d_output, h_output)
    base64_encoder = kernel.get_function("base64_encode")
    blockDim = (4, 1, 1)
    gridDim = (1, 1, 1)
    base64_encoder(np.int32(4), floatValue_gpu, d_output, block=blockDim, grid=gridDim)
    h_output2 = np.empty(floatValue.nbytes, dtype=np.ubyte)
    cuda.memcpy_dtoh(h_output2, d_output)
    return h_output2

print(gpu_rgb2gray())

$ cuda-memcheck  python t29.py
========= CUDA-MEMCHECK
cuda thread 0 : 00
cuda thread 1 : 00
cuda thread 2 : 80
cuda thread 3 : 3f
[  0   0 128  63]
========= ERROR SUMMARY: 0 errors
$

can pycuda parse float as unsigned char array as C++/CUDA does?

1 Answers