You've already hinted in your question at one possible approach:
#define DSIZE (100*1048576)
__device__ unsigned int buffer_index = 0;
__device__ int *buffer_data;
In your host code:
int *buffer_data_temp;
cudaMalloc(&buffer_data_temp, sizeof(int)*DSIZE);
cudaMemcpyToSymbol(buffer_data, &buffer_data_temp, sizeof(int *));
In your thread code:
unsigned int my_buffer_offset = atomicAdd(&buffer_index, size_of_my_thread_data);
assert((my_buffer_offset+size_of_my_thread_data) < DSIZE);
memcpy(buffer_data+my_buffer_offset, my_thread_data, size_of_my_thread_data*sizeof(int));
(disclaimer: coded in browser, not tested)
It's not necessary to use a mutex, for example around the memcpy operation. Once we have reserved the starting and ending points of our allocation with the atomicAdd, the threads will not step on each other, even if all are writing data, because they are writing to separate regions within buffer_data.
EDIT:
Here's a complete example:
#include <stdio.h>
#include <assert.h>
#define DSIZE (100*1048576)
#define nTPB 32
#define BLKS 2
__device__ unsigned int buffer_index = 0;
__global__ void update_buffer(int *buffer_data){
const unsigned int size_of_my_thread_data = 1;
unsigned int my_buffer_offset = atomicAdd(&buffer_index, size_of_my_thread_data);
assert((my_buffer_offset+size_of_my_thread_data) < DSIZE);
int my_thread_data[size_of_my_thread_data];
my_thread_data[0] = (blockIdx.x*10000) + threadIdx.x;
memcpy(buffer_data+my_buffer_offset, my_thread_data, size_of_my_thread_data*sizeof(int));
}
int main(){
int *h_buffer_data, *d_buffer_data;
cudaMalloc(&d_buffer_data, sizeof(int)*DSIZE);
update_buffer<<<BLKS, nTPB>>>(d_buffer_data);
unsigned int result_size;
cudaMemcpyFromSymbol(&result_size, buffer_index, sizeof(unsigned int));
h_buffer_data = (int *)malloc(sizeof(int)*result_size);
cudaMemcpy(h_buffer_data, d_buffer_data, result_size*sizeof(int),cudaMemcpyDeviceToHost);
for (int i = 0; i < result_size; i++)
printf("%d\n", h_buffer_data[i]);
return 0;
}