
#include <stdio.h>
#include <iostream>

#define N 128
#define numThreads 128
#define numBlocks 1


__global__ void add(int* a, int* b, int* c) {
	
	int tid = blockDim.x * blockIdx.x + threadIdx.x;
	if (tid < N) {
		c[tid] = a[tid] + b[tid];
	}
}

int main(void) {

	int* a;
	int* b;
	int* c;
	int* dev_a;
	int* dev_b;
	int* dev_c;

	a = (int* )malloc(sizeof(int) * N);
	b = (int* )malloc(sizeof(int) * N);
	c = (int* )malloc(sizeof(int) * N);

	for ( int i = 0;i < N;i++ ) {
		a[i] = i;
		b[i] = N - i - 1;
	}
	
	cudaMalloc((void**)&dev_a, N * sizeof(int));
	cudaMalloc((void**)&dev_b, N * sizeof(int));
	cudaMalloc((void**)&dev_c, N * sizeof(int));

	cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
	
	add<<<numBlocks, numThreads>>>(dev_a, dev_b, dev_c);
	cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);

	bool flag = true;
	int tot = 0;
	printf("Let us check our results...\n");
	for( int i = 0;i < N;i++ ) {
		if (a[i] + b[i] != c[i]) {
			flag = false;
			printf("%d + %d != %d\n", a[i], b[i], c[i]);
		}
		tot += 1;
	}

	if (flag) {
		printf("success!");
	}
	
	free(a);
	free(b);
	free(c);

	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);
	
	return 0;
}
