12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455 |
- #include <iostream>
- #include <math.h>
-
- // Kernel function to add the elements of two arrays
- __global__
- void add(int n, float *x, float *y)
- {
- int index = threadIdx.x;
- int stride = blockDim.x;
- for (int i = index; i < n; i += stride)
- y[i] = x[i] + y[i];
- }
-
- /*
- __global__
- void add(int n, float *x, float *y)
- {
- for (int i = 0; i < n; i++)
- y[i] = x[i] + y[i];
- }
- */
-
- int main(void)
- {
- int N = 1<<20;
- float *x, *y;
-
- // Allocate Unified Memory – accessible from CPU or GPU
- cudaMallocManaged(&x, N*sizeof(float));
- cudaMallocManaged(&y, N*sizeof(float));
-
- // initialize x and y arrays on the host
- for (int i = 0; i < N; i++) {
- x[i] = 1.0f;
- y[i] = 2.0f;
- }
-
- // Run kernel on 1M elements on the GPU
- add<<<1, 256>>>(N, x, y);
-
- // Wait for GPU to finish before accessing on host
- cudaDeviceSynchronize();
-
- // Check for errors (all values should be 3.0f)
- float maxError = 0.0f;
- for (int i = 0; i < N; i++)
- maxError = fmax(maxError, fabs(y[i]-3.0f));
- std::cout << "Max error: " << maxError << std::endl;
-
- // Free memory
- cudaFree(x);
- cudaFree(y);
-
- return 0;
- }
|