93
edits
Changes
Savy Cat
,→Assignment 2
==== Profiling With Nsight ====
I edit rotate90.cu , removing the display function calls, and looping to rotate the given image 12 times, as done in the CPU version.I copy the result of the rotation back to the host after each operation completes. I re-use the memory allocated on the device for each rotation, only allocating source and destination arrays once, then freeing memory after all 12 rotations are complete: <nowiki> // Allocate device memory for src and dst std::cout << "Allocating device memory ..." << std::endl; cudaMalloc((void**)&d_src, w * h * sizeof(float) * 3); cudaMalloc((void**)&d_dst, w * h * sizeof(float) * 3); // Copy h_src to d_src std::cout << "Copying source image to device ..." << std::endl; cudaMemcpy(d_src, h_src, w * h * sizeof(float) * 3, cudaMemcpyHostToDevice); // Rotate image 6 x 2 times, copying result back to host each time for (int r = 0; r < 6; r++) { std::cout << "Rotating 2x ..." << std::endl; // Launch grid 3 times (one grid per colour channel) for (int i = 0; i < 3; i++) { rot90 << <dGrid, dBlock >> > (d_src, d_dst, w, h, i); } // Ensure operations completed cudaDeviceSynchronize(); // Copy d_dst to h_dst std::cout << "Copying result to host ..." << std::endl; cudaMemcpy(h_dst, d_dst, w * h * sizeof(float) * 3, cudaMemcpyDeviceToHost); // Rotate again for (int i = 0; i < 3; i++) { rot90 << <dGrid, dBlock >> > (d_dst, d_src, h, w, i); } // Ensure operations completed cudaDeviceSynchronize(); // Copy d_src to h_src cudaMemcpy(h_src, d_src, w * h * sizeof(float) * 3, cudaMemcpyDeviceToHost); std::cout << "Copying result to host ..." << std::endl; } // Dealocate memory std::cout << "Dealocating device memory ..." << std::endl; cudaFree(d_src); cudaFree(d_dst);</nowiki>
=== Assignment 3 ===