1
edit
Changes
→Assignment 3
== Assignment 3 ==
Due to the nature of the way this program was structured by the original developer, optimization was not really needed. The benefits were very small, but here are the optimized kernels for safe measure:
'''RC4 OPTIMIZED Cuda Kernel'''
<pre>
/**
* Description: RC4 Cuda Kernel
**/
__global__ void getRC4Buffer(char * buffer, int bufferSize, int ntpb) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int tid = threadIdx.x;
__shared__ float sharedMem[1024];
sharedMem[tid] = buffer[idx];
__syncthreads();
if (idx < bufferSize)
sharedMem[tid] = cycle(sharedMem[tid]);
__syncthreads();
buffer[idx] = sharedMem[tid];
}
</pre>
'''Cycle OPTIMIZED Cuda Kernel'''
<pre>
/**
* Description: Cycle Cuda Kernel
**/
__global__ void getCycleBuffer(char * buffer, int bufferSize) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int tid = threadIdx.x;
__shared__ float sharedMem[1024];
sharedMem[tid] = buffer[idx];
__syncthreads();
if (idx < bufferSize)
sharedMem[tid] = cycle(sharedMem[tid]);
__syncthreads();
buffer[idx] = sharedMem[tid];
}
</pre>
The device functions were not modified.