__global__ void kdot(const float* d_a, const float* d_b, float* d_p, int ni, int nj, int nk) {
int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; //matrix multiplication if (i < ni && j < nj) { float sum = 0.0f; for (int k = 0; k < nk; k++) sum += d_a[i * nk + k] * d_b[k * nj + j]; d_p[i * nj + j] = sum;