I'm trying to produce a mean filter of sliding window 3 x 3. I'm inputting a 16 x 16 array (arrays16.txt --> int matrix) and trying to allocate 16 x 16 threads for each block (1 block right now). Using cuda-memcheck i'm receiving a number of sync/async errors and I have been going around and around. Is there something obviously wrong? I understand it is some sort of segfault and probably an issue with my pointers but I can't seem to get past it.Using cuda-memcheck with the -lineinfo flag it seems to be happening in the loop in the kernel.

Here is my code, thank you:

```
#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#define MAXR 16
#define MAXC 16
__global__ void imagefilter(float ** intermediates_d, int ** result_d) {
int idx = threadIdx.x;
int idy = threadIdx.y;
int x,y;
//result_d[2][2]= 5;
//if ((idx < 15) && (idy < 15)) {
result_d[x][y] = result_d[idx][idy];
for(x=1; x < MAXR; x++) {
for(y=1; y < MAXC; y++) {
result_d[y][x] = (int) (((float) (intermediates_d[idx - 1][idy- 1]
+ intermediates_d[idx - 1][idy]
+ intermediates_d[idx - 1][idy + 1]
+ intermediates_d[idx][idy - 1] + intermediates_d[idx][idy]
+ intermediates_d[idx][idy + 1]
+ intermediates_d[idx + 1][idy - 1]
+ intermediates_d[idx + 1][idy]
+ intermediates_d[idx + 1][idy + 1]) / 9.0F));
// result_d[2][2]= 5;
result_d[idy][idx]= result_d[y][x];
}
}
}
__syncthreads();
int main(void)
{
int i, j;
//double cpu_time_used;
float intermediates[MAXR][MAXC]; // taking input matrix and converting it to floating
int matrix[MAXR][MAXC]; // This is the input matrix from file
int result[MAXR][MAXC]={{0}}; //This is where we want to write the mean values. For now set to zeros
float ** intermediates_d;
//int **matrix_d;
int ** result_d;
int datasize_f = MAXR * MAXC * sizeof(float);
int datasize_i = MAXR * MAXC * sizeof(int);
//Allocate memory on the host.
cudaMalloc((void**) &intermediates_d, datasize_f);
//cudaMalloc((void**) &matrix_d, datasize);
cudaMalloc((void**) &result_d, datasize_i);
FILE *fp;
fp = fopen("arrays16.txt", "r"); // reads in matrix
//clock_t start =clock();
for (i = 0; i < MAXR; i++) // this loop takes the information from .txt file and puts it into arr1 matrix
{
for (j = 0; j < MAXC; j++)
{
fscanf(fp, "%d\t", &matrix[i][j]);
}
}
printf("*****INPUT MATRIX*****\n");
for (i = 0; i < MAXR; i++)
{
printf("\n");
for (j = 0; j < MAXC; j++) {
printf("%d ", matrix[i][j]);
}
}
printf("\n\n");
//This is where we convert the input matrix into floating point in intermediate matrix
for (int y = 0; y < MAXR; y++) {
for (int x = 0; x < MAXC; x++) {
intermediates[y][x] = (float) matrix[y][x];
}
}
printf("*******INTERMEDIATE MATRIX*******\n");
for (i = 0; i < 16; i++) {
printf("\n"); // prints out the results array to .txt file
for (j = 0; j < 16; j++) {
printf("%.1f ", intermediates[i][j]);
}
}
printf("\n\n");
// copying the data from the host array to the device array
//cudaMemcpy(matrix_d, matrix, datasize,
//cudaMemcpyHostToDevice);
cudaMemcpy(intermediates_d, intermediates, datasize_f,cudaMemcpyHostToDevice);
cudaMemcpy(result_d, result, datasize_i, cudaMemcpyHostToDevice);
// how many blocks we will allocate
dim3 blocks(1, 1);
//how many threads per block we will allocate
dim3 threadsPerBlock(16, 16);
//Launch Kernel
imagefilter<<<blocks, threadsPerBlock,MAXR*MAXC*sizeof(float)>>>(intermediates_d,result_d);
//Copy back Results Matrix.
cudaMemcpy(result, result_d, datasize_i, cudaMemcpyDeviceToHost);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
FILE *file;
file = fopen("results.txt", "w+"); // writes matrix to file
printf("*******RESULTS MATRIX******\n\n");
for (i = 1; i < MAXR - 1; i++) { // prints out the results array to .txt file
for (j = 1; j < MAXC - 1; j++) {
printf("%d ", result[i][j]);
fprintf(file, "%d ",result[i][j]);
}
printf("\n");
fprintf(file, "\n");
}
fclose(file);
}
```