L2 error norm of FFT solution not converging

Given a function u(x,y) I want to calculate the Laplacian of a function

enter image description here

I am doing this via cuFFT's forward and inverse transforms. Below is the code

#include <cufft.h>
#include <stdio.h>
#include <math.h>

#define BSZ 8


#define gpuErrChk(ans) { checkCUDAError((ans), __FILE__, __LINE__); }
inline void checkCUDAError(cudaError_t code, const char *file, int line, bool abort=true)
{
  //cudaError_t err = cudaGetLastError();
  if( cudaSuccess != code) 
  {
    fprintf(stderr, "CUDA Error in %s(%d): %s.\n", file, line, cudaGetErrorString(code) );
    exit(EXIT_FAILURE);
  }    
}

__global__ void scale_drv(cufftComplex *ft, float *k, int N)
{
  int i = threadIdx.x + blockIdx.x*blockDim.x;
  int j = threadIdx.y + blockIdx.y*blockDim.y;
  int idx = j*(N/2+1)+i;
  if (i<(N/2+1) && j<N) 
  {
    float k2 = float(k[i]*k[i] + k[j]*k[j]);
    ft[idx].x *= -k2;
    ft[idx].y *= -k2;
  }
}

int main(int argc, char **argv)
{
  // Default problem size
  int N = 64;

  // User can change problem size
  for (int i=0; i<argc; i++) {
    if (strcmp("-N",argv[i]) == 0) {
      N = atoi(argv[i+1]);
      printf("  found -N %d\n", N);
    }
  }

  // Allocate everything else
  float xmax = 1.0f, xmin = 0.0f, ymin = 0.0f;
  float h = (xmax-xmin)/((float)N), s = 0.1, s2 = s*s;
  float *x, *y, *f, *f_a, *u_a, *k;
  float r2;
  size_t memSize = N*N*sizeof(float);
  size_t vecSize = N*sizeof(float);
  x = (float *)malloc(memSize);
  y = (float *)malloc(memSize);
  f = (float *)malloc(memSize);
  f_a = (float *)malloc(memSize);
  u_a = (float *)malloc(memSize);
  k = (float *)malloc(vecSize);

  // Set coordinates, RHS vector, and analytical solution
  for (int j = 0; j < N; j++)
  {
    for (int i = 0; i < N; i++)
    {
      x[N*j+i] = xmin + i*h;
      y[N*j+i] = ymin + j*h;
      r2 = (x[N*j+i] - 0.5)*(x[N*j+i]-0.5) + (y[N*j+i] - 0.5)*(y[N*j+i]-0.5);
      f_a[N*j+i] = (r2-2*s2)/(s2*s2)*exp(-r2/(2*s2));
      u_a[N*j+i] = exp(-r2/(2*s2));
      //u_a[N*j+i] = sin(2*M_PI*i/float(N))*sin(2*M_PI*j/float(N));
      //f[N*j+i] = -8*M_PI*M_PI*u_a[N*j+i];
    }
  }

  // Fourier coeff?
  float freq = 1.f/N;
  for (int i = 0; i <= N/2; i++) k[i] = i*2*M_PI*freq;
  for (int i = N/2+1; i<N; i++) k[i] = (i - N)*2*M_PI*freq;

  // Allocate device information
  float *k_d, *u_d;
  gpuErrChk(cudaMalloc((void**)&k_d, vecSize));
  gpuErrChk(cudaMalloc((void**)&u_d, memSize));
  gpuErrChk(cudaMemcpy(u_d, u_a, memSize, cudaMemcpyHostToDevice));
  gpuErrChk(cudaMemcpy(k_d, k, vecSize, cudaMemcpyHostToDevice));

  // Initiate cuFFT
  cufftComplex *u_dk;
  gpuErrChk(cudaMalloc((void**)&u_dk, sizeof(cufftComplex)*N*(N/2+1)));

  // Create cuFFT plans
  cufftHandle planf, planb;
  size_t wrksize;
  int dims[2] = {N,N};
  cufftCreate(&planf);
  cufftCreate(&planb);
  cufftMakePlanMany(planf,2,dims,NULL,1,0,NULL,1,0,CUFFT_R2C,1,&wrksize);
  cufftMakePlanMany(planb,2,dims,NULL,1,0,NULL,1,0,CUFFT_C2R,1,&wrksize);

  // Forward transform
  cufftExecR2C(planf, u_d, u_dk);

  // Kernel: scale derivative
  dim3 dimBlock (BSZ, BSZ);
  dim3 dimGrid((N/2)/BSZ+1,N/BSZ);
  scale_drv<<<dimGrid, dimBlock>>>(u_dk, k_d, N);
  cudaDeviceSynchronize();

  // Inverse transform
  cufftExecC2R(planb, u_dk, u_d);

  // Transfer data back to host
  gpuErrChk(cudaMemcpy(f, u_d, memSize, cudaMemcpyDeviceToHost));

  // Save expected vs compute solutions to file
  FILE *fp1,*fp2;
  fp1 = fopen("expected.dat","w");
  fp2 = fopen("computed.dat","w");
  float error = 0.0f;
  for (int i=0; i<N*N; i++) {
    fprintf(fp1,"%.3e\n",f_a[i]);
    fprintf(fp2,"%.3e\n",f[i]);
    // Calculate L2 error norm
    error += (f_a[i] - f[i])*(f_a[i] - f[i]);
  }
  fclose(fp1);
  fclose(fp2);

  // Print L2 error norm
  printf("error = %0.3e\n", sqrt(error));

  // Clean up - list incomplete
  cufftDestroy(planf);
  cufftDestroy(planb);
  return 0;
}

When i do an L2 error norm comparing the expected f(x,y) vs the computed f(x,y) for N = 4, 8, 16, 32, 64, 128, and 256 I get the following:

error = 2.190e+02
error = 8.118e+00
error = 2.887e-02
error = 6.510e-02
error = 1.715e-01
error = 5.509e-01
error = 2.874e+00

I expect the error norm to drop (as it did for N = 4 through 16) but it actually goes up when N = 32 and beyond. Why is this happening? Here's how I am computing the L2 error norm:

for (int i=0; i<N*N; i++) {

  ...

  // Calculate L2 error norm
  error += (f_a[i] - f[i])*(f_a[i] - f[i]);
}

  ...

// Print L2 error norm
printf("error = %0.3e\n", sqrt(error));

I wonder if the increasing numbers result from pollution errors or actually mean that FFT is not getting any more accurate. Any input appreciated