How to use FFTW in HPS

I have a final project and it’s about data acquisition system using the combination of DCC and Cyclone V SOCkit. So, I am trying to use FFT with FFTW library in HPS (ARM Cortex-A9). the result is changing everytime I execute the program the same frequency. I got no problem with transfering data from FPGA to HPS.

I use this code to FFT
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>

#include <fftw3.h>
#include <math.h>
#include <float.h>

/**
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
**/


#define PAGE_SIZE 4096 
#define REG_BASE 0xff200000 //lwh2f
#define REG_SPAN 0x00200000
//#define SIGNAL_BASE 0x00000010
#define CH_A_BASE 0x000000a0
#define CH_B_BASE 0x000000b0
#define READ_EN_BASE 0x00000000
#define FULL_BASE 0x00000020

#define NUM_POINTS 512
#define REAL 0
#define IMAG 1

#define DEFAULT_IP "192.168.2.12"

void* virtual_base;
void* read_en_addr;
//void* signal_addr;
void* ch_a_addr;
void* ch_b_addr;
void* full_addr;
int fd;
//int signals;
int read_en;
int full;
float ch_a[512];
int ch_b;
int mag[512];
fftwf_complex *in , *out;

//fftwf_complex in[NUM_POINTS];
//fftwf_complex out[NUM_POINTS];


int main (void){
in = (fftwf_complex *) fftwf_malloc(sizeof(fftwf_complex) * NUM_POINTS);
out = (fftwf_complex *) fftwf_malloc(sizeof(fftwf_complex) * NUM_POINTS);
while(1){

int delay1;
int delay2;

///////PENGAMBILAN DATA DARI FPGA MASUK KE HPS/////////////////
fd=open("/dev/mem",(O_RDWR|O_SYNC));
virtual_base=mmap(NULL,REG_SPAN,(PROT_READ|PROT_WRITE),MAP_SHARED,fd,REG_BASE);
//signal_addr=virtual_base+SIGNAL_BASE;
ch_a_addr=virtual_base+CH_A_BASE;
ch_b_addr=virtual_base+CH_B_BASE;
read_en_addr=virtual_base+READ_EN_BASE;
full_addr = virtual_base+FULL_BASE;

full = *(uint32_t *) full_addr;

if (full ==1){


FILE *fp;

int i = 0;
fp = fopen("output.txt", "w+");

while(i < 512){

//signals=*(uint32_t *)signal_addr;
*(uint32_t *) read_en_addr = 1;
ch_a[i] = *(uint16_t *) ch_a_addr;
*(uint32_t *) read_en_addr = 0;
//FFT
//TCP-IP

//fprintf(fp, "%d\n", ch_a[i]);

//printf("%i\n",ch_a[i]);
//printf("---batas akhir ch_a----\n");
i++;
}

/////////////////FFT     //////////////////////////////////////////////

   
    
    fp = fopen ("file.txt", "w+");

   
    
    //int i;
    //float sample_rate = 800000;
    for (i = 0; i < NUM_POINTS ; ++i) 
    {
        in[i][REAL] = ch_a[i];

        in[i][IMAG] = 0;
       
    }    

    fftwf_plan p = fftwf_plan_dft_1d(NUM_POINTS, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
    
    fftwf_execute(p); /* repeat as needed */
    
    //float mag[sample];
   for (i = 0; i < NUM_POINTS; ++i) 
    {
        mag[i] = sqrt((out[i][REAL] * out[i][REAL]) + (out[i][IMAG] * out[i][IMAG]));
        //printf ("output real\n");
        //printf ("%d\n",out[i][REAL]);
        //printf ("output Imejiner\n");
        //printf ("%d\n",out[i][IMAG]);
        //printf ("batas akhir untuk output\n");
        printf("%i\n", mag[i]);
        fprintf(fp,"%i\n", mag[i]) ;
    }        
      fclose (fp) ;
     fftwf_destroy_plan(p);
    
    fftwf_free(in); 
    fftwf_free(out);
    
}

}
return 0;
}

I have been hardly thinking where did I do wrong, since the cross-compiling is success but the result is so wrong.