71007

CuFFT Double to Complex

Question:

i want to make a FFT from double to std::complex with the CuFFT Lib. My Code looks like

#include <complex> #include <iostream> #include <cufft.h> #include <cuda_runtime_api.h> typedef std::complex<double> Complex; using namespace std; int main(){ int n = 100; double* in; Complex* out; in = (double*) malloc(sizeof(double) * n); out = (Complex*) malloc(sizeof(Complex) * n/2+1); for(int i=0; i<n; i++){ in[i] = 1; } cufftHandle plan; plan = cufftPlan1d(&plan, n, CUFFT_D2Z, 1); unsigned int mem_size = sizeof(double)*n; cufftDoubleReal *d_in; cufftDoubleComplex *d_out; cudaMalloc((void **)&d_in, mem_size); cudaMalloc((void **)&d_out, mem_size); cudaMemcpy(d_in, in, mem_size, cudaMemcpyHostToDevice); cudaMemcpy(d_out, out, mem_size, cudaMemcpyHostToDevice); int succes = cufftExecD2Z(plan,(cufftDoubleReal *) d_in,(cufftDoubleComplex *) d_out); cout << succes << endl; cudaMemcpy(out, d_out, mem_size, cudaMemcpyDeviceToHost); for(int i=0; i<n/2; i++){ cout << "out: " << i << " " << out[i].real() << " " << out[i].imag() << endl; } return 0; }

but it seems to me this must be wrong, because i think the transformed values should be 1 0 0 0 0 .... or without the normalization 100 0 0 0 0 .... but i just get 0 0 0 0 0 ...

Furthermore i would like it more if the cufftExecD2Z would work in place, which should be possible but i haven't figured out how to correctly do so. Can anybody help?

Answer1:

Your code has a variety of errors. You should probably review <a href="http://docs.nvidia.com/cuda/cufft/index.html#plan-execution" rel="nofollow">cufft documentation</a> as well as the sample codes.

<ol><li>You should do proper cuda error checking and proper cufft error checking on all API return values.</li> <li>

The return value of the cufftPlan1d function does not go into the plan:

plan = cufftPlan1d(&plan, n, CUFFT_D2Z, 1);

The function itself sets the plan (that is why you pass &plan to the function), then when you assign the return value into the plan, it ruins the plan set up by the function.

</li> <li>

You correctly identified that the output can be of size ((N/2)+1), but then you didn't allocate space for it properly either on the host side:

out = (Complex*) malloc(sizeof(Complex) * n/2+1);

or on the device side:

unsigned int mem_size = sizeof(double)*n; ... cudaMalloc((void **)&d_out, mem_size); </li> </ol>

The following code has some of the above problems fixed, enough to get your desired result (100, 0, 0, ...)

#include <complex> #include <iostream> #include <cufft.h> #include <cuda_runtime_api.h> #define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ exit(1); \ } \ } while (0) typedef std::complex<double> Complex; using namespace std; int main(){ int n = 100; double* in; Complex* out; #ifdef IN_PLACE in = (double*) malloc(sizeof(Complex) * (n/2+1)); out = (Complex*)in; #else in = (double*) malloc(sizeof(double) * n); out = (Complex*) malloc(sizeof(Complex) * (n/2+1)); #endif for(int i=0; i<n; i++){ in[i] = 1; } cufftHandle plan; cufftResult res = cufftPlan1d(&plan, n, CUFFT_D2Z, 1); if (res != CUFFT_SUCCESS) {cout << "cufft plan error: " << res << endl; return 1;} cufftDoubleReal *d_in; cufftDoubleComplex *d_out; unsigned int out_mem_size = (n/2 + 1)*sizeof(cufftDoubleComplex); #ifdef IN_PLACE unsigned int in_mem_size = out_mem_size; cudaMalloc((void **)&d_in, in_mem_size); d_out = (cufftDoubleComplex *)d_in; #else unsigned int in_mem_size = sizeof(cufftDoubleReal)*n; cudaMalloc((void **)&d_in, in_mem_size); cudaMalloc((void **)&d_out, out_mem_size); #endif cudaCheckErrors("cuda malloc fail"); cudaMemcpy(d_in, in, in_mem_size, cudaMemcpyHostToDevice); cudaCheckErrors("cuda memcpy H2D fail"); res = cufftExecD2Z(plan,d_in, d_out); if (res != CUFFT_SUCCESS) {cout << "cufft exec error: " << res << endl; return 1;} cudaMemcpy(out, d_out, out_mem_size, cudaMemcpyDeviceToHost); cudaCheckErrors("cuda memcpy D2H fail"); for(int i=0; i<n/2; i++){ cout << "out: " << i << " " << out[i].real() << " " << out[i].imag() << endl; } return 0; }

Review <a href="http://docs.nvidia.com/cuda/cufft/index.html#data-layout" rel="nofollow">the documentation</a> on what is necessary to do an in-place transform in the real to complex case. The above code can be recompiled with -DIN_PLACE to see the behavior for an in-place transform, and the necessary code changes.

Recommend

  • using free for the host results in segmentation fault
  • simple sorting using thrust not working
  • Unknown error when inverting image using cuda
  • How can I add up two 2d (pitched) arrays using nested for loops?
  • CUDA Constant Memory Error
  • how to use the cula device
  • minimize data transfers (use device memory?)
  • Unexpected behavior of numpy.fft.fft with high precision numbers
  • Translating four nested loops into a CUDA kernel
  • OpenCV - Copy GpuMat into cuda device data
  • FFT wrong value?
  • nvvp and nsight's profiler give a different result?
  • Casting an array of C structs to a numpy array
  • Basic CUDA C Program Crashing Under Certain Conditions
  • Mayavi: interpolate face colors in triangular_mesh
  • Iterative image processing in CUDA
  • how to pass array of struct to GPU?
  • Plot a table with R
  • Need help to stop program terminating without users consent
  • Iterating over a container bidirectionally
  • What's the essential difference between these two variadic functions?
  • std::system Exception when instantiating a singleton object
  • OpenMP and File I/O
  • c++ regex_replace not doing intended substitution
  • Breaking out column by groups in Pandas
  • How solve “Qt: Untested Windows version 10.0 detected!”
  • Reduction and collapse clauses in OMP have some confusing points
  • Time complexity of a program which involves multiple variables
  • why overloaded new operator is calling constructor even I am using malloc inside overloading functio
  • Is my CUDA kernel really runs on device or is being mistekenly executed by host in emulation?
  • Django query for large number of relationships
  • Why is Django giving me: 'first_name' is an invalid keyword argument for this function?
  • How can I use `wmic` in a Windows PE script?
  • How to push additional view controllers onto NavigationController but keep the TabBar?