赞
踩
Github上某大佬的文章指出,Intel下的onemkL函数库对于大型矩阵的计算很厉害,如下图,在运算性能以及速度上优于诸多方法,本文旨在通过自己亲身实践,学习不同算法函数库用以求解二维实数到复数的快速傅里叶变换,文章最后会附带上全部源代码。(这是本人的第一篇文章,欢迎大家共同交流学习,共勉。)
FFTW3(Fastest Fourier Transform in the West)是一个开源的软件库,用于计算快速傅里叶变换(FFT)。它提供了高效的算法和实现,用于在数字信号处理、图像处理、模拟和科学计算等领域进行频率域分析。FFTW3被广泛应用于各种编程语言和平台上。它提供了多种不同的FFT变换算法,以适应不同的数据大小和性能需求。
处理器:Intel i5
虚拟机:vwmare17,乌班图18.04(推荐使用,个人觉得比较稳定,出了问题网上可查的解决方法也较多,不过也不必纠结于版本,能跑就行)
首先,调用oneMKL封装的函数,随机生成2048*2048大小的单精度浮点数:
- VSLStreamStatePtr stream;
- vslNewStream(&stream,VSL_BRNG_MT19937,1);
- vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD,stream,Nx*Nz,input,0.0f,1.0f);
- vslDeleteStream(&stream);
使用FFTW3计算real to imag FFT2D:
- fftwf_complex *outputFFTW=(fftwf_complex *)fftwf_malloc(sizeof(fftwf_complex)*Nx*(Nz/2+1));
- fftwf_plan status=fftwf_plan_dft_r2c_2d(Nx,Nz,input,outputFFTW,FFTW_MEASURE);
- t0=wallclock_time();
- fftwf_execute(status);
- t1=wallclock_time();
其中,wallclock_time()是自定义的时间函数,如下:
- double wallclock_time(void)
- {
- struct timeval s_val;
- static struct timeval b_val;
- double time;
- static int base=0;
-
- gettimeofday(&s_val,0);
-
- if (!base) {
- b_val = s_val;
- base = 1;
- return 0.0;
- }
-
- time = (double)(s_val.tv_sec-b_val.tv_sec) +
- (double)(1e-6*((double)s_val.tv_usec-(double)b_val.tv_usec));
-
- return (double)time;
- }
使用oneMKL计算real to imag FFT2D:
- MKL_LONG rs[3]={0,Nx,1};
- MKL_LONG cs[3]={0,Nz/2+1,1};
- MKL_Complex8 *outputMKL=(MKL_Complex8 *)malloc(sizeof(MKL_Complex8)*Nx*(Nz/2+1));
- DFTI_DESCRIPTOR_HANDLE handle;
- MKL_LONG sizeN[2]={Nx,Nz};
- DftiCreateDescriptor(&handle,DFTI_SINGLE,DFTI_REAL,2,sizeN);
- DftiSetValue(handle,DFTI_PLACEMENT,DFTI_NOT_INPLACE);
- DftiSetValue(handle,DFTI_CONJUGATE_EVEN_STORAGE,DFTI_COMPLEX_COMPLEX);
- DftiSetValue(handle,DFTI_INPUT_STRIDES,rs);
- DftiSetValue(handle,DFTI_OUTPUT_STRIDES,cs);
- DftiCommitDescriptor(handle);
- t0=wallclock_time();
- DftiComputeForward(handle,input,outputMKL);
- t1=wallclock_time();
比较残差以及运算性能:
- if(Nx==Nz)
- {
- printf("-------------------------\n");
- diff=0;
- for(i=0;i<Nx*(Nz/2+1);i++)
- {
- diff1=(double)fabs(outputMKL[i].real-outputFFTW[i][0]);
- diff2=(double)fabs(outputMKL[i].imag-outputFFTW[i][1]);
- if(diff1>0.000001 || diff2>0.000001)
- {
- diff=1;
- break;
- }
- }
- if(diff==0) fprintf(stderr," CORRECT (residual < 0.000001) bwetween FFTW3 with oneMKL\n");
- else fprintf(stderr," ERROR (residual >0.000001 ) bwetween FFTW3 with oneMKL\n");
- }
最终结果:
可以发现,oneMKL精度在0.000001数量级下与开源的FFTW3相同,同时性能更加优秀。(不同电脑配置加速率不一样)
按照如下方法编译,这要求你的环境上正确安装编译器。
icx fft_2d_r2c_fftw_and_onemkl.c -o fft_2d_r2c_fftw_and_onemkl -qmkl -std=c99 -lmkl_rt -lm
- #include<stdio.h>
- #include<stdlib.h>
- #include<math.h>
- #include<time.h>
- #include<sys/time.h>
- #include<stdbool.h>
- #include<string.h>
- #include<sys/resource.h>
- #include<fftw3.h>
- #include<mkl.h>
- #include"mkl_vsl.h"
- #include"mkl_service.h"
- #include"mkl_dfti.h"
-
- size_t getCurrentMemoryUsage();
- double wallclock_time(void);
-
- int main(int argc,char *argv[])
- {
- int i,Nx,Nz;
- double t0,t1,t,k,diff1,diff2,diff;
- size_t memoryUsage;
- Nx=2048;
- Nz=2048;
- //N2=16;
- k=log(Nz)/log(2)+1;
-
- while(Nz<=Nx)
- {
- float *input=(float *)malloc(sizeof(float)*Nx*Nz);
- /*Random numbers*/
- VSLStreamStatePtr stream;
- vslNewStream(&stream,VSL_BRNG_MT19937,1);
- vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD,stream,Nx*Nz,input,0.0f,1.0f);
- vslDeleteStream(&stream);
-
- /*Use FFTW3 to do FFT2d*/
- fftwf_complex *outputFFTW=(fftwf_complex *)fftwf_malloc(sizeof(fftwf_complex)*Nx*(Nz/2+1));
- fftwf_plan status=fftwf_plan_dft_r2c_2d(Nx,Nz,input,outputFFTW,FFTW_MEASURE);
- t0=wallclock_time();
- fftwf_execute(status);
- t1=wallclock_time();
- t=t1-t0;
- fprintf(stderr," FFTW Plan N=(%-5d * %-5d) time is %f s\n",Nx,Nz,t);
- if(Nx==Nz)
- {
- t0=wallclock_time();
- for(i=0;i<1000;i++) fftwf_execute(status);
- t1=wallclock_time();
- t=t1-t0;
- fprintf(stderr," 1000 times FFTW Plan N=(%-5d * %-5d) time is %f s\n",Nx,Nz,t);
- }
-
- /*Use oneMKL to do FFT2d*/
- MKL_LONG rs[3]={0,Nx,1};
- MKL_LONG cs[3]={0,Nz/2+1,1};
- MKL_Complex8 *outputMKL=(MKL_Complex8 *)malloc(sizeof(MKL_Complex8)*Nx*(Nz/2+1));
- DFTI_DESCRIPTOR_HANDLE handle;
- MKL_LONG sizeN[2]={Nx,Nz};
- DftiCreateDescriptor(&handle,DFTI_SINGLE,DFTI_REAL,2,sizeN);
- DftiSetValue(handle,DFTI_PLACEMENT,DFTI_NOT_INPLACE);
- DftiSetValue(handle,DFTI_CONJUGATE_EVEN_STORAGE,DFTI_COMPLEX_COMPLEX);
- DftiSetValue(handle,DFTI_INPUT_STRIDES,rs);
- DftiSetValue(handle,DFTI_OUTPUT_STRIDES,cs);
- DftiCommitDescriptor(handle);
- t0=wallclock_time();
- DftiComputeForward(handle,input,outputMKL);
- t1=wallclock_time();
- t=t1-t0;
- fprintf(stderr," oneMKL Plan N=(%-5d * %-5d) time is %f s\n",Nx,Nz,t);
- if(Nx==Nz)
- {
- t0=wallclock_time();
- for(i=0;i<1000;i++) DftiComputeForward(handle,input,outputMKL);
- t1=wallclock_time();
- t=t1-t0;
- fprintf(stderr," 1000 times oneMKL Plan N=(%-5d * %-5d) time is %f s\n",Nx,Nz,t);
- }
-
- /*Compare oneMKL and FFTW3*/
- if(Nx==Nz)
- {
- printf("-------------------------\n");
- diff=0;
- for(i=0;i<Nx*(Nz/2+1);i++)
- {
- diff1=(double)fabs(outputMKL[i].real-outputFFTW[i][0]);
- diff2=(double)fabs(outputMKL[i].imag-outputFFTW[i][1]);
- if(diff1>0.000001 || diff2>0.000001)
- {
- diff=1;
- break;
- }
- }
- if(diff==0) fprintf(stderr," CORRECT (residual < 0.000001) bwetween FFTW3 with oneMKL\n");
- else fprintf(stderr," ERROR (residual >0.000001 ) bwetween FFTW3 with oneMKL\n");
- }
-
- free(input);
- free(outputMKL);
- DftiFreeDescriptor(&handle);
- //fftwf_destory_plan(status);
- fftwf_free(outputFFTW);
-
- Nz=pow(2.0,k);
- k+=1.0;
- }
-
- memoryUsage = getCurrentMemoryUsage();
- fprintf(stderr," CPU usage size : %zu KB \n",memoryUsage);
-
- return 0;
- }
-
- size_t getCurrentMemoryUsage(){
- struct rusage usage;
- getrusage(RUSAGE_SELF, &usage);
- return usage.ru_maxrss;
- }
-
- double wallclock_time(void)
- {
- struct timeval s_val;
- static struct timeval b_val;
- double time;
- static int base=0;
-
- gettimeofday(&s_val,0);
-
- if (!base) {
- b_val = s_val;
- base = 1;
- return 0.0;
- }
-
- time = (double)(s_val.tv_sec-b_val.tv_sec) +
- (double)(1e-6*((double)s_val.tv_usec-(double)b_val.tv_usec));
-
- return (double)time;
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。