![]() |
![]() |
#1 |
Dec 2008
Boycotting the Soapbox
24×32×5 Posts |
![]()
The following program (presumably) repeatedly computes 2 complex convolutions of length 4096 in parallel using the 'conjugate spilt-radix' algorithm, which AFAIK has the lowest known addition count. g++ gets ~0.5 addpd/clock on a T3400, which translates to a throughput of only 1.5 instructions/clock, but replacing the loop-bodies with assembly should give a large boost. Timing is done with RDTSC, which is useless for core i7's, but it would be nice if somebody could run (and maybe fiddle around a little with compiler switches) this a couple of times on a Phenom and post the output, because they have even more instruction decoder bandwidth. If latency is a problem for these processors I'll try unrolling loops once & interleaving instructions by hand.
The next hurdle is to figure out a nice way to prefetch the data for the next 2 FFTs into L2 during the computation (all the twiddles will already be in cache). Anybody, who wants to understand the programming, will find this will helpful: http://www.drdobbs.com/cpp/199500857. Btw, I use CAPS for types and templates which (I think) makes it a lot easier to tell the meta- from the programming (and because I'm a kook). Code:
//copy&paste this into test.cpp //compile with: g++ -O3 test.cpp -o test //run with: ./test #include <iostream> #include <iomanip> #include <cstdlib> #include <cmath> #include <emmintrin.h> inline long long rdtsc() { long long counter; asm volatile( "xor %%eax,%%eax \n\t" "cpuid \n\t" "rdtsc \n\t" "movl %%eax,(%0) \n\t" "movl %%edx,0x04(%0) \n\t" ::"r"(&counter):"eax","edx","ecx","ebx" ); return counter; } using namespace std; typedef __m128d V2DF; const double twopi=8*atan(1.0); struct V2CD { V2DF real; V2DF imag; }; template<long N> struct TWIDDLE { static void init(); static V2CD root[N]; }; template<long N> V2CD TWIDDLE<N>::root[N]; template<long N> void TWIDDLE<N>::init() { for( long n=0;n<N;++n ) { root[n].real=_mm_set1_pd(cos(twopi*n/N)); root[n].imag=_mm_set1_pd(sin(twopi*n/N)); } TWIDDLE<N/2>::init(); } template<> void TWIDDLE<1>::init() { } template<long N> struct CIC { static void transform( V2CD* __restrict data ); static long additions(); }; template<long N> void CIC<N>::transform( V2CD* __restrict data ) { //split radix dif V2DF tr,ti,sr,si; //w^0 tr=data[0].real; ti=data[0].imag; data[0].real=tr+data[N/2].real; data[0].imag=ti+data[N/2].imag; tr-=data[N/2].real; ti-=data[N/2].imag; sr=data[N/4].real; si=data[N/4].imag; data[N/4].real=sr+data[N/2+N/4].real; data[N/4].imag=si+data[N/2+N/4].imag; sr-=data[N/2+N/4].real; si-=data[N/2+N/4].imag; data[N/2].real=tr-si; data[N/2].imag=ti+sr; data[N/2+N/4].real=tr+si; data[N/2+N/4].imag=ti-sr; //w^n for( long n=1;n<N/4;++n ) { tr=data[n+0].real; ti=data[n+0].imag; data[n+0].real=tr+data[n+N/2].real; data[n+0].imag=ti+data[n+N/2].imag; tr-=data[n+N/2].real; ti-=data[n+N/2].imag; sr=data[n+N/4].real; si=data[n+N/4].imag; data[n+N/4].real=sr+data[n+N/2+N/4].real; data[n+N/4].imag=si+data[n+N/2+N/4].imag; sr-=data[n+N/2+N/4].real; si-=data[n+N/2+N/4].imag; V2DF ur=tr,ui=ti; ur-=si; ui+=sr; tr+=si; ti-=sr; data[n+N/2].real=TWIDDLE<N/4>::root[n].real*ur-TWIDDLE<N/4>::root[n].imag*ui; data[n+N/2].imag=TWIDDLE<N/4>::root[n].real*ui+TWIDDLE<N/4>::root[n].imag*ur; data[n+N/2+N/4].real=TWIDDLE<N/4>::root[n].real*tr+TWIDDLE<N/4>::root[n].imag*ti; data[n+N/2+N/4].imag=TWIDDLE<N/4>::root[n].real*ti-TWIDDLE<N/4>::root[n].imag*tr; } CIC<N/2>::transform( data ); CIC<N/4>::transform( data + N/2 ); CIC<N/4>::transform( data + N/2 + N/4 ); //split radix dit //w^-0 tr=data[N/2].real; ti=data[N/2].imag; sr=tr-data[N/2+N/4].imag; si=ti+data[N/2+N/4].real; tr+=data[N/2+N/4].imag; ti-=data[N/2+N/4].real; data[N/2].real=data[0].real-tr; data[N/2].imag=data[0].real-ti; data[N/2+N/4].real=data[N/4].real-sr; data[N/2+N/4].imag=data[N/4].imag-si; data[0].real+=tr; data[0].imag+=ti; data[N/4].real+=sr; data[N/4].imag+=si; //w^-n for( long n=1;n<N/4;++n ) { tr=data[n+N/2].real*TWIDDLE<N/4>::root[n].real+data[n+N/2].imag*TWIDDLE<N/4>::root[n].imag; ti=data[n+N/2].imag*TWIDDLE<N/4>::root[n].real-data[n+N/2].real*TWIDDLE<N/4>::root[n].imag; sr=data[n+N/2+N/4].real*TWIDDLE<N/4>::root[n].real-data[n+N/2+N/4].imag*TWIDDLE<N/4>::root[n].imag; si=data[n+N/2+N/4].imag*TWIDDLE<N/4>::root[n].real+data[n+N/2+N/4].real*TWIDDLE<N/4>::root[n].imag; V2DF ur=tr,ui=ti; tr+=si; ti-=sr; sr-=ur; si+=ui; data[n+N/2].real=data[n+0].real-tr; data[n+N/2].imag=data[n+0].real-ti; data[n+N/2+N/4].real=data[n+N/4].real-sr; data[n+N/2+N/4].imag=data[n+N/4].imag-si; data[n+0].real+=tr; data[n+0].imag+=ti; data[n+N/4].real+=sr; data[n+N/4].imag+=si; } } template<long N> long CIC<N>::additions() { return 2*(16*N-4)/4+CIC<N/2>::additions()+2*CIC<N/4>::additions(); } template<> struct CIC<1> { static void transform( V2CD* __restrict data ) { V2DF tr,ti; const V2DF two=_mm_set1_pd(2.0); tr=data[0].real; ti=data[0].imag; data[0].real=tr*tr-ti*ti; data[0].imag=two*tr*ti; } static long additions() { return 1; } }; template<> struct CIC<2> { static void transform( V2CD* __restrict data ) { V2DF tr,ti; tr=data[0].real; ti=data[0].imag; data[0].real=tr+data[1].real; data[0].imag=ti+data[1].imag; data[1].real=tr-data[1].real; data[1].imag=ti-data[1].imag; CIC<1>::transform( data+0 ); CIC<1>::transform( data+1 ); tr=data[0].real; ti=data[0].imag; data[0].real=tr+data[1].real; data[0].imag=ti+data[1].imag; data[1].real=tr-data[1].real; data[1].imag=ti-data[1].imag; } static long additions() { return 10; } }; int main() { cout << "Eenie, meenie, chili-beanie, the spirits are about to speak: " << flush; const long N=1<<12; const long T=1024; TWIDDLE<N>::init(); V2CD* data=new V2CD[N]; for( long n=0;n<N;++n ) { data[n].real=_mm_set1_pd(0.0); data[n].imag=_mm_set1_pd(0.0); } long t=rdtsc(); for( long i=0;i<T;++i) CIC<N>::transform( data ); t=rdtsc()-t; cout << "addpd/clock: " << T*CIC<N>::additions() / double(t) << endl; cout << "Done!" << endl; return EXIT_SUCCESS; } |
![]() |
![]() |
![]() |
#2 |
Jul 2006
Calgary
52×17 Posts |
![]()
best I got was : addpd/clock: 0.257482
on a AMD Phenom II X4 940 Processor |
![]() |
![]() |
![]() |
#3 | ||
Dec 2008
Boycotting the Soapbox
24·32·5 Posts |
![]() Quote:
Quote:
What compiler switches did you use? Are you running a 64-bit or 32-bit OS? |
||
![]() |
![]() |
![]() |
#4 |
"Ben"
Feb 2007
70438 Posts |
![]() |
![]() |
![]() |
![]() |
#5 |
Dec 2008
Boycotting the Soapbox
24×32×5 Posts |
![]()
Try appending -march=amdfam10, too. Although the T3400 is architecturally core2, I got a 10% boost using that switch. Go figure.
|
![]() |
![]() |
![]() |
#6 | |
"Ben"
Feb 2007
E2316 Posts |
![]() Quote:
using -march=amdfam10 I saw 0.24 up to 0.56. Can you run the test in a loop and do some averaging in the code? |
|
![]() |
![]() |
![]() |
#7 | ||
Jul 2006
Calgary
42510 Posts |
![]() Quote:
Quote:
mostly just -O3. It seems like -march=native had little effect. Last fiddled with by lfm on 2010-03-17 at 02:00 |
||
![]() |
![]() |
![]() |
#8 |
Dec 2008
Boycotting the Soapbox
24·32·5 Posts |
![]()
Thanks to all testers.
Just to be certain: you did use both switches at the same time, i.e. g++ -O3 -march=amdfam10, yes? If so, maybe appending -funroll-all-loops will get 0.6, i.e. on par with core2. I guess one could gain a little by fiddling around with the instruction order, but then we might as well do it properly and use (inline) assembly. |
![]() |
![]() |
![]() |
#9 | |
"Ben"
Feb 2007
7×11×47 Posts |
![]() Quote:
including -funroll-all-loops as well, I see some over 0.6. But it feels like cheating since I have to press up-enter-up-enter-up-A-B-A-B-Start as fast as I can to get this one number to appear ![]() Code:
bbuhrow@bbuhrow-ubuntu-9:~/hrb_test$ g++ -O3 -march=amdfam10 -funroll-all-loops test.cpp -o test bbuhrow@bbuhrow-ubuntu-9:~/hrb_test$ ./test Eenie, meenie, chili-beanie, the spirits are about to speak: addpd/clock: 0.368945 Done! bbuhrow@bbuhrow-ubuntu-9:~/hrb_test$ ./test Eenie, meenie, chili-beanie, the spirits are about to speak: addpd/clock: 0.411441 Done! bbuhrow@bbuhrow-ubuntu-9:~/hrb_test$ ./test Eenie, meenie, chili-beanie, the spirits are about to speak: addpd/clock: 0.379844 Done! bbuhrow@bbuhrow-ubuntu-9:~/hrb_test$ ./test Eenie, meenie, chili-beanie, the spirits are about to speak: addpd/clock: 0.428434 Done! bbuhrow@bbuhrow-ubuntu-9:~/hrb_test$ ./test Eenie, meenie, chili-beanie, the spirits are about to speak: addpd/clock: 0.435905 Done! bbuhrow@bbuhrow-ubuntu-9:~/hrb_test$ ./test Eenie, meenie, chili-beanie, the spirits are about to speak: addpd/clock: 0.426248 Done! bbuhrow@bbuhrow-ubuntu-9:~/hrb_test$ ./test Eenie, meenie, chili-beanie, the spirits are about to speak: addpd/clock: 0.360692 Done! bbuhrow@bbuhrow-ubuntu-9:~/hrb_test$ ./test Eenie, meenie, chili-beanie, the spirits are about to speak: addpd/clock: 0.444142 Done! bbuhrow@bbuhrow-ubuntu-9:~/hrb_test$ ./test Eenie, meenie, chili-beanie, the spirits are about to speak: addpd/clock: 0.636502 Done! bbuhrow@bbuhrow-ubuntu-9:~/hrb_test$ ./test Eenie, meenie, chili-beanie, the spirits are about to speak: addpd/clock: 0.420736 Done! bbuhrow@bbuhrow-ubuntu-9:~/hrb_test$ ./test |
|
![]() |
![]() |
![]() |
#10 | |
Dec 2008
Boycotting the Soapbox
24×32×5 Posts |
![]() Quote:
![]() The exercise is mainly about figuring out how fast the processor can perform the task under optimal conditions. The follow up task is then to guarantee these conditions as often as possible. I think all the inlining makes it more likely that the scheduler gets 'started on the wrong foot' a couple of times. When we use assembly we have much more control and can hopefully get these hiccups to disappear. |
|
![]() |
![]() |
![]() |
#11 | |
Oct 2007
11010102 Posts |
![]() Quote:
Goes up to 0.53 on average using -funroll-all-loops. Didn't change the length from 1<<12. |
|
![]() |
![]() |
![]() |
Thread Tools | |
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
Maximize chances of finding Mersenne Prime | dennisonprime | Information & Answers | 7 | 2016-11-10 07:52 |
Mission Creep | davieddy | PrimeNet | 14 | 2011-12-10 20:55 |
Mission Accomplished | garo | Soap Box | 13 | 2009-01-22 20:10 |
Looking for a volunteer for a dangerous mission... | ThomRuley | Marin's Mersenne-aries | 6 | 2004-04-26 19:40 |
First mission | GP2 | Completed Missions | 2 | 2003-09-28 23:16 |