20100316, 20:32  #1 
Dec 2008
Boycotting the Soapbox
2^{4}·3^{2}·5 Posts 
Mission: maximize addpd/clock
The following program (presumably) repeatedly computes 2 complex convolutions of length 4096 in parallel using the 'conjugate spiltradix' algorithm, which AFAIK has the lowest known addition count. g++ gets ~0.5 addpd/clock on a T3400, which translates to a throughput of only 1.5 instructions/clock, but replacing the loopbodies with assembly should give a large boost. Timing is done with RDTSC, which is useless for core i7's, but it would be nice if somebody could run (and maybe fiddle around a little with compiler switches) this a couple of times on a Phenom and post the output, because they have even more instruction decoder bandwidth. If latency is a problem for these processors I'll try unrolling loops once & interleaving instructions by hand.
The next hurdle is to figure out a nice way to prefetch the data for the next 2 FFTs into L2 during the computation (all the twiddles will already be in cache). Anybody, who wants to understand the programming, will find this will helpful: http://www.drdobbs.com/cpp/199500857. Btw, I use CAPS for types and templates which (I think) makes it a lot easier to tell the meta from the programming (and because I'm a kook). Code:
//copy&paste this into test.cpp //compile with: g++ O3 test.cpp o test //run with: ./test #include <iostream> #include <iomanip> #include <cstdlib> #include <cmath> #include <emmintrin.h> inline long long rdtsc() { long long counter; asm volatile( "xor %%eax,%%eax \n\t" "cpuid \n\t" "rdtsc \n\t" "movl %%eax,(%0) \n\t" "movl %%edx,0x04(%0) \n\t" ::"r"(&counter):"eax","edx","ecx","ebx" ); return counter; } using namespace std; typedef __m128d V2DF; const double twopi=8*atan(1.0); struct V2CD { V2DF real; V2DF imag; }; template<long N> struct TWIDDLE { static void init(); static V2CD root[N]; }; template<long N> V2CD TWIDDLE<N>::root[N]; template<long N> void TWIDDLE<N>::init() { for( long n=0;n<N;++n ) { root[n].real=_mm_set1_pd(cos(twopi*n/N)); root[n].imag=_mm_set1_pd(sin(twopi*n/N)); } TWIDDLE<N/2>::init(); } template<> void TWIDDLE<1>::init() { } template<long N> struct CIC { static void transform( V2CD* __restrict data ); static long additions(); }; template<long N> void CIC<N>::transform( V2CD* __restrict data ) { //split radix dif V2DF tr,ti,sr,si; //w^0 tr=data[0].real; ti=data[0].imag; data[0].real=tr+data[N/2].real; data[0].imag=ti+data[N/2].imag; tr=data[N/2].real; ti=data[N/2].imag; sr=data[N/4].real; si=data[N/4].imag; data[N/4].real=sr+data[N/2+N/4].real; data[N/4].imag=si+data[N/2+N/4].imag; sr=data[N/2+N/4].real; si=data[N/2+N/4].imag; data[N/2].real=trsi; data[N/2].imag=ti+sr; data[N/2+N/4].real=tr+si; data[N/2+N/4].imag=tisr; //w^n for( long n=1;n<N/4;++n ) { tr=data[n+0].real; ti=data[n+0].imag; data[n+0].real=tr+data[n+N/2].real; data[n+0].imag=ti+data[n+N/2].imag; tr=data[n+N/2].real; ti=data[n+N/2].imag; sr=data[n+N/4].real; si=data[n+N/4].imag; data[n+N/4].real=sr+data[n+N/2+N/4].real; data[n+N/4].imag=si+data[n+N/2+N/4].imag; sr=data[n+N/2+N/4].real; si=data[n+N/2+N/4].imag; V2DF ur=tr,ui=ti; ur=si; ui+=sr; tr+=si; ti=sr; data[n+N/2].real=TWIDDLE<N/4>::root[n].real*urTWIDDLE<N/4>::root[n].imag*ui; data[n+N/2].imag=TWIDDLE<N/4>::root[n].real*ui+TWIDDLE<N/4>::root[n].imag*ur; data[n+N/2+N/4].real=TWIDDLE<N/4>::root[n].real*tr+TWIDDLE<N/4>::root[n].imag*ti; data[n+N/2+N/4].imag=TWIDDLE<N/4>::root[n].real*tiTWIDDLE<N/4>::root[n].imag*tr; } CIC<N/2>::transform( data ); CIC<N/4>::transform( data + N/2 ); CIC<N/4>::transform( data + N/2 + N/4 ); //split radix dit //w^0 tr=data[N/2].real; ti=data[N/2].imag; sr=trdata[N/2+N/4].imag; si=ti+data[N/2+N/4].real; tr+=data[N/2+N/4].imag; ti=data[N/2+N/4].real; data[N/2].real=data[0].realtr; data[N/2].imag=data[0].realti; data[N/2+N/4].real=data[N/4].realsr; data[N/2+N/4].imag=data[N/4].imagsi; data[0].real+=tr; data[0].imag+=ti; data[N/4].real+=sr; data[N/4].imag+=si; //w^n for( long n=1;n<N/4;++n ) { tr=data[n+N/2].real*TWIDDLE<N/4>::root[n].real+data[n+N/2].imag*TWIDDLE<N/4>::root[n].imag; ti=data[n+N/2].imag*TWIDDLE<N/4>::root[n].realdata[n+N/2].real*TWIDDLE<N/4>::root[n].imag; sr=data[n+N/2+N/4].real*TWIDDLE<N/4>::root[n].realdata[n+N/2+N/4].imag*TWIDDLE<N/4>::root[n].imag; si=data[n+N/2+N/4].imag*TWIDDLE<N/4>::root[n].real+data[n+N/2+N/4].real*TWIDDLE<N/4>::root[n].imag; V2DF ur=tr,ui=ti; tr+=si; ti=sr; sr=ur; si+=ui; data[n+N/2].real=data[n+0].realtr; data[n+N/2].imag=data[n+0].realti; data[n+N/2+N/4].real=data[n+N/4].realsr; data[n+N/2+N/4].imag=data[n+N/4].imagsi; data[n+0].real+=tr; data[n+0].imag+=ti; data[n+N/4].real+=sr; data[n+N/4].imag+=si; } } template<long N> long CIC<N>::additions() { return 2*(16*N4)/4+CIC<N/2>::additions()+2*CIC<N/4>::additions(); } template<> struct CIC<1> { static void transform( V2CD* __restrict data ) { V2DF tr,ti; const V2DF two=_mm_set1_pd(2.0); tr=data[0].real; ti=data[0].imag; data[0].real=tr*trti*ti; data[0].imag=two*tr*ti; } static long additions() { return 1; } }; template<> struct CIC<2> { static void transform( V2CD* __restrict data ) { V2DF tr,ti; tr=data[0].real; ti=data[0].imag; data[0].real=tr+data[1].real; data[0].imag=ti+data[1].imag; data[1].real=trdata[1].real; data[1].imag=tidata[1].imag; CIC<1>::transform( data+0 ); CIC<1>::transform( data+1 ); tr=data[0].real; ti=data[0].imag; data[0].real=tr+data[1].real; data[0].imag=ti+data[1].imag; data[1].real=trdata[1].real; data[1].imag=tidata[1].imag; } static long additions() { return 10; } }; int main() { cout << "Eenie, meenie, chilibeanie, the spirits are about to speak: " << flush; const long N=1<<12; const long T=1024; TWIDDLE<N>::init(); V2CD* data=new V2CD[N]; for( long n=0;n<N;++n ) { data[n].real=_mm_set1_pd(0.0); data[n].imag=_mm_set1_pd(0.0); } long t=rdtsc(); for( long i=0;i<T;++i) CIC<N>::transform( data ); t=rdtsc()t; cout << "addpd/clock: " << T*CIC<N>::additions() / double(t) << endl; cout << "Done!" << endl; return EXIT_SUCCESS; } 
20100316, 22:15  #2 
Jul 2006
Calgary
5^{2}×17 Posts 
best I got was : addpd/clock: 0.257482
on a AMD Phenom II X4 940 Processor 
20100316, 22:24  #3  
Dec 2008
Boycotting the Soapbox
2^{4}·3^{2}·5 Posts 
Quote:
Quote:
What compiler switches did you use? Are you running a 64bit or 32bit OS? 

20100316, 23:26  #4 
"Ben"
Feb 2007
7041_{8} Posts 

20100316, 23:38  #5 
Dec 2008
Boycotting the Soapbox
2^{4}·3^{2}·5 Posts 
Try appending march=amdfam10, too. Although the T3400 is architecturally core2, I got a 10% boost using that switch. Go figure.

20100317, 00:33  #6  
"Ben"
Feb 2007
7041_{8} Posts 
Quote:
using march=amdfam10 I saw 0.24 up to 0.56. Can you run the test in a loop and do some averaging in the code? 

20100317, 01:58  #7  
Jul 2006
Calgary
651_{8} Posts 
Quote:
Quote:
mostly just O3. It seems like march=native had little effect. Last fiddled with by lfm on 20100317 at 02:00 

20100317, 02:04  #8 
Dec 2008
Boycotting the Soapbox
2^{4}·3^{2}·5 Posts 
Thanks to all testers.
Just to be certain: you did use both switches at the same time, i.e. g++ O3 march=amdfam10, yes? If so, maybe appending funrollallloops will get 0.6, i.e. on par with core2. I guess one could gain a little by fiddling around with the instruction order, but then we might as well do it properly and use (inline) assembly. 
20100317, 02:44  #9  
"Ben"
Feb 2007
7041_{8} Posts 
Quote:
including funrollallloops as well, I see some over 0.6. But it feels like cheating since I have to press upenterupenterupABABStart as fast as I can to get this one number to appear Code:
bbuhrow@bbuhrowubuntu9:~/hrb_test$ g++ O3 march=amdfam10 funrollallloops test.cpp o test bbuhrow@bbuhrowubuntu9:~/hrb_test$ ./test Eenie, meenie, chilibeanie, the spirits are about to speak: addpd/clock: 0.368945 Done! bbuhrow@bbuhrowubuntu9:~/hrb_test$ ./test Eenie, meenie, chilibeanie, the spirits are about to speak: addpd/clock: 0.411441 Done! bbuhrow@bbuhrowubuntu9:~/hrb_test$ ./test Eenie, meenie, chilibeanie, the spirits are about to speak: addpd/clock: 0.379844 Done! bbuhrow@bbuhrowubuntu9:~/hrb_test$ ./test Eenie, meenie, chilibeanie, the spirits are about to speak: addpd/clock: 0.428434 Done! bbuhrow@bbuhrowubuntu9:~/hrb_test$ ./test Eenie, meenie, chilibeanie, the spirits are about to speak: addpd/clock: 0.435905 Done! bbuhrow@bbuhrowubuntu9:~/hrb_test$ ./test Eenie, meenie, chilibeanie, the spirits are about to speak: addpd/clock: 0.426248 Done! bbuhrow@bbuhrowubuntu9:~/hrb_test$ ./test Eenie, meenie, chilibeanie, the spirits are about to speak: addpd/clock: 0.360692 Done! bbuhrow@bbuhrowubuntu9:~/hrb_test$ ./test Eenie, meenie, chilibeanie, the spirits are about to speak: addpd/clock: 0.444142 Done! bbuhrow@bbuhrowubuntu9:~/hrb_test$ ./test Eenie, meenie, chilibeanie, the spirits are about to speak: addpd/clock: 0.636502 Done! bbuhrow@bbuhrowubuntu9:~/hrb_test$ ./test Eenie, meenie, chilibeanie, the spirits are about to speak: addpd/clock: 0.420736 Done! bbuhrow@bbuhrowubuntu9:~/hrb_test$ ./test 

20100317, 03:19  #10  
Dec 2008
Boycotting the Soapbox
2^{4}·3^{2}·5 Posts 
Quote:
The exercise is mainly about figuring out how fast the processor can perform the task under optimal conditions. The follow up task is then to guarantee these conditions as often as possible. I think all the inlining makes it more likely that the scheduler gets 'started on the wrong foot' a couple of times. When we use assembly we have much more control and can hopefully get these hiccups to disappear. 

20100317, 04:11  #11  
Oct 2007
2·53 Posts 
Quote:
Goes up to 0.53 on average using funrollallloops. Didn't change the length from 1<<12. 

Thread Tools  
Similar Threads  
Thread  Thread Starter  Forum  Replies  Last Post 
Maximize chances of finding Mersenne Prime  dennisonprime  Information & Answers  7  20161110 07:52 
Mission Creep  davieddy  PrimeNet  14  20111210 20:55 
Mission Accomplished  garo  Soap Box  13  20090122 20:10 
Looking for a volunteer for a dangerous mission...  ThomRuley  Marin's Mersennearies  6  20040426 19:40 
First mission  GP2  Completed Missions  2  20030928 23:16 