mersenneforum.org  

Go Back   mersenneforum.org > Great Internet Mersenne Prime Search > Software

Reply
 
Thread Tools
Old 2007-06-12, 03:25   #1
rtharper
 
rtharper's Avatar
 
Apr 2007

22 Posts
Default OS X Glucas build

Hi all,

I'm keen to get a build of Glucas out there for OS X for the most recent version. I RTFM'd, did the configure and make. The standard building process produced the following error:
Code:
ynormm_5.c: In function 'dit_carry_norm_dif_5_sse2':
ynormm_5.c:729: internal compiler error: output_operand: unterminated assembly dialect alternative
I managed to compile all the other files to find any other errors, and found out it exists in ynormm_[5|7|9].c

I further found out that this error only exists in conditional compilations using the -DY_USE_SSE2 token, and only in functions "dit_carry_norm_dif_[5|7|9]_sse2. My experience with assembly has not yet extended to gcc inline directives. Has anyone had any experience with it or this error or compiling OS X? I got a build running fine with SSE2 instructions disabled. If you all are interested in that build I will post it, otherwise I hope some of you can provide some hint on how to solve this error
rtharper is offline   Reply With Quote
Old 2007-06-12, 19:31   #2
ewmayer
2ω=0
 
ewmayer's Avatar
 
Sep 2002
Rep├║blica de California

3·52·131 Posts
Default

Tom, could you please post a copy of the the code section the compiler is squawking about?
ewmayer is online now   Reply With Quote
Old 2007-06-12, 22:37   #3
rgiltrap
 
rgiltrap's Avatar
 
Apr 2006
Down Under

89 Posts
Default

Also, would you please post the performance results of the non-sse2 versions.
rgiltrap is offline   Reply With Quote
Old 2007-06-13, 23:28   #4
rtharper
 
rtharper's Avatar
 
Apr 2007

22 Posts
Default

Quote:
Originally Posted by ewmayer View Post
Tom, could you please post a copy of the the code section the compiler is squawking about?
Here is the function body. The annoying part of this issue (and why I'm a little paralysed on it) is that the line is a close bracket, and the call before it doesn't seem to be having issues, here is the preprocessed code that gives the error:

Code:
void dit_carry_norm_dif_5_sse2 (BIG_DOUBLE *x, UL N ,UL err_flag)
{
  Y__M128D ttp02, ttp13, ttp4x, ttmp02, ttmp13, ttmp4x;
  Y__M128D tw1r, tw1i, tw2r, tw2i, tw3r, tw3i, tw4r, tw4i;
  Y__M128D t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i, t4r, t4i, t5r, t5i;
  Y__M128D bj02, bj13, bj4x;
  Y__M128D carry02, carry13, carry4x, maxerr;





  UL bj0, bj1, bj2, bj3, bj4;
  y_ptr px, pd0, pd1, pd2, pd3, pd4;
  UL pad = Y_LRIGHT[1], pad2, pad3, pad4;
  UL i, j, k, l, ll;



  px = Y_TWDB[Y_NRADICES - 2];

  __builtin_prefetch(px + 0, 0, 3);

  carry02 = _mm_setzero_pd( );
  carry13 = _mm_setzero_pd( );
  carry4x = _mm_setzero_pd( );
  maxerr = _mm_setzero_pd( );





  if(Y_SBIT == 0)
    carry02 = _mm_setr_pd( (BIG_DOUBLE) -2.0, (BIG_DOUBLE) 0.0);

  pad2 = (pad << 1); pad4 = (pad << 2); pad3 = pad + pad2; pd0 = x; __builtin_prefetch(pd0 + 0, 1, 3); pd1 = x + ((pad << 1) + (((pad << 1) & (~(UL)(128 - 1)) )>>(6 )) + (((pad << 1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((pad << 1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((pad << 1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); __builtin_prefetch(pd1 + 0, 1, 3); pd2 = x + ((pad2 << 1) + (((pad2 << 1) & (~(UL)(128 - 1)) )>>(6 )) + (((pad2 << 1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((pad2 << 1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((pad2 << 1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); __builtin_prefetch(pd2 + 0, 1, 3); pd3 = x + ((pad3 << 1) + (((pad3 << 1) & (~(UL)(128 - 1)) )>>(6 )) + (((pad3 << 1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((pad3 << 1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((pad3 << 1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); __builtin_prefetch(pd3 + 0, 1, 3); pd4 = x + ((pad4 << 1) + (((pad4 << 1) & (~(UL)(128 - 1)) )>>(6 )) + (((pad4 << 1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((pad4 << 1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((pad4 << 1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); __builtin_prefetch(pd4 + 0, 1, 3);;

  bj0 = N; bj1 = ( b % 5) << (Y_K + 1); bj2 = (( 2 * b) % 5) << (Y_K + 1); bj3 = (( 3 * b) % 5) << (Y_K + 1); bj4 = (( 4 * b) % 5) << (Y_K + 1); bj02 = _mm_setr_pd( (BIG_DOUBLE) bj0, (BIG_DOUBLE) bj2); bj13 = _mm_setr_pd( (BIG_DOUBLE) bj1, (BIG_DOUBLE) bj3); bj4x = _mm_setr_pd( (BIG_DOUBLE) bj4, (BIG_DOUBLE) 0);;

  { Y__M128D __aux0, __aux1; __aux1 = _mm_load_pd(px); __aux0 = _mm_set_sd( (BIG_DOUBLE) 1.0); tw1r = _mm_unpacklo_pd( __aux0, __aux1); tw1i = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0, __aux1; __aux1 = _mm_load_pd(px + 2); __aux0 = _mm_set_sd( (BIG_DOUBLE) 1.0); tw3r = _mm_unpacklo_pd( __aux0, __aux1); tw3i = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __x0,__x1; tw2r = tw3r; __x0 = tw3i; tw2i = tw3i; __x1 = tw3r; tw2r = _mm_mul_pd( tw2r, tw1r); __x0 = _mm_mul_pd( __x0, tw1i); tw2i = _mm_mul_pd( tw2i, tw1r); tw4r = tw2r; tw4i = tw2i; __x1 = _mm_mul_pd( __x1, tw1i); tw4r = _mm_sub_pd( tw4r, __x0); tw2r = _mm_add_pd( tw2r, __x0); tw4i = _mm_add_pd( tw4i, __x1); tw2i = _mm_sub_pd( tw2i, __x1); }; __builtin_prefetch(px + 4, 0, 3);;

  px += 4;

  for (i = 0, j = 0; j < pad2; j += UPDATE, i++)
    {
      { size_t j0, j1; j0 = ((i) + (((i) & (~(UL)(128 - 1)) )>>(6 )) + (((i) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((i) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((i) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); j1 = ((((pad2 >> (SHIFT_UPDATE - 1)) + i)) + (((((pad2 >> (SHIFT_UPDATE - 1)) + i)) & (~(UL)(128 - 1)) )>>(6 )) + (((((pad2 >> (SHIFT_UPDATE - 1)) + i)) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((((pad2 >> (SHIFT_UPDATE - 1)) + i)) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((((pad2 >> (SHIFT_UPDATE - 1)) + i)) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); ttmp02 = _mm_setr_pd( (BIG_DOUBLE) two_to_minusphi[j0], (BIG_DOUBLE) two_to_minusphi[j1]); ttp02 = _mm_setr_pd( (BIG_DOUBLE) two_to_phi[j0], (BIG_DOUBLE) two_to_phi[j1]); __builtin_prefetch(two_to_minusphi + j0 + 16, 0, 3); __builtin_prefetch(two_to_minusphi + j1 + 16, 0, 3); __builtin_prefetch(two_to_phi + j0 + 16, 0, 3); __builtin_prefetch(two_to_phi + j1 + 16, 0, 3); j0 = ((((pad2 >> SHIFT_UPDATE) + i)) + (((((pad2 >> SHIFT_UPDATE) + i)) & (~(UL)(128 - 1)) )>>(6 )) + (((((pad2 >> SHIFT_UPDATE) + i)) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((((pad2 >> SHIFT_UPDATE) + i)) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((((pad2 >> SHIFT_UPDATE) + i)) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); j1 = ((((pad3 >> (SHIFT_UPDATE - 1)) + i)) + (((((pad3 >> (SHIFT_UPDATE - 1)) + i)) & (~(UL)(128 - 1)) )>>(6 )) + (((((pad3 >> (SHIFT_UPDATE - 1)) + i)) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((((pad3 >> (SHIFT_UPDATE - 1)) + i)) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((((pad3 >> (SHIFT_UPDATE - 1)) + i)) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); ttmp13 = _mm_setr_pd( (BIG_DOUBLE) two_to_minusphi[j0], (BIG_DOUBLE) two_to_minusphi[j1]); ttp13 = _mm_setr_pd( (BIG_DOUBLE) two_to_phi[j0], (BIG_DOUBLE) two_to_phi[j1]); __builtin_prefetch(two_to_minusphi + j0 + 16, 0, 3); __builtin_prefetch(two_to_minusphi + j1 + 16, 0, 3); __builtin_prefetch(two_to_phi + j0 + 16, 0, 3); __builtin_prefetch(two_to_phi + j1 + 16, 0, 3); j0 = ((((pad4 >> (SHIFT_UPDATE - 1)) + i)) + (((((pad4 >> (SHIFT_UPDATE - 1)) + i)) & (~(UL)(128 - 1)) )>>(6 )) + (((((pad4 >> (SHIFT_UPDATE - 1)) + i)) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((((pad4 >> (SHIFT_UPDATE - 1)) + i)) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((((pad4 >> (SHIFT_UPDATE - 1)) + i)) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); ttmp4x = _mm_setr_pd( (BIG_DOUBLE) two_to_minusphi[j0], (BIG_DOUBLE) 0.0); ttp4x = _mm_setr_pd( (BIG_DOUBLE) two_to_phi[j0], (BIG_DOUBLE) 0.0); __builtin_prefetch(two_to_minusphi + j0 + 16, 0, 3); __builtin_prefetch(two_to_phi + j0 + 16, 0, 3); };

      for (k = 0; k < UPDATE; k += 4, px += 2 * 4)
        {
          l = (j + k) >> 1;
          ll = ((j+k) + (((j+k) & (~(UL)(128 - 1)) )>>(6 )) + (((j+k) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((j+k) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((j+k) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4)));

          t0r = _mm_load_pd(pd0 + ll); t0i = _mm_load_pd((pd0 + ll + 2));; __builtin_prefetch(pd0 + ll + (16 + 0), 1, 3); t1r = _mm_load_pd(pd1 + ll); t1i = _mm_load_pd((pd1 + ll + 2));; __builtin_prefetch(pd1 + ll + (16 + 0), 1, 3); { Y__M128D __a,__b; __a = _mm_mul_pd( t1r, tw1i); t1r = _mm_mul_pd( t1r, tw1r); __b = _mm_mul_pd( t1i, tw1i); t1i = _mm_mul_pd( t1i, tw1r); t1r = _mm_sub_pd( t1r, __b); t1i = _mm_add_pd( t1i, __a); }; t2r = _mm_load_pd(pd2 + ll); t2i = _mm_load_pd((pd2 + ll + 2));; __builtin_prefetch(pd2 + ll + (16 + 0), 1, 3); { Y__M128D __a,__b; __a = _mm_mul_pd( t2r, tw2i); t2r = _mm_mul_pd( t2r, tw2r); __b = _mm_mul_pd( t2i, tw2i); t2i = _mm_mul_pd( t2i, tw2r); t2r = _mm_sub_pd( t2r, __b); t2i = _mm_add_pd( t2i, __a); };; t3r = _mm_load_pd(pd3 + ll); t3i = _mm_load_pd((pd3 + ll + 2));; __builtin_prefetch(pd3 + ll + (16 + 0), 1, 3); { Y__M128D __a,__b; __a = _mm_mul_pd( t3r, tw3i); t3r = _mm_mul_pd( t3r, tw3r); __b = _mm_mul_pd( t3i, tw3i); t3i = _mm_mul_pd( t3i, tw3r); t3r = _mm_sub_pd( t3r, __b); t3i = _mm_add_pd( t3i, __a); }; t4r = _mm_load_pd(pd4 + ll); t4i = _mm_load_pd((pd4 + ll + 2));; __builtin_prefetch(pd4 + ll + (16 + 0), 1, 3); { Y__M128D __a,__b; __a = _mm_mul_pd( t4r, tw4i); t4r = _mm_mul_pd( t4r, tw4r); __b = _mm_mul_pd( t4i, tw4i); t4i = _mm_mul_pd( t4i, tw4r); t4r = _mm_sub_pd( t4r, __b); t4i = _mm_add_pd( t4i, __a); };; t5r = _mm_setzero_pd( ); t5i = _mm_setzero_pd( ); { Y__M128D _ar, _ai, _br, _bi; _ar = _mm_add_pd( t1r, t4r); _ai = _mm_add_pd( t1i, t4i);; _br = _mm_add_pd( t2r, t3r); _bi = _mm_add_pd( t2i, t3i);; t4r = _mm_sub_pd( t1r, t4r); t4i = _mm_sub_pd( t1i, t4i);; t3r = _mm_sub_pd( t2r, t3r); t3i = _mm_sub_pd( t2i, t3i);; t2r = _mm_sub_pd( _ar, _br); t2i = _mm_sub_pd( _ai, _bi);; _ar = _mm_add_pd( _ar, _br); _ai = _mm_add_pd( _ai, _bi);; t2r = _mm_mul_pd( t2r, MM_FN2_5r); _br = _mm_sub_pd( t4r, t3r); _bi = _mm_sub_pd( t4i, t3i);; t2i = _mm_mul_pd( t2i, MM_FN2_5r); t0r = _mm_add_pd( t0r, _ar); t0i = _mm_add_pd( t0i, _ai);; _ar = _mm_mul_pd( _ar, MM_FNM125); _ai = _mm_mul_pd( _ai, MM_FNM125); _ar = _mm_add_pd( _ar, t0r); _ai = _mm_add_pd( _ai, t0i); _br = _mm_mul_pd( _br, MM_F_1_5i); _bi = _mm_mul_pd( _bi, MM_F_1_5i); t3r = _mm_mul_pd( t3r, MM_FN1_5i); t3i = _mm_mul_pd( t3i, MM_FN1_5i); t4r = _mm_mul_pd( t4r, MM_FN2_5i); t4i = _mm_mul_pd( t4i, MM_FN2_5i); t3r = _mm_add_pd( t3r, _br); t3i = _mm_add_pd( t3i, _bi); t1r = _mm_add_pd( _ar, t2r); t1i = _mm_add_pd( _ai, t2i); t2r = _mm_sub_pd( _ar, t2r); t2i = _mm_sub_pd( _ai, t2i); _ar = _mm_sub_pd( _br, t4r); _ai = _mm_sub_pd( _bi, t4i); t4r = _mm_sub_pd( t1r, t3i); t4i = _mm_add_pd( t1i, t3r); t1r = _mm_add_pd( t1r, t3i); t1i = _mm_sub_pd( t1i, t3r); t3r = _mm_sub_pd( t2r, _ai); t3i = _mm_add_pd( t2i, _ar); t2r = _mm_add_pd( t2r, _ai); t2i = _mm_sub_pd( t2i, _ar); };; { Y__M128D __aux0,__aux1; __aux0 = t0r; __aux1 = t2r; t0r = _mm_unpacklo_pd( __aux0, __aux1); t2r = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t0i; __aux1 = t2i; t0i = _mm_unpacklo_pd( __aux0, __aux1); t2i = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t1r; __aux1 = t3r; t1r = _mm_unpacklo_pd( __aux0, __aux1); t3r = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t1i; __aux1 = t3i; t1i = _mm_unpacklo_pd( __aux0, __aux1); t3i = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t4r; __aux1 = t5r; t4r = _mm_unpacklo_pd( __aux0, __aux1); t5r = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t4i; __aux1 = t5i; t4i = _mm_unpacklo_pd( __aux0, __aux1); t5i = _mm_unpackhi_pd( __aux0, __aux1); };;;





          if(err_flag)
            {


              { Y__M128D maskj, maskk; int imaskj, imaskk; maskj = _mm_mul_pd( t0r, ttmp02); maskk = _mm_mul_pd( t1r, ttmp13); __asm__ volatile ("addpd %4, %0 \n" "        addpd %4, %1" : "=&x" (t0r) , "=&x" (t1r) : "0" (maskj), "1" (maskk), "X"(MM_bigA)); __asm__ volatile ("subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (t0r) , "+&x" (t1r) : "X"(MM_bigA)); maskj = _mm_sub_pd( maskj, t0r); maskk = _mm_sub_pd( maskk, t1r); maskj = _mm_and_pd( maskj, MM_YABS); maskk = _mm_and_pd( maskk, MM_YABS); maxerr = _mm_max_pd( maxerr, maskj); t0r = _mm_add_pd( t0r, carry02); maskj = _mm_cmpgt_pd( bj02, MM_c); maxerr = _mm_max_pd( maxerr, maskk); t1r = _mm_add_pd( t1r, carry13); maskk = _mm_cmpgt_pd( bj13, MM_c); imaskj = _mm_movemask_pd( maskj ); imaskk = _mm_movemask_pd( maskk ); t0r = _mm_mul_pd( t0r, MM_inv[imaskj]); t1r = _mm_mul_pd( t1r, MM_inv[imaskk]); __asm__ volatile ("addpd %4, %0 \n" "        addpd %4, %1" : "=&x" (carry02) , "=&x" (carry13) : "0" (t0r), "1" (t1r), "X"(MM_bigA)); __asm__ volatile ("subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (carry02) , "+&x" (carry13) : "X"(MM_bigA)); maskj = _mm_and_pd( maskj, ttp02); maskk = _mm_and_pd( maskk, ttp13); t0r = _mm_sub_pd( t0r, carry02); t1r = _mm_sub_pd( t1r, carry13); ttp02 = _mm_add_pd( ttp02, maskj); ttmp02 = _mm_mul_pd( ttmp02, MM_auxt[imaskj]); ttp13 = _mm_add_pd( ttp13, maskk); ttmp13 = _mm_mul_pd( ttmp13, MM_auxt[imaskk]); t0r = _mm_mul_pd( t0r, ttp02); bj02 = _mm_add_pd( bj02, MM_bc[imaskj]); t1r = _mm_mul_pd( t1r, ttp13); bj13 = _mm_add_pd( bj13, MM_bc[imaskk]); ttp02 = _mm_mul_pd( ttp02, MM_Hsmall); ttp13 = _mm_mul_pd( ttp13, MM_Hsmall); };
              { Y__M128D maskj, maskk; int imaskj, imaskk; maskj = _mm_mul_pd( t0i, ttmp02); maskk = _mm_mul_pd( t1i, ttmp13); __asm__ volatile ("addpd %4, %0 \n" "        addpd %4, %1" : "=&x" (t0i) , "=&x" (t1i) : "0" (maskj), "1" (maskk), "X"(MM_bigA)); __asm__ volatile ("subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (t0i) , "+&x" (t1i) : "X"(MM_bigA)); maskj = _mm_sub_pd( maskj, t0i); maskk = _mm_sub_pd( maskk, t1i); maskj = _mm_and_pd( maskj, MM_YABS); maskk = _mm_and_pd( maskk, MM_YABS); maxerr = _mm_max_pd( maxerr, maskj); t0i = _mm_add_pd( t0i, carry02); maskj = _mm_cmpgt_pd( bj02, MM_c); maxerr = _mm_max_pd( maxerr, maskk); t1i = _mm_add_pd( t1i, carry13); maskk = _mm_cmpgt_pd( bj13, MM_c); imaskj = _mm_movemask_pd( maskj ); imaskk = _mm_movemask_pd( maskk ); t0i = _mm_mul_pd( t0i, MM_inv[imaskj]); t1i = _mm_mul_pd( t1i, MM_inv[imaskk]); __asm__ volatile ("addpd %4, %0 \n" "        addpd %4, %1" : "=&x" (carry02) , "=&x" (carry13) : "0" (t0i), "1" (t1i), "X"(MM_bigA)); __asm__ volatile ("subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (carry02) , "+&x" (carry13) : "X"(MM_bigA)); maskj = _mm_and_pd( maskj, ttp02); maskk = _mm_and_pd( maskk, ttp13); t0i = _mm_sub_pd( t0i, carry02); t1i = _mm_sub_pd( t1i, carry13); ttp02 = _mm_add_pd( ttp02, maskj); ttmp02 = _mm_mul_pd( ttmp02, MM_auxt[imaskj]); ttp13 = _mm_add_pd( ttp13, maskk); ttmp13 = _mm_mul_pd( ttmp13, MM_auxt[imaskk]); t0i = _mm_mul_pd( t0i, ttp02); bj02 = _mm_add_pd( bj02, MM_bc[imaskj]); t1i = _mm_mul_pd( t1i, ttp13); bj13 = _mm_add_pd( bj13, MM_bc[imaskk]); ttp02 = _mm_mul_pd( ttp02, MM_Hsmall); ttp13 = _mm_mul_pd( ttp13, MM_Hsmall); };
              { Y__M128D maskj, maskk; int imaskj, imaskk; maskj = _mm_mul_pd( t2r, ttmp02); maskk = _mm_mul_pd( t3r, ttmp13); __asm__ volatile ("addpd %4, %0 \n" "        addpd %4, %1" : "=&x" (t2r) , "=&x" (t3r) : "0" (maskj), "1" (maskk), "X"(MM_bigA)); __asm__ volatile ("subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (t2r) , "+&x" (t3r) : "X"(MM_bigA)); maskj = _mm_sub_pd( maskj, t2r); maskk = _mm_sub_pd( maskk, t3r); maskj = _mm_and_pd( maskj, MM_YABS); maskk = _mm_and_pd( maskk, MM_YABS); maxerr = _mm_max_pd( maxerr, maskj); t2r = _mm_add_pd( t2r, carry02); maskj = _mm_cmpgt_pd( bj02, MM_c); maxerr = _mm_max_pd( maxerr, maskk); t3r = _mm_add_pd( t3r, carry13); maskk = _mm_cmpgt_pd( bj13, MM_c); imaskj = _mm_movemask_pd( maskj ); imaskk = _mm_movemask_pd( maskk ); t2r = _mm_mul_pd( t2r, MM_inv[imaskj]); t3r = _mm_mul_pd( t3r, MM_inv[imaskk]); __asm__ volatile ("addpd %4, %0 \n" "        addpd %4, %1" : "=&x" (carry02) , "=&x" (carry13) : "0" (t2r), "1" (t3r), "X"(MM_bigA)); __asm__ volatile ("subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (carry02) , "+&x" (carry13) : "X"(MM_bigA)); maskj = _mm_and_pd( maskj, ttp02); maskk = _mm_and_pd( maskk, ttp13); t2r = _mm_sub_pd( t2r, carry02); t3r = _mm_sub_pd( t3r, carry13); ttp02 = _mm_add_pd( ttp02, maskj); ttmp02 = _mm_mul_pd( ttmp02, MM_auxt[imaskj]); ttp13 = _mm_add_pd( ttp13, maskk); ttmp13 = _mm_mul_pd( ttmp13, MM_auxt[imaskk]); t2r = _mm_mul_pd( t2r, ttp02); bj02 = _mm_add_pd( bj02, MM_bc[imaskj]); t3r = _mm_mul_pd( t3r, ttp13); bj13 = _mm_add_pd( bj13, MM_bc[imaskk]); ttp02 = _mm_mul_pd( ttp02, MM_Hsmall); ttp13 = _mm_mul_pd( ttp13, MM_Hsmall); };
              { Y__M128D maskj, maskk; int imaskj, imaskk; maskj = _mm_mul_pd( t2i, ttmp02); maskk = _mm_mul_pd( t3i, ttmp13); __asm__ volatile ("addpd %4, %0 \n" "        addpd %4, %1" : "=&x" (t2i) , "=&x" (t3i) : "0" (maskj), "1" (maskk), "X"(MM_bigA)); __asm__ volatile ("subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (t2i) , "+&x" (t3i) : "X"(MM_bigA)); maskj = _mm_sub_pd( maskj, t2i); maskk = _mm_sub_pd( maskk, t3i); maskj = _mm_and_pd( maskj, MM_YABS); maskk = _mm_and_pd( maskk, MM_YABS); maxerr = _mm_max_pd( maxerr, maskj); t2i = _mm_add_pd( t2i, carry02); maskj = _mm_cmpgt_pd( bj02, MM_c); maxerr = _mm_max_pd( maxerr, maskk); t3i = _mm_add_pd( t3i, carry13); maskk = _mm_cmpgt_pd( bj13, MM_c); imaskj = _mm_movemask_pd( maskj ); imaskk = _mm_movemask_pd( maskk ); t2i = _mm_mul_pd( t2i, MM_inv[imaskj]); t3i = _mm_mul_pd( t3i, MM_inv[imaskk]); __asm__ volatile ("addpd %4, %0 \n" "        addpd %4, %1" : "=&x" (carry02) , "=&x" (carry13) : "0" (t2i), "1" (t3i), "X"(MM_bigA)); __asm__ volatile ("subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (carry02) , "+&x" (carry13) : "X"(MM_bigA)); maskj = _mm_and_pd( maskj, ttp02); maskk = _mm_and_pd( maskk, ttp13); t2i = _mm_sub_pd( t2i, carry02); t3i = _mm_sub_pd( t3i, carry13); ttp02 = _mm_add_pd( ttp02, maskj); ttmp02 = _mm_mul_pd( ttmp02, MM_auxt[imaskj]); ttp13 = _mm_add_pd( ttp13, maskk); ttmp13 = _mm_mul_pd( ttmp13, MM_auxt[imaskk]); t2i = _mm_mul_pd( t2i, ttp02); bj02 = _mm_add_pd( bj02, MM_bc[imaskj]); t3i = _mm_mul_pd( t3i, ttp13); bj13 = _mm_add_pd( bj13, MM_bc[imaskk]); ttp02 = _mm_mul_pd( ttp02, MM_Hsmall); ttp13 = _mm_mul_pd( ttp13, MM_Hsmall); };
# 618 "ynormm_5.c"
              { Y__M128D maskj; int imaskj; maskj = _mm_mul_sd( t4r, ttmp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (t4r) : "0" (maskj), "X"(MM_bigA)); maskj = _mm_sub_sd( maskj, t4r); maskj = _mm_and_pd( maskj, MM_YABS); maxerr = _mm_max_sd( maxerr, maskj); t4r = _mm_add_sd( t4r, carry4x); maskj = _mm_cmpgt_sd( bj4x, MM_c); imaskj = _mm_movemask_pd( maskj ); t4r = _mm_mul_sd( t4r, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (carry4x) : "0" (t4r), "X"(MM_bigA)); ttp4x = _mm_add_sd( ttp4x, maskj); ttmp4x = _mm_mul_sd( ttmp4x, MM_auxt[imaskj]); t4r = _mm_sub_sd( t4r, carry4x); t4r = _mm_mul_sd( t4r, ttp4x); bj4x = _mm_add_sd( bj4x, MM_bc[imaskj]); ttp4x = _mm_mul_sd( ttp4x, MM_Hsmall); };
              { Y__M128D maskj; int imaskj; maskj = _mm_mul_sd( t4i, ttmp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (t4i) : "0" (maskj), "X"(MM_bigA)); maskj = _mm_sub_sd( maskj, t4i); maskj = _mm_and_pd( maskj, MM_YABS); maxerr = _mm_max_sd( maxerr, maskj); t4i = _mm_add_sd( t4i, carry4x); maskj = _mm_cmpgt_sd( bj4x, MM_c); imaskj = _mm_movemask_pd( maskj ); t4i = _mm_mul_sd( t4i, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (carry4x) : "0" (t4i), "X"(MM_bigA)); ttp4x = _mm_add_sd( ttp4x, maskj); ttmp4x = _mm_mul_sd( ttmp4x, MM_auxt[imaskj]); t4i = _mm_sub_sd( t4i, carry4x); t4i = _mm_mul_sd( t4i, ttp4x); bj4x = _mm_add_sd( bj4x, MM_bc[imaskj]); ttp4x = _mm_mul_sd( ttp4x, MM_Hsmall); };
              { Y__M128D maskj; int imaskj; maskj = _mm_mul_sd( t5r, ttmp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (t5r) : "0" (maskj), "X"(MM_bigA)); maskj = _mm_sub_sd( maskj, t5r); maskj = _mm_and_pd( maskj, MM_YABS); maxerr = _mm_max_sd( maxerr, maskj); t5r = _mm_add_sd( t5r, carry4x); maskj = _mm_cmpgt_sd( bj4x, MM_c); imaskj = _mm_movemask_pd( maskj ); t5r = _mm_mul_sd( t5r, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (carry4x) : "0" (t5r), "X"(MM_bigA)); ttp4x = _mm_add_sd( ttp4x, maskj); ttmp4x = _mm_mul_sd( ttmp4x, MM_auxt[imaskj]); t5r = _mm_sub_sd( t5r, carry4x); t5r = _mm_mul_sd( t5r, ttp4x); bj4x = _mm_add_sd( bj4x, MM_bc[imaskj]); ttp4x = _mm_mul_sd( ttp4x, MM_Hsmall); };
              { Y__M128D maskj; int imaskj; maskj = _mm_mul_sd( t5i, ttmp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (t5i) : "0" (maskj), "X"(MM_bigA)); maskj = _mm_sub_sd( maskj, t5i); maskj = _mm_and_pd( maskj, MM_YABS); maxerr = _mm_max_sd( maxerr, maskj); t5i = _mm_add_sd( t5i, carry4x); if( l == (pad - 2) ) bj4x = _mm_and_pd( bj4x, MM_Y0LO); maskj = _mm_cmpgt_sd( bj4x, MM_c); imaskj = _mm_movemask_pd( maskj ); t5i = _mm_mul_sd( t5i, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (carry4x) : "0" (t5i), "X"(MM_bigA)); ttp4x = _mm_add_sd( ttp4x, maskj); ttmp4x = _mm_mul_sd( ttmp4x, MM_auxt[imaskj]); t5i = _mm_sub_sd( t5i, carry4x); t5i = _mm_mul_sd( t5i, ttp4x); bj4x = _mm_add_sd( bj4x, MM_bc[imaskj]); ttp4x = _mm_mul_sd( ttp4x, MM_Hsmall); };
            }
          else
            {


              { Y__M128D maskj, maskk; int imaskj, imaskk; maskj = _mm_cmpgt_pd( bj02, MM_c); maskk = _mm_cmpgt_pd( bj13, MM_c); t0r = _mm_mul_pd( t0r, ttmp02); t1r = _mm_mul_pd( t1r, ttmp13); imaskj = _mm_movemask_pd( maskj ); imaskk = _mm_movemask_pd( maskk ); t0r = _mm_add_pd( t0r, carry02); t1r = _mm_add_pd( t1r, carry13); __asm__ volatile ("addpd %2, %0 \n" "        addpd %2, %1 \n" "        subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (t0r) , "+&x" (t1r) : "X"(MM_bigA)); t0r = _mm_mul_pd( t0r, MM_inv[imaskj]); t1r = _mm_mul_pd( t1r, MM_inv[imaskk]); maskj = _mm_and_pd( maskj, ttp02); maskk = _mm_and_pd( maskk, ttp13); __asm__ volatile ("addpd %4, %0 \n" "        addpd %4, %1" : "=&x" (carry02) , "=&x" (carry13) : "0" (t0r), "1" (t1r), "X"(MM_bigA)); __asm__ volatile ("subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (carry02) , "+&x" (carry13) : "X"(MM_bigA)); t0r = _mm_sub_pd( t0r, carry02); t1r = _mm_sub_pd( t1r, carry13); ttp02 = _mm_add_pd( ttp02, maskj); ttp13 = _mm_add_pd( ttp13, maskk); t0r = _mm_mul_pd( t0r, ttp02); t1r = _mm_mul_pd( t1r, ttp13); ttmp02 = _mm_mul_pd( ttmp02, MM_auxt[imaskj]); ttmp13 = _mm_mul_pd( ttmp13, MM_auxt[imaskk]); bj02 = _mm_add_pd( bj02, MM_bc[imaskj]); ttp02 = _mm_mul_pd( ttp02, MM_Hsmall); bj13 = _mm_add_pd( bj13, MM_bc[imaskk]); ttp13 = _mm_mul_pd( ttp13, MM_Hsmall); };
              { Y__M128D maskj, maskk; int imaskj, imaskk; maskj = _mm_cmpgt_pd( bj02, MM_c); maskk = _mm_cmpgt_pd( bj13, MM_c); t0i = _mm_mul_pd( t0i, ttmp02); t1i = _mm_mul_pd( t1i, ttmp13); imaskj = _mm_movemask_pd( maskj ); imaskk = _mm_movemask_pd( maskk ); t0i = _mm_add_pd( t0i, carry02); t1i = _mm_add_pd( t1i, carry13); __asm__ volatile ("addpd %2, %0 \n" "        addpd %2, %1 \n" "        subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (t0i) , "+&x" (t1i) : "X"(MM_bigA)); t0i = _mm_mul_pd( t0i, MM_inv[imaskj]); t1i = _mm_mul_pd( t1i, MM_inv[imaskk]); maskj = _mm_and_pd( maskj, ttp02); maskk = _mm_and_pd( maskk, ttp13); __asm__ volatile ("addpd %4, %0 \n" "        addpd %4, %1" : "=&x" (carry02) , "=&x" (carry13) : "0" (t0i), "1" (t1i), "X"(MM_bigA)); __asm__ volatile ("subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (carry02) , "+&x" (carry13) : "X"(MM_bigA)); t0i = _mm_sub_pd( t0i, carry02); t1i = _mm_sub_pd( t1i, carry13); ttp02 = _mm_add_pd( ttp02, maskj); ttp13 = _mm_add_pd( ttp13, maskk); t0i = _mm_mul_pd( t0i, ttp02); t1i = _mm_mul_pd( t1i, ttp13); ttmp02 = _mm_mul_pd( ttmp02, MM_auxt[imaskj]); ttmp13 = _mm_mul_pd( ttmp13, MM_auxt[imaskk]); bj02 = _mm_add_pd( bj02, MM_bc[imaskj]); ttp02 = _mm_mul_pd( ttp02, MM_Hsmall); bj13 = _mm_add_pd( bj13, MM_bc[imaskk]); ttp13 = _mm_mul_pd( ttp13, MM_Hsmall); };
              { Y__M128D maskj, maskk; int imaskj, imaskk; maskj = _mm_cmpgt_pd( bj02, MM_c); maskk = _mm_cmpgt_pd( bj13, MM_c); t2r = _mm_mul_pd( t2r, ttmp02); t3r = _mm_mul_pd( t3r, ttmp13); imaskj = _mm_movemask_pd( maskj ); imaskk = _mm_movemask_pd( maskk ); t2r = _mm_add_pd( t2r, carry02); t3r = _mm_add_pd( t3r, carry13); __asm__ volatile ("addpd %2, %0 \n" "        addpd %2, %1 \n" "        subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (t2r) , "+&x" (t3r) : "X"(MM_bigA)); t2r = _mm_mul_pd( t2r, MM_inv[imaskj]); t3r = _mm_mul_pd( t3r, MM_inv[imaskk]); maskj = _mm_and_pd( maskj, ttp02); maskk = _mm_and_pd( maskk, ttp13); __asm__ volatile ("addpd %4, %0 \n" "        addpd %4, %1" : "=&x" (carry02) , "=&x" (carry13) : "0" (t2r), "1" (t3r), "X"(MM_bigA)); __asm__ volatile ("subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (carry02) , "+&x" (carry13) : "X"(MM_bigA)); t2r = _mm_sub_pd( t2r, carry02); t3r = _mm_sub_pd( t3r, carry13); ttp02 = _mm_add_pd( ttp02, maskj); ttp13 = _mm_add_pd( ttp13, maskk); t2r = _mm_mul_pd( t2r, ttp02); t3r = _mm_mul_pd( t3r, ttp13); ttmp02 = _mm_mul_pd( ttmp02, MM_auxt[imaskj]); ttmp13 = _mm_mul_pd( ttmp13, MM_auxt[imaskk]); bj02 = _mm_add_pd( bj02, MM_bc[imaskj]); ttp02 = _mm_mul_pd( ttp02, MM_Hsmall); bj13 = _mm_add_pd( bj13, MM_bc[imaskk]); ttp13 = _mm_mul_pd( ttp13, MM_Hsmall); };
              { Y__M128D maskj, maskk; int imaskj, imaskk; maskj = _mm_cmpgt_pd( bj02, MM_c); maskk = _mm_cmpgt_pd( bj13, MM_c); t2i = _mm_mul_pd( t2i, ttmp02); t3i = _mm_mul_pd( t3i, ttmp13); imaskj = _mm_movemask_pd( maskj ); imaskk = _mm_movemask_pd( maskk ); t2i = _mm_add_pd( t2i, carry02); t3i = _mm_add_pd( t3i, carry13); __asm__ volatile ("addpd %2, %0 \n" "        addpd %2, %1 \n" "        subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (t2i) , "+&x" (t3i) : "X"(MM_bigA)); t2i = _mm_mul_pd( t2i, MM_inv[imaskj]); t3i = _mm_mul_pd( t3i, MM_inv[imaskk]); maskj = _mm_and_pd( maskj, ttp02); maskk = _mm_and_pd( maskk, ttp13); __asm__ volatile ("addpd %4, %0 \n" "        addpd %4, %1" : "=&x" (carry02) , "=&x" (carry13) : "0" (t2i), "1" (t3i), "X"(MM_bigA)); __asm__ volatile ("subpd %2, %0 \n" "        subpd %2, %1" : "+&x" (carry02) , "+&x" (carry13) : "X"(MM_bigA)); t2i = _mm_sub_pd( t2i, carry02); t3i = _mm_sub_pd( t3i, carry13); ttp02 = _mm_add_pd( ttp02, maskj); ttp13 = _mm_add_pd( ttp13, maskk); t2i = _mm_mul_pd( t2i, ttp02); t3i = _mm_mul_pd( t3i, ttp13); ttmp02 = _mm_mul_pd( ttmp02, MM_auxt[imaskj]); ttmp13 = _mm_mul_pd( ttmp13, MM_auxt[imaskk]); bj02 = _mm_add_pd( bj02, MM_bc[imaskj]); ttp02 = _mm_mul_pd( ttp02, MM_Hsmall); bj13 = _mm_add_pd( bj13, MM_bc[imaskk]); ttp13 = _mm_mul_pd( ttp13, MM_Hsmall); };
# 643 "ynormm_5.c"
              { Y__M128D maskj; int imaskj; t4r = _mm_mul_sd( t4r, ttmp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (t4r) : "0" (t4r), "X"(MM_bigA)); t4r = _mm_add_sd( t4r, carry4x); maskj = _mm_cmpgt_sd( bj4x, MM_c); imaskj = _mm_movemask_pd( maskj ); t4r = _mm_mul_sd( t4r, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (carry4x) : "0" (t4r), "X"(MM_bigA)); ttp4x = _mm_add_sd( ttp4x, maskj); ttmp4x = _mm_mul_sd( ttmp4x, MM_auxt[imaskj]); t4r = _mm_sub_sd( t4r, carry4x); t4r = _mm_mul_sd( t4r, ttp4x); bj4x = _mm_add_sd( bj4x, MM_bc[imaskj]); ttp4x = _mm_mul_sd( ttp4x, MM_Hsmall); };
              { Y__M128D maskj; int imaskj; t4i = _mm_mul_sd( t4i, ttmp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (t4i) : "0" (t4i), "X"(MM_bigA)); t4i = _mm_add_sd( t4i, carry4x); maskj = _mm_cmpgt_sd( bj4x, MM_c); imaskj = _mm_movemask_pd( maskj ); t4i = _mm_mul_sd( t4i, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (carry4x) : "0" (t4i), "X"(MM_bigA)); ttp4x = _mm_add_sd( ttp4x, maskj); ttmp4x = _mm_mul_sd( ttmp4x, MM_auxt[imaskj]); t4i = _mm_sub_sd( t4i, carry4x); t4i = _mm_mul_sd( t4i, ttp4x); bj4x = _mm_add_sd( bj4x, MM_bc[imaskj]); ttp4x = _mm_mul_sd( ttp4x, MM_Hsmall); };
              { Y__M128D maskj; int imaskj; t5r = _mm_mul_sd( t5r, ttmp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (t5r) : "0" (t5r), "X"(MM_bigA)); t5r = _mm_add_sd( t5r, carry4x); maskj = _mm_cmpgt_sd( bj4x, MM_c); imaskj = _mm_movemask_pd( maskj ); t5r = _mm_mul_sd( t5r, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (carry4x) : "0" (t5r), "X"(MM_bigA)); ttp4x = _mm_add_sd( ttp4x, maskj); ttmp4x = _mm_mul_sd( ttmp4x, MM_auxt[imaskj]); t5r = _mm_sub_sd( t5r, carry4x); t5r = _mm_mul_sd( t5r, ttp4x); bj4x = _mm_add_sd( bj4x, MM_bc[imaskj]); ttp4x = _mm_mul_sd( ttp4x, MM_Hsmall); };
              { Y__M128D maskj; int imaskj; t5i = _mm_mul_sd( t5i, ttmp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (t5i) : "0" (t5i), "X"(MM_bigA)); t5i = _mm_add_sd( t5i, carry4x); maskj = _mm_cmpgt_pd( bj4x, MM_c); if( l == (pad - 2) ) maskj = _mm_and_pd( maskj, MM_Y0LO); imaskj = _mm_movemask_pd( maskj ); t5i = _mm_mul_sd( t5i, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp4x); __asm__ volatile ("addsd %2, %0 \n" "        subsd %2, %0" : "=&x" (carry4x) : "0" (t5i), "X"(MM_bigA)); ttp4x = _mm_add_sd( ttp4x, maskj); ttmp4x = _mm_mul_sd( ttmp4x, MM_auxt[imaskj]); t5i = _mm_sub_sd( t5i, carry4x); t5i = _mm_mul_sd( t5i, ttp4x); bj4x = _mm_add_sd( bj4x, MM_bc[imaskj]); ttp4x = _mm_mul_sd( ttp4x, MM_Hsmall); };
            }
          { Y__M128D __aux0,__aux1; __aux0 = t0r; __aux1 = t2r; t0r = _mm_unpacklo_pd( __aux0, __aux1); t2r = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t0i; __aux1 = t2i; t0i = _mm_unpacklo_pd( __aux0, __aux1); t2i = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t1r; __aux1 = t3r; t1r = _mm_unpacklo_pd( __aux0, __aux1); t3r = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t1i; __aux1 = t3i; t1i = _mm_unpacklo_pd( __aux0, __aux1); t3i = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t4r; __aux1 = t5r; t4r = _mm_unpacklo_pd( __aux0, __aux1); t5r = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t4i; __aux1 = t5i; t4i = _mm_unpacklo_pd( __aux0, __aux1); t5i = _mm_unpackhi_pd( __aux0, __aux1); };; { Y__M128D _ar, _ai, _br, _bi; _ar = _mm_add_pd( t1r, t4r); _ai = _mm_add_pd( t1i, t4i);; _br = _mm_add_pd( t2r, t3r); _bi = _mm_add_pd( t2i, t3i);; t4r = _mm_sub_pd( t1r, t4r); t4i = _mm_sub_pd( t1i, t4i);; t3r = _mm_sub_pd( t2r, t3r); t3i = _mm_sub_pd( t2i, t3i);; t2r = _mm_sub_pd( _ar, _br); t2i = _mm_sub_pd( _ai, _bi);; _ar = _mm_add_pd( _ar, _br); _ai = _mm_add_pd( _ai, _bi);; t2r = _mm_mul_pd( t2r, MM_FN2_5r); _br = _mm_sub_pd( t4r, t3r); _bi = _mm_sub_pd( t4i, t3i);; t2i = _mm_mul_pd( t2i, MM_FN2_5r); t0r = _mm_add_pd( t0r, _ar); t0i = _mm_add_pd( t0i, _ai);; _ar = _mm_mul_pd( _ar, MM_FNM125); _ai = _mm_mul_pd( _ai, MM_FNM125); _mm_store_pd( pd0 + ll, t0r); _mm_store_pd( (pd0 + ll + 2), t0i);; _br = _mm_mul_pd( _br, MM_F_1_5i); _bi = _mm_mul_pd( _bi, MM_F_1_5i); _ar = _mm_add_pd( _ar, t0r); _ai = _mm_add_pd( _ai, t0i); t3r = _mm_mul_pd( t3r, MM_FN1_5i); t3i = _mm_mul_pd( t3i, MM_FN1_5i); t4r = _mm_mul_pd( t4r, MM_FN2_5i); t4i = _mm_mul_pd( t4i, MM_FN2_5i); t1r = _mm_add_pd( _ar, t2r); t1i = _mm_add_pd( _ai, t2i); t2r = _mm_sub_pd( _ar, t2r); t2i = _mm_sub_pd( _ai, t2i); t3r = _mm_add_pd( t3r, _br); t3i = _mm_add_pd( t3i, _bi); t4r = _mm_sub_pd( _br, t4r); t4i = _mm_sub_pd( _bi, t4i); _ar = t4r; _ai = t4i; t4r = _mm_add_pd( t1r, t3i); t4i = _mm_sub_pd( t1i, t3r); t1r = _mm_sub_pd( t1r, t3i); t1i = _mm_add_pd( t1i, t3r); _mm_store_pd( pd4 + ll, t4r); _mm_store_pd( (pd4 + ll + 2), t4i);; _mm_store_pd( pd1 + ll, t1r); _mm_store_pd( (pd1 + ll + 2), t1i);; t3r = _mm_add_pd( t2r, _ai); t3i = _mm_sub_pd( t2i, _ar); t2r = _mm_sub_pd( t2r, _ai); t2i = _mm_add_pd( t2i, _ar); _mm_store_pd( pd3 + ll, t3r); _mm_store_pd( (pd3 + ll + 2), t3i);; _mm_store_pd( pd2 + ll, t2r); _mm_store_pd( (pd2 + ll + 2), t2i);;};;;






          { Y__M128D __aux0,__aux1; __aux0 = _mm_load_pd(px); __aux1 = _mm_load_pd(px + 4); tw1r = _mm_unpacklo_pd( __aux0, __aux1); tw1i = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = _mm_load_pd(px + 2); __aux1 = _mm_load_pd(px + 4 + 2); tw3r = _mm_unpacklo_pd( __aux0, __aux1); tw3i = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __x0,__x1; tw2r = tw3r; __x0 = tw3i; tw2i = tw3i; __x1 = tw3r; tw2r = _mm_mul_pd( tw2r, tw1r); __x0 = _mm_mul_pd( __x0, tw1i); tw2i = _mm_mul_pd( tw2i, tw1r); tw4r = tw2r; tw4i = tw2i; __x1 = _mm_mul_pd( __x1, tw1i); tw4r = _mm_sub_pd( tw4r, __x0); tw2r = _mm_add_pd( tw2r, __x0); tw4i = _mm_add_pd( tw4i, __x1); tw2i = _mm_sub_pd( tw2i, __x1); }; __builtin_prefetch(px + 2 * 4, 0, 3); __builtin_prefetch(px + 3 * 4, 0, 3);;
        }
    }


  {
    double __attribute__ ((aligned(16))) aux[4], aux0;
    _mm_store_pd( aux, carry13);
    _mm_store_pd( aux + 2, carry4x);
    aux0 = aux[1];
    aux[1] = 0.0;
    aux[3] = aux[0];
    aux[0] = aux0;
    t4r = _mm_load_pd(aux);
    t0r = _mm_load_pd(aux + 2);
  }
  t1r = carry02;
  carry13 = _mm_setzero_pd( );
  carry02 = _mm_setzero_pd( );
  carry4x = _mm_setzero_pd( );

  t0i = _mm_setzero_pd( );
  t2r = _mm_setzero_pd( );
  t2i = _mm_setzero_pd( );
  t1i = _mm_setzero_pd( );
  t3r = _mm_setzero_pd( );
  t3i = _mm_setzero_pd( );
  t4i = _mm_setzero_pd( );
  t5r = _mm_setzero_pd( );
  t5i = _mm_setzero_pd( );

  bj0 = N; bj1 = ( b % 5) << (Y_K + 1); bj2 = (( 2 * b) % 5) << (Y_K + 1); bj3 = (( 3 * b) % 5) << (Y_K + 1); bj4 = (( 4 * b) % 5) << (Y_K + 1); bj02 = _mm_setr_pd( (BIG_DOUBLE) bj0, (BIG_DOUBLE) bj2); bj13 = _mm_setr_pd( (BIG_DOUBLE) bj1, (BIG_DOUBLE) bj3); bj4x = _mm_setr_pd( (BIG_DOUBLE) bj4, (BIG_DOUBLE) 0);;

  j = 0;
  { size_t j0, j1; j0 = ((j) + (((j) & (~(UL)(128 - 1)) )>>(6 )) + (((j) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((j) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((j) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); j1 = ((((pad2 >> (SHIFT_UPDATE - 1)) + j)) + (((((pad2 >> (SHIFT_UPDATE - 1)) + j)) & (~(UL)(128 - 1)) )>>(6 )) + (((((pad2 >> (SHIFT_UPDATE - 1)) + j)) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((((pad2 >> (SHIFT_UPDATE - 1)) + j)) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((((pad2 >> (SHIFT_UPDATE - 1)) + j)) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); ttmp02 = _mm_setr_pd( (BIG_DOUBLE) two_to_minusphi[j0], (BIG_DOUBLE) two_to_minusphi[j1]); ttp02 = _mm_setr_pd( (BIG_DOUBLE) two_to_phi[j0], (BIG_DOUBLE) two_to_phi[j1]); __builtin_prefetch(two_to_minusphi + j0 + 16, 0, 3); __builtin_prefetch(two_to_minusphi + j1 + 16, 0, 3); __builtin_prefetch(two_to_phi + j0 + 16, 0, 3); __builtin_prefetch(two_to_phi + j1 + 16, 0, 3); j0 = ((((pad2 >> SHIFT_UPDATE) + j)) + (((((pad2 >> SHIFT_UPDATE) + j)) & (~(UL)(128 - 1)) )>>(6 )) + (((((pad2 >> SHIFT_UPDATE) + j)) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((((pad2 >> SHIFT_UPDATE) + j)) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((((pad2 >> SHIFT_UPDATE) + j)) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); j1 = ((((pad3 >> (SHIFT_UPDATE - 1)) + j)) + (((((pad3 >> (SHIFT_UPDATE - 1)) + j)) & (~(UL)(128 - 1)) )>>(6 )) + (((((pad3 >> (SHIFT_UPDATE - 1)) + j)) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((((pad3 >> (SHIFT_UPDATE - 1)) + j)) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((((pad3 >> (SHIFT_UPDATE - 1)) + j)) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); ttmp13 = _mm_setr_pd( (BIG_DOUBLE) two_to_minusphi[j0], (BIG_DOUBLE) two_to_minusphi[j1]); ttp13 = _mm_setr_pd( (BIG_DOUBLE) two_to_phi[j0], (BIG_DOUBLE) two_to_phi[j1]); __builtin_prefetch(two_to_minusphi + j0 + 16, 0, 3); __builtin_prefetch(two_to_minusphi + j1 + 16, 0, 3); __builtin_prefetch(two_to_phi + j0 + 16, 0, 3); __builtin_prefetch(two_to_phi + j1 + 16, 0, 3); j0 = ((((pad4 >> (SHIFT_UPDATE - 1)) + j)) + (((((pad4 >> (SHIFT_UPDATE - 1)) + j)) & (~(UL)(128 - 1)) )>>(6 )) + (((((pad4 >> (SHIFT_UPDATE - 1)) + j)) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + (((((pad4 >> (SHIFT_UPDATE - 1)) + j)) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ (((((pad4 >> (SHIFT_UPDATE - 1)) + j)) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))); ttmp4x = _mm_setr_pd( (BIG_DOUBLE) two_to_minusphi[j0], (BIG_DOUBLE) 0.0); ttp4x = _mm_setr_pd( (BIG_DOUBLE) two_to_phi[j0], (BIG_DOUBLE) 0.0); __builtin_prefetch(two_to_minusphi + j0 + 16, 0, 3); __builtin_prefetch(two_to_phi + j0 + 16, 0, 3); };

  { Y__M128D maskj; int imaskj; __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (t0r) : "0" (t0r), "X"(MM_bigA)); t0r = _mm_add_pd( t0r, carry02); maskj = _mm_cmpgt_pd( bj02, MM_c); imaskj = _mm_movemask_pd( maskj ); t0r = _mm_mul_pd( t0r, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp02); __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (carry02) : "0" (t0r), "X"(MM_bigA)); ttp02 = _mm_add_pd( ttp02, maskj); t0r = _mm_sub_pd( t0r, carry02); t0r = _mm_mul_pd( t0r, ttp02); bj02 = _mm_add_pd( bj02, MM_bc[imaskj]); ttp02 = _mm_mul_pd( ttp02, MM_Hsmall); };
  { Y__M128D maskj; int imaskj; __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (t0i) : "0" (t0i), "X"(MM_bigA)); t0i = _mm_add_pd( t0i, carry02); maskj = _mm_cmpgt_pd( bj02, MM_c); imaskj = _mm_movemask_pd( maskj ); t0i = _mm_mul_pd( t0i, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp02); __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (carry02) : "0" (t0i), "X"(MM_bigA)); ttp02 = _mm_add_pd( ttp02, maskj); t0i = _mm_sub_pd( t0i, carry02); t0i = _mm_mul_pd( t0i, ttp02); bj02 = _mm_add_pd( bj02, MM_bc[imaskj]); ttp02 = _mm_mul_pd( ttp02, MM_Hsmall); };
  { Y__M128D maskj; int imaskj; __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (t2r) : "0" (t2r), "X"(MM_bigA)); t2r = _mm_add_pd( t2r, carry02); maskj = _mm_cmpgt_pd( bj02, MM_c); imaskj = _mm_movemask_pd( maskj ); t2r = _mm_mul_pd( t2r, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp02); __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (carry02) : "0" (t2r), "X"(MM_bigA)); ttp02 = _mm_add_pd( ttp02, maskj); t2r = _mm_sub_pd( t2r, carry02); t2r = _mm_mul_pd( t2r, ttp02); bj02 = _mm_add_pd( bj02, MM_bc[imaskj]); ttp02 = _mm_mul_pd( ttp02, MM_Hsmall); };
  { Y__M128D maskj; int imaskj; __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (t2i) : "0" (t2i), "X"(MM_bigA)); t2i = _mm_add_pd( t2i, carry02); maskj = _mm_cmpgt_pd( bj02, MM_c); imaskj = _mm_movemask_pd( maskj ); t2i = _mm_mul_pd( t2i, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp02); __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (carry02) : "0" (t2i), "X"(MM_bigA)); ttp02 = _mm_add_pd( ttp02, maskj); t2i = _mm_sub_pd( t2i, carry02); t2i = _mm_mul_pd( t2i, ttp02); bj02 = _mm_add_pd( bj02, MM_bc[imaskj]); ttp02 = _mm_mul_pd( ttp02, MM_Hsmall); };
  __builtin_prefetch(pd0 + 0, 1, 3);
  __builtin_prefetch(pd2 + 0, 1, 3);
  { Y__M128D maskj; int imaskj; __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (t1r) : "0" (t1r), "X"(MM_bigA)); t1r = _mm_add_pd( t1r, carry13); maskj = _mm_cmpgt_pd( bj13, MM_c); imaskj = _mm_movemask_pd( maskj ); t1r = _mm_mul_pd( t1r, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp13); __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (carry13) : "0" (t1r), "X"(MM_bigA)); ttp13 = _mm_add_pd( ttp13, maskj); t1r = _mm_sub_pd( t1r, carry13); t1r = _mm_mul_pd( t1r, ttp13); bj13 = _mm_add_pd( bj13, MM_bc[imaskj]); ttp13 = _mm_mul_pd( ttp13, MM_Hsmall); };
  { Y__M128D maskj; int imaskj; __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (t1i) : "0" (t1i), "X"(MM_bigA)); t1i = _mm_add_pd( t1i, carry13); maskj = _mm_cmpgt_pd( bj13, MM_c); imaskj = _mm_movemask_pd( maskj ); t1i = _mm_mul_pd( t1i, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp13); __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (carry13) : "0" (t1i), "X"(MM_bigA)); ttp13 = _mm_add_pd( ttp13, maskj); t1i = _mm_sub_pd( t1i, carry13); t1i = _mm_mul_pd( t1i, ttp13); bj13 = _mm_add_pd( bj13, MM_bc[imaskj]); ttp13 = _mm_mul_pd( ttp13, MM_Hsmall); };
  { Y__M128D maskj; int imaskj; __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (t3r) : "0" (t3r), "X"(MM_bigA)); t3r = _mm_add_pd( t3r, carry13); maskj = _mm_cmpgt_pd( bj13, MM_c); imaskj = _mm_movemask_pd( maskj ); t3r = _mm_mul_pd( t3r, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp13); __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (carry13) : "0" (t3r), "X"(MM_bigA)); ttp13 = _mm_add_pd( ttp13, maskj); t3r = _mm_sub_pd( t3r, carry13); t3r = _mm_mul_pd( t3r, ttp13); bj13 = _mm_add_pd( bj13, MM_bc[imaskj]); ttp13 = _mm_mul_pd( ttp13, MM_Hsmall); };
  { Y__M128D maskj; int imaskj; __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (t3i) : "0" (t3i), "X"(MM_bigA)); t3i = _mm_add_pd( t3i, carry13); maskj = _mm_cmpgt_pd( bj13, MM_c); imaskj = _mm_movemask_pd( maskj ); t3i = _mm_mul_pd( t3i, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp13); __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (carry13) : "0" (t3i), "X"(MM_bigA)); ttp13 = _mm_add_pd( ttp13, maskj); t3i = _mm_sub_pd( t3i, carry13); t3i = _mm_mul_pd( t3i, ttp13); bj13 = _mm_add_pd( bj13, MM_bc[imaskj]); ttp13 = _mm_mul_pd( ttp13, MM_Hsmall); };
  __builtin_prefetch(pd1 + 0, 1, 3);
  __builtin_prefetch(pd3 + 0, 1, 3);
  { Y__M128D maskj; int imaskj; __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (t4r) : "0" (t4r), "X"(MM_bigA)); t4r = _mm_add_pd( t4r, carry4x); maskj = _mm_cmpgt_pd( bj4x, MM_c); imaskj = _mm_movemask_pd( maskj ); t4r = _mm_mul_pd( t4r, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp4x); __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (carry4x) : "0" (t4r), "X"(MM_bigA)); ttp4x = _mm_add_pd( ttp4x, maskj); t4r = _mm_sub_pd( t4r, carry4x); t4r = _mm_mul_pd( t4r, ttp4x); bj4x = _mm_add_pd( bj4x, MM_bc[imaskj]); ttp4x = _mm_mul_pd( ttp4x, MM_Hsmall); };
  { Y__M128D maskj; int imaskj; __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (t4i) : "0" (t4i), "X"(MM_bigA)); t4i = _mm_add_pd( t4i, carry4x); maskj = _mm_cmpgt_pd( bj4x, MM_c); imaskj = _mm_movemask_pd( maskj ); t4i = _mm_mul_pd( t4i, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp4x); __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (carry4x) : "0" (t4i), "X"(MM_bigA)); ttp4x = _mm_add_pd( ttp4x, maskj); t4i = _mm_sub_pd( t4i, carry4x); t4i = _mm_mul_pd( t4i, ttp4x); bj4x = _mm_add_pd( bj4x, MM_bc[imaskj]); ttp4x = _mm_mul_pd( ttp4x, MM_Hsmall); };
  { Y__M128D maskj; int imaskj; __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (t5r) : "0" (t5r), "X"(MM_bigA)); t5r = _mm_add_pd( t5r, carry4x); maskj = _mm_cmpgt_pd( bj4x, MM_c); imaskj = _mm_movemask_pd( maskj ); t5r = _mm_mul_pd( t5r, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp4x); __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (carry4x) : "0" (t5r), "X"(MM_bigA)); ttp4x = _mm_add_pd( ttp4x, maskj); t5r = _mm_sub_pd( t5r, carry4x); t5r = _mm_mul_pd( t5r, ttp4x); bj4x = _mm_add_pd( bj4x, MM_bc[imaskj]); ttp4x = _mm_mul_pd( ttp4x, MM_Hsmall); };
  { Y__M128D maskj; int imaskj; __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (t5i) : "0" (t5i), "X"(MM_bigA)); t5i = _mm_add_pd( t5i, carry4x); maskj = _mm_cmpgt_pd( bj4x, MM_c); imaskj = _mm_movemask_pd( maskj ); t5i = _mm_mul_pd( t5i, MM_inv[imaskj]); maskj = _mm_and_pd( maskj, ttp4x); __asm__ volatile ("addpd %2, %0\n" "        subpd %2, %0\n" : "=&x" (carry4x) : "0" (t5i), "X"(MM_bigA)); ttp4x = _mm_add_pd( ttp4x, maskj); t5i = _mm_sub_pd( t5i, carry4x); t5i = _mm_mul_pd( t5i, ttp4x); bj4x = _mm_add_pd( bj4x, MM_bc[imaskj]); ttp4x = _mm_mul_pd( ttp4x, MM_Hsmall); };
  __builtin_prefetch(pd4 + 0, 1, 3);


  { Y__M128D __aux0,__aux1; __aux0 = t0r; __aux1 = t2r; t0r = _mm_unpacklo_pd( __aux0, __aux1); t2r = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t0i; __aux1 = t2i; t0i = _mm_unpacklo_pd( __aux0, __aux1); t2i = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t1r; __aux1 = t3r; t1r = _mm_unpacklo_pd( __aux0, __aux1); t3r = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t1i; __aux1 = t3i; t1i = _mm_unpacklo_pd( __aux0, __aux1); t3i = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t4r; __aux1 = t5r; t4r = _mm_unpacklo_pd( __aux0, __aux1); t5r = _mm_unpackhi_pd( __aux0, __aux1); }; { Y__M128D __aux0,__aux1; __aux0 = t4i; __aux1 = t5i; t4i = _mm_unpacklo_pd( __aux0, __aux1); t5i = _mm_unpackhi_pd( __aux0, __aux1); };; { Y__M128D _ar, _ai, _br, _bi; _ar = _mm_add_pd( t1r, t4r); _ai = _mm_add_pd( t1i, t4i);; _br = _mm_add_pd( t2r, t3r); _bi = _mm_add_pd( t2i, t3i);; t4r = _mm_sub_pd( t1r, t4r); t4i = _mm_sub_pd( t1i, t4i);; t3r = _mm_sub_pd( t2r, t3r); t3i = _mm_sub_pd( t2i, t3i);; t2r = _mm_sub_pd( _ar, _br); t2i = _mm_sub_pd( _ai, _bi);; _ar = _mm_add_pd( _ar, _br); _ai = _mm_add_pd( _ai, _bi);; t2r = _mm_mul_pd( t2r, MM_FN2_5r); _br = _mm_sub_pd( t4r, t3r); _bi = _mm_sub_pd( t4i, t3i);; t2i = _mm_mul_pd( t2i, MM_FN2_5r); t0r = _mm_add_pd( t0r, _ar); t0i = _mm_add_pd( t0i, _ai);; _ar = _mm_mul_pd( _ar, MM_FNM125); _ai = _mm_mul_pd( _ai, MM_FNM125); _br = _mm_mul_pd( _br, MM_F_1_5i); _bi = _mm_mul_pd( _bi, MM_F_1_5i); _ar = _mm_add_pd( _ar, t0r); _ai = _mm_add_pd( _ai, t0i); t3r = _mm_mul_pd( t3r, MM_FN1_5i); t3i = _mm_mul_pd( t3i, MM_FN1_5i); t4r = _mm_mul_pd( t4r, MM_FN2_5i); t4i = _mm_mul_pd( t4i, MM_FN2_5i); t1r = _mm_add_pd( _ar, t2r); t1i = _mm_add_pd( _ai, t2i); t2r = _mm_sub_pd( _ar, t2r); t2i = _mm_sub_pd( _ai, t2i); t3r = _mm_add_pd( t3r, _br); t3i = _mm_add_pd( t3i, _bi); t4r = _mm_sub_pd( _br, t4r); t4i = _mm_sub_pd( _bi, t4i); _ar = t4r; _ai = t4i; t4r = _mm_add_pd( t1r, t3i); t4i = _mm_sub_pd( t1i, t3r); t1r = _mm_sub_pd( t1r, t3i); t1i = _mm_add_pd( t1i, t3r); t3r = _mm_add_pd( t2r, _ai); t3i = _mm_sub_pd( t2i, _ar); t2r = _mm_sub_pd( t2r, _ai); t2i = _mm_add_pd( t2i, _ar); };; { Y__M128D _auxr, _auxi; _auxr = _mm_load_pd(x + (((0)<<1) + ((((0)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((0)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((0)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((0)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4)))); _auxi = _mm_load_pd((x + (((0)<<1) + ((((0)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((0)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((0)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((0)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))) + 2));; _auxr = _mm_add_pd( _auxr, t0r); _auxi = _mm_add_pd( _auxi, t0i); _mm_store_pd( x + (((0)<<1) + ((((0)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((0)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((0)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((0)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))), _auxr); _mm_store_pd( (x + (((0)<<1) + ((((0)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((0)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((0)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((0)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))) + 2), _auxi);; }; { Y__M128D _auxr, _auxi; _auxr = _mm_load_pd(x + (((pad)<<1) + ((((pad)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4)))); _auxi = _mm_load_pd((x + (((pad)<<1) + ((((pad)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))) + 2));; _auxr = _mm_add_pd( _auxr, t1r); _auxi = _mm_add_pd( _auxi, t1i); _mm_store_pd( x + (((pad)<<1) + ((((pad)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))), _auxr); _mm_store_pd( (x + (((pad)<<1) + ((((pad)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))) + 2), _auxi);; }; { Y__M128D _auxr, _auxi; _auxr = _mm_load_pd(x + (((pad2)<<1) + ((((pad2)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad2)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad2)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad2)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4)))); _auxi = _mm_load_pd((x + (((pad2)<<1) + ((((pad2)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad2)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad2)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad2)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))) + 2));; _auxr = _mm_add_pd( _auxr, t2r); _auxi = _mm_add_pd( _auxi, t2i); _mm_store_pd( x + (((pad2)<<1) + ((((pad2)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad2)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad2)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad2)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))), _auxr); _mm_store_pd( (x + (((pad2)<<1) + ((((pad2)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad2)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad2)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad2)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))) + 2), _auxi);; }; { Y__M128D _auxr, _auxi; _auxr = _mm_load_pd(x + (((pad3)<<1) + ((((pad3)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad3)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad3)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad3)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4)))); _auxi = _mm_load_pd((x + (((pad3)<<1) + ((((pad3)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad3)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad3)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad3)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))) + 2));; _auxr = _mm_add_pd( _auxr, t3r); _auxi = _mm_add_pd( _auxi, t3i); _mm_store_pd( x + (((pad3)<<1) + ((((pad3)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad3)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad3)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad3)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))), _auxr); _mm_store_pd( (x + (((pad3)<<1) + ((((pad3)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad3)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad3)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad3)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))) + 2), _auxi);; }; { Y__M128D _auxr, _auxi; _auxr = _mm_load_pd(x + (((pad4)<<1) + ((((pad4)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad4)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad4)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad4)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4)))); _auxi = _mm_load_pd((x + (((pad4)<<1) + ((((pad4)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad4)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad4)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad4)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))) + 2));; _auxr = _mm_add_pd( _auxr, t4r); _auxi = _mm_add_pd( _auxi, t4i); _mm_store_pd( x + (((pad4)<<1) + ((((pad4)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad4)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad4)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad4)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))), _auxr); _mm_store_pd( (x + (((pad4)<<1) + ((((pad4)<<1) & (~(UL)(128 - 1)) )>>(6 )) + ((((pad4)<<1) & (((~(UL)(128 - 1))) << 4))>>(6 + 4)) + ((((pad4)<<1) & (((~(UL)(128 - 1))) << 2*4))>>(6 + 2*4))+ ((((pad4)<<1) & (((~(UL)(128 - 1))) << 3*4))>>(6 + 3*4))) + 2), _auxi);; };;







  if(err_flag)
    {
      double __attribute__ ((aligned(16))) xmax[2];
      _mm_store_pd( xmax, maxerr);
      Err = (xmax[0] > xmax[1]) ? xmax[0] : xmax[1];
    }
  else
    Err = 0.0;

  if(Y_SBIT)
    substract_two_5_sse2 (x, N);
}
rtharper is offline   Reply With Quote
Reply

Thread Tools


Similar Threads
Thread Thread Starter Forum Replies Last Post
Glucas Source nuggetprime Software 13 2011-01-14 19:51
glucas isnt working on my osx elthrain Software 14 2005-01-14 20:03
Glucas and GIMPS optim Software 6 2004-04-05 21:32
GLucas.... bayanne Software 5 2003-08-15 16:14
Factoring with GLucas bayanne Software 10 2003-02-07 18:25

All times are UTC. The time now is 20:36.

Wed Nov 25 20:36:50 UTC 2020 up 76 days, 17:47, 3 users, load averages: 1.72, 1.68, 1.63

Powered by vBulletin® Version 3.8.11
Copyright ©2000 - 2020, Jelsoft Enterprises Ltd.

This forum has received and complied with 0 (zero) government requests for information.

Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.2 or any later version published by the Free Software Foundation.
A copy of the license is included in the FAQ.