![]() |
![]() |
#1 |
(loop (#_fork))
Feb 2006
Cambridge, England
6,379 Posts |
![]()
I'm sure there are people here who like useless exercises in SIMD assembly language.
The exercise: how fast can you perform atoi? Specifically, let char* A point to a list of a million 0x0a-separated positive integers written out in decimal, each of 15 or fewer digits. Output the result of XORing the numbers together. libc is about 650 cycles per number on K8; this must be beatable! For an easier case, let XMM0 contain something of the form '3' '1' '4' '1' '5' '9' ':' '2' '7' '1' '8' '2' '8' '1' '8' ',' '8' and return 314159. For an even easier case, let XMM0 contain something of the form above, and do the right-justification to return '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '3' '1' '4' '1' '5' '9' in as few cycles as you can. |
![]() |
![]() |
![]() |
#2 |
(loop (#_fork))
Feb 2006
Cambridge, England
6,379 Posts |
![]()
My solution is below; about 70 cycles per number on core2. I'm not sure how to do the variable-width shift on anything prior to core2.
Code:
// g++ -O3 -mssse3 -o b b.cpp #include <stdio.h> #include <fstream> using std::ifstream; #include <emmintrin.h> #include <tmmintrin.h> typedef unsigned long long u64; void test_simd(void); long rdtsc(void) { long x,y; asm volatile("rdtsc" : "=a"(x), "=d"(y)); return (y<<32)+x; } int main(void) { ifstream F("bibble"); unsigned long long J,K=0; u64 t0= rdtsc(); for (int u=0; u<1000000; u++) { F>>J; K^=J; } u64 t1 = rdtsc(); printf("%lld %lld\n",K,t1-t0); test_simd(); } // right-justifier void aus(__m128i A) { unsigned char B[16]; _mm_storeu_si128((__m128i*)B,A); for (int u=0; u<16; u++) printf("%02x:",B[u]); printf("\n"); } inline __m128i right_justify(__m128i input, int& ct) { __m128i zeros, subzeros, tens, seq; zeros = _mm_set_epi32(0x30303030,0x30303030,0x30303030,0x30303030); subzeros = _mm_set_epi32(0x2f2f2f2f,0x2f2f2f2f,0x2f2f2f2f,0x2f2f2f2f); tens = _mm_set_epi32(0x3a3a3a3a,0x3a3a3a3a,0x3a3a3a3a,0x3a3a3a3a); __m128i sixteens = _mm_set_epi32(0x10101010,0x10101010,0x10101010,0x10101010); seq = _mm_set_epi32(0x0f0e0d0c,0x0b0a0908,0x07060504,0x03020100); __m128i rev = _mm_set_epi32(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); __m128i mask = _mm_cmpeq_epi8(rev,rev); __m128i m70 = _mm_set_epi32(0x70707070,0x70707070,0x70707070,0x70707070); __m128i below_0 = _mm_cmpgt_epi8(input, subzeros); __m128i above_9 = _mm_cmplt_epi8(input, tens); __m128i valid = _mm_and_si128(below_0, above_9); int plink = _mm_movemask_epi8(valid); int shift = __builtin_ffs(plink^0xffff)-1; ct=shift; __m128i sklunk = _mm_set_epi8(shift,shift,shift,shift,shift,shift,shift,shift,shift,shift,shift,shift,shift,shift,shift,shift); // I don't know how you'd do the 128-bit-wide shift without pshufb // but it's quite pretty with pshufb: __m128i ss2 = _mm_add_epi8(sklunk, seq); __m128i ss3 = _mm_add_epi8(m70, ss2); __m128i shift2 = _mm_shuffle_epi8(input, ss2); __m128i shift3 = _mm_shuffle_epi8(zeros, ss3); __m128i masks = _mm_shuffle_epi8(mask, ss3); __m128i yibble = _mm_or_si128(shift3,_mm_andnot_si128(masks, shift2)); return yibble; } inline long long u64_from_rj(__m128i input) { __m128i zeros = _mm_set_epi32(0x30303030,0x30303030,0x30303030,0x30303030); __m128i mask1 = _mm_set_epi32(0xff00ff00,0xff00ff00,0xff00ff00,0xff00ff00); __m128i mask2 = _mm_set_epi32(0xffff0000,0xffff0000,0xffff0000,0xffff0000); __m128i hundreds = _mm_set_epi32(0x00000064,0x00000064,0x00000064,0x00000064); // aus(input); __m128i corr = _mm_sub_epi32(input, zeros); // multiply alternate digits by 10 // can use 16-bit-wide shifts since everything is less than 10 __m128i scaled = _mm_add_epi8(_mm_slli_epi16(corr,11),_mm_slli_epi16(corr,9)); __m128i step1 = _mm_and_si128(mask1,_mm_add_epi8(corr,scaled)); // multiply alternate digit-pairs by 100 and add __m128i step2 = _mm_add_epi16(_mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi16(step1,8), hundreds),16), _mm_srli_epi16(step1,8)); step2 = _mm_srli_epi32(_mm_and_si128(mask2, step2),16); // aus(step2); // do final step in integer registers unsigned int Q[4]; _mm_storeu_si128((__m128i*)Q,step2); long long a12,a8,a4,a0; a0=1; a4=10000; a8=a4*a4; a12=a8*a4; // printf("%d %d %d %d\n",Q[0],Q[1],Q[2],Q[3]); return a0*Q[3] + a4*Q[2] + a8*Q[1] + a12*Q[0]; } void test_simd(void) { FILE* f = fopen("bibble","rb"); char* world = new char[15889012]; fread(world,1,15889012,f); long long K = 0; long long t0 = rdtsc(); char* ptr = world; int ct; for (int q=0; q<1000000; q++) { __m128i input = _mm_loadu_si128 ((__m128i*)ptr); __m128i justified = right_justify(input, ct); K^=u64_from_rj(justified); ptr += (1+ct); } long long t1 = rdtsc(); printf("%lld %lld\n",K,t1-t0); } |
![]() |
![]() |
![]() |
#3 |
(loop (#_fork))
Feb 2006
Cambridge, England
6,379 Posts |
![]()
Hmm. sscanf behaves really, really badly on very long strings - it looks as if it does something strlen-like first, so if I use code like
Code:
FILE* f = fopen("bibble","rb"); char* world = new char[15889012]; fread(world,1,15889012,f); fclose(f); unsigned long long J,K=0; u64 t0= rdtsc(); char* ptr = world; for (int u=0; u<1000; u++) { sscanf(ptr,"%lld",&J); while (*ptr>=48 && *ptr<=57) ptr++; ptr++; K^=J; } u64 t1 = rdtsc(); printf("%lld %lld\n",K,t1-t0); Code:
FILE* f = fopen("bibble","rb"); char* world = new char[15889012]; fread(world,1,15889012,f); fclose(f); unsigned long long J,K=0; u64 t0= rdtsc(); for (int u=0; u<15889012;u++) if (world[u]==10) world[u]=0; char* ptr = world; for (int u=0; u<1000000; u++) { sscanf(ptr,"%lld",&J); while (*ptr>=48 && *ptr<=57) ptr++; ptr++; K^=J; } u64 t1 = rdtsc(); printf("%lld %lld\n",K,t1-t0); |
![]() |
![]() |
![]() |
#4 |
(loop (#_fork))
Feb 2006
Cambridge, England
143538 Posts |
![]()
I hadn't realised that Phenom (and apparently even Shanghai) don't support the SSSE3 instructions introduced on core2; in particular, they don't support PSHUFB so my shifting trick doesn't work.
|
![]() |
![]() |
![]() |
#5 |
Dec 2008
Boycotting the Soapbox
10110100002 Posts |
![]()
This is how:
Get rid of strlen(), scanf(), etc. (assuming that the string ends in 0x0a00) 1. PCMPEQB 0x00, PCMPEQD 0x00 + loop while MOVMSKPS!=0 2. PCMPEQB 0x0a, PCMPEQD 0x00 + loop while MOVMSKPS!=0 Do the entire conversion in SSE registers: 1. Load 16 bytes using MOVUPS, save to 'Y', then PSUBUSB 0x30 from all bytes ('0'-'9' will be #0-#9 and 0x0a will be also be #0). Call this value 'X'. 2. Use MOVAPSs and ANDPSs on 'X' until you end up with one digit in each dword of 4 different SSE registers and convert to float using CVTDQ2PS. 3. Multiply these 4 registers by 4 different vectors: Code:
[10^0, 10^1/256, 10^2/256^2, 10^3/256^3] [10^4, 10^5/256, 10^6/256^2, 10^7/256^3] [10^8, 10^9/256, 10^10/256^2,10^11/256^3] [10^12,10^13/256,10^14/256^2,10^15/256^3] 5. Convert the resulting 4 floats to doubles (CVTPS2PD) and accumulate using HADDPD twice. 6. Independently (so this doesn't hurt at all!) from doing lots of work on 'X', we do magic on 'Y' to locate the index of the highest '0x0a', e.g. with PCMPBEQ '0x0a', followed by e.g. 2-4 bitscans in GPRs. Call the result 'Z'. 7. Use 'Z' to index a 15-entry table containing 10^-1, 10^-2, 10^-3...10^-16, and do a corrective multiply followed by a conversion to int (there is no instructions to do this, so you have to do this with PSUBD, 2xPSLLRQ, ANDPS, ORPS). Since the maximum result only has 48-bits the 53-bit mantissa is sufficient. 8. Add 'Z' to pointer...done. Next value please! The Dependency chain sub-and-convert-mul-add-add-convert-add-add-mul-convert is ~35 cycles. Depending on how fast the address of the next value is determined (loop dependency) and therefore how much code can therefore possibly be done OOO, the optimal case is probably ~10 cycles and will be really tough to achieve, but doing all this in 20 cycles shouldn't require any additional tricks. I think I'd need about 16 hours of programming to do it. Have fun Last fiddled with by __HRB__ on 2009-03-19 at 01:36 |
![]() |
![]() |
![]() |
#6 |
Dec 2008
Boycotting the Soapbox
24·32·5 Posts |
![]()
[This would have to be:
1. PCMPEQB 0x00, PCMPEQD 0x00 + loop while MOVMSKPS==0x0f otherwise this won't work. What was I thinking? This is going to fail if there are any 0x0a after the terminating 0x00... But of course I've overlooked the obvious solution: Since the algorithm requires computing the index to the start of the next integer anyway, it'd be really stupid not to loop while "first byte!=0x00". If the stream isn't 0x00 terminated, we'll do a cmp with a pointer to end of file. I mean 'Z'+1, ('Z' >> 3)+1 or whatever... Last fiddled with by __HRB__ on 2009-03-19 at 03:51 Reason: what does scanf do? |
![]() |
![]() |
![]() |
#7 |
(loop (#_fork))
Feb 2006
Cambridge, England
6,379 Posts |
![]()
A problem is that PSLLDQ requires the shift amount to be hard-coded at compile time, so you have to use something like the PSHUFB code in my example if you want to do a variable shift (see 'daft SSE restrictions' thread elsewhere)
Decoupling the shift and the conversion seems like a good idea, and the extra parallelism from doing four conversions at once in floats seems useful; I'm just a little concerned that any power of 10 above 10^10 can't be stored exactly in a float (10^22 is the largest that fits exactly in a double), so I'd want to do the multiplication by the larger powers of ten once I'm working in doubles. It's not relevant in this case because the conversion probably takes longer than determining the index, but I'm reminded of some nice Nehalem string-instruction demo code from Intel which features the absurd line if (A==16) u+=16; else u+=A; Not too absurd if your strings are very long: the branch is predicted taken and that lets the OOO machinery run several iterations in parallel. Last fiddled with by fivemack on 2009-03-23 at 15:44 |
![]() |
![]() |
![]() |
#8 | |||
Dec 2008
Boycotting the Soapbox
24×32×5 Posts |
![]() Quote:
They screwed up everything else, but the variable shift you are looking for is one of the few things they actually got right (they screwed up the "xmm, imm8" one: they could have combined it with a move at zero cost). Every SSE-shift (that I know of) has a "xmm, imm8" and a "xmm1, xmm2" form. See, e.g. http://www.ews.uiuc.edu/~cjiang/reference/vc256.htm Maybe your compiler doesn't have the intrinsics or you were trying to specify the variable shifts using an 'int'? You could also do two conversions 'double to int32' (erasing the lower 32 bits of the mantissa, subtract from original will give you the lower 32 bits), to avoid messing around too much with IEEE754. Quote:
We should be able to do everything up to 10^7/256 in floats, since log(10^7)=23.2534966642. I think therefore only one double precision multiply with [10^8,10^0] will be needed and it should be possible merge this multiply with the final correction, since I don't see anything that isn't distributive. Quote:
Of course, if you're willing to allow data-dependent speed-ups, we could also branch to code that handles all 2^8 distributions of LFs (at the cost of one mispredicted branch). A word composed of the first bit of all "pcmpeqb #LF" bytes could be used to compute the address of the jump-table & next chunk. |
|||
![]() |
![]() |
![]() |
Thread Tools | |
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
ARM builds and SIMD-assembler prospects | ewmayer | Mlucas | 183 | 2019-02-25 08:17 |
Which SIMD flag to use for Raspberry Pi | BrainStone | Mlucas | 14 | 2017-11-19 00:59 |
Official "String copy Statement Considered Harmful" thread | Dubslow | Programming | 19 | 2012-05-31 17:49 |
Waves on a taut string | davieddy | Science & Technology | 8 | 2009-02-05 05:10 |