 mersenneforum.org SIMD string->int
 Register FAQ Search Today's Posts Mark Forums Read 2009-03-13, 12:19 #1 fivemack (loop (#_fork))   Feb 2006 Cambridge, England 6,379 Posts SIMD string->int I'm sure there are people here who like useless exercises in SIMD assembly language. The exercise: how fast can you perform atoi? Specifically, let char* A point to a list of a million 0x0a-separated positive integers written out in decimal, each of 15 or fewer digits. Output the result of XORing the numbers together. libc is about 650 cycles per number on K8; this must be beatable! For an easier case, let XMM0 contain something of the form '3' '1' '4' '1' '5' '9' ':' '2' '7' '1' '8' '2' '8' '1' '8' ',' '8' and return 314159. For an even easier case, let XMM0 contain something of the form above, and do the right-justification to return '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '3' '1' '4' '1' '5' '9' in as few cycles as you can.   2009-03-13, 13:58 #2 fivemack (loop (#_fork))   Feb 2006 Cambridge, England 6,379 Posts My solution is below; about 70 cycles per number on core2. I'm not sure how to do the variable-width shift on anything prior to core2. Code: // g++ -O3 -mssse3 -o b b.cpp #include #include using std::ifstream; #include #include typedef unsigned long long u64; void test_simd(void); long rdtsc(void) { long x,y; asm volatile("rdtsc" : "=a"(x), "=d"(y)); return (y<<32)+x; } int main(void) { ifstream F("bibble"); unsigned long long J,K=0; u64 t0= rdtsc(); for (int u=0; u<1000000; u++) { F>>J; K^=J; } u64 t1 = rdtsc(); printf("%lld %lld\n",K,t1-t0); test_simd(); } // right-justifier void aus(__m128i A) { unsigned char B; _mm_storeu_si128((__m128i*)B,A); for (int u=0; u<16; u++) printf("%02x:",B[u]); printf("\n"); } inline __m128i right_justify(__m128i input, int& ct) { __m128i zeros, subzeros, tens, seq; zeros = _mm_set_epi32(0x30303030,0x30303030,0x30303030,0x30303030); subzeros = _mm_set_epi32(0x2f2f2f2f,0x2f2f2f2f,0x2f2f2f2f,0x2f2f2f2f); tens = _mm_set_epi32(0x3a3a3a3a,0x3a3a3a3a,0x3a3a3a3a,0x3a3a3a3a); __m128i sixteens = _mm_set_epi32(0x10101010,0x10101010,0x10101010,0x10101010); seq = _mm_set_epi32(0x0f0e0d0c,0x0b0a0908,0x07060504,0x03020100); __m128i rev = _mm_set_epi32(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); __m128i mask = _mm_cmpeq_epi8(rev,rev); __m128i m70 = _mm_set_epi32(0x70707070,0x70707070,0x70707070,0x70707070); __m128i below_0 = _mm_cmpgt_epi8(input, subzeros); __m128i above_9 = _mm_cmplt_epi8(input, tens); __m128i valid = _mm_and_si128(below_0, above_9); int plink = _mm_movemask_epi8(valid); int shift = __builtin_ffs(plink^0xffff)-1; ct=shift; __m128i sklunk = _mm_set_epi8(shift,shift,shift,shift,shift,shift,shift,shift,shift,shift,shift,shift,shift,shift,shift,shift); // I don't know how you'd do the 128-bit-wide shift without pshufb // but it's quite pretty with pshufb: __m128i ss2 = _mm_add_epi8(sklunk, seq); __m128i ss3 = _mm_add_epi8(m70, ss2); __m128i shift2 = _mm_shuffle_epi8(input, ss2); __m128i shift3 = _mm_shuffle_epi8(zeros, ss3); __m128i masks = _mm_shuffle_epi8(mask, ss3); __m128i yibble = _mm_or_si128(shift3,_mm_andnot_si128(masks, shift2)); return yibble; } inline long long u64_from_rj(__m128i input) { __m128i zeros = _mm_set_epi32(0x30303030,0x30303030,0x30303030,0x30303030); __m128i mask1 = _mm_set_epi32(0xff00ff00,0xff00ff00,0xff00ff00,0xff00ff00); __m128i mask2 = _mm_set_epi32(0xffff0000,0xffff0000,0xffff0000,0xffff0000); __m128i hundreds = _mm_set_epi32(0x00000064,0x00000064,0x00000064,0x00000064); // aus(input); __m128i corr = _mm_sub_epi32(input, zeros); // multiply alternate digits by 10 // can use 16-bit-wide shifts since everything is less than 10 __m128i scaled = _mm_add_epi8(_mm_slli_epi16(corr,11),_mm_slli_epi16(corr,9)); __m128i step1 = _mm_and_si128(mask1,_mm_add_epi8(corr,scaled)); // multiply alternate digit-pairs by 100 and add __m128i step2 = _mm_add_epi16(_mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi16(step1,8), hundreds),16), _mm_srli_epi16(step1,8)); step2 = _mm_srli_epi32(_mm_and_si128(mask2, step2),16); // aus(step2); // do final step in integer registers unsigned int Q; _mm_storeu_si128((__m128i*)Q,step2); long long a12,a8,a4,a0; a0=1; a4=10000; a8=a4*a4; a12=a8*a4; // printf("%d %d %d %d\n",Q,Q,Q,Q); return a0*Q + a4*Q + a8*Q + a12*Q; } void test_simd(void) { FILE* f = fopen("bibble","rb"); char* world = new char; fread(world,1,15889012,f); long long K = 0; long long t0 = rdtsc(); char* ptr = world; int ct; for (int q=0; q<1000000; q++) { __m128i input = _mm_loadu_si128 ((__m128i*)ptr); __m128i justified = right_justify(input, ct); K^=u64_from_rj(justified); ptr += (1+ct); } long long t1 = rdtsc(); printf("%lld %lld\n",K,t1-t0); }   2009-03-13, 14:08 #3 fivemack (loop (#_fork))   Feb 2006 Cambridge, England 6,379 Posts Hmm. sscanf behaves really, really badly on very long strings - it looks as if it does something strlen-like first, so if I use code like Code:  FILE* f = fopen("bibble","rb"); char* world = new char; fread(world,1,15889012,f); fclose(f); unsigned long long J,K=0; u64 t0= rdtsc(); char* ptr = world; for (int u=0; u<1000; u++) { sscanf(ptr,"%lld",&J); while (*ptr>=48 && *ptr<=57) ptr++; ptr++; K^=J; } u64 t1 = rdtsc(); printf("%lld %lld\n",K,t1-t0); I get timings like 20M cycles per sscanf. If I make sure that all the strings are null-terminated by something like Code:  FILE* f = fopen("bibble","rb"); char* world = new char; fread(world,1,15889012,f); fclose(f); unsigned long long J,K=0; u64 t0= rdtsc(); for (int u=0; u<15889012;u++) if (world[u]==10) world[u]=0; char* ptr = world; for (int u=0; u<1000000; u++) { sscanf(ptr,"%lld",&J); while (*ptr>=48 && *ptr<=57) ptr++; ptr++; K^=J; } u64 t1 = rdtsc(); printf("%lld %lld\n",K,t1-t0); it's down to a much saner ~900 cycles per sscanf.   2009-03-13, 22:46 #4 fivemack (loop (#_fork))   Feb 2006 Cambridge, England 143538 Posts Pesky thing! I hadn't realised that Phenom (and apparently even Shanghai) don't support the SSSE3 instructions introduced on core2; in particular, they don't support PSHUFB so my shifting trick doesn't work.   2009-03-19, 00:43 #5 __HRB__   Dec 2008 Boycotting the Soapbox 10110100002 Posts 10-20 cycles - all incl. This is how: Get rid of strlen(), scanf(), etc. (assuming that the string ends in 0x0a00) 1. PCMPEQB 0x00, PCMPEQD 0x00 + loop while MOVMSKPS!=0 2. PCMPEQB 0x0a, PCMPEQD 0x00 + loop while MOVMSKPS!=0 Do the entire conversion in SSE registers: 1. Load 16 bytes using MOVUPS, save to 'Y', then PSUBUSB 0x30 from all bytes ('0'-'9' will be #0-#9 and 0x0a will be also be #0). Call this value 'X'. 2. Use MOVAPSs and ANDPSs on 'X' until you end up with one digit in each dword of 4 different SSE registers and convert to float using CVTDQ2PS. 3. Multiply these 4 registers by 4 different vectors: Code: [10^0, 10^1/256, 10^2/256^2, 10^3/256^3] [10^4, 10^5/256, 10^6/256^2, 10^7/256^3] [10^8, 10^9/256, 10^10/256^2,10^11/256^3] [10^12,10^13/256,10^14/256^2,10^15/256^3] 4. Accumulate results but make sure that values are less than 10^7 apart in magnitude (single floats have 24 bits in the mantissa, which is a little more than than 10^7). Do this using ADDPS & HADDPS, replacing a couple of above MOVs with PSHUFs, and adjusting the 4 factors accordingly. 5. Convert the resulting 4 floats to doubles (CVTPS2PD) and accumulate using HADDPD twice. 6. Independently (so this doesn't hurt at all!) from doing lots of work on 'X', we do magic on 'Y' to locate the index of the highest '0x0a', e.g. with PCMPBEQ '0x0a', followed by e.g. 2-4 bitscans in GPRs. Call the result 'Z'. 7. Use 'Z' to index a 15-entry table containing 10^-1, 10^-2, 10^-3...10^-16, and do a corrective multiply followed by a conversion to int (there is no instructions to do this, so you have to do this with PSUBD, 2xPSLLRQ, ANDPS, ORPS). Since the maximum result only has 48-bits the 53-bit mantissa is sufficient. 8. Add 'Z' to pointer...done. Next value please! The Dependency chain sub-and-convert-mul-add-add-convert-add-add-mul-convert is ~35 cycles. Depending on how fast the address of the next value is determined (loop dependency) and therefore how much code can therefore possibly be done OOO, the optimal case is probably ~10 cycles and will be really tough to achieve, but doing all this in 20 cycles shouldn't require any additional tricks. I think I'd need about 16 hours of programming to do it. Have fun Last fiddled with by __HRB__ on 2009-03-19 at 01:36   2009-03-19, 03:35   #6
__HRB__

Dec 2008
Boycotting the Soapbox

24·32·5 Posts 10-20 cycles - all incl. 1st bugfixes

[
Quote:
 Originally Posted by __HRB__ 1. PCMPEQB 0x00, PCMPEQD 0x00 + loop while MOVMSKPS!=0
This would have to be:
1. PCMPEQB 0x00, PCMPEQD 0x00 + loop while MOVMSKPS==0x0f
otherwise this won't work. What was I thinking?

Quote:
 Originally Posted by __HRB__ 2. PCMPEQB 0x0a, PCMPEQD 0x00 + loop while MOVMSKPS!=0
This is going to fail if there are any 0x0a after the terminating 0x00...

But of course I've overlooked the obvious solution:

Since the algorithm requires computing the index to the start of the next integer anyway, it'd be really stupid not to loop while "first byte!=0x00". If the stream isn't 0x00 terminated, we'll do a cmp with a pointer to end of file.

Quote:
 Originally Posted by __HRB__ 8. Add 'Z' to pointer...done. Next value please!
I mean 'Z'+1, ('Z' >> 3)+1 or whatever...

Last fiddled with by __HRB__ on 2009-03-19 at 03:51 Reason: what does scanf do?   2009-03-23, 15:42 #7 fivemack (loop (#_fork))   Feb 2006 Cambridge, England 6,379 Posts A problem is that PSLLDQ requires the shift amount to be hard-coded at compile time, so you have to use something like the PSHUFB code in my example if you want to do a variable shift (see 'daft SSE restrictions' thread elsewhere) Decoupling the shift and the conversion seems like a good idea, and the extra parallelism from doing four conversions at once in floats seems useful; I'm just a little concerned that any power of 10 above 10^10 can't be stored exactly in a float (10^22 is the largest that fits exactly in a double), so I'd want to do the multiplication by the larger powers of ten once I'm working in doubles. It's not relevant in this case because the conversion probably takes longer than determining the index, but I'm reminded of some nice Nehalem string-instruction demo code from Intel which features the absurd line if (A==16) u+=16; else u+=A; Not too absurd if your strings are very long: the branch is predicted taken and that lets the OOO machinery run several iterations in parallel. Last fiddled with by fivemack on 2009-03-23 at 15:44   2009-03-23, 18:15   #8
__HRB__

Dec 2008
Boycotting the Soapbox

24×32×5 Posts Quote:
 Originally Posted by fivemack A problem is that PSLLDQ requires the shift amount to be hard-coded at compile time, so you have to use something like the PSHUFB code in my example if you want to do a variable shift (see 'daft SSE restrictions' thread elsewhere)
Thank any god that's not true!

They screwed up everything else, but the variable shift you are looking for is one of the few things they actually got right (they screwed up the "xmm, imm8" one: they could have combined it with a move at zero cost).

Every SSE-shift (that I know of) has a "xmm, imm8" and a "xmm1, xmm2" form. See, e.g. http://www.ews.uiuc.edu/~cjiang/reference/vc256.htm

Maybe your compiler doesn't have the intrinsics or you were trying to specify the variable shifts using an 'int'?

You could also do two conversions 'double to int32' (erasing the lower 32 bits of the mantissa, subtract from original will give you the lower 32 bits), to avoid messing around too much with IEEE754.

Quote:
 Originally Posted by fivemack Decoupling the shift and the conversion seems like a good idea, and the extra parallelism from doing four conversions at once in floats seems useful; I'm just a little concerned that any power of 10 above 10^10 can't be stored exactly in a float (10^22 is the largest that fits exactly in a double), so I'd want to do the multiplication by the larger powers of ten once I'm working in doubles.
Yup, you're 100% right. The only thing I paid attention to was magnitude, so this definitely needs some doctoring.

We should be able to do everything up to 10^7/256 in floats, since log(10^7)=23.2534966642. I think therefore only one double precision multiply with [10^8,10^0] will be needed and it should be possible merge this multiply with the final correction, since I don't see anything that isn't distributive.

Quote:
 Originally Posted by fivemack It's not relevant in this case because the conversion probably takes longer than determining the index, but I'm reminded of some nice Nehalem string-instruction demo code from Intel which features the absurd line if (A==16) u+=16; else u+=A; Not too absurd if your strings are very long: the branch is predicted taken and that lets the OOO machinery run several iterations in parallel.
If OOO doesn't do the trick, one could unroll the loop by hand: discover the next (or if strlen is known any) 2-4 line feeds, and do every instruction 2-4x with a different registers (if one has enough).

Of course, if you're willing to allow data-dependent speed-ups, we could also branch to code that handles all 2^8 distributions of LFs (at the cost of one mispredicted branch). A word composed of the first bit of all "pcmpeqb #LF" bytes could be used to compute the address of the jump-table & next chunk.  Thread Tools Show Printable Version Email this Page Similar Threads Thread Thread Starter Forum Replies Last Post ewmayer Mlucas 183 2019-02-25 08:17 BrainStone Mlucas 14 2017-11-19 00:59 Dubslow Programming 19 2012-05-31 17:49 davieddy Science & Technology 8 2009-02-05 05:10

All times are UTC. The time now is 10:50.

Fri Jan 22 10:50:24 UTC 2021 up 50 days, 7:01, 0 users, load averages: 1.41, 1.56, 1.72 