20090521, 14:04  #1 
(loop (#_fork))
Feb 2006
Cambridge, England
6,323 Posts 
World's seconddumbest CUDA program
The first google hit for 'CUDA sieve' provides something that doesn't compile and, when tweaked to compile, gives wrong answers. This disturbed my tranquility, so I determined to write something that worked and gave correct answers, albeit more slowly than a stunned sloth crawling backwards through treacle (3.3 seconds on a 9600GT to find the 5761455 primes <10^8). If this causes pstach to reappear, curse me roundly and give an alternative implementation of eleven thousand times the speed, I won't complain :)
Makefile: Code:
NVCC=/usr/local/cuda64/cuda/bin/nvcc CUTILI=/usr/local/cuda64/cudasdk/common/inc CUTILL=/usr/local/cuda64/cudasdk/lib/libcutil.a EXE=erato EXED=$(patsubst %,%d,$(EXE)) all: $(EXE) $(EXED) %: %.cu $(NVCC) I $(CUTILI) o $@ $< $(CUTILL) %d: %.cu $(NVCC) g G I $(CUTILI) o $@ $< $(CUTILL) Code:
// ** cmode ** #include <stdio.h> #include <cutil_inline.h> int maxT = 0; __global__ static void Sieve(int * sieve,int* primes, int maxp, int sieve_size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < maxp) { int ithPrime = primes[idx]; int delta = ithPrime; if (ithPrime != 2) delta = 2*delta; for(int i=ithPrime*ithPrime;i < sieve_size;i+=delta) sieve[i] = ithPrime; } } __global__ static void Individual(int * sieve, int nloc) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < nloc) { /* int myprime, delta; if (idx == 0) {myprime=2;delta=2;} if (idx == 1) {myprime=3;} if (idx > 2) {myprime = 6*(idx/2) + 2*(idx%2)  1;} if (idx!=0) delta=2*myprime; for (int i=myprime*myprime; i<sieve_size; i+=delta) sieve[i] = 1;*/ // check individual primes by trial division sieve[idx] = 0; int b = 2; if (idx==0  idx==1) sieve[idx] = 1; if (idx==2  idx==3) sieve[idx] = 0; while (b*b <= idx) { if (idx%b == 0) sieve[idx] = 1; b++; } } } bool InitCUDA(void) { int count = 0; int i = 0; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "There is no device.\n"); return false; } for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { printf("Device %d supports CUDA %d.%d\n",i, prop.major, prop.minor); printf("It has warp size %d, %d regs per block, %d threads per block\n",prop.warpSize, prop.regsPerBlock, prop.maxThreadsPerBlock); printf("max Threads %d x %d x %d\n",prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); maxT = prop.maxThreadsDim[0]; printf("max Grid %d x %d x %d\n",prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf("total constant memory %d\n",prop.totalConstMem); break; } } } if(i == count) { fprintf(stderr, "There is no device supporting CUDA 1.x.\n"); return false; } cudaSetDevice(i); return true; } int main(int argc, char** argv) { const int N = 100000000; int Nsmall = sqrt(N); int *device_small_sieve, *device_big_sieve; if(!InitCUDA()) { return 0; } CUDA_SAFE_CALL(cudaMalloc((void**) &device_small_sieve, sizeof(int) * Nsmall)); CUDA_SAFE_CALL(cudaMemset(device_small_sieve, 51, sizeof(int)*Nsmall)); unsigned int shandle; cutCreateTimer(&shandle); cutStartTimer(shandle); Individual<<<maxT,(Nsmall+maxT1)/maxT, 0>>> (device_small_sieve,Nsmall); CUDA_SAFE_CALL(cudaThreadSynchronize()); cutStopTimer(shandle); printf("%f milliseconds for small primes\n",cutGetTimerValue(shandle)); int* host_copy_of_smallsieve = (int*)malloc(Nsmall*sizeof(int)); printf("%p\n",host_copy_of_smallsieve); fflush(stdout); CUDA_SAFE_CALL(cudaMemcpy(host_copy_of_smallsieve, device_small_sieve, sizeof(int) * Nsmall, cudaMemcpyDeviceToHost)); printf("%p\n",host_copy_of_smallsieve); fflush(stdout); // OK. We've got an array with 0 at the small primes int np = 0, test = 0; for (int k=0; k<Nsmall; k++) {test = host_copy_of_smallsieve[k]; np += 1host_copy_of_smallsieve[k];} if (test != 1) {printf("Something impossible just happened\n\n"); exit(1);} printf("%d small primes (< %d) found\n", np, Nsmall); int* primes = (int*)malloc(np*sizeof(int)); int pptr = 0; for (int k=0; k<Nsmall; k++) if (host_copy_of_smallsieve[k] == 0) primes[pptr++]=k; // except that we needed the primes on the device int* device_primes; CUDA_SAFE_CALL(cudaMalloc((void**) &device_primes, sizeof(int) * np)); CUDA_SAFE_CALL(cudaMemcpy(device_primes, primes, sizeof(int)*np, cudaMemcpyHostToDevice)); // now, set up the big array on the device CUDA_SAFE_CALL(cudaMalloc((void**) &device_big_sieve, sizeof(int) * N)); CUDA_SAFE_CALL(cudaMemset(device_big_sieve, 51, sizeof(int)*N)); unsigned int thandle; cutCreateTimer(&thandle); cutStartTimer(thandle); Sieve<<<maxT, (np+maxT1)/maxT, 0>>>(device_big_sieve, device_primes, np, N); cudaThreadSynchronize(); cutStopTimer(thandle); printf("%f milliseconds for big sieve\n",cutGetTimerValue(thandle)); int* host_sieve = (int*)malloc(N*sizeof(int)); cudaMemcpy(host_sieve, device_big_sieve, sizeof(int) * N, cudaMemcpyDeviceToHost); cudaFree(device_big_sieve); cudaFree(device_small_sieve); cudaFree(device_primes); int nbig = 0; for(int i=2;i < N;++i) if (host_sieve[i] == 0x33333333) nbig++; printf("%d big primes (< %d) found\n",nbig,N); return 0; } Last fiddled with by fivemack on 20090521 at 14:09 
20090711, 00:50  #2 
Sep 2008
Kansas
6151_{8} Posts 
Integer divide
I saw on one of the nVidia forums an integer divide uses 140 clock cycles. It is better to use bitwise shifts whenever possible. (Some compiler optimizations may do that for you.)

20090711, 20:35  #3 
Tribal Bullet
Oct 2004
3×1,163 Posts 
Integer division is not on the critical path for a properly written prime sieve, concurrent bytewide memory access is.

20090712, 17:28  #4 
Jan 2008
France
2^{3}×67 Posts 
I think Tom's program was a joke, in fact I'd be willing to bet a beer it was

20090712, 18:54  #5 
(loop (#_fork))
Feb 2006
Cambridge, England
14263_{8} Posts 
It wasn't entirely a joke; I was wondering whether, in the finest traditions of Usenet where posting a bad answer is the best way to get a good answer, if I posted a lousy CUDA program I'd get useful advice as to how to write a less lousy one. It seems I posted too lousy a program, or am perhaps the only person with CUDA set up (and it was on a machine at work which has since been rebuilt ...) who reads the forum regularly.
I'm actually sieving in the Sieve<<<>>> kernel call; used trial division in the step to get the primes up to sqrt(N) because I found that it was quicker than sieving while you were dealing with very small primes. I'm well aware the algorithm I'm using is stupid, I tried a few techniques to make it less stupid which made things significantly worse. 
20090712, 19:14  #6 
Bamboozled!
"πΊππ·π·π"
May 2003
Down not across
2×3×1,699 Posts 

20090712, 21:14  #7 
Jul 2003
So Cal
2^{2}×3^{3}×19 Posts 

20090712, 21:30  #8 
(loop (#_fork))
Feb 2006
Cambridge, England
6,323 Posts 
This may be a rude question, but have you actually written much code on your CUDA setups? Google finds me lots of people who have run CUFFT and mention how large the gigaflops numbers that come out are, a few people doing the same for linpack, and the developers of some Russian GIS software which uses CUDA for all the heavy lifting, but nothing which looks like a particularly vibrant CUDAwriting community.
There are lots of papers full of what after a while read like the same commonplaces on how to write good CUDA code, but I've only really seen the one nVidia conference presentation where they take slow code and demonstrate how it may be changed into fast code, and there are several steps there that I don't quite follow and several others where I can't see how they could be made relevant for sieving. 
20090712, 22:15  #9 
Jul 2003
So Cal
4004_{8} Posts 
I've written just a few simple programs. Besides the obvious things like have lots of threads to mask latencies and access global memory as little as possible, getting the best performance really seems to be trial and error. I was involved in the integration of distributed.net's RC5 CUDA core. I found that seemingly trivial changes like changing an always positive integer from signed to unsigned, and assigning blockDim.x, blockIdx.x, and threadIdx.x to an integer before doing ANY math made huge differences in the speed of the code. In fact, just the move to CUDA 2.2 has halved the speed of the RC5 code. I recently asked why on the nVidia forums but haven't gotten any replies.

20090713, 07:34  #10 
Jul 2003
So Cal
2^{2}·3^{3}·19 Posts 
OK, I'll bite. I'm testing on a 9600GSO. The card didn't have enough memory to run your code as written. I got an error when trying to allocate the big sieve array. I modified the code to use unsigned chars rather than ints and only store 0 or 1 instead of the prime itself. Running that version gave
Code:
Device 0 supports CUDA 1.1 It has warp size 32, 8192 regs per block, 512 threads per block max Threads 512 x 512 x 64 max Grid 65535 x 65535 x 1 total constant memory 65536 0.433000 milliseconds for small primes 0x2588530 0x2588530 1229 small primes (< 10000) found 4743.630859 milliseconds for big sieve 5761455 big primes (< 100000000) found Code:
Device 0 supports CUDA 1.1 It has warp size 32, 8192 regs per block, 512 threads per block max Threads 512 x 512 x 64 max Grid 65535 x 65535 x 1 total constant memory 65536 0.418000 milliseconds for small primes 0x8c3020 0x8c3020 1229 small primes (< 10000) found 1676.081055 milliseconds for big sieve 5761455 big primes (< 100000000) found Here's the modified code: Code:
#include <stdio.h> #include <cutil_inline.h> typedef unsigned char u8; int maxT = 0; __global__ static void Sieve(u8 * sieve,int * primes, int maxp, int sieve_size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if ((idx>0) && (idx < maxp)) { int ithPrime = primes[idx]; for(int i=(3*ithPrime)>>1 ;i < sieve_size; i+=ithPrime) // i = (ithPrime1)/2 + ithPrime though the compiler knew this sieve[i] = 1; } } __global__ static void Individual(u8 * sieve, int nloc) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < nloc) { /* int myprime, delta; if (idx == 0) {myprime=2;delta=2;} if (idx == 1) {myprime=3;} if (idx > 2) {myprime = 6*(idx/2) + 2*(idx%2)  1;} if (idx!=0) delta=2*myprime; for (int i=myprime*myprime; i<sieve_size; i+=delta) sieve[i] = 1;*/ // check individual primes by trial division sieve[idx] = 0; int b = 2; if (idx==0  idx==1) sieve[idx] = 1; if (idx==2  idx==3) sieve[idx] = 0; while (b*b <= idx) { if (idx%b == 0) {sieve[idx] = 1; break;} b++; } } } bool InitCUDA(void) { int count = 0; int i = 0; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "There is no device.\n"); return false; } for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { printf("Device %d supports CUDA %d.%d\n",i, prop.major, prop.minor); printf("It has warp size %d, %d regs per block, %d threads per block\n",prop.warpSize, prop.regsPerBlock, prop.maxThreadsPerBlock); printf("max Threads %d x %d x %d\n",prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); maxT = prop.maxThreadsDim[0]; printf("max Grid %d x %d x %d\n",prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf("total constant memory %d\n",prop.totalConstMem); break; } } } if(i == count) { fprintf(stderr, "There is no device supporting CUDA 1.x.\n"); return false; } cudaSetDevice(i); return true; } int main(int argc, char** argv) { const int N = 100000000; int Nsmall = sqrt(N); u8 *device_small_sieve, *device_big_sieve; if(!InitCUDA()) { return 0; } CUDA_SAFE_CALL(cudaMalloc((void**) &device_small_sieve, sizeof(u8) * Nsmall)); CUDA_SAFE_CALL(cudaMemset(device_small_sieve, 0, sizeof(u8)*Nsmall)); unsigned int shandle; cutCreateTimer(&shandle); cutStartTimer(shandle); Individual<<<maxT,(Nsmall+maxT1)/maxT, 0>>> (device_small_sieve,Nsmall); CUDA_SAFE_CALL(cudaThreadSynchronize()); cutStopTimer(shandle); printf("%f milliseconds for small primes\n",cutGetTimerValue(shandle)); u8* host_copy_of_smallsieve = (u8*)malloc(Nsmall*sizeof(u8)); printf("%p\n",host_copy_of_smallsieve); fflush(stdout); CUDA_SAFE_CALL(cudaMemcpy(host_copy_of_smallsieve, device_small_sieve, sizeof(u8) * Nsmall, cudaMemcpyDeviceToHost)); printf("%p\n",host_copy_of_smallsieve); fflush(stdout); // OK. We've got an array with 0 at the small primes int np = 0, test = 0; for (int k=0; k<Nsmall; k++) {test = host_copy_of_smallsieve[k]; np += 1host_copy_of_smallsieve[k];} if (test != 1) {printf("Something impossible just happened\n\n"); exit(1);} printf("%d small primes (< %d) found\n", np, Nsmall); int* primes = (int*)malloc(np*sizeof(int)); int pptr = 0; for (int k=0; k<Nsmall; k++) if (host_copy_of_smallsieve[k] == 0) primes[pptr++]=k; // except that we needed the primes on the device int* device_primes; CUDA_SAFE_CALL(cudaMalloc((void**) &device_primes, sizeof(int) * np)); CUDA_SAFE_CALL(cudaMemcpy(device_primes, primes, sizeof(int)*np, cudaMemcpyHostToDevice)); // now, set up the big array on the device CUDA_SAFE_CALL(cudaMalloc((void**) &device_big_sieve, sizeof(u8) * N/2)); CUDA_SAFE_CALL(cudaMemset(device_big_sieve, 0, sizeof(u8)*N/2)); unsigned int thandle; cutCreateTimer(&thandle); cutStartTimer(thandle); Sieve<<<maxT, (np+maxT1)/maxT, 0>>>(device_big_sieve, device_primes, np, N/2); cudaThreadSynchronize(); cutStopTimer(thandle); printf("%f milliseconds for big sieve\n",cutGetTimerValue(thandle)); u8* host_sieve = (u8*)malloc(N/2*sizeof(u8)); cudaMemcpy(host_sieve, device_big_sieve, sizeof(u8) * N/2, cudaMemcpyDeviceToHost); cudaFree(device_big_sieve); cudaFree(device_small_sieve); cudaFree(device_primes); int nbig = 0; for(int i=0;i < N/2;++i) if (host_sieve[i] == 0) {nbig++;} //printf("%d\n",(i==0)?2:2*i+1);} printf("%d big primes (< %d) found\n",nbig,N); return 0; } 
20090713, 08:24  #11  
Bamboozled!
"πΊππ·π·π"
May 2003
Down not across
23722_{8} Posts 
Quote:
When I get some time I plan on porting Ernst's code for trial factoring of the larger MM numbers. Unfortunately, I'm too busy with Real Work(tm). Paul 

Thread Tools  
Similar Threads  
Thread  Thread Starter  Forum  Replies  Last Post 
mfaktc: a CUDA program for Mersenne prefactoring  TheJudger  GPU Computing  3434  20201107 08:01 
The P1 factoring CUDA program  firejuggler  GPU Computing  752  20200908 16:15 
End of the world as we know it (in music)  firejuggler  Lounge  3  20121222 01:43 
World Cup Soccer  davieddy  Hobbies  111  20110528 19:21 
World's dumbest CUDA program?  xilman  Programming  1  20091116 10:26 