mersenneforum.org  

Go Back   mersenneforum.org > Extra Stuff > Programming

Reply
 
Thread Tools
Old 2009-05-21, 14:04   #1
fivemack
(loop (#_fork))
 
fivemack's Avatar
 
Feb 2006
Cambridge, England

6,323 Posts
Default World's second-dumbest CUDA program

The first google hit for 'CUDA sieve' provides something that doesn't compile and, when tweaked to compile, gives wrong answers. This disturbed my tranquility, so I determined to write something that worked and gave correct answers, albeit more slowly than a stunned sloth crawling backwards through treacle (3.3 seconds on a 9600GT to find the 5761455 primes <10^8). If this causes pstach to reappear, curse me roundly and give an alternative implementation of eleven thousand times the speed, I won't complain :)

Makefile:
Code:
NVCC=/usr/local/cuda64/cuda/bin/nvcc
CUTILI=/usr/local/cuda64/cuda-sdk/common/inc
CUTILL=/usr/local/cuda64/cuda-sdk/lib/libcutil.a

EXE=erato
EXED=$(patsubst %,%-d,$(EXE))

all: $(EXE) $(EXED)

%: %.cu
	$(NVCC) -I $(CUTILI) -o $@ $< $(CUTILL)

%-d: %.cu
	$(NVCC) -g -G -I $(CUTILI) -o $@ $< $(CUTILL)
erato.cu:
Code:
// *-* c-mode *-*

#include <stdio.h>
#include <cutil_inline.h>

int maxT = 0;

__global__ static void Sieve(int * sieve,int* primes, int maxp, int sieve_size)
{ 
  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
  if (idx < maxp)
    {
      int ithPrime = primes[idx];
      int delta = ithPrime;
      if (ithPrime != 2) delta = 2*delta;
      for(int i=ithPrime*ithPrime;i < sieve_size;i+=delta)
	sieve[i] = ithPrime;
    }
}

__global__ static void Individual(int * sieve, int nloc)
{ 
  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
  if (idx < nloc)
    {
      /*  int myprime, delta;
  if (idx == 0) {myprime=2;delta=2;}
  if (idx == 1) {myprime=3;}
  if (idx > 2) {myprime = 6*(idx/2) + 2*(idx%2) - 1;}
  if (idx!=0) delta=2*myprime;
  for (int i=myprime*myprime; i<sieve_size; i+=delta)
  sieve[i] = 1;*/

  // check individual primes by trial division

      sieve[idx] = 0;
      int b = 2;
      if (idx==0 || idx==1) sieve[idx] = 1;
      if (idx==2 || idx==3) sieve[idx] = 0;
      while (b*b <= idx)
	{
	  if (idx%b == 0)
	    sieve[idx] = 1;
	  b++;
	}
    }
}

bool InitCUDA(void)
{
  int count = 0;
  int i = 0;
  
  cudaGetDeviceCount(&count);
  if(count == 0) {
    fprintf(stderr, "There is no device.\n");
    return false;
  }

  for(i = 0; i < count; i++) {
    cudaDeviceProp prop;
    if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
      if(prop.major >= 1) {
	printf("Device %d supports CUDA %d.%d\n",i, prop.major, prop.minor);
	printf("It has warp size %d, %d regs per block, %d threads per block\n",prop.warpSize, prop.regsPerBlock, prop.maxThreadsPerBlock);
	printf("max Threads %d x %d x %d\n",prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
	maxT = prop.maxThreadsDim[0];
	printf("max Grid %d x %d x %d\n",prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
	printf("total constant memory %d\n",prop.totalConstMem);
	break;
      }
    }
  }
  if(i == count) {
    fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
    return false;
  }
  cudaSetDevice(i);
  return true;
}

int main(int argc, char** argv)
{
  const int N = 100000000;
  int Nsmall = sqrt(N);

  int *device_small_sieve, *device_big_sieve;
  
  if(!InitCUDA())
    {
      return 0;
    } 
  CUDA_SAFE_CALL(cudaMalloc((void**) &device_small_sieve, sizeof(int) * Nsmall));
  CUDA_SAFE_CALL(cudaMemset(device_small_sieve, 51, sizeof(int)*Nsmall));
 
  unsigned int shandle;
  cutCreateTimer(&shandle);
  cutStartTimer(shandle);
  Individual<<<maxT,(Nsmall+maxT-1)/maxT, 0>>> (device_small_sieve,Nsmall);
  CUDA_SAFE_CALL(cudaThreadSynchronize());  
  cutStopTimer(shandle);
  printf("%f milliseconds for small primes\n",cutGetTimerValue(shandle));

  int* host_copy_of_smallsieve = (int*)malloc(Nsmall*sizeof(int));
  printf("%p\n",host_copy_of_smallsieve); fflush(stdout);
  CUDA_SAFE_CALL(cudaMemcpy(host_copy_of_smallsieve,
			    device_small_sieve,
			    sizeof(int) * Nsmall,
			    cudaMemcpyDeviceToHost));
  printf("%p\n",host_copy_of_smallsieve); fflush(stdout);
  
  // OK.  We've got an array with 0 at the small primes
  
  int np = 0, test = 0;
  for (int k=0; k<Nsmall; k++) {test |= host_copy_of_smallsieve[k]; np += 1-host_copy_of_smallsieve[k];}
  if (test != 1) {printf("Something impossible just happened\n\n"); exit(1);}
  printf("%d small primes (< %d) found\n", np, Nsmall);
  int* primes = (int*)malloc(np*sizeof(int));
  int pptr = 0;
  for (int k=0; k<Nsmall; k++) 
    if (host_copy_of_smallsieve[k] == 0)
      primes[pptr++]=k;

  // except that we needed the primes on the device

  int* device_primes;
  CUDA_SAFE_CALL(cudaMalloc((void**) &device_primes, sizeof(int) * np));
  CUDA_SAFE_CALL(cudaMemcpy(device_primes, primes, sizeof(int)*np, cudaMemcpyHostToDevice));

  // now, set up the big array on the device

  CUDA_SAFE_CALL(cudaMalloc((void**) &device_big_sieve, sizeof(int) * N));
  CUDA_SAFE_CALL(cudaMemset(device_big_sieve, 51, sizeof(int)*N));

  unsigned int thandle; 
  cutCreateTimer(&thandle);
  cutStartTimer(thandle);
  Sieve<<<maxT, (np+maxT-1)/maxT, 0>>>(device_big_sieve, device_primes, np, N);
  cudaThreadSynchronize();  
  cutStopTimer(thandle);
  printf("%f milliseconds for big sieve\n",cutGetTimerValue(thandle)); 
  
  int* host_sieve = (int*)malloc(N*sizeof(int));
  cudaMemcpy(host_sieve, device_big_sieve, sizeof(int) * N, cudaMemcpyDeviceToHost);

 cudaFree(device_big_sieve);
 cudaFree(device_small_sieve);
 cudaFree(device_primes);
 
 int nbig = 0;
 for(int i=2;i < N;++i) 
   if (host_sieve[i] == 0x33333333) nbig++;
 printf("%d big primes (< %d) found\n",nbig,N);
 
 return 0;
}

Last fiddled with by fivemack on 2009-05-21 at 14:09
fivemack is offline   Reply With Quote
Old 2009-07-11, 00:50   #2
RichD
 
RichD's Avatar
 
Sep 2008
Kansas

61518 Posts
Default Integer divide

I saw on one of the nVidia forums an integer divide uses 140 clock cycles. It is better to use bit-wise shifts whenever possible. (Some compiler optimizations may do that for you.)
RichD is offline   Reply With Quote
Old 2009-07-11, 20:35   #3
jasonp
Tribal Bullet
 
jasonp's Avatar
 
Oct 2004

3×1,163 Posts
Default

Integer division is not on the critical path for a properly written prime sieve, concurrent byte-wide memory access is.
jasonp is offline   Reply With Quote
Old 2009-07-12, 17:28   #4
ldesnogu
 
ldesnogu's Avatar
 
Jan 2008
France

23×67 Posts
Default

I think Tom's program was a joke, in fact I'd be willing to bet a beer it was
ldesnogu is offline   Reply With Quote
Old 2009-07-12, 18:54   #5
fivemack
(loop (#_fork))
 
fivemack's Avatar
 
Feb 2006
Cambridge, England

142638 Posts
Default

It wasn't entirely a joke; I was wondering whether, in the finest traditions of Usenet where posting a bad answer is the best way to get a good answer, if I posted a lousy CUDA program I'd get useful advice as to how to write a less lousy one. It seems I posted too lousy a program, or am perhaps the only person with CUDA set up (and it was on a machine at work which has since been rebuilt ...) who reads the forum regularly.

I'm actually sieving in the Sieve<<<>>> kernel call; used trial division in the step to get the primes up to sqrt(N) because I found that it was quicker than sieving while you were dealing with very small primes. I'm well aware the algorithm I'm using is stupid, I tried a few techniques to make it less stupid which made things significantly worse.
fivemack is offline   Reply With Quote
Old 2009-07-12, 19:14   #6
xilman
Bamboozled!
 
xilman's Avatar
 
"π’‰Ίπ’ŒŒπ’‡·π’†·π’€­"
May 2003
Down not across

2×3×1,699 Posts
Default

Quote:
Originally Posted by fivemack View Post
or am perhaps the only person with CUDA set up (and it was on a machine at work which has since been rebuilt ...) who reads the forum regularly.
[aol]Me too!!?!!![/aol]

Paul
xilman is offline   Reply With Quote
Old 2009-07-12, 21:14   #7
frmky
 
frmky's Avatar
 
Jul 2003
So Cal

22×33×19 Posts
Default

Quote:
Originally Posted by xilman View Post
[aol]Me too!!?!!![/aol]

Paul
[aol]Me three!!! CUDA ROXXORS![/aol]
frmky is online now   Reply With Quote
Old 2009-07-12, 21:30   #8
fivemack
(loop (#_fork))
 
fivemack's Avatar
 
Feb 2006
Cambridge, England

6,323 Posts
Default

This may be a rude question, but have you actually written much code on your CUDA setups? Google finds me lots of people who have run CUFFT and mention how large the gigaflops numbers that come out are, a few people doing the same for linpack, and the developers of some Russian GIS software which uses CUDA for all the heavy lifting, but nothing which looks like a particularly vibrant CUDA-writing community.

There are lots of papers full of what after a while read like the same commonplaces on how to write good CUDA code, but I've only really seen the one nVidia conference presentation where they take slow code and demonstrate how it may be changed into fast code, and there are several steps there that I don't quite follow and several others where I can't see how they could be made relevant for sieving.
fivemack is offline   Reply With Quote
Old 2009-07-12, 22:15   #9
frmky
 
frmky's Avatar
 
Jul 2003
So Cal

40048 Posts
Default

I've written just a few simple programs. Besides the obvious things like have lots of threads to mask latencies and access global memory as little as possible, getting the best performance really seems to be trial and error. I was involved in the integration of distributed.net's RC5 CUDA core. I found that seemingly trivial changes like changing an always positive integer from signed to unsigned, and assigning blockDim.x, blockIdx.x, and threadIdx.x to an integer before doing ANY math made huge differences in the speed of the code. In fact, just the move to CUDA 2.2 has halved the speed of the RC5 code. I recently asked why on the nVidia forums but haven't gotten any replies.
frmky is online now   Reply With Quote
Old 2009-07-13, 07:34   #10
frmky
 
frmky's Avatar
 
Jul 2003
So Cal

22·33·19 Posts
Default

OK, I'll bite. I'm testing on a 9600GSO. The card didn't have enough memory to run your code as written. I got an error when trying to allocate the big sieve array. I modified the code to use unsigned chars rather than ints and only store 0 or 1 instead of the prime itself. Running that version gave

Code:
Device 0 supports CUDA 1.1
It has warp size 32, 8192 regs per block, 512 threads per block
max Threads 512 x 512 x 64
max Grid 65535 x 65535 x 1
total constant memory 65536
0.433000 milliseconds for small primes
0x2588530
0x2588530
1229 small primes (< 10000) found
4743.630859 milliseconds for big sieve
5761455 big primes (< 100000000) found
I then added the simple optimization of excluding all evens except 2 from the sieve array. Each element i>0 refers to the odd number 2*i+1, treating element 0 as a special case. This simplifies the Sieve() routine and removes the longest running (by a bit more than a factor of 2) thread. The result is

Code:
Device 0 supports CUDA 1.1
It has warp size 32, 8192 regs per block, 512 threads per block
max Threads 512 x 512 x 64
max Grid 65535 x 65535 x 1
total constant memory 65536
0.418000 milliseconds for small primes
0x8c3020
0x8c3020
1229 small primes (< 10000) found
1676.081055 milliseconds for big sieve
5761455 big primes (< 100000000) found
At least this is reasonably on par with the more optimized C implementations at http://primes.utm.edu/links/programs...C_source_code/

Here's the modified code:

Code:
#include <stdio.h>
#include <cutil_inline.h>

typedef unsigned char u8;

int maxT = 0;

__global__ static void Sieve(u8 * sieve,int * primes, int maxp, int sieve_size)
{ 
  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
  if ((idx>0) && (idx < maxp))
    {
      int ithPrime = primes[idx];
      for(int i=(3*ithPrime)>>1 ;i < sieve_size; i+=ithPrime)
        // i = (ithPrime-1)/2 + ithPrime though the compiler knew this
        sieve[i] = 1;
    }
}

__global__ static void Individual(u8 * sieve, int nloc)
{ 
  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
  if (idx < nloc)
    {
      /*  int myprime, delta;
  if (idx == 0) {myprime=2;delta=2;}
  if (idx == 1) {myprime=3;}
  if (idx > 2) {myprime = 6*(idx/2) + 2*(idx%2) - 1;}
  if (idx!=0) delta=2*myprime;
  for (int i=myprime*myprime; i<sieve_size; i+=delta)
  sieve[i] = 1;*/

  // check individual primes by trial division

      sieve[idx] = 0;
      int b = 2;
      if (idx==0 || idx==1) sieve[idx] = 1;
      if (idx==2 || idx==3) sieve[idx] = 0;
      while (b*b <= idx)
        {
          if (idx%b == 0)
            {sieve[idx] = 1; break;}
          b++;
        }
    }
}

bool InitCUDA(void)
{
  int count = 0;
  int i = 0;
  
  cudaGetDeviceCount(&count);
  if(count == 0) {
    fprintf(stderr, "There is no device.\n");
    return false;
  }

  for(i = 0; i < count; i++) {
    cudaDeviceProp prop;
    if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
      if(prop.major >= 1) {
        printf("Device %d supports CUDA %d.%d\n",i, prop.major, prop.minor);
        printf("It has warp size %d, %d regs per block, %d threads per block\n",prop.warpSize, prop.regsPerBlock, prop.maxThreadsPerBlock);
        printf("max Threads %d x %d x %d\n",prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        maxT = prop.maxThreadsDim[0];
        printf("max Grid %d x %d x %d\n",prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
        printf("total constant memory %d\n",prop.totalConstMem);
        break;
      }
    }
  }
  if(i == count) {
    fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
    return false;
  }
  cudaSetDevice(i);
  return true;
}

int main(int argc, char** argv)
{
  const int N = 100000000;
  int Nsmall = sqrt(N);

  u8 *device_small_sieve, *device_big_sieve;
  
  if(!InitCUDA())
    {
      return 0;
    } 
  CUDA_SAFE_CALL(cudaMalloc((void**) &device_small_sieve, sizeof(u8) * Nsmall));
  CUDA_SAFE_CALL(cudaMemset(device_small_sieve, 0, sizeof(u8)*Nsmall));
 
  unsigned int shandle;
  cutCreateTimer(&shandle);
  cutStartTimer(shandle);
  Individual<<<maxT,(Nsmall+maxT-1)/maxT, 0>>> (device_small_sieve,Nsmall);
  CUDA_SAFE_CALL(cudaThreadSynchronize());  
  cutStopTimer(shandle);
  printf("%f milliseconds for small primes\n",cutGetTimerValue(shandle));

  u8* host_copy_of_smallsieve = (u8*)malloc(Nsmall*sizeof(u8));
  printf("%p\n",host_copy_of_smallsieve); fflush(stdout);
  CUDA_SAFE_CALL(cudaMemcpy(host_copy_of_smallsieve,
                            device_small_sieve,
                            sizeof(u8) * Nsmall,
                            cudaMemcpyDeviceToHost));
  printf("%p\n",host_copy_of_smallsieve); fflush(stdout);
  
  // OK.  We've got an array with 0 at the small primes
  
  int np = 0, test = 0;
  for (int k=0; k<Nsmall; k++) {test |= host_copy_of_smallsieve[k]; np += 1-host_copy_of_smallsieve[k];}
  if (test != 1) {printf("Something impossible just happened\n\n"); exit(1);}
  printf("%d small primes (< %d) found\n", np, Nsmall);
  int* primes = (int*)malloc(np*sizeof(int));
  int pptr = 0;
  for (int k=0; k<Nsmall; k++) 
    if (host_copy_of_smallsieve[k] == 0)
      primes[pptr++]=k;

  // except that we needed the primes on the device

  int* device_primes;
  CUDA_SAFE_CALL(cudaMalloc((void**) &device_primes, sizeof(int) * np));
  CUDA_SAFE_CALL(cudaMemcpy(device_primes, primes, sizeof(int)*np, cudaMemcpyHostToDevice));

  // now, set up the big array on the device

  CUDA_SAFE_CALL(cudaMalloc((void**) &device_big_sieve, sizeof(u8) * N/2));
  CUDA_SAFE_CALL(cudaMemset(device_big_sieve, 0, sizeof(u8)*N/2));

  unsigned int thandle; 
  cutCreateTimer(&thandle);
  cutStartTimer(thandle);
  Sieve<<<maxT, (np+maxT-1)/maxT, 0>>>(device_big_sieve, device_primes, np, N/2);
  cudaThreadSynchronize();  
  cutStopTimer(thandle);
  printf("%f milliseconds for big sieve\n",cutGetTimerValue(thandle)); 
  
  u8* host_sieve = (u8*)malloc(N/2*sizeof(u8));
  cudaMemcpy(host_sieve, device_big_sieve, sizeof(u8) * N/2, cudaMemcpyDeviceToHost);

 cudaFree(device_big_sieve);
 cudaFree(device_small_sieve);
 cudaFree(device_primes);
 
 int nbig = 0;
 for(int i=0;i < N/2;++i) 
   if (host_sieve[i] == 0) {nbig++;} //printf("%d\n",(i==0)?2:2*i+1);}
 printf("%d big primes (< %d) found\n",nbig,N);
 
 return 0;
}
frmky is online now   Reply With Quote
Old 2009-07-13, 08:24   #11
xilman
Bamboozled!
 
xilman's Avatar
 
"π’‰Ίπ’ŒŒπ’‡·π’†·π’€­"
May 2003
Down not across

237228 Posts
Default

Quote:
Originally Posted by fivemack View Post
This may be a rude question, but have you actually written much code on your CUDA setups?
I've written some non-trivial crypto code for a Tesla. It runs roughly 50 times faster than it does on the cpu hosting the Tesla.

When I get some time I plan on porting Ernst's code for trial factoring of the larger MM numbers. Unfortunately, I'm too busy with Real Work(tm).


Paul
xilman is offline   Reply With Quote
Reply

Thread Tools


Similar Threads
Thread Thread Starter Forum Replies Last Post
mfaktc: a CUDA program for Mersenne prefactoring TheJudger GPU Computing 3434 2020-11-07 08:01
The P-1 factoring CUDA program firejuggler GPU Computing 752 2020-09-08 16:15
End of the world as we know it (in music) firejuggler Lounge 3 2012-12-22 01:43
World Cup Soccer davieddy Hobbies 111 2011-05-28 19:21
World's dumbest CUDA program? xilman Programming 1 2009-11-16 10:26

All times are UTC. The time now is 06:50.

Thu Nov 26 06:50:31 UTC 2020 up 77 days, 4:01, 3 users, load averages: 1.76, 1.80, 1.74

Powered by vBulletin® Version 3.8.11
Copyright ©2000 - 2020, Jelsoft Enterprises Ltd.

This forum has received and complied with 0 (zero) government requests for information.

Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.2 or any later version published by the Free Software Foundation.
A copy of the license is included in the FAQ.