mersenneforum.org

mersenneforum.org (https://www.mersenneforum.org/index.php)
-   Programming (https://www.mersenneforum.org/forumdisplay.php?f=29)
-   -   World's second-dumbest CUDA program (https://www.mersenneforum.org/showthread.php?t=11900)

fivemack 2009-05-21 14:04

World's second-dumbest CUDA program
 
The first google hit for 'CUDA sieve' provides something that doesn't compile and, when tweaked to compile, gives wrong answers. This disturbed my tranquility, so I determined to write something that worked and gave correct answers, albeit more slowly than a stunned sloth crawling backwards through treacle (3.3 seconds on a 9600GT to find the 5761455 primes <10^8). If this causes pstach to reappear, curse me roundly and give an alternative implementation of eleven thousand times the speed, I won't complain :)

Makefile:
[code]
NVCC=/usr/local/cuda64/cuda/bin/nvcc
CUTILI=/usr/local/cuda64/cuda-sdk/common/inc
CUTILL=/usr/local/cuda64/cuda-sdk/lib/libcutil.a

EXE=erato
EXED=$(patsubst %,%-d,$(EXE))

all: $(EXE) $(EXED)

%: %.cu
$(NVCC) -I $(CUTILI) -o $@ $< $(CUTILL)

%-d: %.cu
$(NVCC) -g -G -I $(CUTILI) -o $@ $< $(CUTILL)
[/code]

erato.cu:
[code]
// *-* c-mode *-*

#include <stdio.h>
#include <cutil_inline.h>

int maxT = 0;

__global__ static void Sieve(int * sieve,int* primes, int maxp, int sieve_size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < maxp)
{
int ithPrime = primes[idx];
int delta = ithPrime;
if (ithPrime != 2) delta = 2*delta;
for(int i=ithPrime*ithPrime;i < sieve_size;i+=delta)
sieve[i] = ithPrime;
}
}

__global__ static void Individual(int * sieve, int nloc)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < nloc)
{
/* int myprime, delta;
if (idx == 0) {myprime=2;delta=2;}
if (idx == 1) {myprime=3;}
if (idx > 2) {myprime = 6*(idx/2) + 2*(idx%2) - 1;}
if (idx!=0) delta=2*myprime;
for (int i=myprime*myprime; i<sieve_size; i+=delta)
sieve[i] = 1;*/

// check individual primes by trial division

sieve[idx] = 0;
int b = 2;
if (idx==0 || idx==1) sieve[idx] = 1;
if (idx==2 || idx==3) sieve[idx] = 0;
while (b*b <= idx)
{
if (idx%b == 0)
sieve[idx] = 1;
b++;
}
}
}

bool InitCUDA(void)
{
int count = 0;
int i = 0;

cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}

for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {
printf("Device %d supports CUDA %d.%d\n",i, prop.major, prop.minor);
printf("It has warp size %d, %d regs per block, %d threads per block\n",prop.warpSize, prop.regsPerBlock, prop.maxThreadsPerBlock);
printf("max Threads %d x %d x %d\n",prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
maxT = prop.maxThreadsDim[0];
printf("max Grid %d x %d x %d\n",prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("total constant memory %d\n",prop.totalConstMem);
break;
}
}
}
if(i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}

int main(int argc, char** argv)
{
const int N = 100000000;
int Nsmall = sqrt(N);

int *device_small_sieve, *device_big_sieve;

if(!InitCUDA())
{
return 0;
}
CUDA_SAFE_CALL(cudaMalloc((void**) &device_small_sieve, sizeof(int) * Nsmall));
CUDA_SAFE_CALL(cudaMemset(device_small_sieve, 51, sizeof(int)*Nsmall));

unsigned int shandle;
cutCreateTimer(&shandle);
cutStartTimer(shandle);
Individual<<<maxT,(Nsmall+maxT-1)/maxT, 0>>> (device_small_sieve,Nsmall);
CUDA_SAFE_CALL(cudaThreadSynchronize());
cutStopTimer(shandle);
printf("%f milliseconds for small primes\n",cutGetTimerValue(shandle));

int* host_copy_of_smallsieve = (int*)malloc(Nsmall*sizeof(int));
printf("%p\n",host_copy_of_smallsieve); fflush(stdout);
CUDA_SAFE_CALL(cudaMemcpy(host_copy_of_smallsieve,
device_small_sieve,
sizeof(int) * Nsmall,
cudaMemcpyDeviceToHost));
printf("%p\n",host_copy_of_smallsieve); fflush(stdout);

// OK. We've got an array with 0 at the small primes

int np = 0, test = 0;
for (int k=0; k<Nsmall; k++) {test |= host_copy_of_smallsieve[k]; np += 1-host_copy_of_smallsieve[k];}
if (test != 1) {printf("Something impossible just happened\n\n"); exit(1);}
printf("%d small primes (< %d) found\n", np, Nsmall);
int* primes = (int*)malloc(np*sizeof(int));
int pptr = 0;
for (int k=0; k<Nsmall; k++)
if (host_copy_of_smallsieve[k] == 0)
primes[pptr++]=k;

// except that we needed the primes on the device

int* device_primes;
CUDA_SAFE_CALL(cudaMalloc((void**) &device_primes, sizeof(int) * np));
CUDA_SAFE_CALL(cudaMemcpy(device_primes, primes, sizeof(int)*np, cudaMemcpyHostToDevice));

// now, set up the big array on the device

CUDA_SAFE_CALL(cudaMalloc((void**) &device_big_sieve, sizeof(int) * N));
CUDA_SAFE_CALL(cudaMemset(device_big_sieve, 51, sizeof(int)*N));

unsigned int thandle;
cutCreateTimer(&thandle);
cutStartTimer(thandle);
Sieve<<<maxT, (np+maxT-1)/maxT, 0>>>(device_big_sieve, device_primes, np, N);
cudaThreadSynchronize();
cutStopTimer(thandle);
printf("%f milliseconds for big sieve\n",cutGetTimerValue(thandle));

int* host_sieve = (int*)malloc(N*sizeof(int));
cudaMemcpy(host_sieve, device_big_sieve, sizeof(int) * N, cudaMemcpyDeviceToHost);

cudaFree(device_big_sieve);
cudaFree(device_small_sieve);
cudaFree(device_primes);

int nbig = 0;
for(int i=2;i < N;++i)
if (host_sieve[i] == 0x33333333) nbig++;
printf("%d big primes (< %d) found\n",nbig,N);

return 0;
}
[/code]

RichD 2009-07-11 00:50

Integer divide
 
I saw on one of the nVidia forums an integer divide uses 140 clock cycles. It is better to use bit-wise shifts whenever possible. (Some compiler optimizations may do that for you.)

jasonp 2009-07-11 20:35

Integer division is not on the critical path for a properly written prime sieve, concurrent byte-wide memory access is.

ldesnogu 2009-07-12 17:28

I think Tom's program was a joke, in fact I'd be willing to bet a beer it was :smile:

fivemack 2009-07-12 18:54

It wasn't entirely a joke; I was wondering whether, in the finest traditions of Usenet where posting a bad answer is the best way to get a good answer, if I posted a lousy CUDA program I'd get useful advice as to how to write a less lousy one. It seems I posted too lousy a program, or am perhaps the only person with CUDA set up (and it was on a machine at work which has since been rebuilt ...) who reads the forum regularly.

I'm actually sieving in the Sieve<<<>>> kernel call; used trial division in the step to get the primes up to sqrt(N) because I found that it was quicker than sieving while you were dealing with very small primes. I'm well aware the algorithm I'm using is stupid, I tried a few techniques to make it less stupid which made things significantly worse.

xilman 2009-07-12 19:14

[QUOTE=fivemack;180728]or am perhaps the only person with CUDA set up (and it was on a machine at work which has since been rebuilt ...) who reads the forum regularly.[/QUOTE][aol]Me too!!?!!![/aol]

Paul

frmky 2009-07-12 21:14

[QUOTE=xilman;180731][aol]Me too!!?!!![/aol]

Paul[/QUOTE]

[aol]Me three!!! CUDA ROXXORS![/aol]

fivemack 2009-07-12 21:30

This may be a rude question, but have you actually written much code on your CUDA setups? Google finds me lots of people who have run CUFFT and mention how large the gigaflops numbers that come out are, a few people doing the same for linpack, and the developers of some Russian GIS software which uses CUDA for all the heavy lifting, but nothing which looks like a particularly vibrant CUDA-writing community.

There are lots of papers full of what after a while read like the same commonplaces on how to write good CUDA code, but I've only really seen the one nVidia conference presentation where they take slow code and demonstrate how it may be changed into fast code, and there are several steps there that I don't quite follow and several others where I can't see how they could be made relevant for sieving.

frmky 2009-07-12 22:15

I've written just a few simple programs. Besides the obvious things like have lots of threads to mask latencies and access global memory as little as possible, getting the best performance really seems to be trial and error. I was involved in the integration of distributed.net's RC5 CUDA core. I found that seemingly trivial changes like changing an always positive integer from signed to unsigned, and assigning blockDim.x, blockIdx.x, and threadIdx.x to an integer before doing ANY math made huge differences in the speed of the code. In fact, just the move to CUDA 2.2 has halved the speed of the RC5 code. I recently asked why on the [URL="http://forums.nvidia.com/index.php?showtopic=101641&pid=564237&mode=threaded&start=#entry564237"]nVidia forums[/URL] but haven't gotten any replies.

frmky 2009-07-13 07:34

OK, I'll bite. I'm testing on a 9600GSO. The card didn't have enough memory to run your code as written. I got an error when trying to allocate the big sieve array. I modified the code to use unsigned chars rather than ints and only store 0 or 1 instead of the prime itself. Running that version gave

[CODE]Device 0 supports CUDA 1.1
It has warp size 32, 8192 regs per block, 512 threads per block
max Threads 512 x 512 x 64
max Grid 65535 x 65535 x 1
total constant memory 65536
0.433000 milliseconds for small primes
0x2588530
0x2588530
1229 small primes (< 10000) found
4743.630859 milliseconds for big sieve
5761455 big primes (< 100000000) found
[/CODE]

I then added the simple optimization of excluding all evens except 2 from the sieve array. Each element i>0 refers to the odd number 2*i+1, treating element 0 as a special case. This simplifies the Sieve() routine and removes the longest running (by a bit more than a factor of 2) thread. The result is

[CODE]Device 0 supports CUDA 1.1
It has warp size 32, 8192 regs per block, 512 threads per block
max Threads 512 x 512 x 64
max Grid 65535 x 65535 x 1
total constant memory 65536
0.418000 milliseconds for small primes
0x8c3020
0x8c3020
1229 small primes (< 10000) found
1676.081055 milliseconds for big sieve
5761455 big primes (< 100000000) found
[/CODE]

At least this is reasonably on par with the more optimized C implementations at [URL="http://primes.utm.edu/links/programs/sieves/Eratosthenes/C_source_code/"]http://primes.utm.edu/links/programs/sieves/Eratosthenes/C_source_code/[/URL]

Here's the modified code:

[CODE]#include <stdio.h>
#include <cutil_inline.h>

typedef unsigned char u8;

int maxT = 0;

__global__ static void Sieve(u8 * sieve,int * primes, int maxp, int sieve_size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if ((idx>0) && (idx < maxp))
{
int ithPrime = primes[idx];
for(int i=(3*ithPrime)>>1 ;i < sieve_size; i+=ithPrime)
// i = (ithPrime-1)/2 + ithPrime though the compiler knew this
sieve[i] = 1;
}
}

__global__ static void Individual(u8 * sieve, int nloc)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < nloc)
{
/* int myprime, delta;
if (idx == 0) {myprime=2;delta=2;}
if (idx == 1) {myprime=3;}
if (idx > 2) {myprime = 6*(idx/2) + 2*(idx%2) - 1;}
if (idx!=0) delta=2*myprime;
for (int i=myprime*myprime; i<sieve_size; i+=delta)
sieve[i] = 1;*/

// check individual primes by trial division

sieve[idx] = 0;
int b = 2;
if (idx==0 || idx==1) sieve[idx] = 1;
if (idx==2 || idx==3) sieve[idx] = 0;
while (b*b <= idx)
{
if (idx%b == 0)
{sieve[idx] = 1; break;}
b++;
}
}
}

bool InitCUDA(void)
{
int count = 0;
int i = 0;

cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}

for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {
printf("Device %d supports CUDA %d.%d\n",i, prop.major, prop.minor);
printf("It has warp size %d, %d regs per block, %d threads per block\n",prop.warpSize, prop.regsPerBlock, prop.maxThreadsPerBlock);
printf("max Threads %d x %d x %d\n",prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
maxT = prop.maxThreadsDim[0];
printf("max Grid %d x %d x %d\n",prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("total constant memory %d\n",prop.totalConstMem);
break;
}
}
}
if(i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
cudaSetDevice(i);
return true;
}

int main(int argc, char** argv)
{
const int N = 100000000;
int Nsmall = sqrt(N);

u8 *device_small_sieve, *device_big_sieve;

if(!InitCUDA())
{
return 0;
}
CUDA_SAFE_CALL(cudaMalloc((void**) &device_small_sieve, sizeof(u8) * Nsmall));
CUDA_SAFE_CALL(cudaMemset(device_small_sieve, 0, sizeof(u8)*Nsmall));

unsigned int shandle;
cutCreateTimer(&shandle);
cutStartTimer(shandle);
Individual<<<maxT,(Nsmall+maxT-1)/maxT, 0>>> (device_small_sieve,Nsmall);
CUDA_SAFE_CALL(cudaThreadSynchronize());
cutStopTimer(shandle);
printf("%f milliseconds for small primes\n",cutGetTimerValue(shandle));

u8* host_copy_of_smallsieve = (u8*)malloc(Nsmall*sizeof(u8));
printf("%p\n",host_copy_of_smallsieve); fflush(stdout);
CUDA_SAFE_CALL(cudaMemcpy(host_copy_of_smallsieve,
device_small_sieve,
sizeof(u8) * Nsmall,
cudaMemcpyDeviceToHost));
printf("%p\n",host_copy_of_smallsieve); fflush(stdout);

// OK. We've got an array with 0 at the small primes

int np = 0, test = 0;
for (int k=0; k<Nsmall; k++) {test |= host_copy_of_smallsieve[k]; np += 1-host_copy_of_smallsieve[k];}
if (test != 1) {printf("Something impossible just happened\n\n"); exit(1);}
printf("%d small primes (< %d) found\n", np, Nsmall);
int* primes = (int*)malloc(np*sizeof(int));
int pptr = 0;
for (int k=0; k<Nsmall; k++)
if (host_copy_of_smallsieve[k] == 0)
primes[pptr++]=k;

// except that we needed the primes on the device

int* device_primes;
CUDA_SAFE_CALL(cudaMalloc((void**) &device_primes, sizeof(int) * np));
CUDA_SAFE_CALL(cudaMemcpy(device_primes, primes, sizeof(int)*np, cudaMemcpyHostToDevice));

// now, set up the big array on the device

CUDA_SAFE_CALL(cudaMalloc((void**) &device_big_sieve, sizeof(u8) * N/2));
CUDA_SAFE_CALL(cudaMemset(device_big_sieve, 0, sizeof(u8)*N/2));

unsigned int thandle;
cutCreateTimer(&thandle);
cutStartTimer(thandle);
Sieve<<<maxT, (np+maxT-1)/maxT, 0>>>(device_big_sieve, device_primes, np, N/2);
cudaThreadSynchronize();
cutStopTimer(thandle);
printf("%f milliseconds for big sieve\n",cutGetTimerValue(thandle));

u8* host_sieve = (u8*)malloc(N/2*sizeof(u8));
cudaMemcpy(host_sieve, device_big_sieve, sizeof(u8) * N/2, cudaMemcpyDeviceToHost);

cudaFree(device_big_sieve);
cudaFree(device_small_sieve);
cudaFree(device_primes);

int nbig = 0;
for(int i=0;i < N/2;++i)
if (host_sieve[i] == 0) {nbig++;} //printf("%d\n",(i==0)?2:2*i+1);}
printf("%d big primes (< %d) found\n",nbig,N);

return 0;
}
[/CODE]

xilman 2009-07-13 08:24

[QUOTE=fivemack;180742]This may be a rude question, but have you actually written much code on your CUDA setups?[/QUOTE]I've written some non-trivial crypto code for a Tesla. It runs roughly 50 times faster than it does on the cpu hosting the Tesla.

When I get some time I plan on porting Ernst's code for trial factoring of the larger MM numbers. Unfortunately, I'm too busy with Real Work(tm).


Paul


All times are UTC. The time now is 00:27.

Powered by vBulletin® Version 3.8.11
Copyright ©2000 - 2021, Jelsoft Enterprises Ltd.