for( int i = 0 ; i < SIZE ; i++ ){ a[i] = 1.0 ; }
cudaMemcpy(dev_a, a, SIZE * sizeof(float), cudaMemcpyHostToDevice ) ;
cudaEventCreate( &start ) ; cudaEventCreate( &stop ) ; cudaEventRecord( start, 0 ) ;
sumOfSquares_gpu0<<
for(int i = 0 ; i < BLOCK_NUM * THREAD_NUM ; i++ ){ sum += result[i] ; }
cudaEventRecord( stop, 0 ) ; cudaEventSynchronize( stop ) ;
cudaEventElapsedTime( &elapsedTime, start, stop ) ;
printf( \
printf( \
cudaEventDestroy( start ) ; cudaEventDestroy( stop ) ;
free( a ) ;
cudaFree( dev_a ) ;
cudaFree( dev_result ) ;
return 0 ; } 2、#include
#include
11
#define SIZE 1048576 #define BLOCK_NUM 32 #define THREAD_NUM 256
__global__ void sumOfSquares_gpu0( float *a, int n, float *result ){
__shared__ float shared[THREAD_NUM]; int tid = threadIdx.x ; int bid = blockIdx.x ;
shared[tid] = 0.0 ;
for( int i = bid * THREAD_NUM + tid ; i < n ; i += BLOCK_NUM * THREAD_NUM ){ shared[tid] += a[i] * a[i] ; }
__syncthreads();
if(tid == 0) {
for(int i = 1; i < THREAD_NUM; i++) { shared[0] += shared[i]; }
result[bid] = shared[0]; } }
int main( int argc, char **argv ) {
cudaEvent_t start, stop ; float elapsedTime ;
float *result, *dev_result, *a, *dev_a ;
cudaMalloc( (void**)&dev_result, BLOCK_NUM * sizeof(float) ) ; cudaMalloc( (void**)&dev_a, SIZE * sizeof(float) ) ;
a = (float*)malloc( SIZE * sizeof( float ) ) ;
result = (float*)malloc( BLOCK_NUM * sizeof( float ) ) ;
for( int i = 0 ; i < SIZE ; i++ ){ a[i] = 1.0 ; }
12
cudaMemcpy(dev_a, a, SIZE * sizeof(float), cudaMemcpyHostToDevice ) ;
cudaEventCreate( &start ) ; cudaEventCreate( &stop ) ; cudaEventRecord( start, 0 ) ;
sumOfSquares_gpu0<<
cudaMemcpy(result, dev_result, BLOCK_NUM * sizeof(float), cudaMemcpyDeviceToHost ) ; double sum = 0.0 ;
for(int i = 0 ; i < BLOCK_NUM ; i++ ){ sum += result[i] ; }
cudaEventRecord( stop, 0 ) ; cudaEventSynchronize( stop ) ;
cudaEventElapsedTime( &elapsedTime, start, stop ) ; printf( \ printf( \ cudaEventDestroy( start ) ; cudaEventDestroy( stop ) ;
free( a ) ;
cudaFree( dev_a ) ;
cudaFree( dev_result ) ;
return 0 ; } 1、#include
#include
__global__ void sumOfSquares_gpu0( float *a, int n, float *result ){ int tid = threadIdx.x ; float sum = 0.0 ;
for( int i = tid ; i < n ; i += THREAD_NUM ){ sum += a[i] * a[i] ; }
result[tid] = sum ; }
13
int main( int argc, char **argv ) {
cudaEvent_t start, stop ; float elapsedTime ;
float *result, *dev_result, *a, *dev_a ;
cudaMalloc( (void**)&dev_result, THREAD_NUM * sizeof(float) ) ; cudaMalloc( (void**)&dev_a, SIZE * sizeof(float) ) ;
a = (float*)malloc( SIZE * sizeof( float ) ) ;
result = (float*)malloc( THREAD_NUM * sizeof( float ) ) ;
for( int i = 0 ; i < SIZE ; i++ ){ a[i] = 1.0 ; }
cudaMemcpy(dev_a, a, SIZE * sizeof(float), cudaMemcpyHostToDevice ) ;
cudaEventCreate( &start ) ; cudaEventCreate( &stop ) ; cudaEventRecord( start, 0 ) ;
sumOfSquares_gpu0<<<1,THREAD_NUM>>>( dev_a, SIZE, dev_result ) ; cudaMemcpy(result, dev_result, THREAD_NUM * sizeof(float), cudaMemcpyDeviceToHost ) ; double sum = 0.0 ;
for(int i = 0 ; i < THREAD_NUM ; i++ ){ sum += result[i] ; }
cudaEventRecord( stop, 0 ) ; cudaEventSynchronize( stop ) ;
cudaEventElapsedTime( &elapsedTime, start, stop ) ; printf( \ printf( \ cudaEventDestroy( start ) ; cudaEventDestroy( stop ) ; free( a ) ;
cudaFree( dev_a ) ;
cudaFree( dev_result ) ; return 0 ; } 4、#include
14
#include
#define SIZE 1048576 #define BLOCK_NUM 32 #define THREAD_NUM 256
__global__ void sumOfSquares_gpu0( float *a, int n, float *result ){
__shared__ float shared[THREAD_NUM]; int tid = threadIdx.x ; int bid = blockIdx.x ;
shared[tid] = 0.0 ;
for( int i = bid * THREAD_NUM + tid ; i < n ; i += BLOCK_NUM * THREAD_NUM ){ shared[tid] += a[i] * a[i] ; }
__syncthreads();
int i = blockDim.x/2 ; while( i != 0 ){ if( tid < i ){
shared[tid] += shared[tid + i] ; }
__syncthreads(); i /= 2 ; }
if(tid == 0) {
result[bid] = shared[0]; } }
int main( int argc, char **argv ) {
cudaEvent_t start, stop ; float elapsedTime ;
float *result, *dev_result, *a, *dev_a ;
cudaMalloc( (void**)&dev_result, BLOCK_NUM * sizeof(float) ) ; cudaMalloc( (void**)&dev_a, SIZE * sizeof(float) ) ;
a = (float*)malloc( SIZE * sizeof( float ) ) ;
result = (float*)malloc( BLOCK_NUM * sizeof( float ) ) ;
15
for( int i = 0 ; i < SIZE ; i++ ){ a[i] = 1.0 ; }
cudaMemcpy(dev_a, a, SIZE * sizeof(float), cudaMemcpyHostToDevice ) ; cudaEventCreate( &start ) ; cudaEventCreate( &stop ) ; cudaEventRecord( start, 0 ) ;
sumOfSquares_gpu0<<
for(int i = 0 ; i < BLOCK_NUM ; i++ ){ sum += result[i] ; }
cudaEventRecord( stop, 0 ) ; cudaEventSynchronize( stop ) ;
cudaEventElapsedTime( &elapsedTime, start, stop ) ; printf( \ printf( \ cudaEventDestroy( start ) ; cudaEventDestroy( stop ) ; free( a ) ;
cudaFree( dev_a ) ;
cudaFree( dev_result ) ; return 0 ; }
16