for( int i = 0 ; i < SIZE ; i++ ){ a[i] = 1.0 ; }
cudaMemcpy(dev_a, a, SIZE * sizeof(float), cudaMemcpyHostToDevice ) ; cudaEventCreate( &start ) ; cudaEventCreate( &stop ) ; cudaEventRecord( start, 0 ) ;
sumOfSquares_gpu0<<
for(int i = 0 ; i < BLOCK_NUM ; i++ ){ sum += result[i] ; }
cudaEventRecord( stop, 0 ) ; cudaEventSynchronize( stop ) ;
cudaEventElapsedTime( &elapsedTime, start, stop ) ; printf( \ printf( \ cudaEventDestroy( start ) ; cudaEventDestroy( stop ) ; free( a ) ;
cudaFree( dev_a ) ;
cudaFree( dev_result ) ; return 0 ; }
16