Skip to content
This repository has been archived by the owner on Oct 19, 2020. It is now read-only.

Commit

Permalink
Browse files Browse the repository at this point in the history
Working reduction.
  • Loading branch information
Andreas Kloeckner committed Jul 30, 2009
1 parent 3a71fa9 commit 57eff9f
Show file tree
Hide file tree
Showing 8 changed files with 292 additions and 16 deletions.
2 changes: 2 additions & 0 deletions example/.gitignore
@@ -0,0 +1,2 @@
multiply_matrix
dot_product
6 changes: 3 additions & 3 deletions example/CMakeLists.txt
@@ -1,5 +1,5 @@
add_executable(multiply_matrix
multiply_matrix.cpp)

add_executable(multiply_matrix multiply_matrix.cpp)
TARGET_LINK_LIBRARIES(multiply_matrix iterativecuda)

add_executable(dot_product dot_product.cpp)
TARGET_LINK_LIBRARIES(dot_product iterativecuda)
69 changes: 69 additions & 0 deletions example/dot_product.cpp
@@ -0,0 +1,69 @@
/*
Iterative CUDA is licensed to you under the MIT/X Consortium license:
Copyright (c) 2009 Andreas Kloeckner.
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the Software), to
deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/




#include <iterative-cuda.hpp>
#include <iostream>
#include <cstdlib>




using namespace iterative_cuda;

int main(int argc, char **argv)
{
typedef float value_type;

const unsigned n = 20000000;
value_type *x = new value_type[n];
value_type *y = new value_type[n];

for (int i = 0; i < n; ++i)
{
x[i] = drand48();
y[i] = drand48();
}

double ref_dot = 0;

for (int i = 0; i < n; ++i)
ref_dot += double(x[i])*double(y[i]);

typedef gpu_vector<value_type> vec_t;
vec_t x_gpu(x, n);
vec_t y_gpu(y, n);

std::auto_ptr<vec_t> result(x_gpu.dot(y_gpu));

value_type gpu_dot;
result->to_cpu(&gpu_dot);

std::cerr << (ref_dot-gpu_dot)/ref_dot << std::endl;

return 0;
}

Binary file removed example/multiply_matrix
Binary file not shown.
23 changes: 10 additions & 13 deletions example/multiply_matrix.cpp
Expand Up @@ -41,18 +41,18 @@ int main(int argc, char **argv)
std::cerr << "usage: " << argv[0] << " matrix.mtx" << std::endl;
return 1;
}
typedef float entry_type;
typedef cpu_sparse_csr_matrix<entry_type> cpu_mat_type;
typedef gpu_sparse_pkt_matrix<entry_type> gpu_mat_type;
typedef float value_type;
typedef cpu_sparse_csr_matrix<value_type> cpu_mat_type;
typedef gpu_sparse_pkt_matrix<value_type> gpu_mat_type;
std::auto_ptr<cpu_mat_type> cpu_mat(
cpu_mat_type::read_matrix_market_file(argv[1]));

gpu_mat_type gpu_mat(*cpu_mat);

// build host vectors
entry_type *x = new entry_type[gpu_mat.column_count()];
entry_type *y1 = new entry_type[gpu_mat.row_count()];
entry_type *y2 = new entry_type[gpu_mat.row_count()];
value_type *x = new value_type[gpu_mat.column_count()];
value_type *y1 = new value_type[gpu_mat.row_count()];
value_type *y2 = new value_type[gpu_mat.row_count()];

for (int i = 0; i < gpu_mat.column_count(); ++i)
x[i] = drand48();
Expand All @@ -63,11 +63,8 @@ int main(int argc, char **argv)
}

// do gpu matrix multiply
gpu_vector<entry_type> x_gpu(gpu_mat.column_count());
gpu_vector<entry_type> y_gpu(gpu_mat.row_count());

x_gpu.from_cpu(x);
y_gpu.from_cpu(y2);
gpu_vector<value_type> x_gpu(x, gpu_mat.column_count());
gpu_vector<value_type> y_gpu(y2, gpu_mat.row_count());

gpu_mat(y_gpu, x_gpu);

Expand All @@ -77,8 +74,8 @@ int main(int argc, char **argv)
// compute error
(*cpu_mat)(y1, x);

entry_type error = 0;
entry_type norm = 0;
value_type error = 0;
value_type norm = 0;

for (int i = 0; i < gpu_mat.row_count(); ++i)
{
Expand Down
3 changes: 3 additions & 0 deletions include/iterative-cuda.hpp
Expand Up @@ -68,6 +68,7 @@ namespace iterative_cuda

public:
gpu_vector(index_type size);
gpu_vector(value_type *cpu, index_type size);
gpu_vector(gpu_vector const &src);
~gpu_vector();

Expand All @@ -78,6 +79,8 @@ namespace iterative_cuda

value_type *ptr();
const value_type *ptr() const;

gpu_vector *dot(gpu_vector const &b) const;
};


Expand Down
22 changes: 22 additions & 0 deletions src/gpu-vector.hpp
Expand Up @@ -32,6 +32,7 @@ SOFTWARE.

#include <iterative-cuda.hpp>
#include "helpers.hpp"
#include "reduction.hpp"



Expand Down Expand Up @@ -59,6 +60,18 @@ namespace iterative_cuda



template <typename VT, typename IT>
gpu_vector<VT, IT>::gpu_vector(value_type *cpu, index_type size)
: pimpl(new gpu_vector_pimpl<VT, IT>)
{
ICUDA_CHK(cudaMalloc, ((void **) &pimpl->gpu_data, size*sizeof(value_type)));
pimpl->size = size;
from_cpu(cpu);
}




template <typename VT, typename IT>
gpu_vector<VT, IT>::gpu_vector(gpu_vector const &src)
: pimpl(new gpu_vector_pimpl<VT, IT>)
Expand Down Expand Up @@ -121,6 +134,15 @@ namespace iterative_cuda
template <typename VT, typename IT>
const gpu_vector<VT, IT>::value_type *gpu_vector<VT, IT>::ptr() const
{ return pimpl->gpu_data; }




template <typename VT, typename IT>
gpu_vector<VT, IT> *gpu_vector<VT, IT>::dot(gpu_vector const &b) const
{
return inner_product(*this, b);
}
}


Expand Down
183 changes: 183 additions & 0 deletions src/reduction.hpp
@@ -0,0 +1,183 @@
/*
Iterative CUDA is licensed to you under the MIT/X Consortium license:
Copyright (c) 2009 Andreas Kloeckner.
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the Software), to
deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Based on code by Mark Harris at Nvidia.
*/




#ifndef _AAFADFJ_ITERATIVE_CUDA_REDUCTION_HPP_SEEN
#define _AAFADFJ_ITERATIVE_CUDA_REDUCTION_HPP_SEEN




#include <memory>




namespace iterative_cuda
{
#define MAKE_REDUCTION_KERNEL(KERNEL_NAME, ARG_DECL, READ_AND_MAP, REDUCE) \
template <class ValueType, unsigned BlockSize> \
__global__ void KERNEL_NAME##_kernel(ValueType *out, \
ARG_DECL, unsigned int seq_count, unsigned int n) \
{ \
__shared__ ValueType sdata[BlockSize]; \
\
unsigned int tid = threadIdx.x; \
unsigned int i = blockIdx.x*BlockSize*seq_count + tid; \
\
ValueType acc = 0; \
for (unsigned s = 0; s < seq_count; ++s) \
{ \
if (i >= n) \
break; \
acc = REDUCE(acc, READ_AND_MAP(i)); \
\
i += BlockSize; \
} \
\
sdata[tid] = acc; \
\
__syncthreads(); \
\
if (BlockSize >= 512) \
{ \
if (tid < 256) { sdata[tid] = REDUCE(sdata[tid], sdata[tid + 256]); } \
__syncthreads(); \
} \
\
if (BlockSize >= 256) \
{ \
if (tid < 128) { sdata[tid] = REDUCE(sdata[tid], sdata[tid + 128]); } \
__syncthreads(); \
} \
\
if (BlockSize >= 128) \
{ \
if (tid < 64) { sdata[tid] = REDUCE(sdata[tid], sdata[tid + 64]); } \
__syncthreads(); \
} \
\
if (tid < 32) \
{ \
if (BlockSize >= 64) sdata[tid] = REDUCE(sdata[tid], sdata[tid + 32]); \
if (BlockSize >= 32) sdata[tid] = REDUCE(sdata[tid], sdata[tid + 16]); \
if (BlockSize >= 16) sdata[tid] = REDUCE(sdata[tid], sdata[tid + 8]); \
if (BlockSize >= 8) sdata[tid] = REDUCE(sdata[tid], sdata[tid + 4]); \
if (BlockSize >= 4) sdata[tid] = REDUCE(sdata[tid], sdata[tid + 2]); \
if (BlockSize >= 2) sdata[tid] = REDUCE(sdata[tid], sdata[tid + 1]); \
} \
\
if (tid == 0) out[blockIdx.x] = sdata[0]; \
} \




#define INNER_PRODUCT_STAGE1_ARGS ValueType const *a, ValueType const *b
#define INNER_PRODUCT_READ_AND_MAP(i) a[i]*b[i]
#define INNER_PRODUCT_STAGE2_ARGS ValueType const *a
#define STRAIGHT_READ(i) a[i]
#define SUM_REDUCE(a, b) a+b

MAKE_REDUCTION_KERNEL(inner_product_stage1,
INNER_PRODUCT_STAGE1_ARGS,
INNER_PRODUCT_READ_AND_MAP, SUM_REDUCE);
MAKE_REDUCTION_KERNEL(inner_product_stage2,
INNER_PRODUCT_STAGE2_ARGS,
STRAIGHT_READ, SUM_REDUCE);




template <class VT, class IT>
gpu_vector<VT, IT> *inner_product(
gpu_vector<VT, IT> const &a,
gpu_vector<VT, IT> const &b)
{
const unsigned BLOCK_SIZE = 512;
const unsigned MAX_BLOCK_COUNT = 1024;
const unsigned SMALL_SEQ_COUNT = 4;

typedef gpu_vector<VT, IT> vec_t;

std::auto_ptr<vec_t> result;

while (true)
{
unsigned sz;
if (!result.get())
// stage 1
sz = a.size();
else
// stage 2..n
sz = result->size();

unsigned block_count, seq_count;
if (sz <= BLOCK_SIZE*SMALL_SEQ_COUNT*MAX_BLOCK_COUNT)
{
unsigned total_block_size = SMALL_SEQ_COUNT*BLOCK_SIZE;
block_count = (sz + total_block_size - 1) / total_block_size;
seq_count = SMALL_SEQ_COUNT;
}
else
{
block_count = MAX_BLOCK_COUNT;
unsigned macroblock_size = block_count*BLOCK_SIZE;
seq_count = (sz + macroblock_size - 1) / macroblock_size;
}

if (!result.get())
{
// stage 1
result = std::auto_ptr<vec_t>(new vec_t(block_count));

inner_product_stage1_kernel<VT, BLOCK_SIZE>
<<<block_count, BLOCK_SIZE>>>(result->ptr(), a.ptr(), b.ptr(),
seq_count, sz);
}
else
{
// stage 2..n
std::auto_ptr<vec_t> new_result(new vec_t(block_count));

inner_product_stage2_kernel<VT, BLOCK_SIZE>
<<<block_count, BLOCK_SIZE>>>(new_result->ptr(), result->ptr(),
seq_count, sz);

result = new_result;
}
if (block_count == 1)
return result.release();
}
}
}




#endif

0 comments on commit 57eff9f

Please sign in to comment.