Skip to content
This repository has been archived by the owner on Oct 19, 2020. It is now read-only.

Commit

Permalink
Browse files Browse the repository at this point in the history
CG appears to work.
  • Loading branch information
Andreas Kloeckner committed Jul 31, 2009
1 parent 9e03ba8 commit b80d6d6
Show file tree
Hide file tree
Showing 9 changed files with 181 additions and 36 deletions.
1 change: 1 addition & 0 deletions example/.gitignore
@@ -1,2 +1,3 @@
multiply_matrix
dot_product
solve
7 changes: 5 additions & 2 deletions example/CMakeLists.txt
@@ -1,5 +1,8 @@
add_executable(multiply_matrix multiply_matrix.cpp)
TARGET_LINK_LIBRARIES(multiply_matrix iterativecuda)
target_link_libraries(multiply_matrix iterativecuda)

add_executable(dot_product dot_product.cpp)
TARGET_LINK_LIBRARIES(dot_product iterativecuda)
target_link_libraries(dot_product iterativecuda)

add_executable(solve solve.cpp)
target_link_libraries(solve iterativecuda)
100 changes: 100 additions & 0 deletions example/solve.cpp
@@ -0,0 +1,100 @@
/*
Iterative CUDA is licensed to you under the MIT/X Consortium license:
Copyright (c) 2009 Andreas Kloeckner.
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the Software), to
deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/




#include <iterative-cuda.hpp>
#include <iostream>
#include <cstdlib>




using namespace iterative_cuda;

int main(int argc, char **argv)
{
if (argc != 2)
{
std::cerr << "usage: " << argv[0] << " matrix.mtx" << std::endl;
return 1;
}
typedef float value_type;
typedef cpu_sparse_csr_matrix<value_type> cpu_mat_type;
typedef gpu_sparse_pkt_matrix<value_type> gpu_mat_type;
std::auto_ptr<cpu_mat_type> cpu_mat(
cpu_mat_type::read_matrix_market_file(argv[1]));

gpu_mat_type gpu_mat(*cpu_mat);

// build host vectors
unsigned n = gpu_mat.row_count();
value_type *b = new value_type[n];
value_type *x = new value_type[n];
value_type *b2 = new value_type[n];

for (int i = 0; i < gpu_mat.column_count(); ++i)
{
b[i] = drand48();
}

// transfer vectors to gpu
typedef gpu_vector<value_type> vec_t;
vec_t b_gpu(b, n);
vec_t x_gpu(n);
vec_t b2_gpu(n);

x_gpu.fill(0);
b2_gpu.fill(0);

std::cout << "begin solve" << std::endl;

value_type tol;
if (sizeof(value_type) < 8) // ick
tol = 1e-4;
else
tol = 1e-8;

unsigned it_count;
gpu_cg(gpu_mat,
identity_preconditioner<vec_t>(),
x_gpu, b_gpu, tol, /*max_iterations*/ 0, &it_count);

gpu_mat(b2_gpu, x_gpu);

vec_t residual_gpu(n);
residual_gpu.set_to_linear_combination(1, b2_gpu, -1, b_gpu);

std::auto_ptr<vec_t> res_norm_gpu(residual_gpu.dot(residual_gpu));
value_type res_norm;
res_norm_gpu->to_cpu(&res_norm);

std::cout << "residual norm " << res_norm << std::endl;
std::cout << "iterations " << it_count << std::endl;

return 0;
}


13 changes: 8 additions & 5 deletions include/iterative-cuda.hpp
Expand Up @@ -67,7 +67,7 @@ namespace iterative_cuda
std::auto_ptr<gpu_vector_pimpl<value_type, index_type> > pimpl;

public:
gpu_vector(index_type size);
explicit gpu_vector(index_type size);
gpu_vector(value_type *cpu, index_type size);
gpu_vector(gpu_vector const &src);
~gpu_vector();
Expand All @@ -82,6 +82,7 @@ namespace iterative_cuda
value_type *ptr();
const value_type *ptr() const;

void fill(value_type x);
void set_to_linear_combination(
value_type a,
gpu_vector const &x,
Expand Down Expand Up @@ -188,7 +189,7 @@ namespace iterative_cuda
public:
typedef GpuVector gpu_vector_type;

void operator()(gpu_vector_type &result, gpu_vector_type const &op)
void operator()(gpu_vector_type &result, gpu_vector_type const &op) const
{
result = op;
}
Expand All @@ -214,7 +215,7 @@ namespace iterative_cuda
// keeps a reference to vec
diagonal_preconditioner(gpu_vector_type const &vec);

void operator()(gpu_vector_type &result, gpu_vector_type const &op);
void operator()(gpu_vector_type &result, gpu_vector_type const &op) const;
};


Expand All @@ -226,10 +227,12 @@ namespace iterative_cuda
template <typename ValueType, typename IndexType, typename Operator, typename Preconditioner>
void gpu_cg(
const Operator &a,
gpu_vector<ValueType, IndexType> const &x,
const Preconditioner &m_inv,
gpu_vector<ValueType, IndexType> &x,
gpu_vector<ValueType, IndexType> const &b,
ValueType tol=1e-8,
const Preconditioner *m_inv=0);
unsigned max_iterations=0,
unsigned *itcount=0);
}


Expand Down
27 changes: 16 additions & 11 deletions src/cg.hpp
Expand Up @@ -41,12 +41,12 @@ namespace iterative_cuda
template <typename ValueType, typename IndexType, typename Operator, typename Preconditioner>
inline void gpu_cg(
const Operator &a,
const Preconditioner m_inv,
gpu_vector<ValueType, IndexType> const &x,
const Preconditioner &m_inv,
gpu_vector<ValueType, IndexType> &x,
gpu_vector<ValueType, IndexType> const &b,
ValueType tol=1e-8,
unsigned max_iterations=0
)
ValueType tol,
unsigned max_iterations,
unsigned *it_count)
{
typedef gpu_vector<ValueType, IndexType> vector_t;
typedef vector_t gpu_scalar_t; // such is life. :/
Expand All @@ -59,7 +59,7 @@ namespace iterative_cuda

std::auto_ptr<vector_t> norm_b_squared_gpu(b.dot(b));
scalar_t norm_b_squared;
norm_b_squared_gpu.to_cpu(&norm_b_squared);
norm_b_squared_gpu->to_cpu(&norm_b_squared);

if (norm_b_squared == 0)
{
Expand All @@ -78,21 +78,22 @@ namespace iterative_cuda
unsigned iterations = 0;
vector_t ax(n);
vector_t residual(n);
ax.fill(0);
a(ax, x);
residual.set_to_linear_combination(1, b, -1, ax);

vector_t d(n);
m_inv(d, residual);

std::auto_ptr<gpu_scalar_t> delta_new(
residual.dot(d));
std::auto_ptr<gpu_scalar_t> delta_new(residual.dot(d));

scalar_t delta_0;
delta_new->to_cpu(&delta_0);

while (iterations < max_iterations)
{
vector_t q(n);
q.fill(0);
a(q, d);

gpu_scalar_t alpha(1);
Expand All @@ -104,6 +105,7 @@ namespace iterative_cuda

if (calculate_real_residual)
{
ax.fill(0);
a(ax, x);
residual.set_to_linear_combination(1, b, -1, ax);
}
Expand All @@ -114,16 +116,16 @@ namespace iterative_cuda
m_inv(s, residual);

std::auto_ptr<gpu_scalar_t> delta_old(delta_new);
delta_new = std::auto_ptr<gpu_scalar_t>(
residual.dot(s));
delta_new = std::auto_ptr<gpu_scalar_t>(residual.dot(s));

if (calculate_real_residual)
{
// Only terminate the loop on the basis of a "real" residual.

scalar_t delta_new_host;
delta_new->to_cpu(&delta_new_host);
if (std::abs(delta_new) < tol*tol * std::abs(delta_0))
std::cout << delta_new_host << std::endl;
if (std::abs(delta_new_host) < tol*tol * std::abs(delta_0))
break;
}

Expand All @@ -137,6 +139,9 @@ namespace iterative_cuda

if (iterations == max_iterations)
throw std::runtime_error("cg failed to converge");

if (it_count)
*it_count = iterations;
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/diag-preconditioner.hpp
Expand Up @@ -61,7 +61,7 @@ namespace iterative_cuda

template <class GpuVector>
inline void diagonal_preconditioner<GpuVector>::operator()(
gpu_vector_type &result, gpu_vector_type const &op)
gpu_vector_type &result, gpu_vector_type const &op) const
{
multiply(result, op, *pimpl->vec);
}
Expand Down
27 changes: 27 additions & 0 deletions src/elementwise.hpp
Expand Up @@ -225,6 +225,33 @@ namespace iterative_cuda
divide_kernel<VT><<<grid, block>>>(
z.ptr(), x.ptr(), y.ptr(), x.size());
}




template <class ValueType>
__global__ void fill_kernel(
ValueType *z, ValueType x, unsigned n)
{
unsigned tid = threadIdx.x;
unsigned total_threads = gridDim.x*blockDim.x;
unsigned cta_start = blockDim.x*blockIdx.x;
unsigned i;

for (i = cta_start + tid; i < n; i += total_threads)
z[i] = x;
}




template <class VT, class IT>
void fill(gpu_vector<VT, IT> &z, VT x)
{
dim3 grid, block;
splay(z.size(), grid, block);
fill_kernel<VT><<<grid, block>>>(z.ptr(), x, z.size());
}
}


Expand Down
9 changes: 9 additions & 0 deletions src/gpu-vector.hpp
Expand Up @@ -198,6 +198,15 @@ namespace iterative_cuda



template <typename VT, typename IT>
inline void gpu_vector<VT, IT>::fill(value_type x)
{
iterative_cuda::fill(*this, x);
}




template <typename VT, typename IT>
inline void gpu_vector<VT, IT>::set_to_linear_combination(
value_type a,
Expand Down
31 changes: 14 additions & 17 deletions src/instantiation.cu
Expand Up @@ -35,32 +35,29 @@ SOFTWARE.



using namespace iterative_cuda;




namespace iterative_cuda
{
#define INSTANTIATE(TP) \
template class gpu_vector<TP>; \
template class cpu_sparse_csr_matrix<TP>; \
template class gpu_sparse_pkt_matrix<TP>; \
template class diagonal_preconditioner<gpu_vector<TP> >; \
void gpu_cg( \
template void gpu_cg( \
const gpu_sparse_pkt_matrix<TP> &a, \
const identity_preconditioner<gpu_vector<TP> > m_inv, \
gpu_vector<TP> const &x, \
const identity_preconditioner<gpu_vector<TP> > &m_inv, \
gpu_vector<TP> &x, \
gpu_vector<TP> const &b, \
TP tol, \
unsigned max_iterations); \
void gpu_cg( \
TP tol, unsigned max_iterations, unsigned *it_count); \
template void gpu_cg( \
const gpu_sparse_pkt_matrix<TP> &a, \
const diagonal_preconditioner<gpu_vector<TP> > m_inv, \
gpu_vector<TP> const &x, \
const diagonal_preconditioner<gpu_vector<TP> > &m_inv, \
gpu_vector<TP> &x, \
gpu_vector<TP> const &b, \
TP tol, \
unsigned max_iterations);
TP tol, unsigned max_iterations, unsigned *it_count);



INSTANTIATE(float);
INSTANTIATE(double);

INSTANTIATE(float);
INSTANTIATE(double);
}

0 comments on commit b80d6d6

Please sign in to comment.