Navigation Menu

Skip to content
This repository has been archived by the owner on Oct 19, 2020. It is now read-only.

Commit

Permalink
Guarded divide, improved convergence checking. (report by Keith Kelley)
Browse files Browse the repository at this point in the history
  • Loading branch information
inducer committed May 21, 2010
1 parent e4c9833 commit e6413cf
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 4 deletions.
17 changes: 17 additions & 0 deletions include/iterative-cuda.hpp
Expand Up @@ -33,6 +33,8 @@ SOFTWARE.

#include <utility>
#include <memory>
#include <iostream>
#include <vector>
#include <stdint.h>


Expand Down Expand Up @@ -105,6 +107,21 @@ namespace iterative_cuda
gpu_vector *dot(gpu_vector const &b) const;
};

template <typename ValueType, typename IndexType>
inline
std::ostream &operator<<(std::ostream &os,
gpu_vector<ValueType, IndexType> const &vec)
{
std::vector<ValueType> tmp(vec.size());
vec.to_cpu(tmp.data());
os << '[';
for (unsigned i = 0; i < vec.size(); ++i)
os << tmp[i] << ' ';
os << ']';

return os;
}




Expand Down
12 changes: 8 additions & 4 deletions src/cg.hpp
Expand Up @@ -30,6 +30,7 @@ SOFTWARE.



#include <algorithm>
#include <iterative-cuda.hpp>
#include "reduction.hpp"

Expand Down Expand Up @@ -57,6 +58,8 @@ namespace iterative_cuda

unsigned n = a.row_count();

unsigned real_resid_interval = std::min(n, unsigned(50));

std::auto_ptr<vector_t> norm_b_squared_gpu(b.dot(b));
scalar_t norm_b_squared;
norm_b_squared_gpu->to_cpu(&norm_b_squared);
Expand All @@ -68,7 +71,7 @@ namespace iterative_cuda
}

if (max_iterations == 0)
max_iterations = n;
max_iterations = 2*n;

// typed up from J.R. Shewchuk,
// An Introduction to the Conjugate Gradient Method
Expand Down Expand Up @@ -99,11 +102,11 @@ namespace iterative_cuda

gpu_scalar_t alpha(1);
std::auto_ptr<gpu_scalar_t> d_dot_q(d.dot(q));
divide(alpha, *delta_new, *d_dot_q);
guarded_divide(alpha, *delta_new, *d_dot_q);

x.set_to_linear_combination(1, x, 1, alpha, d);

bool calculate_real_residual = iterations % 50 == 0;
bool calculate_real_residual = iterations % real_resid_interval == 0;

if (calculate_real_residual)
{
Expand All @@ -126,12 +129,13 @@ namespace iterative_cuda

scalar_t delta_new_host;
delta_new->to_cpu(&delta_new_host);

if (std::abs(delta_new_host) < tol*tol * std::abs(delta_0))
break;
}

gpu_scalar_t beta(1);
divide(beta, *delta_new, *delta_old);
guarded_divide(beta, *delta_new, *delta_old);

d.set_to_linear_combination(1, s, 1, beta, d);

Expand Down
32 changes: 32 additions & 0 deletions src/elementwise.hpp
Expand Up @@ -229,6 +229,38 @@ namespace iterative_cuda



template <class ValueType>
__global__ void guarded_divide_kernel(
ValueType *z, ValueType const *x, ValueType const *y, unsigned n)
{
unsigned tid = threadIdx.x;
unsigned total_threads = gridDim.x*blockDim.x;
unsigned cta_start = blockDim.x*blockIdx.x;
unsigned i;

for (i = cta_start + tid; i < n; i += total_threads)
z[i] = y[i] == 0 ? 0 : (x[i]/y[i]);
}




template <class VT, class IT>
void guarded_divide(
gpu_vector<VT, IT> &z,
gpu_vector<VT, IT> const &x,
gpu_vector<VT, IT> const &y
)
{
dim3 grid, block;
splay(x.size(), grid, block);
guarded_divide_kernel<VT><<<grid, block>>>(
z.ptr(), x.ptr(), y.ptr(), x.size());
}




template <class ValueType>
__global__ void fill_kernel(
ValueType *z, ValueType x, unsigned n)
Expand Down

0 comments on commit e6413cf

Please sign in to comment.