Guarded divide, improved convergence checking. (report by Keith Kelley)

inducer · May 21, 2010 · e6413cf · e6413cf
1 parent e4c9833
commit e6413cf
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 4 deletions.
diff --git a/include/iterative-cuda.hpp b/include/iterative-cuda.hpp
@@ -33,6 +33,8 @@ SOFTWARE.
 
 #include <utility>
 #include <memory>
+#include <iostream>
+#include <vector>
 #include <stdint.h>
 
 
@@ -105,6 +107,21 @@ namespace iterative_cuda
       gpu_vector *dot(gpu_vector const &b) const;
   };
 
+  template <typename ValueType, typename IndexType>
+  inline
+  std::ostream &operator<<(std::ostream &os, 
+      gpu_vector<ValueType, IndexType> const &vec)
+  {
+    std::vector<ValueType> tmp(vec.size());
+    vec.to_cpu(tmp.data());
+    os << '[';
+    for (unsigned i = 0; i < vec.size(); ++i)
+      os << tmp[i] << ' ';
+    os << ']';
+
+    return os;
+  }
+
 
 
 

diff --git a/src/cg.hpp b/src/cg.hpp
@@ -30,6 +30,7 @@ SOFTWARE.
 
 
 
+#include <algorithm>
 #include <iterative-cuda.hpp>
 #include "reduction.hpp"
 
@@ -57,6 +58,8 @@ namespace iterative_cuda
 
     unsigned n = a.row_count();
 
+    unsigned real_resid_interval = std::min(n, unsigned(50));
+
     std::auto_ptr<vector_t> norm_b_squared_gpu(b.dot(b));
     scalar_t norm_b_squared;
     norm_b_squared_gpu->to_cpu(&norm_b_squared);
@@ -68,7 +71,7 @@ namespace iterative_cuda
     }
 
     if (max_iterations == 0)
-      max_iterations = n;
+      max_iterations = 2*n;
 
     // typed up from J.R. Shewchuk, 
     // An Introduction to the Conjugate Gradient Method
@@ -99,11 +102,11 @@ namespace iterative_cuda
 
       gpu_scalar_t alpha(1);
       std::auto_ptr<gpu_scalar_t> d_dot_q(d.dot(q));
-      divide(alpha, *delta_new, *d_dot_q);
+      guarded_divide(alpha, *delta_new, *d_dot_q);
 
       x.set_to_linear_combination(1, x, 1, alpha, d);
 
-      bool calculate_real_residual = iterations % 50 == 0;
+      bool calculate_real_residual = iterations % real_resid_interval == 0;
 
       if (calculate_real_residual)
       {
@@ -126,12 +129,13 @@ namespace iterative_cuda
 
         scalar_t delta_new_host;
         delta_new->to_cpu(&delta_new_host);
+
         if (std::abs(delta_new_host) < tol*tol * std::abs(delta_0))
           break;
       }
 
       gpu_scalar_t beta(1);
-      divide(beta, *delta_new, *delta_old);
+      guarded_divide(beta, *delta_new, *delta_old);
 
       d.set_to_linear_combination(1, s, 1, beta, d);
 

diff --git a/src/elementwise.hpp b/src/elementwise.hpp
@@ -229,6 +229,38 @@ namespace iterative_cuda
 
 
 
+  template <class ValueType>
+  __global__ void guarded_divide_kernel(
+      ValueType *z, ValueType const *x, ValueType const *y, unsigned n)
+  {
+    unsigned tid = threadIdx.x;
+    unsigned total_threads = gridDim.x*blockDim.x;
+    unsigned cta_start = blockDim.x*blockIdx.x;
+    unsigned i;
+
+    for (i = cta_start + tid; i < n; i += total_threads) 
+      z[i] = y[i] == 0 ? 0 : (x[i]/y[i]);
+  }
+
+
+
+
+  template <class VT, class IT>
+  void guarded_divide(
+      gpu_vector<VT, IT> &z,
+      gpu_vector<VT, IT> const &x,
+      gpu_vector<VT, IT> const &y
+      )
+  {
+    dim3 grid, block;
+    splay(x.size(), grid, block);
+    guarded_divide_kernel<VT><<<grid, block>>>(
+        z.ptr(), x.ptr(), y.ptr(), x.size());
+  }
+
+
+
+
   template <class ValueType>
   __global__ void fill_kernel(
       ValueType *z, ValueType x, unsigned n)