Skip to content
This repository has been archived by the owner on Oct 19, 2020. It is now read-only.

Commit

Permalink
Browse files Browse the repository at this point in the history
Solving with diagonal preconditioner works.
  • Loading branch information
Andreas Kloeckner committed Jul 31, 2009
1 parent b80d6d6 commit 3874d4b
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 52 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -9,3 +9,4 @@ CMakeFiles/
Makefile
cmake_install.cmake
*.linkinfo
*.log
17 changes: 12 additions & 5 deletions example/multiply_matrix.cpp
Expand Up @@ -28,6 +28,7 @@ SOFTWARE.
#include <iterative-cuda.hpp>
#include <iostream>
#include <cstdlib>
#include <cmath>



Expand Down Expand Up @@ -59,14 +60,20 @@ int main(int argc, char **argv)
for (int i = 0; i < gpu_mat.row_count(); ++i)
{
y1[i] = 0;
y2[i] = 0;
}

// do gpu matrix multiply
gpu_vector<value_type> x_gpu(x, gpu_mat.column_count());
gpu_vector<value_type> y_gpu(y2, gpu_mat.row_count());
gpu_vector<value_type> y_gpu(gpu_mat.row_count());

gpu_mat(y_gpu, x_gpu);
gpu_vector<value_type> x_perm_gpu(gpu_mat.column_count());
gpu_vector<value_type> y_perm_gpu(gpu_mat.row_count());
gpu_mat.permute(x_perm_gpu, x_gpu);

y_perm_gpu.fill(0);
gpu_mat(y_perm_gpu, x_perm_gpu);

gpu_mat.unpermute(y_gpu, y_perm_gpu);

y_gpu.to_cpu(y2);
synchronize_gpu();
Expand All @@ -80,9 +87,9 @@ int main(int argc, char **argv)
for (int i = 0; i < gpu_mat.row_count(); ++i)
{
error += (y1[i]-y2[i])*(y1[i]-y2[i]);
norm += x[i]*x[i];
norm += y1[i]*y1[i];
}
std::cerr << error/norm << std::endl;
std::cerr << sqrt(error/norm) << std::endl;

delete[] x;
delete[] y1;
Expand Down
80 changes: 56 additions & 24 deletions example/solve.cpp
Expand Up @@ -28,6 +28,7 @@ SOFTWARE.
#include <iterative-cuda.hpp>
#include <iostream>
#include <cstdlib>
#include <cmath>



Expand All @@ -36,64 +37,95 @@ using namespace iterative_cuda;

int main(int argc, char **argv)
{
srand48(49583344);

if (argc != 2)
{
std::cerr << "usage: " << argv[0] << " matrix.mtx" << std::endl;
return 1;
}
typedef float value_type;
typedef double value_type;
typedef cpu_sparse_csr_matrix<value_type> cpu_mat_type;
typedef gpu_sparse_pkt_matrix<value_type> gpu_mat_type;
typedef gpu_vector<value_type> gvec_t;

std::auto_ptr<cpu_mat_type> cpu_mat(
cpu_mat_type::read_matrix_market_file(argv[1]));

unsigned n = cpu_mat->row_count();

value_type *diag = new value_type[n];
value_type *inv_diag = new value_type[n];
cpu_mat->extract_diagonal(diag);
for (int i = 0; i < n; ++i)
{
inv_diag[i] = 1./diag[i];
}
gvec_t inv_diag_gpu(inv_diag, n);
delete[] inv_diag;
delete[] diag;

gpu_mat_type gpu_mat(*cpu_mat);

gvec_t perm_inv_diag_gpu(n);
gpu_mat.permute(perm_inv_diag_gpu, inv_diag_gpu);
diagonal_preconditioner<gvec_t> diag_pre(perm_inv_diag_gpu);

// build host vectors
unsigned n = gpu_mat.row_count();
value_type *b = new value_type[n];
value_type *x = new value_type[n];
value_type *b2 = new value_type[n];

for (int i = 0; i < gpu_mat.column_count(); ++i)
{
for (int i = 0; i < n; ++i)
b[i] = drand48();
}

// transfer vectors to gpu
typedef gpu_vector<value_type> vec_t;
vec_t b_gpu(b, n);
vec_t x_gpu(n);
vec_t b2_gpu(n);
gvec_t b_gpu(b, n);
gvec_t b_perm_gpu(n);
gpu_mat.permute(b_perm_gpu, b_gpu);

x_gpu.fill(0);
b2_gpu.fill(0);
gvec_t x_perm_gpu(n);
x_perm_gpu.fill(0);

std::cout << "begin solve" << std::endl;

value_type tol;
if (sizeof(value_type) < 8) // ick
if (sizeof(value_type) < sizeof(double))
tol = 1e-4;
else
tol = 1e-8;
tol = 1e-12;

unsigned it_count;
gpu_cg(gpu_mat,
identity_preconditioner<vec_t>(),
x_gpu, b_gpu, tol, /*max_iterations*/ 0, &it_count);
// identity_preconditioner<gvec_t>(),
diag_pre,
x_perm_gpu, b_perm_gpu, tol, /*max_iterations*/ 0, &it_count);

gpu_mat(b2_gpu, x_gpu);
gvec_t x_gpu(n);
gpu_mat.unpermute(x_gpu, x_perm_gpu);

vec_t residual_gpu(n);
residual_gpu.set_to_linear_combination(1, b2_gpu, -1, b_gpu);
value_type *x = new value_type[n];
x_gpu.to_cpu(x);

value_type *b2 = new value_type[n];
for (int i = 0; i < n; ++i)
b2[i] = 0;
(*cpu_mat)(b2, x);

std::auto_ptr<vec_t> res_norm_gpu(residual_gpu.dot(residual_gpu));
value_type res_norm;
res_norm_gpu->to_cpu(&res_norm);
value_type error = 0;
value_type norm = 0;

std::cout << "residual norm " << res_norm << std::endl;
for (int i = 0; i < gpu_mat.row_count(); ++i)
{
error += (b[i]-b2[i])*(b[i]-b2[i]);
norm += b[i]*b[i];
}

std::cerr << "residual norm " << sqrt(error/norm) << std::endl;
std::cout << "iterations " << it_count << std::endl;

delete[] b;
delete[] x;
delete[] b2;

return 0;
}

Expand Down
11 changes: 9 additions & 2 deletions include/iterative-cuda.hpp
Expand Up @@ -76,13 +76,19 @@ namespace iterative_cuda

index_type size() const;

void from_cpu(value_type *cpu);
void to_cpu(value_type *cpu);
void from_cpu(value_type const *cpu);
void to_cpu(value_type *cpu) const;

value_type *ptr();
const value_type *ptr() const;

void fill(value_type x);
void set_to_product(
gpu_vector const &x,
gpu_vector const &y);
void set_to_quotient(
gpu_vector const &x,
gpu_vector const &y);
void set_to_linear_combination(
value_type a,
gpu_vector const &x,
Expand Down Expand Up @@ -214,6 +220,7 @@ namespace iterative_cuda
public:
// keeps a reference to vec
diagonal_preconditioner(gpu_vector_type const &vec);
~diagonal_preconditioner();

void operator()(gpu_vector_type &result, gpu_vector_type const &op) const;
};
Expand Down
4 changes: 2 additions & 2 deletions src/cg.hpp
Expand Up @@ -77,9 +77,10 @@ namespace iterative_cuda

unsigned iterations = 0;
vector_t ax(n);
vector_t residual(n);
ax.fill(0);
a(ax, x);

vector_t residual(n);
residual.set_to_linear_combination(1, b, -1, ax);

vector_t d(n);
Expand Down Expand Up @@ -124,7 +125,6 @@ namespace iterative_cuda

scalar_t delta_new_host;
delta_new->to_cpu(&delta_new_host);
std::cout << delta_new_host << std::endl;
if (std::abs(delta_new_host) < tol*tol * std::abs(delta_0))
break;
}
Expand Down
4 changes: 3 additions & 1 deletion src/cpu-sparse-matrix.hpp
Expand Up @@ -133,7 +133,9 @@ namespace iterative_cuda
{
const index_type j = mat.Aj[jj];
if (i == j)
d[i] = mat.Ax[jj];
{
d[i] += mat.Ax[jj];
}
}
}
}
Expand Down
10 changes: 9 additions & 1 deletion src/diag-preconditioner.hpp
Expand Up @@ -59,11 +59,19 @@ namespace iterative_cuda



template <class GpuVector>
inline diagonal_preconditioner<GpuVector>::
~diagonal_preconditioner()
{ }




template <class GpuVector>
inline void diagonal_preconditioner<GpuVector>::operator()(
gpu_vector_type &result, gpu_vector_type const &op) const
{
multiply(result, op, *pimpl->vec);
multiply(result, *pimpl->vec, op);
}
}

Expand Down
6 changes: 3 additions & 3 deletions src/gpu-sparse-matrix.hpp
Expand Up @@ -87,7 +87,7 @@ namespace iterative_cuda
for (index_type i = 0; i < block_count; ++i)
if (block_occupancy[i] > rows_per_packet)
{
std::cerr << "Metis partition invalid, retrying..." << std::endl;
std::cerr << "Metis partition invalid, retrying with more parts..." << std::endl;
partition_ok = false;
block_count += 2 + int(1.02*block_count);
break;
Expand Down Expand Up @@ -138,7 +138,7 @@ namespace iterative_cuda
vector_type const &src) const
{
gather_device(dest.ptr(), src.ptr(),
pimpl->matrix.permute_old_to_new, row_count());
pimpl->matrix.permute_new_to_old, row_count());
}


Expand All @@ -150,7 +150,7 @@ namespace iterative_cuda
vector_type const &src) const
{
gather_device(dest.ptr(), src.ptr(),
pimpl->matrix.permute_new_to_old, row_count());
pimpl->matrix.permute_old_to_new, row_count());
}


Expand Down
38 changes: 24 additions & 14 deletions src/gpu-vector.hpp
Expand Up @@ -163,7 +163,7 @@ namespace iterative_cuda


template <typename VT, typename IT>
inline void gpu_vector<VT, IT>::from_cpu(value_type *cpu)
inline void gpu_vector<VT, IT>::from_cpu(value_type const *cpu)
{
ICUDA_CHK(cudaMemcpy, (ptr(), cpu,
size()*sizeof(value_type),
Expand All @@ -174,7 +174,7 @@ namespace iterative_cuda


template <typename VT, typename IT>
inline void gpu_vector<VT, IT>::to_cpu(value_type *cpu)
inline void gpu_vector<VT, IT>::to_cpu(value_type *cpu) const
{
ICUDA_CHK(cudaMemcpy, (cpu, ptr(),
size()*sizeof(value_type),
Expand All @@ -200,9 +200,25 @@ namespace iterative_cuda

template <typename VT, typename IT>
inline void gpu_vector<VT, IT>::fill(value_type x)
{
iterative_cuda::fill(*this, x);
}
{ iterative_cuda::fill(*this, x); }




template <typename VT, typename IT>
inline void gpu_vector<VT, IT>::set_to_product(
gpu_vector const &x,
gpu_vector const &y)
{ multiply(*this, x, y); }




template <typename VT, typename IT>
void gpu_vector<VT, IT>::set_to_quotient(
gpu_vector const &x,
gpu_vector const &y)
{ divide(*this, x, y); }



Expand All @@ -213,9 +229,7 @@ namespace iterative_cuda
gpu_vector const &x,
value_type b,
gpu_vector const &y)
{
lc2(*this, a, x, b, y);
}
{ lc2(*this, a, x, b, y); }



Expand All @@ -227,18 +241,14 @@ namespace iterative_cuda
value_type b0,
gpu_vector const &b1,
gpu_vector const &y)
{
lc2(*this, a, x, b0, b1, y);
}
{ lc2(*this, a, x, b0, b1, y); }




template <typename VT, typename IT>
inline gpu_vector<VT, IT> *gpu_vector<VT, IT>::dot(gpu_vector const &b) const
{
return inner_product(*this, b);
}
{ return inner_product(*this, b); }
}


Expand Down

0 comments on commit 3874d4b

Please sign in to comment.