ViennaCL - The Vienna Computing Library  1.2.0
spai-dynamic.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
2 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2011, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8 
9  -----------------
10  ViennaCL - The Vienna Computing Library
11  -----------------
12 
13  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
14 
15  (A list of authors and contributors can be found in the PDF manual)
16 
17  License: MIT (X11), see file LICENSE in the base directory
18 ============================================================================= */
19 
26 #include <utility>
27 #include <iostream>
28 #include <fstream>
29 #include <string>
30 #include <algorithm>
31 #include <vector>
32 #include <math.h>
33 #include <map>
34 //#include "block_matrix.hpp"
35 //#include "block_vector.hpp"
36 //#include "benchmark-utils.hpp"
37 #include "boost/numeric/ublas/vector.hpp"
38 #include "boost/numeric/ublas/matrix.hpp"
39 #include "boost/numeric/ublas/matrix_proxy.hpp"
40 #include "boost/numeric/ublas/vector_proxy.hpp"
41 #include "boost/numeric/ublas/storage.hpp"
42 #include "boost/numeric/ublas/io.hpp"
43 #include "boost/numeric/ublas/lu.hpp"
44 #include "boost/numeric/ublas/triangular.hpp"
45 #include "boost/numeric/ublas/matrix_expression.hpp"
46 // ViennaCL includes
47 #include "viennacl/linalg/prod.hpp"
48 #include "viennacl/matrix.hpp"
52 #include "viennacl/scalar.hpp"
53 #include "viennacl/linalg/cg.hpp"
55 #include "viennacl/linalg/ilu.hpp"
56 #include "viennacl/ocl/backend.hpp"
57 
64 
65 namespace viennacl
66 {
67  namespace linalg
68  {
69  namespace detail
70  {
71  namespace spai
72  {
73 
74  typedef std::pair<unsigned int, double> PairT;
75  struct CompareSecond{
76  bool operator()(const PairT& left, const PairT& right)
77  {
78  return static_cast<double>(left.second) > static_cast<double>(right.second);
79  }
80  };
81 
82 
89  template<typename SparseMatrixType, typename DenseMatrixType>
90  void initProjectSubMatrix(const SparseMatrixType& A_in, const std::vector<unsigned int>& J, std::vector<unsigned int>& I,
91  DenseMatrixType& A_out){
92  typedef typename DenseMatrixType::value_type ScalarType;
93  A_out.resize(I.size(), J.size(), false);
94  for(size_t j = 0; j < J.size(); ++j){
95  for(size_t i = 0; i < I.size(); ++i){
96  A_out(i,j) = A_in(I[i],J[j]);
97  }
98  }
99  }
100 
105  bool isInIndexSet(const std::vector<unsigned int>& J, const unsigned int& ind){
106  return (std::find(J.begin(), J.end(), ind) != J.end());
107  }
108 
114  template<typename MatrixType>
115  void composeNewR(const MatrixType& A, const MatrixType& R_n, MatrixType& R){
116  typedef typename MatrixType::value_type ScalarType;
117  size_t row_n = R_n.size1() - (A.size1() - R.size2());
118  MatrixType C = boost::numeric::ublas::zero_matrix<ScalarType>(R.size1() + row_n, R.size2() + A.size2());
119  //write original R to new Composite R
120  boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R;
121  //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2()
122  boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(),
123  R.size2() + A.size2())) +=
124  boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2()));
125  //adding decomposed(QR) block to Composite R
126  if(R_n.size1() > 0 && R_n.size2() > 0)
127  boost::numeric::ublas::project(C, boost::numeric::ublas::range(R.size2(), R.size1() + row_n),
128  boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n;
129  R = C;
130  }
131 
136  template<typename VectorType>
137  void composeNewVector(const VectorType& v_n, VectorType& v){
138  typedef typename VectorType::value_type ScalarType;
139  VectorType w = boost::numeric::ublas::zero_vector<ScalarType>(v.size() + v_n.size());
140  boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v;
141  boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n;
142  v = w;
143  }
144 
149  template<typename SparseVectorType, typename ScalarType>
150  void sparse_norm_2(const SparseVectorType& v, ScalarType& norm){
151  for(typename SparseVectorType::const_iterator vec_it = v.begin(); vec_it != v.end(); ++vec_it){
152  norm += (vec_it->second)*(vec_it->second);
153  }
154  norm = std::sqrt(norm);
155  }
156 
162  template<typename SparseVectorType, typename ScalarType>
163  void sparse_inner_prod(const SparseVectorType& v1, const SparseVectorType& v2, ScalarType& res_v){
164  typename SparseVectorType::const_iterator v_it1 = v1.begin();
165  typename SparseVectorType::const_iterator v_it2 = v2.begin();
166  while((v_it1 != v1.end())&&(v_it2 != v2.end())){
167  if(v_it1->first == v_it2->first){
168  res_v += (v_it1->second)*(v_it2->second);
169  ++v_it1;
170  ++v_it2;
171  }
172  else if(v_it1->first < v_it2->first){
173  ++v_it1;
174  }
175  else
176  ++v_it2;
177 
178 
179  }
180  }
181 
189  template <typename SparseVectorType, typename ScalarType>
190  bool buildAugmentedIndexSet(const std::vector<SparseVectorType>& A_v_c,
191  const SparseVectorType& res,
192  std::vector<unsigned int>& J,
193  std::vector<unsigned int>& J_u,
194  const spai_tag& tag){
195  std::vector<std::pair<unsigned int, ScalarType> > p;
196  size_t cur_size = 0;
197  ScalarType inprod, norm2;
198  //print_sparse_vector(res);
199  for(typename SparseVectorType::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it){
200  if(!isInIndexSet(J, res_it->first) && (std::abs(res_it->second) > tag.getResidualThreshold())){
201  inprod = norm2 = 0;
202  sparse_inner_prod(res, A_v_c[res_it->first], inprod);
203  sparse_norm_2(A_v_c[res_it->first], norm2);
204  p.push_back(std::pair<size_t, ScalarType>(res_it->first, (inprod*inprod)/(norm2*norm2)));
205  }
206  }
207 
208  std::sort(p.begin(), p.end(), CompareSecond());
209  while ((cur_size < J.size())&&(p.size() > 0)) {
210  J_u.push_back(p[0].first);
211  p.erase(p.begin());
212  cur_size++;
213  }
214  p.clear();
215  return (cur_size > 0);
216  }
217 
224  template<typename SparseVectorType>
225  void buildNewRowSet(const std::vector<SparseVectorType>& A_v_c, const std::vector<unsigned int>& I,
226  const std::vector<unsigned int>& J_n, std::vector<unsigned int>& I_n){
227  for(size_t i = 0; i < J_n.size(); ++i){
228  for(typename SparseVectorType::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it){
229  if(!isInIndexSet(I, col_it->first)&&!isInIndexSet(I_n, col_it->first)){
230  I_n.push_back(col_it->first);
231  }
232  }
233  }
234  }
235 
241  template<typename MatrixType>
242  void QRBlockComposition(const MatrixType& A_I_J, const MatrixType& A_I_J_u, MatrixType& A_I_u_J_u){
243  typedef typename MatrixType::value_type ScalarType;
244  size_t row_n1 = A_I_J_u.size1() - A_I_J.size2();
245  size_t row_n2 = A_I_u_J_u.size1();
246  size_t row_n = row_n1 + row_n2;
247  size_t col_n = A_I_J_u.size2();
248  MatrixType C = boost::numeric::ublas::zero_matrix<ScalarType>(row_n, col_n);
249  boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, row_n1), boost::numeric::ublas::range(0, col_n)) +=
250  boost::numeric::ublas::project(A_I_J_u, boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()),
251  boost::numeric::ublas::range(0, col_n));
252 
253  boost::numeric::ublas::project(C, boost::numeric::ublas::range(row_n1, row_n1 + row_n2),
254  boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u;
255  A_I_u_J_u = C;
256  }
257 
269  template<typename SparseMatrixType, typename SparseVectorType, typename DenseMatrixType, typename VectorType>
270  void block_update(const SparseMatrixType& A, const std::vector<SparseVectorType>& A_v_c,
271  std::vector<SparseVectorType>& g_res,
272  std::vector<bool>& g_is_update,
273  std::vector<std::vector<unsigned int> >& g_I,
274  std::vector<std::vector<unsigned int> >& g_J,
275  std::vector<VectorType>& g_b_v,
276  std::vector<DenseMatrixType>& g_A_I_J,
277  spai_tag const & tag){
278  typedef typename DenseMatrixType::value_type ScalarType;
279  //set of new column indices
280  std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
281  //set of new row indices
282  std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
283  //matrix A(I, \tilde J), cf. Kallischko p.31-32
284  std::vector<DenseMatrixType> g_A_I_J_u(g_J.size());
285  //matrix A(\tilde I, \tilde J), cf. Kallischko
286  std::vector<DenseMatrixType> g_A_I_u_J_u(g_J.size());
287  //new vector of beta coefficients from QR factorization
288  std::vector<VectorType> g_b_v_u(g_J.size());
289 #ifdef _OPENMP
290  #pragma omp parallel for
291 #endif
292  for(std::size_t i = 0; i < g_J.size(); ++i){
293  if(g_is_update[i]){
294  if(buildAugmentedIndexSet<SparseVectorType, ScalarType>(A_v_c, g_res[i], g_J[i], g_J_u[i], tag)){
295  //initialize matrix A_I_\hatJ
296  initProjectSubMatrix(A, g_J_u[i], g_I[i], g_A_I_J_u[i]);
297  //multiplication of Q'*A_I_\hatJ
298  apply_q_trans_mat(g_A_I_J[i], g_b_v[i], g_A_I_J_u[i]);
299  //building new rows index set \hatI
300  buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]);
301  initProjectSubMatrix(A, g_J_u[i], g_I_u[i], g_A_I_u_J_u[i]);
302  //composition of block for new QR factorization
303  QRBlockComposition(g_A_I_J[i], g_A_I_J_u[i], g_A_I_u_J_u[i]);
304  //QR factorization
305  single_qr(g_A_I_u_J_u[i], g_b_v_u[i]);
306  //composition of new R and new vector b_v
307  composeNewR(g_A_I_J_u[i], g_A_I_u_J_u[i], g_A_I_J[i]);
308  composeNewVector(g_b_v_u[i], g_b_v[i]);
309  //composition of new sets: I and J
310  g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end());
311  g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end());
312  }else{
313  g_is_update[i] = false;
314  }
315  }
316  }
317  }
318  /**************************************************** GPU SPAI Update ****************************************************************/
319 
320 
321  //performs Q'*A(I, \tilde J) on GPU
331  template<typename ScalarType>
332  void block_q_multiplication(const std::vector<std::vector<unsigned int> >& g_J_u,
333  const std::vector<std::vector<unsigned int> >& g_I,
334  block_matrix& g_A_I_J_vcl,
335  block_vector& g_bv_vcl,
336  block_matrix& g_A_I_J_u_vcl,
337  std::vector<cl_uint>& g_is_update,
338  const unsigned int cur_iter){
339  unsigned int local_r_n, local_c_n, sz_blocks;
340  get_max_block_size(g_I, local_r_n);
341  get_max_block_size(g_J_u, local_c_n);
342  //for debug
343  std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
344  std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
345  compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims);
346  std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
347  /*if(cur_iter == 1){
348  //if first run - compile the program
349  std::string block_q_kernel_file_name = "kernels/spai/block_q.cl";
350  std::string block_q_kernel_source;
351  read_kernel_from_file(block_q_kernel_file_name, block_q_kernel_source);
352  viennacl::ocl::program & block_q_prog = viennacl::ocl::current_context().add_program(block_q_kernel_source.c_str(), "block_q_kernel_source");
353  block_q_prog.add_kernel("block_q_mult");
354  //
355  }*/
356 
357  viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
358  static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
359  &(g_is_update[0]));
360  viennacl::ocl::kernel& block_q_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_q_mult");
361  block_q_kernel.local_work_size(0, local_c_n);
362  block_q_kernel.global_work_size(0, 256);
363  viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
364  g_bv_vcl.handle(),
365  g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl,
366  viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
367  static_cast<cl_uint>(g_I.size())));
368  }
369 
376  void assemble_qr_row_inds(const std::vector<std::vector<unsigned int> >& g_I, const std::vector<std::vector<unsigned int> > g_J,
377  const std::vector<std::vector<unsigned int> >& g_I_u,
378  std::vector<std::vector<unsigned int> >& g_I_q){
379 #ifdef _OPENMP
380  #pragma omp parallel for
381 #endif
382  for(std::size_t i = 0; i < g_I.size(); ++i){
383  for(std::size_t j = g_J[i].size(); j < g_I[i].size(); ++j){
384  g_I_q[i].push_back(g_I[i][j]);
385  }
386 
387  for(std::size_t j = 0; j < g_I_u[i].size(); ++j){
388  g_I_q[i].push_back(g_I_u[i][j]);
389  }
390  }
391  }
392 
407  template<typename ScalarType>
409  const std::vector<std::vector<unsigned int> >& g_J,
410  const std::vector<std::vector<unsigned int> >& g_I,
411  const std::vector<std::vector<unsigned int> >& g_J_u,
412  const std::vector<std::vector<unsigned int> >& g_I_u,
413  std::vector<std::vector<unsigned int> >& g_I_q,
414  block_matrix& g_A_I_J_u_vcl,
415  viennacl::ocl::handle<cl_mem>& matrix_dimensions,
416  block_matrix& g_A_I_u_J_u_vcl,
417  std::vector<cl_uint>& g_is_update,
418  const bool is_empty_block,
419  const unsigned int cur_iter){
420  //std::vector<std::vector<unsigned int> > g_I_q(g_I.size());
421  assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q);
422  unsigned int sz_blocks;
423  std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
424  std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
425  compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims);
426  std::vector<ScalarType> con_A_I_J_q(sz_blocks, static_cast<ScalarType>(0));
427 
428  /*if(cur_iter == 1){
429  std::string qr_block_asm_file_name = "kernels/spai/qr_block_assembly_g.cl";
430  std::string qr_block_asm_source;
431  read_kernel_from_file(qr_block_asm_file_name, qr_block_asm_source);
432  viennacl::ocl::program & qr_block_asm_prog = viennacl::ocl::current_context().add_program(qr_block_asm_source.c_str(),
433  "qr_block_assembly_kernel_source");
434 
435  qr_block_asm_prog.add_kernel("block_qr_assembly");
436 
437 
438  //extra kernel in case of empty block A_I_u_J_u
439  std::string qr_block_asm_file_name_1 = "kernels/spai/qr_block_assembly_1_g.cl";
440  std::string qr_block_asm_source_1;
441  read_kernel_from_file(qr_block_asm_file_name_1, qr_block_asm_source_1);
442  viennacl::ocl::program & qr_block_asm_prog_1 = viennacl::ocl::current_context().add_program(qr_block_asm_source_1.c_str(),
443  "qr_block_assembly_kernel_source_1");
444 
445  qr_block_asm_prog_1.add_kernel("block_qr_assembly_1");
446 
447  }*/
448  block_matrix g_A_I_J_q_vcl;
449  //need to allocate memory for QR block
450  g_A_I_J_q_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
451  static_cast<unsigned int>(sizeof(ScalarType)*sz_blocks),
452  &(con_A_I_J_q[0]));
453  g_A_I_J_q_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
454  static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
455  &(matrix_dims[0]));
456  g_A_I_J_q_vcl.handle2() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
457  static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size() + 1)),
458  &(blocks_ind[0]));
459  viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
460  static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
461  &(g_is_update[0]));
462 
463  if(!is_empty_block){
464  viennacl::ocl::kernel& qr_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_qr_assembly");
465  qr_assembly_kernel.local_work_size(0, 1);
466  qr_assembly_kernel.global_work_size(0, 256);
467  viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions,
468  g_A_I_J_u_vcl.handle(),
469  g_A_I_J_u_vcl.handle2(),
470  g_A_I_J_u_vcl.handle1(),
471  g_A_I_u_J_u_vcl.handle(),
472  g_A_I_u_J_u_vcl.handle2(),
473  g_A_I_u_J_u_vcl.handle1(),
474  g_A_I_J_q_vcl.handle(),
475  g_A_I_J_q_vcl.handle2(),
476  g_A_I_J_q_vcl.handle1(),
477  g_is_update_vcl,
478  static_cast<unsigned int>(g_I.size())));
479  }else{
480  viennacl::ocl::kernel& qr_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_qr_assembly_1");
481  qr_assembly_kernel.local_work_size(0, 1);
482  qr_assembly_kernel.global_work_size(0, 256);
483  viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
484  g_A_I_J_u_vcl.handle1(),
485  g_A_I_J_q_vcl.handle(),
486  g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(),
487  g_is_update_vcl,
488  static_cast<unsigned int>(g_I.size())));
489  }
490  g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle();
491  g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1();
492  g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2();
493  }
494 
506  template<typename ScalarType>
507  void assemble_r(std::vector<std::vector<unsigned int> >& g_I, std::vector<std::vector<unsigned int> >& g_J,
508  block_matrix& g_A_I_J_vcl,
509  block_matrix& g_A_I_J_u_vcl,
510  block_matrix& g_A_I_u_J_u_vcl,
511  block_vector& g_bv_vcl,
512  block_vector& g_bv_vcl_u,
513  std::vector<cl_uint>& g_is_update,
514  const unsigned int cur_iter){
515  std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
516  std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
517  std::vector<cl_uint> start_bv_r_inds(g_I.size() + 1, 0);
518  unsigned int sz_blocks, bv_size;
519  compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
520  get_size(g_J, bv_size);
521  init_start_inds(g_J, start_bv_r_inds);
522  std::vector<ScalarType> con_A_I_J_r(sz_blocks, static_cast<ScalarType>(0));
523  std::vector<ScalarType> b_v_r(bv_size, static_cast<ScalarType>(0));
524  /*if(cur_iter == 1){
525  std::string r_block_asm_file_name = "kernels/spai/r_block_assembly_g.cl";
526  std::string r_block_asm_source;
527  read_kernel_from_file(r_block_asm_file_name, r_block_asm_source);
528  viennacl::ocl::program & r_block_asm_prog = viennacl::ocl::current_context().add_program(r_block_asm_source.c_str(),
529  "r_block_assembly_kernel_source");
530  r_block_asm_prog.add_kernel("block_r_assembly");
531 
532  std::string bv_block_asm_file_name = "kernels/spai/bv_block_assembly_g.cl";
533  std::string bv_block_asm_source;
534  read_kernel_from_file(bv_block_asm_file_name, bv_block_asm_source);
535  viennacl::ocl::program & bv_block_asm_prog = viennacl::ocl::current_context().add_program(bv_block_asm_source.c_str(),
536  "bv_block_assembly_kernel_source");
537  bv_block_asm_prog.add_kernel("block_bv_assembly");
538  }*/
539  block_matrix g_A_I_J_r_vcl;
540  block_vector g_bv_r_vcl;
541  g_A_I_J_r_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
542  static_cast<unsigned int>(sizeof(ScalarType)*sz_blocks),
543  &(con_A_I_J_r[0]));
544  g_A_I_J_r_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
545  static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
546  &(matrix_dims[0]));
547  g_A_I_J_r_vcl.handle2() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
548  static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size() + 1)),
549  &(blocks_ind[0]));
550  g_bv_r_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
551  static_cast<unsigned int>(sizeof(ScalarType)*bv_size),
552  &(b_v_r[0]));
553  g_bv_r_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
554  static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
555  &(start_bv_r_inds[0]));
556  viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
557  static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
558  &(g_is_update[0]));
559  viennacl::ocl::kernel& r_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_r_assembly");
560  r_assembly_kernel.local_work_size(0, 1);
561  r_assembly_kernel.global_work_size(0, 256);
562 
563  viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(),
564  g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(),
565  g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(),
566  g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(),
567  g_is_update_vcl, static_cast<cl_uint>(g_I.size())));
568 
569  viennacl::ocl::kernel & bv_assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_bv_assembly");
570  bv_assembly_kernel.local_work_size(0, 1);
571  bv_assembly_kernel.global_work_size(0, 256);
572  viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(),
573  g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(),
574  g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl,
575  static_cast<cl_uint>(g_I.size())));
576  g_bv_vcl.handle() = g_bv_r_vcl.handle();
577  g_bv_vcl.handle1() = g_bv_r_vcl.handle1();
578 
579  g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle();
580  g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2();
581  g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1();
582  }
583 
596  template<typename ScalarType, unsigned int MAT_ALIGNMENT, typename SparseVectorType>
597  void block_update(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, const std::vector<SparseVectorType>& A_v_c,
598  std::vector<cl_uint>& g_is_update,
599  std::vector<SparseVectorType>& g_res,
600  std::vector<std::vector<unsigned int> >& g_J,
601  std::vector<std::vector<unsigned int> >& g_I,
602  block_matrix& g_A_I_J_vcl,
603  block_vector& g_bv_vcl,
604  spai_tag const & tag,
605  const unsigned int cur_iter){
606  //updated index set for columns
607  std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
608  //updated index set for rows
609  std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
610  //mixed index set of old and updated indices for rows
611  std::vector<std::vector<unsigned int> > g_I_q(g_J.size());
612  //GPU memory for A_I_\hatJ
613  block_matrix g_A_I_J_u_vcl;
614  //GPU memory for A_\hatI_\hatJ
615  block_matrix g_A_I_u_J_u_vcl;
616  bool is_empty_block;
617  //GPU memory for new b_v
618  block_vector g_bv_u_vcl;
619 #ifdef _OPENMP
620  #pragma omp parallel for
621 #endif
622  for(std::size_t i = 0; i < g_J.size(); ++i){
623  if(g_is_update[i]){
624  if(buildAugmentedIndexSet<SparseVectorType, ScalarType>(A_v_c, g_res[i], g_J[i], g_J_u[i], tag)){
625  buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]);
626  }
627  }
628  }
629  //assemble new A_I_J_u blocks on GPU and multiply them with Q'
630  block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block, cur_iter);
631  //I have matrix A_I_J_u ready..
632  block_q_multiplication<ScalarType>(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, cur_iter);
633  //assemble A_\hatI_\hatJ
634  block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block, cur_iter);
635  assemble_qr_block<ScalarType>(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(),
636  g_A_I_u_J_u_vcl, g_is_update, is_empty_block, cur_iter);
637 
638  block_qr<ScalarType>(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, cur_iter);
639  //concatanation of new and old indices
640 #ifdef _OPENMP
641  #pragma omp parallel for
642 #endif
643  for(std::size_t i = 0; i < g_J.size(); ++i){
644  g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end());
645  g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end());
646  }
647  assemble_r<ScalarType>(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl, g_bv_vcl, g_bv_u_vcl, g_is_update, cur_iter);
648  }
649 
650  }
651  }
652  }
653 }
654 #endif