ViennaCL - The Vienna Computing Library  1.5.0
viennacl/linalg/detail/spai/spai-dynamic.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
00002 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2013, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00027 #include <utility>
00028 #include <iostream>
00029 #include <fstream>
00030 #include <string>
00031 #include <algorithm>
00032 #include <vector>
00033 #include <math.h>
00034 #include <map>
00035 //#include "block_matrix.hpp"
00036 //#include "block_vector.hpp"
00037 //#include "benchmark-utils.hpp"
00038 #include "boost/numeric/ublas/vector.hpp"
00039 #include "boost/numeric/ublas/matrix.hpp"
00040 #include "boost/numeric/ublas/matrix_proxy.hpp"
00041 #include "boost/numeric/ublas/vector_proxy.hpp"
00042 #include "boost/numeric/ublas/storage.hpp"
00043 #include "boost/numeric/ublas/io.hpp"
00044 #include "boost/numeric/ublas/lu.hpp"
00045 #include "boost/numeric/ublas/triangular.hpp"
00046 #include "boost/numeric/ublas/matrix_expression.hpp"
00047 // ViennaCL includes
00048 #include "viennacl/linalg/prod.hpp"
00049 #include "viennacl/matrix.hpp"
00050 #include "viennacl/compressed_matrix.hpp"
00051 #include "viennacl/linalg/sparse_matrix_operations.hpp"
00052 #include "viennacl/linalg/matrix_operations.hpp"
00053 #include "viennacl/scalar.hpp"
00054 #include "viennacl/linalg/cg.hpp"
00055 #include "viennacl/linalg/inner_prod.hpp"
00056 #include "viennacl/linalg/ilu.hpp"
00057 #include "viennacl/ocl/backend.hpp"
00058 
00059 #include "viennacl/linalg/detail/spai/block_matrix.hpp"
00060 #include "viennacl/linalg/detail/spai/block_vector.hpp"
00061 #include "viennacl/linalg/detail/spai/qr.hpp"
00062 #include "viennacl/linalg/detail/spai/spai-static.hpp"
00063 #include "viennacl/linalg/detail/spai/spai.hpp"
00064 #include "viennacl/linalg/detail/spai/spai_tag.hpp"
00065 #include "viennacl/linalg/opencl/kernels/spai.hpp"
00066 
00067 namespace viennacl
00068 {
00069     namespace linalg
00070     {
00071       namespace detail
00072       {
00073         namespace spai
00074         {
00075 
00077           struct CompareSecond
00078           {
00079             template <typename T1, typename T2>
00080             bool operator()(std::pair<T1, T2> const & left, std::pair<T1, T2> const & right)
00081             {
00082               return static_cast<double>(left.second) > static_cast<double>(right.second);
00083             }
00084           };
00085 
00086 
00092           template<typename MatrixType>
00093           void composeNewR(const MatrixType& A, const MatrixType& R_n, MatrixType& R)
00094           {
00095             typedef typename MatrixType::value_type ScalarType;
00096             vcl_size_t row_n = R_n.size1() - (A.size1() - R.size2());
00097             MatrixType C = boost::numeric::ublas::zero_matrix<ScalarType>(R.size1() + row_n, R.size2() + A.size2());
00098             //write original R to new Composite R
00099             boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R;
00100             //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2()
00101             boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(),
00102                                                                                                                       R.size2() + A.size2())) +=
00103             boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2()));
00104             //adding decomposed(QR) block to Composite R
00105             if(R_n.size1() > 0 && R_n.size2() > 0)
00106                 boost::numeric::ublas::project(C, boost::numeric::ublas::range(R.size2(), R.size1() + row_n),
00107                                               boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n;
00108             R = C;
00109           }
00110 
00115           template<typename VectorType>
00116           void composeNewVector(const VectorType& v_n, VectorType& v)
00117           {
00118             typedef typename VectorType::value_type ScalarType;
00119             VectorType w  = boost::numeric::ublas::zero_vector<ScalarType>(v.size() + v_n.size());
00120             boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v;
00121             boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n;
00122             v = w;
00123           }
00124 
00129           template<typename SparseVectorType, typename ScalarType>
00130           void sparse_norm_2(const SparseVectorType& v, ScalarType& norm)
00131           {
00132             for(typename SparseVectorType::const_iterator vec_it  = v.begin(); vec_it != v.end(); ++vec_it)
00133               norm += (vec_it->second)*(vec_it->second);
00134 
00135             norm = std::sqrt(norm);
00136           }
00137 
00143           template<typename SparseVectorType, typename ScalarType>
00144           void sparse_inner_prod(const SparseVectorType& v1, const SparseVectorType& v2, ScalarType& res_v)
00145           {
00146             typename SparseVectorType::const_iterator v_it1 = v1.begin();
00147             typename SparseVectorType::const_iterator v_it2 = v2.begin();
00148             while((v_it1 != v1.end())&&(v_it2 != v2.end()))
00149             {
00150               if(v_it1->first == v_it2->first)
00151               {
00152                 res_v += (v_it1->second)*(v_it2->second);
00153                 ++v_it1;
00154                 ++v_it2;
00155               }
00156               else if(v_it1->first < v_it2->first)
00157                 ++v_it1;
00158               else
00159                 ++v_it2;
00160             }
00161           }
00162 
00170           template <typename SparseVectorType, typename ScalarType>
00171           bool buildAugmentedIndexSet(const std::vector<SparseVectorType>& A_v_c,
00172                                       const SparseVectorType& res,
00173                                       std::vector<unsigned int>& J,
00174                                       std::vector<unsigned int>& J_u,
00175                                       const spai_tag& tag)
00176           {
00177             std::vector<std::pair<unsigned int, ScalarType> > p;
00178             vcl_size_t cur_size = 0;
00179             ScalarType inprod, norm2;
00180             //print_sparse_vector(res);
00181             for(typename SparseVectorType::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it)
00182             {
00183               if(!isInIndexSet(J, res_it->first) && (std::fabs(res_it->second) > tag.getResidualThreshold()))
00184               {
00185                 inprod = norm2 = 0;
00186                 sparse_inner_prod(res, A_v_c[res_it->first], inprod);
00187                 sparse_norm_2(A_v_c[res_it->first], norm2);
00188                 p.push_back(std::pair<unsigned int, ScalarType>(res_it->first, (inprod*inprod)/(norm2*norm2)));
00189               }
00190             }
00191 
00192             std::sort(p.begin(), p.end(), CompareSecond());
00193             while ((cur_size < J.size())&&(p.size() > 0))
00194             {
00195               J_u.push_back(p[0].first);
00196               p.erase(p.begin());
00197               cur_size++;
00198             }
00199             p.clear();
00200             return (cur_size > 0);
00201           }
00202 
00209           template<typename SparseVectorType>
00210           void buildNewRowSet(const std::vector<SparseVectorType>& A_v_c, const std::vector<unsigned int>& I,
00211                               const std::vector<unsigned int>& J_n, std::vector<unsigned int>& I_n)
00212           {
00213             for(vcl_size_t i = 0; i < J_n.size(); ++i)
00214             {
00215               for(typename SparseVectorType::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it)
00216               {
00217                 if(!isInIndexSet(I, col_it->first)&&!isInIndexSet(I_n, col_it->first))
00218                   I_n.push_back(col_it->first);
00219               }
00220             }
00221           }
00222 
00228           template<typename MatrixType>
00229           void QRBlockComposition(const MatrixType& A_I_J, const MatrixType& A_I_J_u, MatrixType& A_I_u_J_u)
00230           {
00231             typedef typename MatrixType::value_type ScalarType;
00232             vcl_size_t row_n1 = A_I_J_u.size1() - A_I_J.size2();
00233             vcl_size_t row_n2 = A_I_u_J_u.size1();
00234             vcl_size_t row_n = row_n1 + row_n2;
00235             vcl_size_t col_n = A_I_J_u.size2();
00236             MatrixType C = boost::numeric::ublas::zero_matrix<ScalarType>(row_n, col_n);
00237             boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, row_n1), boost::numeric::ublas::range(0, col_n)) +=
00238             boost::numeric::ublas::project(A_I_J_u, boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()),
00239                                           boost::numeric::ublas::range(0, col_n));
00240 
00241             boost::numeric::ublas::project(C, boost::numeric::ublas::range(row_n1, row_n1 + row_n2),
00242                                           boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u;
00243             A_I_u_J_u = C;
00244           }
00245 
00257           template<typename SparseMatrixType, typename SparseVectorType, typename DenseMatrixType, typename VectorType>
00258           void block_update(const SparseMatrixType& A, const std::vector<SparseVectorType>& A_v_c,
00259                             std::vector<SparseVectorType>& g_res,
00260                             std::vector<bool>& g_is_update,
00261                             std::vector<std::vector<unsigned int> >& g_I,
00262                             std::vector<std::vector<unsigned int> >& g_J,
00263                             std::vector<VectorType>& g_b_v,
00264                             std::vector<DenseMatrixType>& g_A_I_J,
00265                             spai_tag const & tag)
00266           {
00267             typedef typename DenseMatrixType::value_type ScalarType;
00268             //set of new column indices
00269             std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
00270             //set of new row indices
00271             std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
00272             //matrix A(I, \tilde J), cf. Kallischko p.31-32
00273             std::vector<DenseMatrixType> g_A_I_J_u(g_J.size());
00274             //matrix A(\tilde I, \tilde J), cf. Kallischko
00275             std::vector<DenseMatrixType> g_A_I_u_J_u(g_J.size());
00276             //new vector of beta coefficients from QR factorization
00277             std::vector<VectorType> g_b_v_u(g_J.size());
00278 #ifdef VIENNACL_WITH_OPENMP
00279             #pragma omp parallel for
00280 #endif
00281             for(long i = 0; i < static_cast<long>(g_J.size()); ++i)
00282             {
00283               if(g_is_update[i])
00284               {
00285                 if(buildAugmentedIndexSet<SparseVectorType, ScalarType>(A_v_c, g_res[i], g_J[i], g_J_u[i], tag))
00286                 {
00287                   //initialize matrix A_I_\hatJ
00288                   initProjectSubMatrix(A, g_J_u[i], g_I[i], g_A_I_J_u[i]);
00289                   //multiplication of Q'*A_I_\hatJ
00290                   apply_q_trans_mat(g_A_I_J[i], g_b_v[i], g_A_I_J_u[i]);
00291                   //building new rows index set \hatI
00292                   buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]);
00293                   initProjectSubMatrix(A, g_J_u[i], g_I_u[i], g_A_I_u_J_u[i]);
00294                   //composition of block for new QR factorization
00295                   QRBlockComposition(g_A_I_J[i], g_A_I_J_u[i], g_A_I_u_J_u[i]);
00296                   //QR factorization
00297                   single_qr(g_A_I_u_J_u[i], g_b_v_u[i]);
00298                   //composition of new R and new vector b_v
00299                   composeNewR(g_A_I_J_u[i], g_A_I_u_J_u[i], g_A_I_J[i]);
00300                   composeNewVector(g_b_v_u[i], g_b_v[i]);
00301                   //composition of new sets: I and J
00302                   g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end());
00303                   g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end());
00304                 }
00305                 else
00306                 {
00307                   g_is_update[i] = false;
00308                 }
00309               }
00310             }
00311           }
00312           /**************************************************** GPU SPAI Update ****************************************************************/
00313 
00314 
00315           //performs Q'*A(I, \tilde J) on GPU
00325           template<typename ScalarType>
00326           void block_q_multiplication(const std::vector<std::vector<unsigned int> >& g_J_u,
00327                                       const std::vector<std::vector<unsigned int> >& g_I,
00328                                       block_matrix& g_A_I_J_vcl,
00329                                       block_vector& g_bv_vcl,
00330                                       block_matrix& g_A_I_J_u_vcl,
00331                                       std::vector<cl_uint>& g_is_update,
00332                                       viennacl::context ctx)
00333           {
00334             viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
00335             unsigned int local_r_n = 0;
00336             unsigned int local_c_n = 0;
00337             unsigned int sz_blocks = 0;
00338             get_max_block_size(g_I, local_r_n);
00339             get_max_block_size(g_J_u, local_c_n);
00340             //for debug
00341             std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
00342             std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
00343             compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims);
00344             //std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
00345 
00346             viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00347                                                                                      static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
00348                                                                                      &(g_is_update[0]));
00349             viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx);
00350             viennacl::ocl::kernel& block_q_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_q_mult");
00351             block_q_kernel.local_work_size(0, local_c_n);
00352             block_q_kernel.global_work_size(0, 256);
00353             viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
00354                                                   g_bv_vcl.handle(),
00355                                                   g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl,
00356                                                   viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
00357                                                   static_cast<cl_uint>(g_I.size())));
00358           }
00359 
00366           template <typename SizeType>
00367           void assemble_qr_row_inds(const std::vector<std::vector<SizeType> >& g_I, const std::vector<std::vector<SizeType> > g_J,
00368                                     const std::vector<std::vector<SizeType> >& g_I_u,
00369                                     std::vector<std::vector<SizeType> >& g_I_q)
00370           {
00371 #ifdef VIENNACL_WITH_OPENMP
00372             #pragma omp parallel for
00373 #endif
00374             for(long i = 0; i < static_cast<long>(g_I.size()); ++i)
00375             {
00376               for(vcl_size_t j = g_J[i].size(); j < g_I[i].size(); ++j)
00377                   g_I_q[i].push_back(g_I[i][j]);
00378 
00379               for(vcl_size_t j = 0; j < g_I_u[i].size(); ++j)
00380                   g_I_q[i].push_back(g_I_u[i][j]);
00381             }
00382           }
00383 
00398           template<typename ScalarType>
00399           void assemble_qr_block(
00400                                 const std::vector<std::vector<unsigned int> >& g_J,
00401                                 const std::vector<std::vector<unsigned int> >& g_I,
00402                                 const std::vector<std::vector<unsigned int> >& g_J_u,
00403                                 const std::vector<std::vector<unsigned int> >& g_I_u,
00404                                 std::vector<std::vector<unsigned int> >& g_I_q,
00405                                 block_matrix& g_A_I_J_u_vcl,
00406                                 viennacl::ocl::handle<cl_mem>& matrix_dimensions,
00407                                 block_matrix& g_A_I_u_J_u_vcl,
00408                                 std::vector<cl_uint>& g_is_update,
00409                                 const bool is_empty_block,
00410                                 viennacl::context ctx)
00411           {
00412             viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
00413 
00414             //std::vector<std::vector<unsigned int> > g_I_q(g_I.size());
00415             assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q);
00416             unsigned int sz_blocks;
00417             std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
00418             std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
00419             compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims);
00420             std::vector<ScalarType> con_A_I_J_q(sz_blocks, static_cast<ScalarType>(0));
00421 
00422             block_matrix g_A_I_J_q_vcl;
00423             //need to allocate memory for QR block
00424             g_A_I_J_q_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00425                                                               static_cast<unsigned int>(sizeof(ScalarType)*sz_blocks),
00426                                                               &(con_A_I_J_q[0]));
00427             g_A_I_J_q_vcl.handle().context(opencl_ctx);
00428 
00429             g_A_I_J_q_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00430                                                                static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
00431                                                                &(matrix_dims[0]));
00432             g_A_I_J_q_vcl.handle1().context(opencl_ctx);
00433 
00434             g_A_I_J_q_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00435                                                                 static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size() + 1)),
00436                                                                 &(blocks_ind[0]));
00437             g_A_I_J_q_vcl.handle2().context(opencl_ctx);
00438 
00439             viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00440                                                                                      static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
00441                                                                                      &(g_is_update[0]));
00442 
00443             viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx);
00444             if(!is_empty_block)
00445             {
00446               viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_qr_assembly");
00447               qr_assembly_kernel.local_work_size(0, 1);
00448               qr_assembly_kernel.global_work_size(0, 256);
00449               viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions,
00450                                                         g_A_I_J_u_vcl.handle(),
00451                                                         g_A_I_J_u_vcl.handle2(),
00452                                                         g_A_I_J_u_vcl.handle1(),
00453                                                         g_A_I_u_J_u_vcl.handle(),
00454                                                         g_A_I_u_J_u_vcl.handle2(),
00455                                                         g_A_I_u_J_u_vcl.handle1(),
00456                                                         g_A_I_J_q_vcl.handle(),
00457                                                         g_A_I_J_q_vcl.handle2(),
00458                                                         g_A_I_J_q_vcl.handle1(),
00459                                                         g_is_update_vcl,
00460                                                         static_cast<unsigned int>(g_I.size())));
00461             }
00462             else
00463             {
00464               viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_qr_assembly_1");
00465               qr_assembly_kernel.local_work_size(0, 1);
00466               qr_assembly_kernel.global_work_size(0, 256);
00467               viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(),
00468                                                         g_A_I_J_u_vcl.handle1(),
00469                                                         g_A_I_J_q_vcl.handle(),
00470                                                         g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(),
00471                                                         g_is_update_vcl,
00472                                                         static_cast<unsigned int>(g_I.size())));
00473             }
00474             g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle();
00475             g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1();
00476             g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2();
00477           }
00478 
00490           template<typename ScalarType>
00491           void assemble_r(std::vector<std::vector<unsigned int> >& g_I, std::vector<std::vector<unsigned int> >& g_J,
00492                           block_matrix& g_A_I_J_vcl,
00493                           block_matrix& g_A_I_J_u_vcl,
00494                           block_matrix& g_A_I_u_J_u_vcl,
00495                           block_vector& g_bv_vcl,
00496                           block_vector& g_bv_vcl_u,
00497                           std::vector<cl_uint>& g_is_update,
00498                           viennacl::context ctx)
00499           {
00500             viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
00501             std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
00502             std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
00503             std::vector<cl_uint> start_bv_r_inds(g_I.size() + 1, 0);
00504             unsigned int sz_blocks, bv_size;
00505             compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
00506             get_size(g_J, bv_size);
00507             init_start_inds(g_J, start_bv_r_inds);
00508             std::vector<ScalarType> con_A_I_J_r(sz_blocks, static_cast<ScalarType>(0));
00509             std::vector<ScalarType> b_v_r(bv_size, static_cast<ScalarType>(0));
00510 
00511             block_matrix g_A_I_J_r_vcl;
00512             block_vector g_bv_r_vcl;
00513             g_A_I_J_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00514                                                               static_cast<unsigned int>(sizeof(ScalarType)*sz_blocks),
00515                                                               &(con_A_I_J_r[0]));
00516             g_A_I_J_r_vcl.handle().context(opencl_ctx);
00517 
00518             g_A_I_J_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00519                                                                static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())),
00520                                                                &(matrix_dims[0]));
00521             g_A_I_J_r_vcl.handle1().context(opencl_ctx);
00522 
00523             g_A_I_J_r_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00524                                                                static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size() + 1)),
00525                                                                &(blocks_ind[0]));
00526             g_A_I_J_r_vcl.handle2().context(opencl_ctx);
00527 
00528             g_bv_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00529                                                            static_cast<unsigned int>(sizeof(ScalarType)*bv_size),
00530                                                            &(b_v_r[0]));
00531             g_bv_r_vcl.handle().context(opencl_ctx);
00532 
00533             g_bv_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00534                                                             static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
00535                                                             &(start_bv_r_inds[0]));
00536             g_bv_r_vcl.handle().context(opencl_ctx);
00537 
00538             viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00539                                                                                      static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
00540                                                                                      &(g_is_update[0]));
00541             viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx);
00542             viennacl::ocl::kernel& r_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_r_assembly");
00543             r_assembly_kernel.local_work_size(0, 1);
00544             r_assembly_kernel.global_work_size(0, 256);
00545 
00546             viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(),
00547                                                     g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(),
00548                                                     g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(),
00549                                                     g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(),
00550                                                     g_is_update_vcl, static_cast<cl_uint>(g_I.size())));
00551 
00552             viennacl::ocl::kernel & bv_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_bv_assembly");
00553             bv_assembly_kernel.local_work_size(0, 1);
00554             bv_assembly_kernel.global_work_size(0, 256);
00555             viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(),
00556                                                       g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(),
00557                                                       g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl,
00558                                                       static_cast<cl_uint>(g_I.size())));
00559             g_bv_vcl.handle() = g_bv_r_vcl.handle();
00560             g_bv_vcl.handle1() = g_bv_r_vcl.handle1();
00561 
00562             g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle();
00563             g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2();
00564             g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1();
00565           }
00566 
00578           template<typename ScalarType, unsigned int MAT_ALIGNMENT, typename SparseVectorType>
00579           void block_update(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, const std::vector<SparseVectorType>& A_v_c,
00580                             std::vector<cl_uint>& g_is_update,
00581                             std::vector<SparseVectorType>& g_res,
00582                             std::vector<std::vector<unsigned int> >& g_J,
00583                             std::vector<std::vector<unsigned int> >& g_I,
00584                             block_matrix& g_A_I_J_vcl,
00585                             block_vector& g_bv_vcl,
00586                             spai_tag const & tag)
00587           {
00588             viennacl::context ctx = viennacl::traits::context(A);
00589             //updated index set for columns
00590             std::vector<std::vector<unsigned int> > g_J_u(g_J.size());
00591             //updated index set for rows
00592             std::vector<std::vector<unsigned int> > g_I_u(g_J.size());
00593             //mixed index set of old and updated indices for rows
00594             std::vector<std::vector<unsigned int> > g_I_q(g_J.size());
00595             //GPU memory for A_I_\hatJ
00596             block_matrix g_A_I_J_u_vcl;
00597             //GPU memory for A_\hatI_\hatJ
00598             block_matrix g_A_I_u_J_u_vcl;
00599             bool is_empty_block;
00600             //GPU memory for new b_v
00601             block_vector g_bv_u_vcl;
00602 #ifdef VIENNACL_WITH_OPENMP
00603             #pragma omp parallel for
00604 #endif
00605             for(long i = 0; i < static_cast<long>(g_J.size()); ++i)
00606             {
00607               if(g_is_update[i])
00608               {
00609                 if(buildAugmentedIndexSet<SparseVectorType, ScalarType>(A_v_c, g_res[i], g_J[i], g_J_u[i], tag))
00610                     buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]);
00611               }
00612             }
00613             //assemble new A_I_J_u blocks on GPU and multiply them with Q'
00614             block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block);
00615             //I have matrix A_I_J_u ready..
00616             block_q_multiplication<ScalarType>(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, ctx);
00617             //assemble A_\hatI_\hatJ
00618             block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block);
00619             assemble_qr_block<ScalarType>(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(),
00620                                           g_A_I_u_J_u_vcl, g_is_update, is_empty_block, ctx);
00621 
00622             block_qr<ScalarType>(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, ctx);
00623             //concatanation of new and old indices
00624 #ifdef VIENNACL_WITH_OPENMP
00625             #pragma omp parallel for
00626 #endif
00627             for(long i = 0; i < static_cast<long>(g_J.size()); ++i)
00628             {
00629               g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end());
00630               g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end());
00631             }
00632             assemble_r<ScalarType>(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl,  g_bv_vcl,  g_bv_u_vcl, g_is_update, ctx);
00633           }
00634 
00635         }
00636       }
00637     }
00638 }
00639 #endif