ViennaCL - The Vienna Computing Library
1.5.0
|
00001 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP 00002 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2013, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00027 #include <utility> 00028 #include <iostream> 00029 #include <fstream> 00030 #include <string> 00031 #include <algorithm> 00032 #include <vector> 00033 #include <math.h> 00034 #include <map> 00035 //#include "block_matrix.hpp" 00036 //#include "block_vector.hpp" 00037 //#include "benchmark-utils.hpp" 00038 #include "boost/numeric/ublas/vector.hpp" 00039 #include "boost/numeric/ublas/matrix.hpp" 00040 #include "boost/numeric/ublas/matrix_proxy.hpp" 00041 #include "boost/numeric/ublas/vector_proxy.hpp" 00042 #include "boost/numeric/ublas/storage.hpp" 00043 #include "boost/numeric/ublas/io.hpp" 00044 #include "boost/numeric/ublas/lu.hpp" 00045 #include "boost/numeric/ublas/triangular.hpp" 00046 #include "boost/numeric/ublas/matrix_expression.hpp" 00047 // ViennaCL includes 00048 #include "viennacl/linalg/prod.hpp" 00049 #include "viennacl/matrix.hpp" 00050 #include "viennacl/compressed_matrix.hpp" 00051 #include "viennacl/linalg/sparse_matrix_operations.hpp" 00052 #include "viennacl/linalg/matrix_operations.hpp" 00053 #include "viennacl/scalar.hpp" 00054 #include "viennacl/linalg/cg.hpp" 00055 #include "viennacl/linalg/inner_prod.hpp" 00056 #include "viennacl/linalg/ilu.hpp" 00057 #include "viennacl/ocl/backend.hpp" 00058 00059 #include "viennacl/linalg/detail/spai/block_matrix.hpp" 00060 #include "viennacl/linalg/detail/spai/block_vector.hpp" 00061 #include "viennacl/linalg/detail/spai/qr.hpp" 00062 #include "viennacl/linalg/detail/spai/spai-static.hpp" 00063 #include "viennacl/linalg/detail/spai/spai.hpp" 00064 #include "viennacl/linalg/detail/spai/spai_tag.hpp" 00065 #include "viennacl/linalg/opencl/kernels/spai.hpp" 00066 00067 namespace viennacl 00068 { 00069 namespace linalg 00070 { 00071 namespace detail 00072 { 00073 namespace spai 00074 { 00075 00077 struct CompareSecond 00078 { 00079 template <typename T1, typename T2> 00080 bool operator()(std::pair<T1, T2> const & left, std::pair<T1, T2> const & right) 00081 { 00082 return static_cast<double>(left.second) > static_cast<double>(right.second); 00083 } 00084 }; 00085 00086 00092 template<typename MatrixType> 00093 void composeNewR(const MatrixType& A, const MatrixType& R_n, MatrixType& R) 00094 { 00095 typedef typename MatrixType::value_type ScalarType; 00096 vcl_size_t row_n = R_n.size1() - (A.size1() - R.size2()); 00097 MatrixType C = boost::numeric::ublas::zero_matrix<ScalarType>(R.size1() + row_n, R.size2() + A.size2()); 00098 //write original R to new Composite R 00099 boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R; 00100 //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2() 00101 boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(), 00102 R.size2() + A.size2())) += 00103 boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2())); 00104 //adding decomposed(QR) block to Composite R 00105 if(R_n.size1() > 0 && R_n.size2() > 0) 00106 boost::numeric::ublas::project(C, boost::numeric::ublas::range(R.size2(), R.size1() + row_n), 00107 boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n; 00108 R = C; 00109 } 00110 00115 template<typename VectorType> 00116 void composeNewVector(const VectorType& v_n, VectorType& v) 00117 { 00118 typedef typename VectorType::value_type ScalarType; 00119 VectorType w = boost::numeric::ublas::zero_vector<ScalarType>(v.size() + v_n.size()); 00120 boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v; 00121 boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n; 00122 v = w; 00123 } 00124 00129 template<typename SparseVectorType, typename ScalarType> 00130 void sparse_norm_2(const SparseVectorType& v, ScalarType& norm) 00131 { 00132 for(typename SparseVectorType::const_iterator vec_it = v.begin(); vec_it != v.end(); ++vec_it) 00133 norm += (vec_it->second)*(vec_it->second); 00134 00135 norm = std::sqrt(norm); 00136 } 00137 00143 template<typename SparseVectorType, typename ScalarType> 00144 void sparse_inner_prod(const SparseVectorType& v1, const SparseVectorType& v2, ScalarType& res_v) 00145 { 00146 typename SparseVectorType::const_iterator v_it1 = v1.begin(); 00147 typename SparseVectorType::const_iterator v_it2 = v2.begin(); 00148 while((v_it1 != v1.end())&&(v_it2 != v2.end())) 00149 { 00150 if(v_it1->first == v_it2->first) 00151 { 00152 res_v += (v_it1->second)*(v_it2->second); 00153 ++v_it1; 00154 ++v_it2; 00155 } 00156 else if(v_it1->first < v_it2->first) 00157 ++v_it1; 00158 else 00159 ++v_it2; 00160 } 00161 } 00162 00170 template <typename SparseVectorType, typename ScalarType> 00171 bool buildAugmentedIndexSet(const std::vector<SparseVectorType>& A_v_c, 00172 const SparseVectorType& res, 00173 std::vector<unsigned int>& J, 00174 std::vector<unsigned int>& J_u, 00175 const spai_tag& tag) 00176 { 00177 std::vector<std::pair<unsigned int, ScalarType> > p; 00178 vcl_size_t cur_size = 0; 00179 ScalarType inprod, norm2; 00180 //print_sparse_vector(res); 00181 for(typename SparseVectorType::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it) 00182 { 00183 if(!isInIndexSet(J, res_it->first) && (std::fabs(res_it->second) > tag.getResidualThreshold())) 00184 { 00185 inprod = norm2 = 0; 00186 sparse_inner_prod(res, A_v_c[res_it->first], inprod); 00187 sparse_norm_2(A_v_c[res_it->first], norm2); 00188 p.push_back(std::pair<unsigned int, ScalarType>(res_it->first, (inprod*inprod)/(norm2*norm2))); 00189 } 00190 } 00191 00192 std::sort(p.begin(), p.end(), CompareSecond()); 00193 while ((cur_size < J.size())&&(p.size() > 0)) 00194 { 00195 J_u.push_back(p[0].first); 00196 p.erase(p.begin()); 00197 cur_size++; 00198 } 00199 p.clear(); 00200 return (cur_size > 0); 00201 } 00202 00209 template<typename SparseVectorType> 00210 void buildNewRowSet(const std::vector<SparseVectorType>& A_v_c, const std::vector<unsigned int>& I, 00211 const std::vector<unsigned int>& J_n, std::vector<unsigned int>& I_n) 00212 { 00213 for(vcl_size_t i = 0; i < J_n.size(); ++i) 00214 { 00215 for(typename SparseVectorType::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it) 00216 { 00217 if(!isInIndexSet(I, col_it->first)&&!isInIndexSet(I_n, col_it->first)) 00218 I_n.push_back(col_it->first); 00219 } 00220 } 00221 } 00222 00228 template<typename MatrixType> 00229 void QRBlockComposition(const MatrixType& A_I_J, const MatrixType& A_I_J_u, MatrixType& A_I_u_J_u) 00230 { 00231 typedef typename MatrixType::value_type ScalarType; 00232 vcl_size_t row_n1 = A_I_J_u.size1() - A_I_J.size2(); 00233 vcl_size_t row_n2 = A_I_u_J_u.size1(); 00234 vcl_size_t row_n = row_n1 + row_n2; 00235 vcl_size_t col_n = A_I_J_u.size2(); 00236 MatrixType C = boost::numeric::ublas::zero_matrix<ScalarType>(row_n, col_n); 00237 boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, row_n1), boost::numeric::ublas::range(0, col_n)) += 00238 boost::numeric::ublas::project(A_I_J_u, boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()), 00239 boost::numeric::ublas::range(0, col_n)); 00240 00241 boost::numeric::ublas::project(C, boost::numeric::ublas::range(row_n1, row_n1 + row_n2), 00242 boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u; 00243 A_I_u_J_u = C; 00244 } 00245 00257 template<typename SparseMatrixType, typename SparseVectorType, typename DenseMatrixType, typename VectorType> 00258 void block_update(const SparseMatrixType& A, const std::vector<SparseVectorType>& A_v_c, 00259 std::vector<SparseVectorType>& g_res, 00260 std::vector<bool>& g_is_update, 00261 std::vector<std::vector<unsigned int> >& g_I, 00262 std::vector<std::vector<unsigned int> >& g_J, 00263 std::vector<VectorType>& g_b_v, 00264 std::vector<DenseMatrixType>& g_A_I_J, 00265 spai_tag const & tag) 00266 { 00267 typedef typename DenseMatrixType::value_type ScalarType; 00268 //set of new column indices 00269 std::vector<std::vector<unsigned int> > g_J_u(g_J.size()); 00270 //set of new row indices 00271 std::vector<std::vector<unsigned int> > g_I_u(g_J.size()); 00272 //matrix A(I, \tilde J), cf. Kallischko p.31-32 00273 std::vector<DenseMatrixType> g_A_I_J_u(g_J.size()); 00274 //matrix A(\tilde I, \tilde J), cf. Kallischko 00275 std::vector<DenseMatrixType> g_A_I_u_J_u(g_J.size()); 00276 //new vector of beta coefficients from QR factorization 00277 std::vector<VectorType> g_b_v_u(g_J.size()); 00278 #ifdef VIENNACL_WITH_OPENMP 00279 #pragma omp parallel for 00280 #endif 00281 for(long i = 0; i < static_cast<long>(g_J.size()); ++i) 00282 { 00283 if(g_is_update[i]) 00284 { 00285 if(buildAugmentedIndexSet<SparseVectorType, ScalarType>(A_v_c, g_res[i], g_J[i], g_J_u[i], tag)) 00286 { 00287 //initialize matrix A_I_\hatJ 00288 initProjectSubMatrix(A, g_J_u[i], g_I[i], g_A_I_J_u[i]); 00289 //multiplication of Q'*A_I_\hatJ 00290 apply_q_trans_mat(g_A_I_J[i], g_b_v[i], g_A_I_J_u[i]); 00291 //building new rows index set \hatI 00292 buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]); 00293 initProjectSubMatrix(A, g_J_u[i], g_I_u[i], g_A_I_u_J_u[i]); 00294 //composition of block for new QR factorization 00295 QRBlockComposition(g_A_I_J[i], g_A_I_J_u[i], g_A_I_u_J_u[i]); 00296 //QR factorization 00297 single_qr(g_A_I_u_J_u[i], g_b_v_u[i]); 00298 //composition of new R and new vector b_v 00299 composeNewR(g_A_I_J_u[i], g_A_I_u_J_u[i], g_A_I_J[i]); 00300 composeNewVector(g_b_v_u[i], g_b_v[i]); 00301 //composition of new sets: I and J 00302 g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end()); 00303 g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end()); 00304 } 00305 else 00306 { 00307 g_is_update[i] = false; 00308 } 00309 } 00310 } 00311 } 00312 /**************************************************** GPU SPAI Update ****************************************************************/ 00313 00314 00315 //performs Q'*A(I, \tilde J) on GPU 00325 template<typename ScalarType> 00326 void block_q_multiplication(const std::vector<std::vector<unsigned int> >& g_J_u, 00327 const std::vector<std::vector<unsigned int> >& g_I, 00328 block_matrix& g_A_I_J_vcl, 00329 block_vector& g_bv_vcl, 00330 block_matrix& g_A_I_J_u_vcl, 00331 std::vector<cl_uint>& g_is_update, 00332 viennacl::context ctx) 00333 { 00334 viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context()); 00335 unsigned int local_r_n = 0; 00336 unsigned int local_c_n = 0; 00337 unsigned int sz_blocks = 0; 00338 get_max_block_size(g_I, local_r_n); 00339 get_max_block_size(g_J_u, local_c_n); 00340 //for debug 00341 std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0)); 00342 std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0)); 00343 compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims); 00344 //std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0)); 00345 00346 viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00347 static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())), 00348 &(g_is_update[0])); 00349 viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx); 00350 viennacl::ocl::kernel& block_q_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_q_mult"); 00351 block_q_kernel.local_work_size(0, local_c_n); 00352 block_q_kernel.global_work_size(0, 256); 00353 viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), 00354 g_bv_vcl.handle(), 00355 g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl, 00356 viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))), 00357 static_cast<cl_uint>(g_I.size()))); 00358 } 00359 00366 template <typename SizeType> 00367 void assemble_qr_row_inds(const std::vector<std::vector<SizeType> >& g_I, const std::vector<std::vector<SizeType> > g_J, 00368 const std::vector<std::vector<SizeType> >& g_I_u, 00369 std::vector<std::vector<SizeType> >& g_I_q) 00370 { 00371 #ifdef VIENNACL_WITH_OPENMP 00372 #pragma omp parallel for 00373 #endif 00374 for(long i = 0; i < static_cast<long>(g_I.size()); ++i) 00375 { 00376 for(vcl_size_t j = g_J[i].size(); j < g_I[i].size(); ++j) 00377 g_I_q[i].push_back(g_I[i][j]); 00378 00379 for(vcl_size_t j = 0; j < g_I_u[i].size(); ++j) 00380 g_I_q[i].push_back(g_I_u[i][j]); 00381 } 00382 } 00383 00398 template<typename ScalarType> 00399 void assemble_qr_block( 00400 const std::vector<std::vector<unsigned int> >& g_J, 00401 const std::vector<std::vector<unsigned int> >& g_I, 00402 const std::vector<std::vector<unsigned int> >& g_J_u, 00403 const std::vector<std::vector<unsigned int> >& g_I_u, 00404 std::vector<std::vector<unsigned int> >& g_I_q, 00405 block_matrix& g_A_I_J_u_vcl, 00406 viennacl::ocl::handle<cl_mem>& matrix_dimensions, 00407 block_matrix& g_A_I_u_J_u_vcl, 00408 std::vector<cl_uint>& g_is_update, 00409 const bool is_empty_block, 00410 viennacl::context ctx) 00411 { 00412 viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context()); 00413 00414 //std::vector<std::vector<unsigned int> > g_I_q(g_I.size()); 00415 assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q); 00416 unsigned int sz_blocks; 00417 std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0)); 00418 std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0)); 00419 compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims); 00420 std::vector<ScalarType> con_A_I_J_q(sz_blocks, static_cast<ScalarType>(0)); 00421 00422 block_matrix g_A_I_J_q_vcl; 00423 //need to allocate memory for QR block 00424 g_A_I_J_q_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00425 static_cast<unsigned int>(sizeof(ScalarType)*sz_blocks), 00426 &(con_A_I_J_q[0])); 00427 g_A_I_J_q_vcl.handle().context(opencl_ctx); 00428 00429 g_A_I_J_q_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00430 static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())), 00431 &(matrix_dims[0])); 00432 g_A_I_J_q_vcl.handle1().context(opencl_ctx); 00433 00434 g_A_I_J_q_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00435 static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size() + 1)), 00436 &(blocks_ind[0])); 00437 g_A_I_J_q_vcl.handle2().context(opencl_ctx); 00438 00439 viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00440 static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())), 00441 &(g_is_update[0])); 00442 00443 viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx); 00444 if(!is_empty_block) 00445 { 00446 viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_qr_assembly"); 00447 qr_assembly_kernel.local_work_size(0, 1); 00448 qr_assembly_kernel.global_work_size(0, 256); 00449 viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, 00450 g_A_I_J_u_vcl.handle(), 00451 g_A_I_J_u_vcl.handle2(), 00452 g_A_I_J_u_vcl.handle1(), 00453 g_A_I_u_J_u_vcl.handle(), 00454 g_A_I_u_J_u_vcl.handle2(), 00455 g_A_I_u_J_u_vcl.handle1(), 00456 g_A_I_J_q_vcl.handle(), 00457 g_A_I_J_q_vcl.handle2(), 00458 g_A_I_J_q_vcl.handle1(), 00459 g_is_update_vcl, 00460 static_cast<unsigned int>(g_I.size()))); 00461 } 00462 else 00463 { 00464 viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_qr_assembly_1"); 00465 qr_assembly_kernel.local_work_size(0, 1); 00466 qr_assembly_kernel.global_work_size(0, 256); 00467 viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), 00468 g_A_I_J_u_vcl.handle1(), 00469 g_A_I_J_q_vcl.handle(), 00470 g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(), 00471 g_is_update_vcl, 00472 static_cast<unsigned int>(g_I.size()))); 00473 } 00474 g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle(); 00475 g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1(); 00476 g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2(); 00477 } 00478 00490 template<typename ScalarType> 00491 void assemble_r(std::vector<std::vector<unsigned int> >& g_I, std::vector<std::vector<unsigned int> >& g_J, 00492 block_matrix& g_A_I_J_vcl, 00493 block_matrix& g_A_I_J_u_vcl, 00494 block_matrix& g_A_I_u_J_u_vcl, 00495 block_vector& g_bv_vcl, 00496 block_vector& g_bv_vcl_u, 00497 std::vector<cl_uint>& g_is_update, 00498 viennacl::context ctx) 00499 { 00500 viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context()); 00501 std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0)); 00502 std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0)); 00503 std::vector<cl_uint> start_bv_r_inds(g_I.size() + 1, 0); 00504 unsigned int sz_blocks, bv_size; 00505 compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims); 00506 get_size(g_J, bv_size); 00507 init_start_inds(g_J, start_bv_r_inds); 00508 std::vector<ScalarType> con_A_I_J_r(sz_blocks, static_cast<ScalarType>(0)); 00509 std::vector<ScalarType> b_v_r(bv_size, static_cast<ScalarType>(0)); 00510 00511 block_matrix g_A_I_J_r_vcl; 00512 block_vector g_bv_r_vcl; 00513 g_A_I_J_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00514 static_cast<unsigned int>(sizeof(ScalarType)*sz_blocks), 00515 &(con_A_I_J_r[0])); 00516 g_A_I_J_r_vcl.handle().context(opencl_ctx); 00517 00518 g_A_I_J_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00519 static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())), 00520 &(matrix_dims[0])); 00521 g_A_I_J_r_vcl.handle1().context(opencl_ctx); 00522 00523 g_A_I_J_r_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00524 static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size() + 1)), 00525 &(blocks_ind[0])); 00526 g_A_I_J_r_vcl.handle2().context(opencl_ctx); 00527 00528 g_bv_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00529 static_cast<unsigned int>(sizeof(ScalarType)*bv_size), 00530 &(b_v_r[0])); 00531 g_bv_r_vcl.handle().context(opencl_ctx); 00532 00533 g_bv_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00534 static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), 00535 &(start_bv_r_inds[0])); 00536 g_bv_r_vcl.handle().context(opencl_ctx); 00537 00538 viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00539 static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())), 00540 &(g_is_update[0])); 00541 viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx); 00542 viennacl::ocl::kernel& r_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_r_assembly"); 00543 r_assembly_kernel.local_work_size(0, 1); 00544 r_assembly_kernel.global_work_size(0, 256); 00545 00546 viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), 00547 g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(), 00548 g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(), 00549 g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(), 00550 g_is_update_vcl, static_cast<cl_uint>(g_I.size()))); 00551 00552 viennacl::ocl::kernel & bv_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_bv_assembly"); 00553 bv_assembly_kernel.local_work_size(0, 1); 00554 bv_assembly_kernel.global_work_size(0, 256); 00555 viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(), 00556 g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(), 00557 g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl, 00558 static_cast<cl_uint>(g_I.size()))); 00559 g_bv_vcl.handle() = g_bv_r_vcl.handle(); 00560 g_bv_vcl.handle1() = g_bv_r_vcl.handle1(); 00561 00562 g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle(); 00563 g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2(); 00564 g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1(); 00565 } 00566 00578 template<typename ScalarType, unsigned int MAT_ALIGNMENT, typename SparseVectorType> 00579 void block_update(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, const std::vector<SparseVectorType>& A_v_c, 00580 std::vector<cl_uint>& g_is_update, 00581 std::vector<SparseVectorType>& g_res, 00582 std::vector<std::vector<unsigned int> >& g_J, 00583 std::vector<std::vector<unsigned int> >& g_I, 00584 block_matrix& g_A_I_J_vcl, 00585 block_vector& g_bv_vcl, 00586 spai_tag const & tag) 00587 { 00588 viennacl::context ctx = viennacl::traits::context(A); 00589 //updated index set for columns 00590 std::vector<std::vector<unsigned int> > g_J_u(g_J.size()); 00591 //updated index set for rows 00592 std::vector<std::vector<unsigned int> > g_I_u(g_J.size()); 00593 //mixed index set of old and updated indices for rows 00594 std::vector<std::vector<unsigned int> > g_I_q(g_J.size()); 00595 //GPU memory for A_I_\hatJ 00596 block_matrix g_A_I_J_u_vcl; 00597 //GPU memory for A_\hatI_\hatJ 00598 block_matrix g_A_I_u_J_u_vcl; 00599 bool is_empty_block; 00600 //GPU memory for new b_v 00601 block_vector g_bv_u_vcl; 00602 #ifdef VIENNACL_WITH_OPENMP 00603 #pragma omp parallel for 00604 #endif 00605 for(long i = 0; i < static_cast<long>(g_J.size()); ++i) 00606 { 00607 if(g_is_update[i]) 00608 { 00609 if(buildAugmentedIndexSet<SparseVectorType, ScalarType>(A_v_c, g_res[i], g_J[i], g_J_u[i], tag)) 00610 buildNewRowSet(A_v_c, g_I[i], g_J_u[i], g_I_u[i]); 00611 } 00612 } 00613 //assemble new A_I_J_u blocks on GPU and multiply them with Q' 00614 block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block); 00615 //I have matrix A_I_J_u ready.. 00616 block_q_multiplication<ScalarType>(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, ctx); 00617 //assemble A_\hatI_\hatJ 00618 block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block); 00619 assemble_qr_block<ScalarType>(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(), 00620 g_A_I_u_J_u_vcl, g_is_update, is_empty_block, ctx); 00621 00622 block_qr<ScalarType>(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, ctx); 00623 //concatanation of new and old indices 00624 #ifdef VIENNACL_WITH_OPENMP 00625 #pragma omp parallel for 00626 #endif 00627 for(long i = 0; i < static_cast<long>(g_J.size()); ++i) 00628 { 00629 g_J[i].insert(g_J[i].end(), g_J_u[i].begin(), g_J_u[i].end()); 00630 g_I[i].insert(g_I[i].end(), g_I_u[i].begin(), g_I_u[i].end()); 00631 } 00632 assemble_r<ScalarType>(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl, g_bv_vcl, g_bv_u_vcl, g_is_update, ctx); 00633 } 00634 00635 } 00636 } 00637 } 00638 } 00639 #endif