ViennaCL - The Vienna Computing Library  1.5.0
viennacl/linalg/opencl/sparse_matrix_operations.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_OPENCL_SPARSE_MATRIX_OPERATIONS_HPP_
00002 #define VIENNACL_LINALG_OPENCL_SPARSE_MATRIX_OPERATIONS_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2013, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00025 #include "viennacl/forwards.h"
00026 #include "viennacl/ocl/device.hpp"
00027 #include "viennacl/ocl/handle.hpp"
00028 #include "viennacl/ocl/kernel.hpp"
00029 #include "viennacl/scalar.hpp"
00030 #include "viennacl/vector.hpp"
00031 #include "viennacl/tools/tools.hpp"
00032 #include "viennacl/linalg/opencl/kernels/compressed_matrix.hpp"
00033 #include "viennacl/linalg/opencl/kernels/coordinate_matrix.hpp"
00034 #include "viennacl/linalg/opencl/kernels/ell_matrix.hpp"
00035 #include "viennacl/linalg/opencl/kernels/hyb_matrix.hpp"
00036 #include "viennacl/linalg/opencl/kernels/compressed_compressed_matrix.hpp"
00037 #include "viennacl/linalg/opencl/common.hpp"
00038 
00039 namespace viennacl
00040 {
00041   namespace linalg
00042   {
00043     namespace opencl
00044     {
00045 
00046       //
00047       // Compressed matrix
00048       //
00049 
00050       namespace detail
00051       {
00052         template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
00053         void row_info(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & mat,
00054                       vector_base<SCALARTYPE> & vec,
00055                       viennacl::linalg::detail::row_info_types info_selector)
00056         {
00057           viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00058           viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
00059           viennacl::ocl::kernel & row_info_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "row_info_extractor");
00060 
00061           viennacl::ocl::enqueue(row_info_kernel(mat.handle1().opencl_handle(), mat.handle2().opencl_handle(), mat.handle().opencl_handle(),
00062                                                  viennacl::traits::opencl_handle(vec),
00063                                                  cl_uint(mat.size1()),
00064                                                  cl_uint(info_selector)
00065                                                 )
00066                                 );
00067         }
00068       }
00069 
00078       template<class TYPE, unsigned int ALIGNMENT>
00079       void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & mat,
00080                      const viennacl::vector_base<TYPE> & vec,
00081                            viennacl::vector_base<TYPE> & result)
00082       {
00083         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00084         viennacl::linalg::opencl::kernels::compressed_matrix<TYPE>::init(ctx);
00085         std::stringstream ss;
00086         ss << "vec_mul";
00087         if (ALIGNMENT == 4)
00088           ss << "4";
00089         if (ALIGNMENT == 8)
00090           ss << "8";
00091 
00092         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<TYPE>::program_name(), ss.str());
00093 
00094         viennacl::ocl::packed_cl_uint layout_vec;
00095         layout_vec.start  = cl_uint(viennacl::traits::start(vec));
00096         layout_vec.stride = cl_uint(viennacl::traits::stride(vec));
00097         layout_vec.size   = cl_uint(viennacl::traits::size(vec));
00098         layout_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00099 
00100         viennacl::ocl::packed_cl_uint layout_result;
00101         layout_result.start  = cl_uint(viennacl::traits::start(result));
00102         layout_result.stride = cl_uint(viennacl::traits::stride(result));
00103         layout_result.size   = cl_uint(viennacl::traits::size(result));
00104         layout_result.internal_size   = cl_uint(viennacl::traits::internal_size(result));
00105 
00106         viennacl::ocl::enqueue(k(mat.handle1().opencl_handle(), mat.handle2().opencl_handle(), mat.handle().opencl_handle(),
00107                                 vec, layout_vec,
00108                                 result, layout_result
00109                                 ));
00110       }
00111 
00112 
00121       template< typename TYPE, unsigned int ALIGNMENT, typename F1, typename F2>
00122       void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & sp_mat,
00123                      const viennacl::matrix_base<TYPE, F1> & d_mat,
00124                            viennacl::matrix_base<TYPE, F2> & result) {
00125 
00126         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_mat).context());
00127         viennacl::linalg::opencl::kernels::compressed_matrix<TYPE>::init(ctx);
00128         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<TYPE>::program_name(),
00129                                                    detail::sparse_dense_matmult_kernel_name(false, is_row_major<F1>::value, is_row_major<F2>::value));
00130 
00131         viennacl::ocl::enqueue(k(sp_mat.handle1().opencl_handle(), sp_mat.handle2().opencl_handle(), sp_mat.handle().opencl_handle(),
00132                                  viennacl::traits::opencl_handle(d_mat),
00133                                  cl_uint(viennacl::traits::start1(d_mat)),          cl_uint(viennacl::traits::start2(d_mat)),
00134                                  cl_uint(viennacl::traits::stride1(d_mat)),         cl_uint(viennacl::traits::stride2(d_mat)),
00135                                  cl_uint(viennacl::traits::size1(d_mat)),           cl_uint(viennacl::traits::size2(d_mat)),
00136                                  cl_uint(viennacl::traits::internal_size1(d_mat)),  cl_uint(viennacl::traits::internal_size2(d_mat)),
00137                                  viennacl::traits::opencl_handle(result),
00138                                  cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
00139                                  cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
00140                                  cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
00141                                  cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)) ));
00142       }
00143 
00153       template< typename TYPE, unsigned int ALIGNMENT, typename F1, typename F2>
00154       void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & sp_mat,
00155                      const viennacl::matrix_expression< const viennacl::matrix_base<TYPE, F1>,
00156                                                         const viennacl::matrix_base<TYPE, F1>,
00157                                                         viennacl::op_trans > & d_mat,
00158                       viennacl::matrix_base<TYPE, F2> & result) {
00159 
00160         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_mat).context());
00161         viennacl::linalg::opencl::kernels::compressed_matrix<TYPE>::init(ctx);
00162         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<TYPE>::program_name(),
00163                                                    detail::sparse_dense_matmult_kernel_name(true, is_row_major<F1>::value, is_row_major<F2>::value));
00164 
00165         viennacl::ocl::enqueue(k(sp_mat.handle1().opencl_handle(), sp_mat.handle2().opencl_handle(), sp_mat.handle().opencl_handle(),
00166                                  viennacl::traits::opencl_handle(d_mat.lhs()),
00167                                  cl_uint(viennacl::traits::start1(d_mat.lhs())),          cl_uint(viennacl::traits::start2(d_mat.lhs())),
00168                                  cl_uint(viennacl::traits::stride1(d_mat.lhs())),         cl_uint(viennacl::traits::stride2(d_mat.lhs())),
00169                                  cl_uint(viennacl::traits::size1(d_mat.lhs())),           cl_uint(viennacl::traits::size2(d_mat.lhs())),
00170                                  cl_uint(viennacl::traits::internal_size1(d_mat.lhs())),  cl_uint(viennacl::traits::internal_size2(d_mat.lhs())),
00171                                  viennacl::traits::opencl_handle(result),
00172                                  cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
00173                                  cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
00174                                  cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
00175                                  cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)) ) );
00176       }
00177 
00178 
00179 
00180       // triangular solvers
00181 
00187       template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
00188       void inplace_solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & L,
00189                          vector_base<SCALARTYPE> & vec,
00190                          viennacl::linalg::unit_lower_tag)
00191       {
00192         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
00193         viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
00194         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "unit_lu_forward");
00195 
00196         k.local_work_size(0, 128);
00197         k.global_work_size(0, k.local_work_size());
00198         viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
00199                                  viennacl::traits::opencl_handle(vec),
00200                                  cl_uint(L.size1())
00201                                 )
00202                               );
00203       }
00204 
00210       template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
00211       void inplace_solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & L,
00212                          vector_base<SCALARTYPE> & vec,
00213                          viennacl::linalg::lower_tag)
00214       {
00215         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context());
00216         viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
00217 
00218         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "lu_forward");
00219 
00220         k.local_work_size(0, 128);
00221         k.global_work_size(0, k.local_work_size());
00222         viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(),
00223                                  viennacl::traits::opencl_handle(vec),
00224                                  cl_uint(L.size1())
00225                                 )
00226                               );
00227       }
00228 
00229 
00235       template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
00236       void inplace_solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & U,
00237                          vector_base<SCALARTYPE> & vec,
00238                          viennacl::linalg::unit_upper_tag)
00239       {
00240         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U).context());
00241         viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
00242         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "unit_lu_backward");
00243 
00244         k.local_work_size(0, 128);
00245         k.global_work_size(0, k.local_work_size());
00246         viennacl::ocl::enqueue(k(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(),
00247                                  viennacl::traits::opencl_handle(vec),
00248                                  cl_uint(U.size1())
00249                                 )
00250                               );
00251       }
00252 
00258       template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
00259       void inplace_solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & U,
00260                          vector_base<SCALARTYPE> & vec,
00261                          viennacl::linalg::upper_tag)
00262       {
00263         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U).context());
00264         viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
00265 
00266         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "lu_backward");
00267 
00268         k.local_work_size(0, 128);
00269         k.global_work_size(0, k.local_work_size());
00270         viennacl::ocl::enqueue(k(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(),
00271                                  viennacl::traits::opencl_handle(vec),
00272                                  cl_uint(U.size1())
00273                                 )
00274                               );
00275       }
00276 
00277 
00278 
00279 
00280 
00281       // transposed triangular solvers
00282 
00283       namespace detail
00284       {
00285         //
00286         // block solves
00287         //
00288         template<typename ScalarType, unsigned int MAT_ALIGNMENT>
00289         void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
00290                                                          const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
00291                                                          op_trans> & L,
00292                                  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
00293                                  vector_base<ScalarType> const & /* L_diagonal */,  //ignored
00294                                  vector_base<ScalarType> & vec,
00295                                  viennacl::linalg::unit_lower_tag)
00296         {
00297           viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L.lhs()).context());
00298           viennacl::linalg::opencl::kernels::compressed_matrix<ScalarType>::init(ctx);
00299           viennacl::ocl::kernel & block_solve_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<ScalarType>::program_name(), "block_trans_unit_lu_forward");
00300           block_solve_kernel.global_work_size(0, num_blocks * block_solve_kernel.local_work_size(0));
00301 
00302           viennacl::ocl::enqueue(block_solve_kernel(L.lhs().handle1().opencl_handle(),
00303                                                     L.lhs().handle2().opencl_handle(),
00304                                                     L.lhs().handle().opencl_handle(),
00305                                                     block_indices.opencl_handle(),
00306                                                     vec,
00307                                                     static_cast<cl_uint>(vec.size())));
00308         }
00309 
00310 
00311         template<typename ScalarType, unsigned int MAT_ALIGNMENT>
00312         void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
00313                                                          const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
00314                                                          op_trans> & U,
00315                                  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
00316                                  vector_base<ScalarType> const & U_diagonal,
00317                                  vector_base<ScalarType> & vec,
00318                                  viennacl::linalg::upper_tag)
00319         {
00320           viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(U.lhs()).context());
00321           viennacl::linalg::opencl::kernels::compressed_matrix<ScalarType>::init(ctx);
00322           viennacl::ocl::kernel & block_solve_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<ScalarType>::program_name(), "block_trans_lu_backward");
00323           block_solve_kernel.global_work_size(0, num_blocks * block_solve_kernel.local_work_size(0));
00324 
00325           viennacl::ocl::enqueue(block_solve_kernel(U.lhs().handle1().opencl_handle(),
00326                                                     U.lhs().handle2().opencl_handle(),
00327                                                     U.lhs().handle().opencl_handle(),
00328                                                     U_diagonal,
00329                                                     block_indices.opencl_handle(),
00330                                                     vec,
00331                                                     static_cast<cl_uint>(vec.size())));
00332         }
00333 
00334 
00335       }
00336 
00337 
00343       template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
00344       void inplace_solve(matrix_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
00345                                             const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
00346                                             op_trans> const & proxy_L,
00347                          vector_base<SCALARTYPE> & vec,
00348                          viennacl::linalg::unit_lower_tag)
00349       {
00350         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_L.lhs()).context());
00351         viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
00352         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "trans_unit_lu_forward");
00353 
00354         k.local_work_size(0, 128);
00355         k.global_work_size(0, k.local_work_size());
00356         viennacl::ocl::enqueue(k(proxy_L.lhs().handle1().opencl_handle(), proxy_L.lhs().handle2().opencl_handle(), proxy_L.lhs().handle().opencl_handle(),
00357                                  viennacl::traits::opencl_handle(vec),
00358                                  cl_uint(proxy_L.lhs().size1())
00359                                 )
00360                               );
00361       }
00362 
00363 
00369       template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
00370       void inplace_solve(matrix_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
00371                                             const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
00372                                             op_trans> const & proxy_L,
00373                          vector_base<SCALARTYPE> & vec,
00374                          viennacl::linalg::lower_tag)
00375       {
00376         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_L.lhs()).context());
00377         viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
00378 
00379         viennacl::vector<SCALARTYPE> diagonal(vec.size());
00380         detail::row_info(proxy_L.lhs(), diagonal, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
00381 
00382         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "trans_lu_forward");
00383 
00384         k.local_work_size(0, 128);
00385         k.global_work_size(0, k.local_work_size());
00386         viennacl::ocl::enqueue(k(proxy_L.lhs().handle1().opencl_handle(), proxy_L.lhs().handle2().opencl_handle(), proxy_L.lhs().handle().opencl_handle(),
00387                                  viennacl::traits::opencl_handle(diagonal),
00388                                  viennacl::traits::opencl_handle(vec),
00389                                  cl_uint(proxy_L.lhs().size1())
00390                                 )
00391                               );
00392       }
00393 
00399       template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
00400       void inplace_solve(matrix_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
00401                                             const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
00402                                             op_trans> const & proxy_U,
00403                          vector_base<SCALARTYPE> & vec,
00404                          viennacl::linalg::unit_upper_tag)
00405       {
00406         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_U.lhs()).context());
00407         viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
00408         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "trans_unit_lu_backward");
00409 
00410         k.local_work_size(0, 128);
00411         k.global_work_size(0, k.local_work_size());
00412         viennacl::ocl::enqueue(k(proxy_U.lhs().handle1().opencl_handle(), proxy_U.lhs().handle2().opencl_handle(), proxy_U.lhs().handle().opencl_handle(),
00413                                  viennacl::traits::opencl_handle(vec),
00414                                  cl_uint(proxy_U.lhs().size1())
00415                                 )
00416                               );
00417       }
00418 
00419 
00425       template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
00426       void inplace_solve(matrix_expression< const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
00427                                             const compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>,
00428                                             op_trans> const & proxy_U,
00429                          vector_base<SCALARTYPE> & vec,
00430                          viennacl::linalg::upper_tag)
00431       {
00432         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(proxy_U.lhs()).context());
00433         viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::init(ctx);
00434 
00435         viennacl::vector<SCALARTYPE> diagonal(vec.size());
00436         detail::row_info(proxy_U.lhs(), diagonal, viennacl::linalg::detail::SPARSE_ROW_DIAGONAL);
00437 
00438         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<SCALARTYPE>::program_name(), "trans_lu_backward");
00439 
00440         k.local_work_size(0, 128);
00441         k.global_work_size(0, k.local_work_size());
00442         viennacl::ocl::enqueue(k(proxy_U.lhs().handle1().opencl_handle(), proxy_U.lhs().handle2().opencl_handle(), proxy_U.lhs().handle().opencl_handle(),
00443                                  viennacl::traits::opencl_handle(diagonal),
00444                                  viennacl::traits::opencl_handle(vec),
00445                                  cl_uint(proxy_U.lhs().size1())
00446                                 )
00447                               );
00448       }
00449 
00450 
00451       //
00452       // Compressed Compressed matrix
00453       //
00454 
00463       template<class TYPE>
00464       void prod_impl(const viennacl::compressed_compressed_matrix<TYPE> & mat,
00465                      const viennacl::vector_base<TYPE> & vec,
00466                            viennacl::vector_base<TYPE> & result)
00467       {
00468         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00469         viennacl::linalg::opencl::kernels::compressed_compressed_matrix<TYPE>::init(ctx);
00470         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_compressed_matrix<TYPE>::program_name(), "vec_mul");
00471 
00472         result.clear();
00473 
00474         viennacl::ocl::packed_cl_uint layout_vec;
00475         layout_vec.start  = cl_uint(viennacl::traits::start(vec));
00476         layout_vec.stride = cl_uint(viennacl::traits::stride(vec));
00477         layout_vec.size   = cl_uint(viennacl::traits::size(vec));
00478         layout_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00479 
00480         viennacl::ocl::packed_cl_uint layout_result;
00481         layout_result.start  = cl_uint(viennacl::traits::start(result));
00482         layout_result.stride = cl_uint(viennacl::traits::stride(result));
00483         layout_result.size   = cl_uint(viennacl::traits::size(result));
00484         layout_result.internal_size   = cl_uint(viennacl::traits::internal_size(result));
00485 
00486         viennacl::ocl::enqueue(k(mat.handle1().opencl_handle(), mat.handle3().opencl_handle(), mat.handle2().opencl_handle(), mat.handle().opencl_handle(), cl_uint(mat.nnz1()),
00487                                  vec, layout_vec,
00488                                  result, layout_result
00489                                 ));
00490       }
00491 
00492 
00493       //
00494       // Coordinate matrix
00495       //
00496 
00497       namespace detail
00498       {
00499         template<typename SCALARTYPE, unsigned int MAT_ALIGNMENT>
00500         void row_info(coordinate_matrix<SCALARTYPE, MAT_ALIGNMENT> const & mat,
00501                       vector_base<SCALARTYPE> & vec,
00502                       viennacl::linalg::detail::row_info_types info_selector)
00503         {
00504           viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00505           viennacl::linalg::opencl::kernels::coordinate_matrix<SCALARTYPE>::init(ctx);
00506           viennacl::ocl::kernel & row_info_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<SCALARTYPE>::program_name(), "row_info_extractor");
00507           unsigned int thread_num = 256; //k.local_work_size(0);
00508 
00509           row_info_kernel.local_work_size(0, thread_num);
00510 
00511           row_info_kernel.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
00512           viennacl::ocl::enqueue(row_info_kernel(mat.handle12().opencl_handle(), mat.handle().opencl_handle(), mat.handle3().opencl_handle(),
00513                                                  viennacl::traits::opencl_handle(vec),
00514                                                  cl_uint(info_selector),
00515                                                  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
00516                                                  viennacl::ocl::local_mem(sizeof(SCALARTYPE)*thread_num)) );
00517         }
00518       }
00519 
00528       template<class SCALARTYPE, unsigned int ALIGNMENT>
00529       void prod_impl(const viennacl::coordinate_matrix<SCALARTYPE, ALIGNMENT> & mat,
00530                      const viennacl::vector_base<SCALARTYPE> & vec,
00531                            viennacl::vector_base<SCALARTYPE> & result)
00532       {
00533         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00534         viennacl::linalg::opencl::kernels::coordinate_matrix<SCALARTYPE>::init(ctx);
00535 
00536         result.clear();
00537 
00538         viennacl::ocl::packed_cl_uint layout_vec;
00539         layout_vec.start  = cl_uint(viennacl::traits::start(vec));
00540         layout_vec.stride = cl_uint(viennacl::traits::stride(vec));
00541         layout_vec.size   = cl_uint(viennacl::traits::size(vec));
00542         layout_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00543 
00544         viennacl::ocl::packed_cl_uint layout_result;
00545         layout_result.start  = cl_uint(viennacl::traits::start(result));
00546         layout_result.stride = cl_uint(viennacl::traits::stride(result));
00547         layout_result.size   = cl_uint(viennacl::traits::size(result));
00548         layout_result.internal_size   = cl_uint(viennacl::traits::internal_size(result));
00549 
00550         //std::cout << "prod(coordinate_matrix" << ALIGNMENT << ", vector) called with internal_nnz=" << mat.internal_nnz() << std::endl;
00551 
00552         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<SCALARTYPE>::program_name(), "vec_mul");
00553         unsigned int thread_num = 256; //k.local_work_size(0);
00554 
00555         k.local_work_size(0, thread_num);
00556 
00557         k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
00558         //k.global_work_size(0, thread_num);  //Only one work group
00559         viennacl::ocl::enqueue(k(mat.handle12().opencl_handle(), mat.handle().opencl_handle(), mat.handle3().opencl_handle(),
00560                                  viennacl::traits::opencl_handle(vec),
00561                                  layout_vec,
00562                                  viennacl::traits::opencl_handle(result),
00563                                  layout_result,
00564                                  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
00565                                  viennacl::ocl::local_mem(sizeof(SCALARTYPE)*thread_num)) );
00566 
00567       }
00568 
00569 
00578       template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
00579       void prod_impl(const viennacl::coordinate_matrix<NumericT, ALIGNMENT> & mat,
00580                      const viennacl::matrix_base<NumericT, F1> & d_mat,
00581                            viennacl::matrix_base<NumericT, F2> & result)
00582       {
00583         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00584         viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::init(ctx);
00585 
00586         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::program_name(),
00587                                                    detail::sparse_dense_matmult_kernel_name(false, is_row_major<F1>::value, is_row_major<F2>::value));
00588 
00589         result.clear();
00590 
00591         unsigned int thread_num = 256; //k.local_work_size(0);
00592         k.local_work_size(0, thread_num);
00593         k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
00594 
00595         viennacl::ocl::enqueue(k(mat.handle12().opencl_handle(), mat.handle().opencl_handle(), mat.handle3().opencl_handle(),
00596                                  viennacl::traits::opencl_handle(d_mat),
00597                                  cl_uint(viennacl::traits::start1(d_mat)),          cl_uint(viennacl::traits::start2(d_mat)),
00598                                  cl_uint(viennacl::traits::stride1(d_mat)),         cl_uint(viennacl::traits::stride2(d_mat)),
00599                                  cl_uint(viennacl::traits::size1(d_mat)),           cl_uint(viennacl::traits::size2(d_mat)),
00600                                  cl_uint(viennacl::traits::internal_size1(d_mat)),  cl_uint(viennacl::traits::internal_size2(d_mat)),
00601                                  viennacl::traits::opencl_handle(result),
00602                                  cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
00603                                  cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
00604                                  cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
00605                                  cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
00606                                  viennacl::ocl::local_mem(sizeof(cl_uint)*k.local_work_size(0)),
00607                                  viennacl::ocl::local_mem(sizeof(NumericT)*k.local_work_size(0))) );
00608 
00609       }
00610 
00619       template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
00620       void prod_impl(const viennacl::coordinate_matrix<NumericT, ALIGNMENT> & mat,
00621                      const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
00622                                                         const viennacl::matrix_base<NumericT, F1>,
00623                                                         viennacl::op_trans > & d_mat,
00624                            viennacl::matrix_base<NumericT, F2> & result)
00625       {
00626         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00627         viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::init(ctx);
00628 
00629         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::coordinate_matrix<NumericT>::program_name(),
00630                                                    detail::sparse_dense_matmult_kernel_name(true, is_row_major<F1>::value, is_row_major<F2>::value));
00631 
00632         result.clear();
00633 
00634         unsigned int thread_num = 256; //k.local_work_size(0);
00635         k.local_work_size(0, thread_num);
00636         k.global_work_size(0, 64 * thread_num);  //64 work groups are hard-coded for now. Gives reasonable performance in most cases
00637 
00638         viennacl::ocl::enqueue(k(mat.handle12().opencl_handle(), mat.handle().opencl_handle(), mat.handle3().opencl_handle(),
00639                                  viennacl::traits::opencl_handle(d_mat),
00640                                  cl_uint(viennacl::traits::start1(d_mat.lhs())),          cl_uint(viennacl::traits::start2(d_mat.lhs())),
00641                                  cl_uint(viennacl::traits::stride1(d_mat.lhs())),         cl_uint(viennacl::traits::stride2(d_mat.lhs())),
00642                                  cl_uint(viennacl::traits::size1(d_mat.lhs())),           cl_uint(viennacl::traits::size2(d_mat.lhs())),
00643                                  cl_uint(viennacl::traits::internal_size1(d_mat.lhs())),  cl_uint(viennacl::traits::internal_size2(d_mat.lhs())),
00644                                  viennacl::traits::opencl_handle(result),
00645                                  cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
00646                                  cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
00647                                  cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
00648                                  cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result)),
00649                                  viennacl::ocl::local_mem(sizeof(cl_uint)*k.local_work_size(0)),
00650                                  viennacl::ocl::local_mem(sizeof(NumericT)*k.local_work_size(0))) );
00651 
00652       }
00653 
00654 
00655       //
00656       // ELL Matrix
00657       //
00658 
00659       template<class TYPE, unsigned int ALIGNMENT>
00660       void prod_impl( const viennacl::ell_matrix<TYPE, ALIGNMENT> & mat,
00661                       const viennacl::vector_base<TYPE> & vec,
00662                       viennacl::vector_base<TYPE> & result)
00663       {
00664         assert(mat.size1() == result.size());
00665         assert(mat.size2() == vec.size());
00666 
00667         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00668         viennacl::linalg::opencl::kernels::ell_matrix<TYPE>::init(ctx);
00669         result.clear();
00670 
00671         viennacl::ocl::packed_cl_uint layout_vec;
00672         layout_vec.start  = cl_uint(viennacl::traits::start(vec));
00673         layout_vec.stride = cl_uint(viennacl::traits::stride(vec));
00674         layout_vec.size   = cl_uint(viennacl::traits::size(vec));
00675         layout_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00676 
00677         viennacl::ocl::packed_cl_uint layout_result;
00678         layout_result.start  = cl_uint(viennacl::traits::start(result));
00679         layout_result.stride = cl_uint(viennacl::traits::stride(result));
00680         layout_result.size   = cl_uint(viennacl::traits::size(result));
00681         layout_result.internal_size   = cl_uint(viennacl::traits::internal_size(result));
00682 
00683         std::stringstream ss;
00684         ss << "vec_mul_" << 1;//(ALIGNMENT != 1?4:1);
00685         viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<TYPE>::program_name(), "vec_mul");
00686 
00687         unsigned int thread_num = 128;
00688         unsigned int group_num = 256;
00689 
00690         k.local_work_size(0, thread_num);
00691         k.global_work_size(0, thread_num * group_num);
00692 
00693         viennacl::ocl::enqueue(k(mat.handle2().opencl_handle(),
00694                                  mat.handle().opencl_handle(),
00695                                  viennacl::traits::opencl_handle(vec),
00696                                  layout_vec,
00697                                  viennacl::traits::opencl_handle(result),
00698                                  layout_result,
00699                                  cl_uint(mat.size1()),
00700                                  cl_uint(mat.size2()),
00701                                  cl_uint(mat.internal_size1()),
00702                                  cl_uint(mat.maxnnz()),
00703                                  cl_uint(mat.internal_maxnnz())
00704                                 )
00705         );
00706 
00707 
00708       }
00709 
00719       template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2 >
00720       void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & sp_mat,
00721                      const viennacl::matrix_base<NumericT, F1> & d_mat,
00722                            viennacl::matrix_base<NumericT, F2> & result) {
00723 
00724         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_mat).context());
00725         viennacl::linalg::opencl::kernels::ell_matrix<ScalarType>::init(ctx);
00726         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<ScalarType>::program_name(),
00727                                                    detail::sparse_dense_matmult_kernel_name(false, is_row_major<F1>::value, is_row_major<F2>::value));
00728 
00729         //unsigned int thread_num = 128;
00730         //unsigned int group_num = 256;
00731         //
00732         //k.local_work_size(0, thread_num);
00733         //k.global_work_size(0, thread_num * group_num);
00734 
00735         viennacl::ocl::enqueue(k(sp_mat.handle2().opencl_handle(), sp_mat.handle().opencl_handle(),
00736                                  cl_uint(sp_mat.size1()),
00737                                  cl_uint(sp_mat.size2()),
00738                                  cl_uint(sp_mat.internal_size1()),
00739                                  cl_uint(sp_mat.maxnnz()),
00740                                  cl_uint(sp_mat.internal_maxnnz()),
00741                                  viennacl::traits::opencl_handle(d_mat),
00742                                  cl_uint(viennacl::traits::start1(d_mat)),          cl_uint(viennacl::traits::start2(d_mat)),
00743                                  cl_uint(viennacl::traits::stride1(d_mat)),         cl_uint(viennacl::traits::stride2(d_mat)),
00744                                  cl_uint(viennacl::traits::size1(d_mat)),           cl_uint(viennacl::traits::size2(d_mat)),
00745                                  cl_uint(viennacl::traits::internal_size1(d_mat)),  cl_uint(viennacl::traits::internal_size2(d_mat)),
00746                                  viennacl::traits::opencl_handle(result),
00747                                  cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
00748                                  cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
00749                                  cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
00750                                  cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result))
00751                                 )
00752                               );
00753       }
00754 
00764       template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2>
00765       void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & sp_mat,
00766                      const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
00767                                                         const viennacl::matrix_base<NumericT, F1>,
00768                                                         viennacl::op_trans > & d_mat,
00769                            viennacl::matrix_base<NumericT, F2> & result) {
00770 
00771         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(sp_mat).context());
00772         viennacl::linalg::opencl::kernels::ell_matrix<ScalarType>::init(ctx);
00773         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ell_matrix<ScalarType>::program_name(),
00774                                                    detail::sparse_dense_matmult_kernel_name(true, is_row_major<F1>::value, is_row_major<F2>::value));
00775 
00776         //unsigned int thread_num = 128;
00777         //unsigned int group_num = 256;
00778         //
00779         //k.local_work_size(0, thread_num);
00780         //k.global_work_size(0, thread_num * group_num);
00781 
00782         viennacl::ocl::enqueue(k(sp_mat.handle2().opencl_handle(), sp_mat.handle().opencl_handle(),
00783                                  cl_uint(sp_mat.size1()),
00784                                  cl_uint(sp_mat.size2()),
00785                                  cl_uint(sp_mat.internal_size1()),
00786                                  cl_uint(sp_mat.maxnnz()),
00787                                  cl_uint(sp_mat.internal_maxnnz()),
00788                                  viennacl::traits::opencl_handle(d_mat.lhs()),
00789                                  cl_uint(viennacl::traits::start1(d_mat.lhs())),          cl_uint(viennacl::traits::start2(d_mat.lhs())),
00790                                  cl_uint(viennacl::traits::stride1(d_mat.lhs())),         cl_uint(viennacl::traits::stride2(d_mat.lhs())),
00791                                  cl_uint(viennacl::traits::size1(d_mat.lhs())),           cl_uint(viennacl::traits::size2(d_mat.lhs())),
00792                                  cl_uint(viennacl::traits::internal_size1(d_mat.lhs())),  cl_uint(viennacl::traits::internal_size2(d_mat.lhs())),
00793                                  viennacl::traits::opencl_handle(result),
00794                                  cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
00795                                  cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
00796                                  cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
00797                                  cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result))
00798                                 )
00799                               );
00800       }
00801 
00802       //
00803       // Hybrid Matrix
00804       //
00805 
00806       template<class TYPE, unsigned int ALIGNMENT>
00807       void prod_impl( const viennacl::hyb_matrix<TYPE, ALIGNMENT>& mat,
00808                       const viennacl::vector_base<TYPE>& vec,
00809                       viennacl::vector_base<TYPE>& result)
00810       {
00811         assert(mat.size1() == result.size());
00812         assert(mat.size2() == vec.size());
00813 
00814         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00815         viennacl::linalg::opencl::kernels::hyb_matrix<TYPE>::init(ctx);
00816 
00817         viennacl::ocl::packed_cl_uint layout_vec;
00818         layout_vec.start  = cl_uint(viennacl::traits::start(vec));
00819         layout_vec.stride = cl_uint(viennacl::traits::stride(vec));
00820         layout_vec.size   = cl_uint(viennacl::traits::size(vec));
00821         layout_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00822 
00823         viennacl::ocl::packed_cl_uint layout_result;
00824         layout_result.start  = cl_uint(viennacl::traits::start(result));
00825         layout_result.stride = cl_uint(viennacl::traits::stride(result));
00826         layout_result.size   = cl_uint(viennacl::traits::size(result));
00827         layout_result.internal_size   = cl_uint(viennacl::traits::internal_size(result));
00828 
00829         viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<TYPE>::program_name(), "vec_mul");
00830 
00831         unsigned int thread_num = 256;
00832         unsigned int group_num = 32;
00833 
00834         k.local_work_size(0, thread_num);
00835         k.global_work_size(0, thread_num * group_num);
00836 
00837         viennacl::ocl::enqueue(k(mat.handle2().opencl_handle(),
00838                                  mat.handle().opencl_handle(),
00839                                  mat.handle3().opencl_handle(),
00840                                  mat.handle4().opencl_handle(),
00841                                  mat.handle5().opencl_handle(),
00842                                  viennacl::traits::opencl_handle(vec),
00843                                  layout_vec,
00844                                  viennacl::traits::opencl_handle(result),
00845                                  layout_result,
00846                                  cl_uint(mat.size1()),
00847                                  cl_uint(mat.internal_size1()),
00848                                  cl_uint(mat.ell_nnz()),
00849                                  cl_uint(mat.internal_ellnnz())
00850                                 )
00851         );
00852       }
00853 
00854       template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
00855       void prod_impl( const viennacl::hyb_matrix<NumericT, ALIGNMENT>& mat,
00856                       const viennacl::matrix_base<NumericT, F1> & d_mat,
00857                             viennacl::matrix_base<NumericT, F2> & result)
00858       {
00859         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00860         viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::init(ctx);
00861         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::program_name(),
00862                                                    detail::sparse_dense_matmult_kernel_name(false, is_row_major<F1>::value, is_row_major<F2>::value));
00863 
00864         unsigned int thread_num = 256;
00865         unsigned int group_num = 32;
00866 
00867         k.local_work_size(0, thread_num);
00868         k.global_work_size(0, thread_num * group_num);
00869 
00870         viennacl::ocl::enqueue(k(mat.handle2().opencl_handle(),
00871                                  mat.handle().opencl_handle(),
00872                                  mat.handle3().opencl_handle(),
00873                                  mat.handle4().opencl_handle(),
00874                                  mat.handle5().opencl_handle(),
00875                                  cl_uint(mat.size1()),
00876                                  cl_uint(mat.internal_size1()),
00877                                  cl_uint(mat.ell_nnz()),
00878                                  cl_uint(mat.internal_ellnnz()),
00879                                  viennacl::traits::opencl_handle(d_mat),
00880                                  cl_uint(viennacl::traits::start1(d_mat)),          cl_uint(viennacl::traits::start2(d_mat)),
00881                                  cl_uint(viennacl::traits::stride1(d_mat)),         cl_uint(viennacl::traits::stride2(d_mat)),
00882                                  cl_uint(viennacl::traits::size1(d_mat)),           cl_uint(viennacl::traits::size2(d_mat)),
00883                                  cl_uint(viennacl::traits::internal_size1(d_mat)),  cl_uint(viennacl::traits::internal_size2(d_mat)),
00884                                  viennacl::traits::opencl_handle(result),
00885                                  cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
00886                                  cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
00887                                  cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
00888                                  cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result))
00889                                 )
00890         );
00891       }
00892 
00893       template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
00894       void prod_impl( const viennacl::hyb_matrix<NumericT, ALIGNMENT>& mat,
00895                       const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
00896                                                          const viennacl::matrix_base<NumericT, F1>,
00897                                                          viennacl::op_trans > & d_mat,
00898                             viennacl::matrix_base<NumericT, F2> & result)
00899       {
00900         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00901         viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::init(ctx);
00902         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::hyb_matrix<NumericT>::program_name(),
00903                                                    detail::sparse_dense_matmult_kernel_name(true, is_row_major<F1>::value, is_row_major<F2>::value));
00904 
00905         unsigned int thread_num = 256;
00906         unsigned int group_num = 32;
00907 
00908         k.local_work_size(0, thread_num);
00909         k.global_work_size(0, thread_num * group_num);
00910 
00911         viennacl::ocl::enqueue(k(mat.handle2().opencl_handle(),
00912                                  mat.handle().opencl_handle(),
00913                                  mat.handle3().opencl_handle(),
00914                                  mat.handle4().opencl_handle(),
00915                                  mat.handle5().opencl_handle(),
00916                                  cl_uint(mat.size1()),
00917                                  cl_uint(mat.internal_size1()),
00918                                  cl_uint(mat.ell_nnz()),
00919                                  cl_uint(mat.internal_ellnnz()),
00920                                  viennacl::traits::opencl_handle(d_mat.lhs()),
00921                                  cl_uint(viennacl::traits::start1(d_mat.lhs())),          cl_uint(viennacl::traits::start2(d_mat.lhs())),
00922                                  cl_uint(viennacl::traits::stride1(d_mat.lhs())),         cl_uint(viennacl::traits::stride2(d_mat.lhs())),
00923                                  cl_uint(viennacl::traits::size1(d_mat.lhs())),           cl_uint(viennacl::traits::size2(d_mat.lhs())),
00924                                  cl_uint(viennacl::traits::internal_size1(d_mat.lhs())),  cl_uint(viennacl::traits::internal_size2(d_mat.lhs())),
00925                                  viennacl::traits::opencl_handle(result),
00926                                  cl_uint(viennacl::traits::start1(result)),         cl_uint(viennacl::traits::start2(result)),
00927                                  cl_uint(viennacl::traits::stride1(result)),        cl_uint(viennacl::traits::stride2(result)),
00928                                  cl_uint(viennacl::traits::size1(result)),          cl_uint(viennacl::traits::size2(result)),
00929                                  cl_uint(viennacl::traits::internal_size1(result)), cl_uint(viennacl::traits::internal_size2(result))
00930                                 )
00931         );
00932       }
00933 
00934 
00935     } // namespace opencl
00936   } //namespace linalg
00937 } //namespace viennacl
00938 
00939 
00940 #endif