ViennaCL - The Vienna Computing Library  1.5.0
viennacl/linalg/cuda/direct_solve.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
00002 #define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2013, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00025 #include "viennacl/forwards.h"
00026 #include "viennacl/vector.hpp"
00027 #include "viennacl/matrix.hpp"
00028 
00029 
00030 #include "viennacl/linalg/cuda/common.hpp"
00031 
00032 
00033 namespace viennacl
00034 {
00035   namespace linalg
00036   {
00037     namespace cuda
00038     {
00039 
00040       template <typename T>
00041       __global__ void matrix_matrix_upper_solve_kernel(
00042                 const T * A,
00043                 unsigned int A_start1, unsigned int A_start2,
00044                 unsigned int A_inc1,   unsigned int A_inc2,
00045                 unsigned int A_size1,  unsigned int A_size2,
00046                 unsigned int A_internal_size1, unsigned int A_internal_size2,
00047                 bool row_major_A,
00048                 bool transpose_A,
00049 
00050                 T * B,
00051                 unsigned int B_start1, unsigned int B_start2,
00052                 unsigned int B_inc1,   unsigned int B_inc2,
00053                 unsigned int B_size1,  unsigned int B_size2,
00054                 unsigned int B_internal_size1, unsigned int B_internal_size2,
00055                 bool row_major_B,
00056                 bool transpose_B,
00057 
00058                 bool unit_diagonal)
00059       {
00060         T temp;
00061         T entry_A;
00062 
00063         for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt)
00064         {
00065           unsigned int row = A_size1 - 1 - row_cnt;
00066 
00067           if (!unit_diagonal)
00068           {
00069             __syncthreads();
00070 
00071             if (threadIdx.x == 0)
00072             {
00073               if (row_major_B && transpose_B)
00074                 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
00075                                                                                                                     : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
00076               else if (row_major_B && !transpose_B)
00077                 B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
00078                                                                                                                     : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
00079               else if (!row_major_B && transpose_B)
00080                 B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
00081                                                                                                                     : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
00082               else //if (!row_major_B && !transpose_B)
00083                 B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
00084                                                                                                                     : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
00085             }
00086           }
00087 
00088           __syncthreads();
00089 
00090           if (row_major_B && transpose_B)
00091             temp = B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)];
00092           else if (row_major_B && !transpose_B)
00093             temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
00094           else if (!row_major_B && transpose_B)
00095             temp = B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1];
00096           else //if (!row_major_B && !transpose_B)
00097             temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
00098 
00099           //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
00100           for  (unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x)
00101           {
00102             if (row_major_A && transpose_A)
00103               entry_A = A[(row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)];
00104             else if (row_major_A && !transpose_A)
00105               entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
00106             else if (!row_major_A && transpose_A)
00107               entry_A = A[(row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1];
00108             else //if (!row_major_A && !transpose_A)
00109               entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
00110 
00111             if (row_major_B && transpose_B)
00112               B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * entry_A;
00113             else if (row_major_B && !transpose_B)
00114               B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
00115             else if (!row_major_B && transpose_B)
00116               B[(blockIdx.x * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
00117             else //if (!row_major_B && !transpose_B)
00118               B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
00119 
00120           }
00121         }
00122       }
00123 
00124 
00125 
00126       template <typename T>
00127       __global__ void matrix_matrix_lower_solve_kernel(
00128                 const T * A,
00129                 unsigned int A_start1, unsigned int A_start2,
00130                 unsigned int A_inc1,   unsigned int A_inc2,
00131                 unsigned int A_size1,  unsigned int A_size2,
00132                 unsigned int A_internal_size1, unsigned int A_internal_size2,
00133                 bool row_major_A,
00134                 bool transpose_A,
00135 
00136                 T * B,
00137                 unsigned int B_start1, unsigned int B_start2,
00138                 unsigned int B_inc1,   unsigned int B_inc2,
00139                 unsigned int B_size1,  unsigned int B_size2,
00140                 unsigned int B_internal_size1, unsigned int B_internal_size2,
00141                 bool row_major_B,
00142                 bool transpose_B,
00143 
00144                 bool unit_diagonal)
00145       {
00146         T temp;
00147         T entry_A;
00148 
00149         for (unsigned int row = 0; row < A_size1; ++row)
00150         {
00151 
00152           if (!unit_diagonal)
00153           {
00154             __syncthreads();
00155 
00156             if (threadIdx.x == 0)
00157             {
00158               if (row_major_B && transpose_B)
00159                 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
00160                                                                                                                     : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
00161               else if (row_major_B && !transpose_B)
00162                 B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
00163                                                                                                                     : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
00164               else if (!row_major_B && transpose_B)
00165                 B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
00166                                                                                                                     : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
00167               else //if (!row_major_B && !transpose_B)
00168                 B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]
00169                                                                                                                     : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1];
00170             }
00171           }
00172 
00173           __syncthreads();
00174 
00175           if (row_major_B && transpose_B)
00176             temp = B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)];
00177           else if (row_major_B && !transpose_B)
00178             temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)];
00179           else if (!row_major_B && transpose_B)
00180             temp = B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1];
00181           else //if (!row_major_B && !transpose_B)
00182             temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1];
00183 
00184           //eliminate column of op(A) with index 'row' in parallel: " << std::endl;
00185           for  (unsigned int elim = row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x)
00186           {
00187             if (row_major_A && transpose_A)
00188               entry_A = A[(row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)];
00189             else if (row_major_A && !transpose_A)
00190               entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
00191             else if (!row_major_A && transpose_A)
00192               entry_A = A[(row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1];
00193             else //if (!row_major_A && !transpose_A)
00194               entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
00195 
00196             if (row_major_B && transpose_B)
00197               B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * entry_A;
00198             else if (row_major_B && !transpose_B)
00199               B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A;
00200             else if (!row_major_B && transpose_B)
00201               B[(blockIdx.x * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
00202             else //if (!row_major_B && !transpose_B)
00203               B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A;
00204 
00205           }
00206         }
00207       }
00208 
00209 
00210 
00211 
00212 
00213 
00214       namespace detail
00215       {
00216         template <typename T>
00217         bool is_unit_solve(T const & tag) { return false; }
00218 
00219         inline bool is_unit_solve(viennacl::linalg::unit_lower_tag) { return true; }
00220         inline bool is_unit_solve(viennacl::linalg::unit_upper_tag) { return true; }
00221 
00222         template <typename T>
00223         bool is_upper_solve(T const & tag) { return false; }
00224 
00225         inline bool is_upper_solve(viennacl::linalg::upper_tag) { return true; }
00226         inline bool is_upper_solve(viennacl::linalg::unit_upper_tag) { return true; }
00227 
00228         template <typename M1, typename M2, typename SolverTag>
00229         void inplace_solve_impl(M1 const & A, bool transpose_A,
00230                                 M2 & B,       bool transpose_B,
00231                                 SolverTag const & tag)
00232         {
00233           typedef typename viennacl::result_of::cpu_value_type<M1>::type        value_type;
00234 
00235           dim3 threads(128);
00236           dim3 grid( transpose_B ? B.size1() : B.size2() );
00237 
00238           if (is_upper_solve(tag))
00239           {
00240             matrix_matrix_upper_solve_kernel<<<grid,threads>>>(detail::cuda_arg<value_type>(A),
00241                                                                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
00242                                                                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
00243                                                                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
00244                                                                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00245                                                                bool(viennacl::is_row_major<M1>::value),
00246                                                                transpose_A,
00247 
00248                                                                detail::cuda_arg<value_type>(B),
00249                                                                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
00250                                                                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
00251                                                                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
00252                                                                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
00253                                                                bool(viennacl::is_row_major<M2>::value),
00254                                                                transpose_B,
00255 
00256                                                                is_unit_solve(tag)
00257                                                               );
00258           }
00259           else
00260           {
00261             matrix_matrix_lower_solve_kernel<<<grid,threads>>>(detail::cuda_arg<value_type>(A),
00262                                                                static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
00263                                                                static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
00264                                                                static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
00265                                                                static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00266                                                                bool(viennacl::is_row_major<M1>::value),
00267                                                                transpose_A,
00268 
00269                                                                detail::cuda_arg<value_type>(B),
00270                                                                static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
00271                                                                static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
00272                                                                static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
00273                                                                static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
00274                                                                bool(viennacl::is_row_major<M2>::value),
00275                                                                transpose_B,
00276 
00277                                                                is_unit_solve(tag)
00278                                                               );
00279           }
00280 
00281         }
00282       }
00283 
00284 
00285       //
00286       // Note: By convention, all size checks are performed in the calling frontend. No need to double-check here.
00287       //
00288 
00290 
00296       template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
00297       void inplace_solve(const matrix_base<NumericT, F1> & A, matrix_base<NumericT, F2> & B, SOLVERTAG tag)
00298       {
00299         detail::inplace_solve_impl(A, false,
00300                                    B, false, tag);
00301       }
00302 
00309       template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
00310       void inplace_solve(const matrix_base<NumericT, F1> & A,
00311                          matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> proxy_B,
00312                          SOLVERTAG tag)
00313       {
00314         detail::inplace_solve_impl(A, false,
00315                                    const_cast<matrix_base<NumericT, F2> &>(proxy_B.lhs()), true, tag);
00316       }
00317 
00318       //upper triangular solver for transposed lower triangular matrices
00325       template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
00326       void inplace_solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A,
00327                          matrix_base<NumericT, F2> & B,
00328                          SOLVERTAG tag)
00329       {
00330         detail::inplace_solve_impl(const_cast<matrix_base<NumericT, F1> &>(proxy_A.lhs()), true,
00331                                    B, false, tag);
00332       }
00333 
00340       template <typename NumericT, typename F1, typename F2, typename SOLVERTAG>
00341       void inplace_solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A,
00342                                matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans>   proxy_B,
00343                          SOLVERTAG tag)
00344       {
00345         detail::inplace_solve_impl(const_cast<matrix_base<NumericT, F1> &>(proxy_A.lhs()), true,
00346                                    const_cast<matrix_base<NumericT, F2> &>(proxy_B.lhs()), true, tag);
00347       }
00348 
00349 
00350 
00351       //
00352       //  Solve on vector
00353       //
00354 
00355       template <typename T>
00356       __global__ void triangular_substitute_inplace_row_kernel(
00357                 T const * A,
00358                 unsigned int A_start1, unsigned int A_start2,
00359                 unsigned int A_inc1,   unsigned int A_inc2,
00360                 unsigned int A_size1,  unsigned int A_size2,
00361                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00362                 T * v,
00363                 unsigned int v_start,
00364                 unsigned int v_inc,
00365                 unsigned int v_size,
00366 
00367                 unsigned int options)
00368       {
00369         T temp;
00370         unsigned int unit_diagonal_flag  = (options & (1 << 0));
00371         unsigned int transposed_access_A = (options & (1 << 1));
00372         unsigned int is_lower_solve      = (options & (1 << 2));
00373         unsigned int row;
00374         for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)    //Note: A required to be square
00375         {
00376           row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
00377           if (!unit_diagonal_flag)
00378           {
00379             __syncthreads();
00380             if (threadIdx.x == 0)
00381               v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)];
00382           }
00383 
00384           __syncthreads();
00385 
00386           temp = v[row * v_inc + v_start];
00387 
00388           for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
00389                   elim < (is_lower_solve ? A_size1 : row);
00390                   elim += blockDim.x)
00391             v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2))
00392                                                                       : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row  * A_inc2 + A_start2))];
00393         }
00394       }
00395 
00396 
00397       template <typename T>
00398       __global__ void triangular_substitute_inplace_col_kernel(
00399                 T const * A,
00400                 unsigned int A_start1, unsigned int A_start2,
00401                 unsigned int A_inc1,   unsigned int A_inc2,
00402                 unsigned int A_size1,  unsigned int A_size2,
00403                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00404                 T * v,
00405                 unsigned int v_start,
00406                 unsigned int v_inc,
00407                 unsigned int v_size,
00408                 unsigned int options)
00409       {
00410         T temp;
00411         unsigned int unit_diagonal_flag  = (options & (1 << 0));
00412         unsigned int transposed_access_A = (options & (1 << 1));
00413         unsigned int is_lower_solve      = (options & (1 << 2));
00414         unsigned int row;
00415         for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)    //Note: A required to be square
00416         {
00417           row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1);
00418           if (!unit_diagonal_flag)
00419           {
00420             __syncthreads();
00421             if (threadIdx.x == 0)
00422               v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1];
00423           }
00424 
00425           __syncthreads();
00426 
00427           temp = v[row * v_inc + v_start];
00428 
00429           for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x);
00430                   elim < (is_lower_solve ? A_size1 : row);
00431                   elim += blockDim.x)
00432             v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1)
00433                                                                       : ((elim * A_inc1 + A_start1) + (row  * A_inc2 + A_start2) * A_internal_size1)];
00434         }
00435       }
00436 
00437 
00438       namespace detail
00439       {
00440         inline unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag)      { return 0; }
00441         inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); }
00442         inline unsigned int get_option_for_solver_tag(viennacl::linalg::lower_tag)      { return (1 << 2); }
00443         inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); }
00444 
00445         template <typename MatrixType, typename VectorType>
00446         void inplace_solve_vector_impl(MatrixType const & mat,
00447                                        VectorType & vec,
00448                                        unsigned int options)
00449         {
00450           typedef typename viennacl::result_of::cpu_value_type<MatrixType>::type        value_type;
00451 
00452           if (viennacl::is_row_major<MatrixType>::value)
00453           {
00454             triangular_substitute_inplace_row_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(mat),
00455                                                                  static_cast<unsigned int>(viennacl::traits::start1(mat)),         static_cast<unsigned int>(viennacl::traits::start2(mat)),
00456                                                                  static_cast<unsigned int>(viennacl::traits::stride1(mat)),        static_cast<unsigned int>(viennacl::traits::stride2(mat)),
00457                                                                  static_cast<unsigned int>(viennacl::traits::size1(mat)),          static_cast<unsigned int>(viennacl::traits::size2(mat)),
00458                                                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
00459                                                                  detail::cuda_arg<value_type>(vec),
00460                                                                  static_cast<unsigned int>(viennacl::traits::start(vec)),
00461                                                                  static_cast<unsigned int>(viennacl::traits::stride(vec)),
00462                                                                  static_cast<unsigned int>(viennacl::traits::size(vec)),
00463                                                                  options
00464                                                                 );
00465           }
00466           else
00467           {
00468             triangular_substitute_inplace_col_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(mat),
00469                                                                  static_cast<unsigned int>(viennacl::traits::start1(mat)),         static_cast<unsigned int>(viennacl::traits::start2(mat)),
00470                                                                  static_cast<unsigned int>(viennacl::traits::stride1(mat)),        static_cast<unsigned int>(viennacl::traits::stride2(mat)),
00471                                                                  static_cast<unsigned int>(viennacl::traits::size1(mat)),          static_cast<unsigned int>(viennacl::traits::size2(mat)),
00472                                                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
00473                                                                  detail::cuda_arg<value_type>(vec),
00474                                                                  static_cast<unsigned int>(viennacl::traits::start(vec)),
00475                                                                  static_cast<unsigned int>(viennacl::traits::stride(vec)),
00476                                                                  static_cast<unsigned int>(viennacl::traits::size(vec)),
00477                                                                  options
00478                                                                 );
00479           }
00480         }
00481 
00482       }
00483 
00489       template <typename NumericT, typename F, typename SOLVERTAG>
00490       void inplace_solve(const matrix_base<NumericT, F> & mat,
00491                                vector_base<NumericT> & vec,
00492                          SOLVERTAG)
00493       {
00494         unsigned int options = detail::get_option_for_solver_tag(SOLVERTAG());
00495 
00496         detail::inplace_solve_vector_impl(mat, vec, options);
00497       }
00498 
00499 
00500 
00501 
00507       template <typename NumericT, typename F, typename SOLVERTAG>
00508       void inplace_solve(const matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & proxy,
00509                          vector_base<NumericT> & vec,
00510                          SOLVERTAG)
00511       {
00512         unsigned int options = detail::get_option_for_solver_tag(SOLVERTAG()) | 0x02;  //add transpose-flag
00513 
00514         detail::inplace_solve_vector_impl(proxy.lhs(), vec, options);
00515       }
00516 
00517 
00518 
00519     }
00520   }
00521 }
00522 
00523 #endif