ViennaCL - The Vienna Computing Library  1.5.0
viennacl/linalg/cuda/matrix_operations_row.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
00002 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2013, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00026 namespace viennacl
00027 {
00028   namespace linalg
00029   {
00030     namespace cuda
00031     {
00032       //
00033       // am
00034       //
00035 
00036       // alpha on CPU
00037       template <typename T>
00038       __global__ void am_row_kernel(
00039                 T * A,
00040                 unsigned int A_start1, unsigned int A_start2,
00041                 unsigned int A_inc1,   unsigned int A_inc2,
00042                 unsigned int A_size1,  unsigned int A_size2,
00043                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00044 
00045                 T fac2,
00046                 unsigned int options2,
00047                 const T * B,
00048                 unsigned int B_start1, unsigned int B_start2,
00049                 unsigned int B_inc1,   unsigned int B_inc2,
00050                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00051       {
00052         T alpha = fac2;
00053         if (options2 & (1 << 0))
00054           alpha = -alpha;
00055 
00056         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00057         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00058 
00059         if (options2 & (1 << 1))
00060         {
00061           for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00062             for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00063               A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
00064         }
00065         else
00066         {
00067           for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00068             for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00069               A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
00070         }
00071       }
00072 
00073       // alpha on GPU
00074       template <typename T>
00075       __global__ void am_row_kernel(
00076                 T * A,
00077                 unsigned int A_start1, unsigned int A_start2,
00078                 unsigned int A_inc1,   unsigned int A_inc2,
00079                 unsigned int A_size1,  unsigned int A_size2,
00080                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00081 
00082                 const T * fac2,
00083                 unsigned int options2,
00084                 const T * B,
00085                 unsigned int B_start1, unsigned int B_start2,
00086                 unsigned int B_inc1,   unsigned int B_inc2,
00087                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00088       {
00089         T alpha = *fac2;
00090         if (options2 & (1 << 0))
00091           alpha = -alpha;
00092 
00093         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00094         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00095 
00096         if (options2 & (1 << 1))
00097         {
00098           for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00099             for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00100               A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
00101         }
00102         else
00103         {
00104           for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00105             for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00106               A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
00107         }
00108       }
00109 
00110 
00111       //
00112       // ambm
00113       //
00114 
00115       // alpha and beta on CPU
00116       template <typename T>
00117       __global__ void ambm_row_kernel(
00118                 T * A,
00119                 unsigned int A_start1, unsigned int A_start2,
00120                 unsigned int A_inc1,   unsigned int A_inc2,
00121                 unsigned int A_size1,  unsigned int A_size2,
00122                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00123 
00124                 T fac2,
00125                 unsigned int options2,
00126                 const T * B,
00127                 unsigned int B_start1, unsigned int B_start2,
00128                 unsigned int B_inc1,   unsigned int B_inc2,
00129                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00130 
00131                 T fac3,
00132                 unsigned int options3,
00133                 const T * C,
00134                 unsigned int C_start1, unsigned int C_start2,
00135                 unsigned int C_inc1,   unsigned int C_inc2,
00136                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00137       {
00138         T alpha = fac2;
00139         if (options2 & (1 << 0))
00140           alpha = -alpha;
00141 
00142         T beta = fac3;
00143         if (options3 & (1 << 0))
00144           beta = -beta;
00145 
00146         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00147         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00148 
00149         if (options2 & (1 << 1))
00150         {
00151           if (options3 & (1 << 1))
00152           {
00153             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00154               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00155                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00156               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00157               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00158           }
00159           else
00160           {
00161             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00162               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00163                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00164               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00165               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00166           }
00167         }
00168         else
00169         {
00170           if (options3 & (1 << 1))
00171           {
00172             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00173               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00174                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00175               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00176               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00177           }
00178           else
00179           {
00180             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00181               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00182                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00183               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00184               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00185           }
00186         }
00187       }
00188 
00189 
00190       // alpha on CPU, beta on GPU
00191       template <typename T>
00192       __global__ void ambm_row_kernel(
00193                 T * A,
00194                 unsigned int A_start1, unsigned int A_start2,
00195                 unsigned int A_inc1,   unsigned int A_inc2,
00196                 unsigned int A_size1,  unsigned int A_size2,
00197                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00198 
00199                 T fac2,
00200                 unsigned int options2,
00201                 const T * B,
00202                 unsigned int B_start1, unsigned int B_start2,
00203                 unsigned int B_inc1,   unsigned int B_inc2,
00204                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00205 
00206                 const T * fac3,
00207                 unsigned int options3,
00208                 const T * C,
00209                 unsigned int C_start1, unsigned int C_start2,
00210                 unsigned int C_inc1,   unsigned int C_inc2,
00211                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00212       {
00213         T alpha = fac2;
00214         if (options2 & (1 << 0))
00215           alpha = -alpha;
00216 
00217         T beta = *fac3;
00218         if (options3 & (1 << 0))
00219           beta = -beta;
00220 
00221         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00222         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00223 
00224         if (options2 & (1 << 1))
00225         {
00226           if (options3 & (1 << 1))
00227           {
00228             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00229               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00230                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00231               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00232               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00233           }
00234           else
00235           {
00236             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00237               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00238                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00239               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00240               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00241           }
00242         }
00243         else
00244         {
00245           if (options3 & (1 << 1))
00246           {
00247             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00248               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00249                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00250               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00251               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00252           }
00253           else
00254           {
00255             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00256               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00257                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00258               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00259               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00260           }
00261         }
00262       }
00263 
00264       // alpha on GPU, beta on CPU
00265       template <typename T>
00266       __global__ void ambm_row_kernel(
00267                 T * A,
00268                 unsigned int A_start1, unsigned int A_start2,
00269                 unsigned int A_inc1,   unsigned int A_inc2,
00270                 unsigned int A_size1,  unsigned int A_size2,
00271                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00272 
00273                 const T * fac2,
00274                 unsigned int options2,
00275                 const T * B,
00276                 unsigned int B_start1, unsigned int B_start2,
00277                 unsigned int B_inc1,   unsigned int B_inc2,
00278                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00279 
00280                 T fac3,
00281                 unsigned int options3,
00282                 const T * C,
00283                 unsigned int C_start1, unsigned int C_start2,
00284                 unsigned int C_inc1,   unsigned int C_inc2,
00285                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00286       {
00287         T alpha = *fac2;
00288         if (options2 & (1 << 0))
00289           alpha = -alpha;
00290 
00291         T beta = fac3;
00292         if (options3 & (1 << 0))
00293           beta = -beta;
00294 
00295         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00296         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00297 
00298         if (options2 & (1 << 1))
00299         {
00300           if (options3 & (1 << 1))
00301           {
00302             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00303               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00304                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00305               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00306               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00307           }
00308           else
00309           {
00310             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00311               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00312                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00313               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00314               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00315           }
00316         }
00317         else
00318         {
00319           if (options3 & (1 << 1))
00320           {
00321             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00322               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00323                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00324               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00325               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00326           }
00327           else
00328           {
00329             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00330               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00331                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00332               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00333               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00334           }
00335         }
00336       }
00337 
00338 
00339       // alpha and beta on GPU
00340       template <typename T>
00341       __global__ void ambm_row_kernel(
00342                 T * A,
00343                 unsigned int A_start1, unsigned int A_start2,
00344                 unsigned int A_inc1,   unsigned int A_inc2,
00345                 unsigned int A_size1,  unsigned int A_size2,
00346                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00347 
00348                 const T * fac2,
00349                 unsigned int options2,
00350                 const T * B,
00351                 unsigned int B_start1, unsigned int B_start2,
00352                 unsigned int B_inc1,   unsigned int B_inc2,
00353                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00354 
00355                 const T * fac3,
00356                 unsigned int options3,
00357                 const T * C,
00358                 unsigned int C_start1, unsigned int C_start2,
00359                 unsigned int C_inc1,   unsigned int C_inc2,
00360                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00361       {
00362         T alpha = *fac2;
00363         if (options2 & (1 << 0))
00364           alpha = -alpha;
00365 
00366         T beta = *fac3;
00367         if (options3 & (1 << 0))
00368           beta = -beta;
00369 
00370         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00371         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00372 
00373         if (options2 & (1 << 1))
00374         {
00375           if (options3 & (1 << 1))
00376           {
00377             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00378               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00379                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00380               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00381               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00382           }
00383           else
00384           {
00385             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00386               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00387                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00388               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00389               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00390           }
00391         }
00392         else
00393         {
00394           if (options3 & (1 << 1))
00395           {
00396             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00397               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00398                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00399               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00400               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00401           }
00402           else
00403           {
00404             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00405               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00406                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00407               = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00408               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00409           }
00410         }
00411       }
00412 
00413 
00414       //
00415       // ambm_m
00416       //
00417 
00418       // alpha and beta on CPU
00419       template <typename T>
00420       __global__ void ambm_m_row_kernel(
00421                 T * A,
00422                 unsigned int A_start1, unsigned int A_start2,
00423                 unsigned int A_inc1,   unsigned int A_inc2,
00424                 unsigned int A_size1,  unsigned int A_size2,
00425                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00426 
00427                 T fac2,
00428                 unsigned int options2,
00429                 const T * B,
00430                 unsigned int B_start1, unsigned int B_start2,
00431                 unsigned int B_inc1,   unsigned int B_inc2,
00432                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00433 
00434                 T fac3,
00435                 unsigned int options3,
00436                 const T * C,
00437                 unsigned int C_start1, unsigned int C_start2,
00438                 unsigned int C_inc1,   unsigned int C_inc2,
00439                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00440       {
00441         T alpha = fac2;
00442         if (options2 & (1 << 0))
00443           alpha = -alpha;
00444 
00445         T beta = fac3;
00446         if (options3 & (1 << 0))
00447           beta = -beta;
00448 
00449         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00450         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00451 
00452         if (options2 & (1 << 1))
00453         {
00454           if (options3 & (1 << 1))
00455           {
00456             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00457               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00458                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00459              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00460               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00461           }
00462           else
00463           {
00464             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00465               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00466                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00467              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00468               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00469           }
00470         }
00471         else
00472         {
00473           if (options3 & (1 << 1))
00474           {
00475             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00476               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00477                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00478              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00479               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00480           }
00481           else
00482           {
00483             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00484               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00485                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00486              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00487               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00488           }
00489         }
00490       }
00491 
00492 
00493       // alpha on CPU, beta on GPU
00494       template <typename T>
00495       __global__ void ambm_m_row_kernel(
00496                 T * A,
00497                 unsigned int A_start1, unsigned int A_start2,
00498                 unsigned int A_inc1,   unsigned int A_inc2,
00499                 unsigned int A_size1,  unsigned int A_size2,
00500                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00501 
00502                 T fac2,
00503                 unsigned int options2,
00504                 const T * B,
00505                 unsigned int B_start1, unsigned int B_start2,
00506                 unsigned int B_inc1,   unsigned int B_inc2,
00507                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00508 
00509                 const T * fac3,
00510                 unsigned int options3,
00511                 const T * C,
00512                 unsigned int C_start1, unsigned int C_start2,
00513                 unsigned int C_inc1,   unsigned int C_inc2,
00514                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00515       {
00516         T alpha = fac2;
00517         if (options2 & (1 << 0))
00518           alpha = -alpha;
00519 
00520         T beta = *fac3;
00521         if (options3 & (1 << 0))
00522           beta = -beta;
00523 
00524         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00525         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00526 
00527         if (options2 & (1 << 1))
00528         {
00529           if (options3 & (1 << 1))
00530           {
00531             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00532               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00533                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00534              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00535               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00536           }
00537           else
00538           {
00539             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00540               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00541                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00542              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00543               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00544           }
00545         }
00546         else
00547         {
00548           if (options3 & (1 << 1))
00549           {
00550             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00551               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00552                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00553              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00554               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00555           }
00556           else
00557           {
00558             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00559               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00560                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00561              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00562               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00563           }
00564         }
00565       }
00566 
00567       // alpha on GPU, beta on CPU
00568       template <typename T>
00569       __global__ void ambm_m_row_kernel(
00570                 T * A,
00571                 unsigned int A_start1, unsigned int A_start2,
00572                 unsigned int A_inc1,   unsigned int A_inc2,
00573                 unsigned int A_size1,  unsigned int A_size2,
00574                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00575 
00576                 const T * fac2,
00577                 unsigned int options2,
00578                 const T * B,
00579                 unsigned int B_start1, unsigned int B_start2,
00580                 unsigned int B_inc1,   unsigned int B_inc2,
00581                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00582 
00583                 T fac3,
00584                 unsigned int options3,
00585                 const T * C,
00586                 unsigned int C_start1, unsigned int C_start2,
00587                 unsigned int C_inc1,   unsigned int C_inc2,
00588                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00589       {
00590         T alpha = *fac2;
00591         if (options2 & (1 << 0))
00592           alpha = -alpha;
00593 
00594         T beta = fac3;
00595         if (options3 & (1 << 0))
00596           beta = -beta;
00597 
00598         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00599         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00600 
00601         if (options2 & (1 << 1))
00602         {
00603           if (options3 & (1 << 1))
00604           {
00605             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00606               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00607                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00608              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00609               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00610           }
00611           else
00612           {
00613             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00614               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00615                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00616              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00617               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00618           }
00619         }
00620         else
00621         {
00622           if (options3 & (1 << 1))
00623           {
00624             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00625               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00626                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00627              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00628               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00629           }
00630           else
00631           {
00632             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00633               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00634                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00635              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00636               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00637           }
00638         }
00639       }
00640 
00641 
00642       // alpha and beta on GPU
00643       template <typename T>
00644       __global__ void ambm_m_row_kernel(
00645                 T * A,
00646                 unsigned int A_start1, unsigned int A_start2,
00647                 unsigned int A_inc1,   unsigned int A_inc2,
00648                 unsigned int A_size1,  unsigned int A_size2,
00649                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00650 
00651                 const T * fac2,
00652                 unsigned int options2,
00653                 const T * B,
00654                 unsigned int B_start1, unsigned int B_start2,
00655                 unsigned int B_inc1,   unsigned int B_inc2,
00656                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00657 
00658                 const T * fac3,
00659                 unsigned int options3,
00660                 const T * C,
00661                 unsigned int C_start1, unsigned int C_start2,
00662                 unsigned int C_inc1,   unsigned int C_inc2,
00663                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00664       {
00665         T alpha = *fac2;
00666         if (options2 & (1 << 0))
00667           alpha = -alpha;
00668 
00669         T beta = *fac3;
00670         if (options3 & (1 << 0))
00671           beta = -beta;
00672 
00673         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00674         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00675 
00676         if (options2 & (1 << 1))
00677         {
00678           if (options3 & (1 << 1))
00679           {
00680             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00681               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00682                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00683              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00684               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00685           }
00686           else
00687           {
00688             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00689               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00690                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00691              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
00692               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00693           }
00694         }
00695         else
00696         {
00697           if (options3 & (1 << 1))
00698           {
00699             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00700               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00701                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00702              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00703               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
00704           }
00705           else
00706           {
00707             for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00708               for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00709                 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00710              += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
00711               + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
00712           }
00713         }
00714       }
00715 
00716       //
00717       // assignments
00718       //
00719 
00720       template <typename T>
00721       __global__ void matrix_row_assign_kernel(
00722                 T * A,
00723                 unsigned int A_start1, unsigned int A_start2,
00724                 unsigned int A_inc1,   unsigned int A_inc2,
00725                 unsigned int A_size1,  unsigned int A_size2,
00726                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00727                 T alpha)
00728       {
00729         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00730         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00731 
00732         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00733           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00734             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
00735       }
00736 
00737 
00738       template <typename T>
00739       __global__ void matrix_row_diagonal_assign_kernel(
00740                 T * A,
00741                 unsigned int A_start1, unsigned int A_start2,
00742                 unsigned int A_inc1,   unsigned int A_inc2,
00743                 unsigned int A_size1,  unsigned int A_size2,
00744                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00745                 T alpha)
00746       {
00747         unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
00748 
00749         for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
00750           A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] = alpha;
00751       }
00752 
00753       //
00754       // binary element-wise operations
00755       //
00756 
00757       template <typename T>
00758       __global__ void element_op_row_kernel(
00759                 T * A,
00760                 unsigned int A_start1, unsigned int A_start2,
00761                 unsigned int A_inc1,   unsigned int A_inc2,
00762                 unsigned int A_size1,  unsigned int A_size2,
00763                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00764 
00765                 const T * B,
00766                 unsigned int B_start1, unsigned int B_start2,
00767                 unsigned int B_inc1,   unsigned int B_inc2,
00768                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00769 
00770                 const T * C,
00771                 unsigned int C_start1, unsigned int C_start2,
00772                 unsigned int C_inc1,   unsigned int C_inc2,
00773                 unsigned int C_internal_size1,  unsigned int C_internal_size2,
00774 
00775                 unsigned int op_type) //0: product, 1: division, 2: pow
00776       {
00777         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00778         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00779 
00780         if (op_type == 2)
00781         {
00782           for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00783             for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00784               A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00785             = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
00786                   C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
00787         }
00788         else if (op_type == 1)
00789         {
00790           for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00791             for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00792               A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00793             = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
00794             / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
00795         }
00796         else if (op_type == 0)
00797         {
00798           for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00799             for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00800               A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00801             = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
00802             * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
00803         }
00804       }
00805 
00806       template <typename T>
00807       __global__ void element_op_int_row_kernel(
00808                 T * A,
00809                 unsigned int A_start1, unsigned int A_start2,
00810                 unsigned int A_inc1,   unsigned int A_inc2,
00811                 unsigned int A_size1,  unsigned int A_size2,
00812                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00813 
00814                 const T * B,
00815                 unsigned int B_start1, unsigned int B_start2,
00816                 unsigned int B_inc1,   unsigned int B_inc2,
00817                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00818 
00819                 const T * C,
00820                 unsigned int C_start1, unsigned int C_start2,
00821                 unsigned int C_inc1,   unsigned int C_inc2,
00822                 unsigned int C_internal_size1,  unsigned int C_internal_size2,
00823 
00824                 unsigned int op_type) //0: product, 1: division, 2: pow
00825       {
00826         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00827         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00828 
00829         if (op_type == 1)
00830         {
00831           for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00832             for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00833               A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00834             = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
00835             / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
00836         }
00837         else if (op_type == 0)
00838         {
00839           for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00840             for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00841               A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
00842             = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
00843             * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
00844         }
00845       }
00846 
00847       //
00848       // unary element-wise operations
00849       //
00850 
00851       // abs
00852       template <typename T>
00853       __global__ void matrix_row_element_abs_kernel(
00854                 T * A,
00855                 unsigned int A_start1, unsigned int A_start2,
00856                 unsigned int A_inc1,   unsigned int A_inc2,
00857                 unsigned int A_size1,  unsigned int A_size2,
00858                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00859 
00860                 const T * B,
00861                 unsigned int B_start1, unsigned int B_start2,
00862                 unsigned int B_inc1,   unsigned int B_inc2,
00863                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00864       {
00865         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00866         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00867 
00868         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00869           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00870             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
00871       }
00872 
00873 
00874       // acos
00875       template <typename T>
00876       __global__ void matrix_row_element_acos_kernel(
00877                 T * A,
00878                 unsigned int A_start1, unsigned int A_start2,
00879                 unsigned int A_inc1,   unsigned int A_inc2,
00880                 unsigned int A_size1,  unsigned int A_size2,
00881                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00882 
00883                 const T * B,
00884                 unsigned int B_start1, unsigned int B_start2,
00885                 unsigned int B_inc1,   unsigned int B_inc2,
00886                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00887       {
00888         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00889         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00890 
00891         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00892           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00893             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
00894       }
00895 
00896 
00897       // asin
00898       template <typename T>
00899       __global__ void matrix_row_element_asin_kernel(
00900                 T * A,
00901                 unsigned int A_start1, unsigned int A_start2,
00902                 unsigned int A_inc1,   unsigned int A_inc2,
00903                 unsigned int A_size1,  unsigned int A_size2,
00904                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00905 
00906                 const T * B,
00907                 unsigned int B_start1, unsigned int B_start2,
00908                 unsigned int B_inc1,   unsigned int B_inc2,
00909                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00910       {
00911         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00912         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00913 
00914         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00915           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00916             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
00917       }
00918 
00919 
00920       // atan
00921       template <typename T>
00922       __global__ void matrix_row_element_atan_kernel(
00923                 T * A,
00924                 unsigned int A_start1, unsigned int A_start2,
00925                 unsigned int A_inc1,   unsigned int A_inc2,
00926                 unsigned int A_size1,  unsigned int A_size2,
00927                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00928 
00929                 const T * B,
00930                 unsigned int B_start1, unsigned int B_start2,
00931                 unsigned int B_inc1,   unsigned int B_inc2,
00932                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00933       {
00934         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00935         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00936 
00937         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00938           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00939             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
00940       }
00941 
00942 
00943       // ceil
00944       template <typename T>
00945       __global__ void matrix_row_element_ceil_kernel(
00946                 T * A,
00947                 unsigned int A_start1, unsigned int A_start2,
00948                 unsigned int A_inc1,   unsigned int A_inc2,
00949                 unsigned int A_size1,  unsigned int A_size2,
00950                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00951 
00952                 const T * B,
00953                 unsigned int B_start1, unsigned int B_start2,
00954                 unsigned int B_inc1,   unsigned int B_inc2,
00955                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00956       {
00957         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00958         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00959 
00960         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00961           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00962             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
00963       }
00964 
00965 
00966       // cos
00967       template <typename T>
00968       __global__ void matrix_row_element_cos_kernel(
00969                 T * A,
00970                 unsigned int A_start1, unsigned int A_start2,
00971                 unsigned int A_inc1,   unsigned int A_inc2,
00972                 unsigned int A_size1,  unsigned int A_size2,
00973                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00974 
00975                 const T * B,
00976                 unsigned int B_start1, unsigned int B_start2,
00977                 unsigned int B_inc1,   unsigned int B_inc2,
00978                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00979       {
00980         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00981         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00982 
00983         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
00984           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
00985             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
00986       }
00987 
00988 
00989       // cosh
00990       template <typename T>
00991       __global__ void matrix_row_element_cosh_kernel(
00992                 T * A,
00993                 unsigned int A_start1, unsigned int A_start2,
00994                 unsigned int A_inc1,   unsigned int A_inc2,
00995                 unsigned int A_size1,  unsigned int A_size2,
00996                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00997 
00998                 const T * B,
00999                 unsigned int B_start1, unsigned int B_start2,
01000                 unsigned int B_inc1,   unsigned int B_inc2,
01001                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01002       {
01003         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01004         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01005 
01006         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01007           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01008             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
01009       }
01010 
01011 
01012       // exp
01013       template <typename T>
01014       __global__ void matrix_row_element_exp_kernel(
01015                 T * A,
01016                 unsigned int A_start1, unsigned int A_start2,
01017                 unsigned int A_inc1,   unsigned int A_inc2,
01018                 unsigned int A_size1,  unsigned int A_size2,
01019                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01020 
01021                 const T * B,
01022                 unsigned int B_start1, unsigned int B_start2,
01023                 unsigned int B_inc1,   unsigned int B_inc2,
01024                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01025       {
01026         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01027         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01028 
01029         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01030           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01031             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
01032       }
01033 
01034 
01035       // fabs
01036       template <typename T>
01037       __global__ void matrix_row_element_fabs_kernel(
01038                 T * A,
01039                 unsigned int A_start1, unsigned int A_start2,
01040                 unsigned int A_inc1,   unsigned int A_inc2,
01041                 unsigned int A_size1,  unsigned int A_size2,
01042                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01043 
01044                 const T * B,
01045                 unsigned int B_start1, unsigned int B_start2,
01046                 unsigned int B_inc1,   unsigned int B_inc2,
01047                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01048       {
01049         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01050         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01051 
01052         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01053           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01054             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
01055       }
01056 
01057 
01058       // floor
01059       template <typename T>
01060       __global__ void matrix_row_element_floor_kernel(
01061                 T * A,
01062                 unsigned int A_start1, unsigned int A_start2,
01063                 unsigned int A_inc1,   unsigned int A_inc2,
01064                 unsigned int A_size1,  unsigned int A_size2,
01065                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01066 
01067                 const T * B,
01068                 unsigned int B_start1, unsigned int B_start2,
01069                 unsigned int B_inc1,   unsigned int B_inc2,
01070                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01071       {
01072         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01073         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01074 
01075         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01076           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01077             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
01078       }
01079 
01080 
01081       // log
01082       template <typename T>
01083       __global__ void matrix_row_element_log_kernel(
01084                 T * A,
01085                 unsigned int A_start1, unsigned int A_start2,
01086                 unsigned int A_inc1,   unsigned int A_inc2,
01087                 unsigned int A_size1,  unsigned int A_size2,
01088                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01089 
01090                 const T * B,
01091                 unsigned int B_start1, unsigned int B_start2,
01092                 unsigned int B_inc1,   unsigned int B_inc2,
01093                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01094       {
01095         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01096         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01097 
01098         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01099           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01100             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
01101       }
01102 
01103 
01104       // log10
01105       template <typename T>
01106       __global__ void matrix_row_element_log10_kernel(
01107                 T * A,
01108                 unsigned int A_start1, unsigned int A_start2,
01109                 unsigned int A_inc1,   unsigned int A_inc2,
01110                 unsigned int A_size1,  unsigned int A_size2,
01111                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01112 
01113                 const T * B,
01114                 unsigned int B_start1, unsigned int B_start2,
01115                 unsigned int B_inc1,   unsigned int B_inc2,
01116                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01117       {
01118         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01119         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01120 
01121         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01122           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01123             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
01124       }
01125 
01126 
01127       // sin
01128       template <typename T>
01129       __global__ void matrix_row_element_sin_kernel(
01130                 T * A,
01131                 unsigned int A_start1, unsigned int A_start2,
01132                 unsigned int A_inc1,   unsigned int A_inc2,
01133                 unsigned int A_size1,  unsigned int A_size2,
01134                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01135 
01136                 const T * B,
01137                 unsigned int B_start1, unsigned int B_start2,
01138                 unsigned int B_inc1,   unsigned int B_inc2,
01139                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01140       {
01141         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01142         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01143 
01144         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01145           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01146             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
01147       }
01148 
01149 
01150       // sinh
01151       template <typename T>
01152       __global__ void matrix_row_element_sinh_kernel(
01153                 T * A,
01154                 unsigned int A_start1, unsigned int A_start2,
01155                 unsigned int A_inc1,   unsigned int A_inc2,
01156                 unsigned int A_size1,  unsigned int A_size2,
01157                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01158 
01159                 const T * B,
01160                 unsigned int B_start1, unsigned int B_start2,
01161                 unsigned int B_inc1,   unsigned int B_inc2,
01162                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01163       {
01164         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01165         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01166 
01167         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01168           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01169             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
01170       }
01171 
01172 
01173       // sqrt
01174       template <typename T>
01175       __global__ void matrix_row_element_sqrt_kernel(
01176                 T * A,
01177                 unsigned int A_start1, unsigned int A_start2,
01178                 unsigned int A_inc1,   unsigned int A_inc2,
01179                 unsigned int A_size1,  unsigned int A_size2,
01180                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01181 
01182                 const T * B,
01183                 unsigned int B_start1, unsigned int B_start2,
01184                 unsigned int B_inc1,   unsigned int B_inc2,
01185                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01186       {
01187         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01188         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01189 
01190         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01191           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01192             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
01193       }
01194 
01195 
01196       // tan
01197       template <typename T>
01198       __global__ void matrix_row_element_tan_kernel(
01199                 T * A,
01200                 unsigned int A_start1, unsigned int A_start2,
01201                 unsigned int A_inc1,   unsigned int A_inc2,
01202                 unsigned int A_size1,  unsigned int A_size2,
01203                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01204 
01205                 const T * B,
01206                 unsigned int B_start1, unsigned int B_start2,
01207                 unsigned int B_inc1,   unsigned int B_inc2,
01208                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01209       {
01210         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01211         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01212 
01213         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01214           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01215             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
01216       }
01217 
01218 
01219       // tanh
01220       template <typename T>
01221       __global__ void matrix_row_element_tanh_kernel(
01222                 T * A,
01223                 unsigned int A_start1, unsigned int A_start2,
01224                 unsigned int A_inc1,   unsigned int A_inc2,
01225                 unsigned int A_size1,  unsigned int A_size2,
01226                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01227 
01228                 const T * B,
01229                 unsigned int B_start1, unsigned int B_start2,
01230                 unsigned int B_inc1,   unsigned int B_inc2,
01231                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01232       {
01233         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01234         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01235 
01236         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01237           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01238             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
01239       }
01240 
01241 
01242 
01243       //
01244       // matrix-vector product
01245       //
01246 
01247       template <typename T>
01248       __global__ void vec_mul_row_kernel(
01249                 const T * A,
01250                 unsigned int A_row_start,
01251                 unsigned int A_col_start,
01252                 unsigned int A_row_inc,
01253                 unsigned int A_col_inc,
01254                 unsigned int A_row_size,
01255                 unsigned int A_col_size,
01256                 unsigned int A_internal_rows,
01257                 unsigned int A_internal_cols,
01258                 const T * v,
01259                 unsigned int v_start,
01260                 unsigned int v_inc,
01261                 unsigned int v_size,
01262                 T * result,
01263                 unsigned int result_start,
01264                 unsigned int result_inc,
01265                 unsigned int result_size)
01266       {
01267         __shared__ T work[128];
01268 
01269         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01270         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01271         unsigned int lid = threadIdx.x;
01272 
01273         for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x)
01274         {
01275           T dot_prod = 0;
01276           for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
01277             dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
01278           work[lid] = dot_prod;
01279 
01280           for(unsigned int stride = blockDim.x/2 ; stride>0 ; stride>>=1){
01281             __syncthreads();
01282             if(lid < stride)
01283               work[lid] += work[lid+stride];
01284           }
01285 
01286           if(lid == 0)
01287             result[row * result_inc + result_start] = work[0];
01288         }
01289       }
01290 
01291 
01292       template <typename T>
01293       __global__ void trans_vec_mul_row_kernel(
01294                 const T * A,
01295                 unsigned int A_row_start,
01296                 unsigned int A_col_start,
01297                 unsigned int A_row_inc,
01298                 unsigned int A_col_inc,
01299                 unsigned int A_row_size,
01300                 unsigned int A_col_size,
01301                 unsigned int A_internal_rows,
01302                 unsigned int A_internal_cols,
01303                 const T * v,
01304                 unsigned int v_start,
01305                 unsigned int v_inc,
01306                 unsigned int v_size,
01307                 T * result,
01308                 unsigned int result_start,
01309                 unsigned int result_inc,
01310                 unsigned int result_size)
01311       {
01312         for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_col_size; row += gridDim.x * blockDim.x)
01313         {
01314           T dot_prod = 0;
01315           for (unsigned int col = 0; col < A_row_size; ++col)
01316             dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
01317           result[row * result_inc + result_start] = dot_prod;
01318         }
01319       }
01320 
01321 
01322       //
01323       // matrix-matrix products
01324       //
01325 
01326 
01327 
01328 
01329       //
01330       // scaled rank-1-update
01331       //
01332 
01333       // alpha on CPU
01334       template <typename T>
01335       __global__ void scaled_rank1_update_row_kernel(
01336                 T * A,
01337                 unsigned int A_start1, unsigned int A_start2,
01338                 unsigned int A_inc1,   unsigned int A_inc2,
01339                 unsigned int A_size1,  unsigned int A_size2,
01340                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01341 
01342                 T val,
01343                 unsigned int options2,
01344 
01345                 const T * vec1,
01346                 unsigned int start1,
01347                 unsigned int inc1,
01348                 unsigned int size1,
01349 
01350                 const T * vec2,
01351                 unsigned int start2,
01352                 unsigned int inc2,
01353                 unsigned int size2)
01354       {
01355         T alpha = val;
01356         if (options2 & (1 << 0))
01357           alpha = -alpha;
01358         if (options2 & (1 << 1))
01359           alpha = ((T)(1)) / alpha;
01360 
01361         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01362         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01363 
01364         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01365         {
01366           T tmp = alpha * vec1[row * inc1 + start1];
01367           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01368             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
01369         }
01370       }
01371 
01372 
01373       // alpha on GPU
01374       template <typename T>
01375       __global__ void scaled_rank1_update_row_kernel(
01376                 T * A,
01377                 unsigned int A_start1, unsigned int A_start2,
01378                 unsigned int A_inc1,   unsigned int A_inc2,
01379                 unsigned int A_size1,  unsigned int A_size2,
01380                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01381 
01382                 const T * val,
01383                 unsigned int options2,
01384 
01385                 const T * vec1,
01386                 unsigned int start1,
01387                 unsigned int inc1,
01388                 unsigned int size1,
01389 
01390                 const T * vec2,
01391                 unsigned int start2,
01392                 unsigned int inc2,
01393                 unsigned int size2)
01394       {
01395         T alpha = *val;
01396         if (options2 & (1 << 0))
01397           alpha = -alpha;
01398         if (options2 & (1 << 1))
01399           alpha = ((T)(1)) / alpha;
01400 
01401         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01402         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01403 
01404         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01405         {
01406           T tmp = alpha * vec1[row * inc1 + start1];
01407           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01408             A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2];
01409         }
01410       }
01411 
01412 
01413 
01414     } // namespace cuda
01415   } //namespace linalg
01416 } //namespace viennacl
01417 
01418 
01419 #endif