ViennaCL - The Vienna Computing Library  1.5.0
viennacl/linalg/cuda/matrix_operations_prod.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
00002 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2013, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00028 namespace viennacl
00029 {
00030   namespace linalg
00031   {
00032     namespace cuda
00033     {
00034 
00035       // matrix-matrix multiplication C = A * B
00036       // matrix layouts: C...col_major, A...col_major, B...col_major
00037       template <typename T>
00038       __global__ void matrix_matrix_col_col_col_prod_AA_kernel(
00039                 T alpha,
00040                 const T * A,
00041                 unsigned int A_row_start,
00042                 unsigned int A_col_start,
00043                 unsigned int A_row_inc,
00044                 unsigned int A_col_inc,
00045                 unsigned int A_row_size,
00046                 unsigned int A_col_size,
00047                 unsigned int A_internal_rows,
00048                 unsigned int A_internal_cols,
00049                 const T * B,
00050                 unsigned int B_row_start,
00051                 unsigned int B_col_start,
00052                 unsigned int B_row_inc,
00053                 unsigned int B_col_inc,
00054                 unsigned int B_row_size,
00055                 unsigned int B_col_size,
00056                 unsigned int B_internal_rows,
00057                 unsigned int B_internal_cols,
00058                 T beta,
00059                 T * C,
00060                 unsigned int C_row_start,
00061                 unsigned int C_col_start,
00062                 unsigned int C_row_inc,
00063                 unsigned int C_col_inc,
00064                 unsigned int C_row_size,
00065                 unsigned int C_col_size,
00066                 unsigned int C_internal_rows,
00067                 unsigned int C_internal_cols)
00068       {
00069 
00070         __shared__ T bufA[272];
00071         __shared__ T bufB[272];
00072 
00073         vcl_size_t block_size = 16;//get_local_size(0);
00074         vcl_size_t row_block_id = blockIdx.x;
00075         vcl_size_t col_block_id = blockIdx.y;
00076         vcl_size_t row_thread_id = threadIdx.x;
00077         vcl_size_t col_thread_id = threadIdx.y;
00078         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
00079         vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
00080         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
00081         vcl_size_t bStep = block_size * B_row_inc;
00082         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
00083         T Csub = 0;
00084         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
00085         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
00086 
00087         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
00088         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
00089         for (vcl_size_t block = 0;
00090                 block < block_num;
00091                 ++block)
00092         {
00093           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
00094           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
00095           __syncthreads();
00096           T * bufAptr = bufA + row_thread_id_times_block_size;
00097           T * bufBptr = bufB + col_thread_id_times_block_size;
00098             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00099             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00100             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00101             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00102             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00103             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00104             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00105             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00106             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00107             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00108             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00109             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00110             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00111             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00112             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00113             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00114           __syncthreads();
00115           aBegin += aStep;
00116           bBegin += bStep;
00117         }
00118         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
00119           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
00120       }
00121 
00122       // matrix-matrix multiplication C = A * B^T
00123       // matrix layouts: C...col_major, A...col_major, B...col_major
00124       template <typename T>
00125       __global__ void matrix_matrix_col_col_col_prod_AT_kernel(
00126                 T alpha,
00127                 const T * A,
00128                 unsigned int A_row_start,
00129                 unsigned int A_col_start,
00130                 unsigned int A_row_inc,
00131                 unsigned int A_col_inc,
00132                 unsigned int A_row_size,
00133                 unsigned int A_col_size,
00134                 unsigned int A_internal_rows,
00135                 unsigned int A_internal_cols,
00136                 const T * B,
00137                 unsigned int B_row_start,
00138                 unsigned int B_col_start,
00139                 unsigned int B_row_inc,
00140                 unsigned int B_col_inc,
00141                 unsigned int B_row_size,
00142                 unsigned int B_col_size,
00143                 unsigned int B_internal_rows,
00144                 unsigned int B_internal_cols,
00145                 T beta,
00146                 T * C,
00147                 unsigned int C_row_start,
00148                 unsigned int C_col_start,
00149                 unsigned int C_row_inc,
00150                 unsigned int C_col_inc,
00151                 unsigned int C_row_size,
00152                 unsigned int C_col_size,
00153                 unsigned int C_internal_rows,
00154                 unsigned int C_internal_cols)
00155       {
00156 
00157         __shared__ T bufA[272];
00158         __shared__ T bufB[272];
00159 
00160         vcl_size_t block_size = 16;//get_local_size(0);
00161         vcl_size_t row_block_id = blockIdx.x;
00162         vcl_size_t col_block_id = blockIdx.y;
00163         vcl_size_t row_thread_id = threadIdx.x;
00164         vcl_size_t col_thread_id = threadIdx.y;
00165         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
00166         vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
00167         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
00168         vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
00169         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
00170         T Csub = 0;
00171         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
00172         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
00173 
00174         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
00175         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
00176         for (vcl_size_t block = 0;
00177                 block < block_num;
00178                 ++block)
00179         {
00180           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
00181           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
00182           __syncthreads();
00183           T * bufAptr = bufA + row_thread_id_times_block_size;
00184           T * bufBptr = bufB + col_thread_id_times_block_size;
00185             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00186             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00187             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00188             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00189             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00190             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00191             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00192             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00193             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00194             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00195             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00196             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00197             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00198             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00199             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00200             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00201           __syncthreads();
00202           aBegin += aStep;
00203           bBegin += bStep;
00204         }
00205         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
00206           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
00207       }
00208 
00209       // matrix-matrix multiplication C = A^T * B
00210       // matrix layouts: C...col_major, A...col_major, B...col_major
00211       template <typename T>
00212       __global__ void matrix_matrix_col_col_col_prod_TA_kernel(
00213                 T alpha,
00214                 const T * A,
00215                 unsigned int A_row_start,
00216                 unsigned int A_col_start,
00217                 unsigned int A_row_inc,
00218                 unsigned int A_col_inc,
00219                 unsigned int A_row_size,
00220                 unsigned int A_col_size,
00221                 unsigned int A_internal_rows,
00222                 unsigned int A_internal_cols,
00223                 const T * B,
00224                 unsigned int B_row_start,
00225                 unsigned int B_col_start,
00226                 unsigned int B_row_inc,
00227                 unsigned int B_col_inc,
00228                 unsigned int B_row_size,
00229                 unsigned int B_col_size,
00230                 unsigned int B_internal_rows,
00231                 unsigned int B_internal_cols,
00232                 T beta,
00233                 T * C,
00234                 unsigned int C_row_start,
00235                 unsigned int C_col_start,
00236                 unsigned int C_row_inc,
00237                 unsigned int C_col_inc,
00238                 unsigned int C_row_size,
00239                 unsigned int C_col_size,
00240                 unsigned int C_internal_rows,
00241                 unsigned int C_internal_cols)
00242       {
00243 
00244         __shared__ T bufA[272];
00245         __shared__ T bufB[272];
00246 
00247         vcl_size_t block_size = 16;//get_local_size(0);
00248         vcl_size_t row_block_id = blockIdx.x;
00249         vcl_size_t col_block_id = blockIdx.y;
00250         vcl_size_t row_thread_id = threadIdx.x;
00251         vcl_size_t col_thread_id = threadIdx.y;
00252         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
00253         vcl_size_t aStep = block_size * A_row_inc;
00254         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
00255         vcl_size_t bStep = block_size * B_row_inc;
00256         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
00257         T Csub = 0;
00258         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
00259         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
00260 
00261         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
00262         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
00263         for (vcl_size_t block = 0;
00264                 block < block_num;
00265                 ++block)
00266         {
00267           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
00268           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
00269           __syncthreads();
00270           T * bufAptr = bufA + row_thread_id_times_block_size;
00271           T * bufBptr = bufB + col_thread_id_times_block_size;
00272             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00273             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00274             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00275             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00276             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00277             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00278             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00279             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00280             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00281             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00282             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00283             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00284             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00285             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00286             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00287             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00288           __syncthreads();
00289           aBegin += aStep;
00290           bBegin += bStep;
00291         }
00292         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
00293           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
00294       }
00295 
00296       // matrix-matrix multiplication C = A^T * B^T
00297       // matrix layouts: C...col_major, A...col_major, B...col_major
00298       template <typename T>
00299       __global__ void matrix_matrix_col_col_col_prod_TT_kernel(
00300                 T alpha,
00301                 const T * A,
00302                 unsigned int A_row_start,
00303                 unsigned int A_col_start,
00304                 unsigned int A_row_inc,
00305                 unsigned int A_col_inc,
00306                 unsigned int A_row_size,
00307                 unsigned int A_col_size,
00308                 unsigned int A_internal_rows,
00309                 unsigned int A_internal_cols,
00310                 const T * B,
00311                 unsigned int B_row_start,
00312                 unsigned int B_col_start,
00313                 unsigned int B_row_inc,
00314                 unsigned int B_col_inc,
00315                 unsigned int B_row_size,
00316                 unsigned int B_col_size,
00317                 unsigned int B_internal_rows,
00318                 unsigned int B_internal_cols,
00319                 T beta,
00320                 T * C,
00321                 unsigned int C_row_start,
00322                 unsigned int C_col_start,
00323                 unsigned int C_row_inc,
00324                 unsigned int C_col_inc,
00325                 unsigned int C_row_size,
00326                 unsigned int C_col_size,
00327                 unsigned int C_internal_rows,
00328                 unsigned int C_internal_cols)
00329       {
00330 
00331         __shared__ T bufA[272];
00332         __shared__ T bufB[272];
00333 
00334         vcl_size_t block_size = 16;//get_local_size(0);
00335         vcl_size_t row_block_id = blockIdx.x;
00336         vcl_size_t col_block_id = blockIdx.y;
00337         vcl_size_t row_thread_id = threadIdx.x;
00338         vcl_size_t col_thread_id = threadIdx.y;
00339         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
00340         vcl_size_t aStep = block_size * A_row_inc;
00341         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
00342         vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
00343         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
00344         T Csub = 0;
00345         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
00346         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
00347 
00348         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
00349         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
00350         for (vcl_size_t block = 0;
00351                 block < block_num;
00352                 ++block)
00353         {
00354           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
00355           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
00356           __syncthreads();
00357           T * bufAptr = bufA + row_thread_id_times_block_size;
00358           T * bufBptr = bufB + col_thread_id_times_block_size;
00359             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00360             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00361             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00362             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00363             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00364             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00365             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00366             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00367             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00368             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00369             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00370             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00371             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00372             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00373             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00374             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00375           __syncthreads();
00376           aBegin += aStep;
00377           bBegin += bStep;
00378         }
00379         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
00380           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
00381       }
00382 
00383 
00384 
00386 
00387 
00388 
00389 
00390       // matrix-matrix multiplication C = A * B
00391       // matrix layouts: C...row_major, A...col_major, B...col_major
00392       template <typename T>
00393       __global__ void matrix_matrix_row_col_col_prod_AA_kernel(
00394                 T alpha,
00395                 const T * A,
00396                 unsigned int A_row_start,
00397                 unsigned int A_col_start,
00398                 unsigned int A_row_inc,
00399                 unsigned int A_col_inc,
00400                 unsigned int A_row_size,
00401                 unsigned int A_col_size,
00402                 unsigned int A_internal_rows,
00403                 unsigned int A_internal_cols,
00404                 const T * B,
00405                 unsigned int B_row_start,
00406                 unsigned int B_col_start,
00407                 unsigned int B_row_inc,
00408                 unsigned int B_col_inc,
00409                 unsigned int B_row_size,
00410                 unsigned int B_col_size,
00411                 unsigned int B_internal_rows,
00412                 unsigned int B_internal_cols,
00413                 T beta,
00414                 T * C,
00415                 unsigned int C_row_start,
00416                 unsigned int C_col_start,
00417                 unsigned int C_row_inc,
00418                 unsigned int C_col_inc,
00419                 unsigned int C_row_size,
00420                 unsigned int C_col_size,
00421                 unsigned int C_internal_rows,
00422                 unsigned int C_internal_cols)
00423       {
00424 
00425         __shared__ T bufA[272];
00426         __shared__ T bufB[272];
00427 
00428         vcl_size_t block_size = 16;//get_local_size(0);
00429         vcl_size_t row_block_id = blockIdx.x;
00430         vcl_size_t col_block_id = blockIdx.y;
00431         vcl_size_t row_thread_id = threadIdx.x;
00432         vcl_size_t col_thread_id = threadIdx.y;
00433         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
00434         vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
00435         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
00436         vcl_size_t bStep = block_size * B_row_inc;
00437         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
00438         T Csub = 0;
00439         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
00440         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
00441 
00442         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
00443         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
00444         for (vcl_size_t block = 0;
00445                 block < block_num;
00446                 ++block)
00447         {
00448           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
00449           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
00450           __syncthreads();
00451           T * bufAptr = bufA + row_thread_id_times_block_size;
00452           T * bufBptr = bufB + col_thread_id_times_block_size;
00453             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00454             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00455             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00456             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00457             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00458             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00459             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00460             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00461             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00462             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00463             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00464             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00465             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00466             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00467             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00468             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00469           __syncthreads();
00470           aBegin += aStep;
00471           bBegin += bStep;
00472         }
00473         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
00474           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
00475       }
00476 
00477       // matrix-matrix multiplication C = A * B^T
00478       // matrix layouts: C...row_major, A...col_major, B...col_major
00479       template <typename T>
00480       __global__ void matrix_matrix_row_col_col_prod_AT_kernel(
00481                 T alpha,
00482                 const T * A,
00483                 unsigned int A_row_start,
00484                 unsigned int A_col_start,
00485                 unsigned int A_row_inc,
00486                 unsigned int A_col_inc,
00487                 unsigned int A_row_size,
00488                 unsigned int A_col_size,
00489                 unsigned int A_internal_rows,
00490                 unsigned int A_internal_cols,
00491                 const T * B,
00492                 unsigned int B_row_start,
00493                 unsigned int B_col_start,
00494                 unsigned int B_row_inc,
00495                 unsigned int B_col_inc,
00496                 unsigned int B_row_size,
00497                 unsigned int B_col_size,
00498                 unsigned int B_internal_rows,
00499                 unsigned int B_internal_cols,
00500                 T beta,
00501                 T * C,
00502                 unsigned int C_row_start,
00503                 unsigned int C_col_start,
00504                 unsigned int C_row_inc,
00505                 unsigned int C_col_inc,
00506                 unsigned int C_row_size,
00507                 unsigned int C_col_size,
00508                 unsigned int C_internal_rows,
00509                 unsigned int C_internal_cols)
00510       {
00511 
00512         __shared__ T bufA[272];
00513         __shared__ T bufB[272];
00514 
00515         vcl_size_t block_size = 16;//get_local_size(0);
00516         vcl_size_t row_block_id = blockIdx.x;
00517         vcl_size_t col_block_id = blockIdx.y;
00518         vcl_size_t row_thread_id = threadIdx.x;
00519         vcl_size_t col_thread_id = threadIdx.y;
00520         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
00521         vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
00522         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
00523         vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
00524         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
00525         T Csub = 0;
00526         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
00527         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
00528 
00529         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
00530         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
00531         for (vcl_size_t block = 0;
00532                 block < block_num;
00533                 ++block)
00534         {
00535           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
00536           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
00537           __syncthreads();
00538           T * bufAptr = bufA + row_thread_id_times_block_size;
00539           T * bufBptr = bufB + col_thread_id_times_block_size;
00540             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00541             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00542             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00543             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00544             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00545             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00546             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00547             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00548             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00549             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00550             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00551             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00552             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00553             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00554             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00555             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00556           __syncthreads();
00557           aBegin += aStep;
00558           bBegin += bStep;
00559         }
00560         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
00561           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
00562       }
00563 
00564       // matrix-matrix multiplication C = A^T * B
00565       // matrix layouts: C...row_major, A...col_major, B...col_major
00566       template <typename T>
00567       __global__ void matrix_matrix_row_col_col_prod_TA_kernel(
00568                 T alpha,
00569                 const T * A,
00570                 unsigned int A_row_start,
00571                 unsigned int A_col_start,
00572                 unsigned int A_row_inc,
00573                 unsigned int A_col_inc,
00574                 unsigned int A_row_size,
00575                 unsigned int A_col_size,
00576                 unsigned int A_internal_rows,
00577                 unsigned int A_internal_cols,
00578                 const T * B,
00579                 unsigned int B_row_start,
00580                 unsigned int B_col_start,
00581                 unsigned int B_row_inc,
00582                 unsigned int B_col_inc,
00583                 unsigned int B_row_size,
00584                 unsigned int B_col_size,
00585                 unsigned int B_internal_rows,
00586                 unsigned int B_internal_cols,
00587                 T beta,
00588                 T * C,
00589                 unsigned int C_row_start,
00590                 unsigned int C_col_start,
00591                 unsigned int C_row_inc,
00592                 unsigned int C_col_inc,
00593                 unsigned int C_row_size,
00594                 unsigned int C_col_size,
00595                 unsigned int C_internal_rows,
00596                 unsigned int C_internal_cols)
00597       {
00598 
00599         __shared__ T bufA[272];
00600         __shared__ T bufB[272];
00601 
00602         vcl_size_t block_size = 16;//get_local_size(0);
00603         vcl_size_t row_block_id = blockIdx.x;
00604         vcl_size_t col_block_id = blockIdx.y;
00605         vcl_size_t row_thread_id = threadIdx.x;
00606         vcl_size_t col_thread_id = threadIdx.y;
00607         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
00608         vcl_size_t aStep = block_size * A_row_inc;
00609         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
00610         vcl_size_t bStep = block_size * B_row_inc;
00611         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
00612         T Csub = 0;
00613         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
00614         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
00615 
00616         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
00617         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
00618         for (vcl_size_t block = 0;
00619                 block < block_num;
00620                 ++block)
00621         {
00622           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
00623           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
00624           __syncthreads();
00625           T * bufAptr = bufA + row_thread_id_times_block_size;
00626           T * bufBptr = bufB + col_thread_id_times_block_size;
00627             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00628             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00629             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00630             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00631             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00632             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00633             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00634             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00635             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00636             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00637             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00638             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00639             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00640             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00641             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00642             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00643           __syncthreads();
00644           aBegin += aStep;
00645           bBegin += bStep;
00646         }
00647         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
00648           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
00649       }
00650 
00651       // matrix-matrix multiplication C = A^T * B^T
00652       // matrix layouts: C...row_major, A...col_major, B...col_major
00653       template <typename T>
00654       __global__ void matrix_matrix_row_col_col_prod_TT_kernel(
00655                 T alpha,
00656                 const T * A,
00657                 unsigned int A_row_start,
00658                 unsigned int A_col_start,
00659                 unsigned int A_row_inc,
00660                 unsigned int A_col_inc,
00661                 unsigned int A_row_size,
00662                 unsigned int A_col_size,
00663                 unsigned int A_internal_rows,
00664                 unsigned int A_internal_cols,
00665                 const T * B,
00666                 unsigned int B_row_start,
00667                 unsigned int B_col_start,
00668                 unsigned int B_row_inc,
00669                 unsigned int B_col_inc,
00670                 unsigned int B_row_size,
00671                 unsigned int B_col_size,
00672                 unsigned int B_internal_rows,
00673                 unsigned int B_internal_cols,
00674                 T beta,
00675                 T * C,
00676                 unsigned int C_row_start,
00677                 unsigned int C_col_start,
00678                 unsigned int C_row_inc,
00679                 unsigned int C_col_inc,
00680                 unsigned int C_row_size,
00681                 unsigned int C_col_size,
00682                 unsigned int C_internal_rows,
00683                 unsigned int C_internal_cols)
00684       {
00685 
00686         __shared__ T bufA[272];
00687         __shared__ T bufB[272];
00688 
00689         vcl_size_t block_size = 16;//get_local_size(0);
00690         vcl_size_t row_block_id = blockIdx.x;
00691         vcl_size_t col_block_id = blockIdx.y;
00692         vcl_size_t row_thread_id = threadIdx.x;
00693         vcl_size_t col_thread_id = threadIdx.y;
00694         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
00695         vcl_size_t aStep = block_size * A_row_inc;
00696         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
00697         vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
00698         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
00699         T Csub = 0;
00700         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
00701         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
00702 
00703         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
00704         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
00705         for (vcl_size_t block = 0;
00706                 block < block_num;
00707                 ++block)
00708         {
00709           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
00710           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
00711           __syncthreads();
00712           T * bufAptr = bufA + row_thread_id_times_block_size;
00713           T * bufBptr = bufB + col_thread_id_times_block_size;
00714             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00715             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00716             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00717             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00718             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00719             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00720             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00721             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00722             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00723             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00724             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00725             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00726             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00727             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00728             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00729             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00730           __syncthreads();
00731           aBegin += aStep;
00732           bBegin += bStep;
00733         }
00734         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
00735           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
00736       }
00737 
00738 
00739 
00740 
00742 
00743 
00744 
00745 
00746       // matrix-matrix multiplication C = A * B
00747       // matrix layouts: C...col_major, A...col_major, B...row_major
00748       template <typename T>
00749       __global__ void matrix_matrix_col_col_row_prod_AA_kernel(
00750                 T alpha,
00751                 const T * A,
00752                 unsigned int A_row_start,
00753                 unsigned int A_col_start,
00754                 unsigned int A_row_inc,
00755                 unsigned int A_col_inc,
00756                 unsigned int A_row_size,
00757                 unsigned int A_col_size,
00758                 unsigned int A_internal_rows,
00759                 unsigned int A_internal_cols,
00760                 const T * B,
00761                 unsigned int B_row_start,
00762                 unsigned int B_col_start,
00763                 unsigned int B_row_inc,
00764                 unsigned int B_col_inc,
00765                 unsigned int B_row_size,
00766                 unsigned int B_col_size,
00767                 unsigned int B_internal_rows,
00768                 unsigned int B_internal_cols,
00769                 T beta,
00770                 T * C,
00771                 unsigned int C_row_start,
00772                 unsigned int C_col_start,
00773                 unsigned int C_row_inc,
00774                 unsigned int C_col_inc,
00775                 unsigned int C_row_size,
00776                 unsigned int C_col_size,
00777                 unsigned int C_internal_rows,
00778                 unsigned int C_internal_cols)
00779       {
00780 
00781         __shared__ T bufA[272];
00782         __shared__ T bufB[272];
00783 
00784         vcl_size_t block_size = 16;//get_local_size(0);
00785         vcl_size_t row_block_id = blockIdx.x;
00786         vcl_size_t col_block_id = blockIdx.y;
00787         vcl_size_t row_thread_id = threadIdx.x;
00788         vcl_size_t col_thread_id = threadIdx.y;
00789         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
00790         vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
00791         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
00792         vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
00793         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
00794         T Csub = 0;
00795         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
00796         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
00797 
00798         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
00799         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
00800         for (vcl_size_t block = 0;
00801                 block < block_num;
00802                 ++block)
00803         {
00804           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
00805           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
00806           __syncthreads();
00807           T * bufAptr = bufA + row_thread_id_times_block_size;
00808           T * bufBptr = bufB + col_thread_id_times_block_size;
00809             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00810             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00811             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00812             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00813             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00814             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00815             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00816             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00817             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00818             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00819             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00820             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00821             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00822             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00823             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00824             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00825           __syncthreads();
00826           aBegin += aStep;
00827           bBegin += bStep;
00828         }
00829         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
00830           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
00831       }
00832 
00833       // matrix-matrix multiplication C = A * B^T
00834       // matrix layouts: C...col_major, A...col_major, B...row_major
00835       template <typename T>
00836       __global__ void matrix_matrix_col_col_row_prod_AT_kernel(
00837                 T alpha,
00838                 const T * A,
00839                 unsigned int A_row_start,
00840                 unsigned int A_col_start,
00841                 unsigned int A_row_inc,
00842                 unsigned int A_col_inc,
00843                 unsigned int A_row_size,
00844                 unsigned int A_col_size,
00845                 unsigned int A_internal_rows,
00846                 unsigned int A_internal_cols,
00847                 const T * B,
00848                 unsigned int B_row_start,
00849                 unsigned int B_col_start,
00850                 unsigned int B_row_inc,
00851                 unsigned int B_col_inc,
00852                 unsigned int B_row_size,
00853                 unsigned int B_col_size,
00854                 unsigned int B_internal_rows,
00855                 unsigned int B_internal_cols,
00856                 T beta,
00857                 T * C,
00858                 unsigned int C_row_start,
00859                 unsigned int C_col_start,
00860                 unsigned int C_row_inc,
00861                 unsigned int C_col_inc,
00862                 unsigned int C_row_size,
00863                 unsigned int C_col_size,
00864                 unsigned int C_internal_rows,
00865                 unsigned int C_internal_cols)
00866       {
00867 
00868         __shared__ T bufA[272];
00869         __shared__ T bufB[272];
00870 
00871         vcl_size_t block_size = 16;//get_local_size(0);
00872         vcl_size_t row_block_id = blockIdx.x;
00873         vcl_size_t col_block_id = blockIdx.y;
00874         vcl_size_t row_thread_id = threadIdx.x;
00875         vcl_size_t col_thread_id = threadIdx.y;
00876         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
00877         vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
00878         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
00879         vcl_size_t bStep = block_size * B_col_inc;
00880         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
00881         T Csub = 0;
00882         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
00883         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
00884 
00885         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
00886         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
00887         for (vcl_size_t block = 0;
00888                 block < block_num;
00889                 ++block)
00890         {
00891           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
00892           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
00893           __syncthreads();
00894           T * bufAptr = bufA + row_thread_id_times_block_size;
00895           T * bufBptr = bufB + col_thread_id_times_block_size;
00896             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00897             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00898             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00899             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00900             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00901             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00902             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00903             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00904             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00905             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00906             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00907             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00908             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00909             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00910             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00911             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00912           __syncthreads();
00913           aBegin += aStep;
00914           bBegin += bStep;
00915         }
00916         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
00917           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
00918       }
00919 
00920       // matrix-matrix multiplication C = A^T * B
00921       // matrix layouts: C...col_major, A...col_major, B...row_major
00922       template <typename T>
00923       __global__ void matrix_matrix_col_col_row_prod_TA_kernel(
00924                 T alpha,
00925                 const T * A,
00926                 unsigned int A_row_start,
00927                 unsigned int A_col_start,
00928                 unsigned int A_row_inc,
00929                 unsigned int A_col_inc,
00930                 unsigned int A_row_size,
00931                 unsigned int A_col_size,
00932                 unsigned int A_internal_rows,
00933                 unsigned int A_internal_cols,
00934                 const T * B,
00935                 unsigned int B_row_start,
00936                 unsigned int B_col_start,
00937                 unsigned int B_row_inc,
00938                 unsigned int B_col_inc,
00939                 unsigned int B_row_size,
00940                 unsigned int B_col_size,
00941                 unsigned int B_internal_rows,
00942                 unsigned int B_internal_cols,
00943                 T beta,
00944                 T * C,
00945                 unsigned int C_row_start,
00946                 unsigned int C_col_start,
00947                 unsigned int C_row_inc,
00948                 unsigned int C_col_inc,
00949                 unsigned int C_row_size,
00950                 unsigned int C_col_size,
00951                 unsigned int C_internal_rows,
00952                 unsigned int C_internal_cols)
00953       {
00954 
00955         __shared__ T bufA[272];
00956         __shared__ T bufB[272];
00957 
00958         vcl_size_t block_size = 16;//get_local_size(0);
00959         vcl_size_t row_block_id = blockIdx.x;
00960         vcl_size_t col_block_id = blockIdx.y;
00961         vcl_size_t row_thread_id = threadIdx.x;
00962         vcl_size_t col_thread_id = threadIdx.y;
00963         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
00964         vcl_size_t aStep = block_size * A_row_inc;
00965         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
00966         vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
00967         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
00968         T Csub = 0;
00969         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
00970         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
00971 
00972         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
00973         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
00974         for (vcl_size_t block = 0;
00975                 block < block_num;
00976                 ++block)
00977         {
00978           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
00979           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
00980           __syncthreads();
00981           T * bufAptr = bufA + row_thread_id_times_block_size;
00982           T * bufBptr = bufB + col_thread_id_times_block_size;
00983             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00984             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00985             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00986             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00987             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00988             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00989             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00990             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00991             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00992             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00993             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00994             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00995             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00996             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00997             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00998             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
00999           __syncthreads();
01000           aBegin += aStep;
01001           bBegin += bStep;
01002         }
01003         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
01004           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
01005       }
01006 
01007       // matrix-matrix multiplication C = A^T * B^T
01008       // matrix layouts: C...col_major, A...col_major, B...row_major
01009       template <typename T>
01010       __global__ void matrix_matrix_col_col_row_prod_TT_kernel(
01011                 T alpha,
01012                 const T * A,
01013                 unsigned int A_row_start,
01014                 unsigned int A_col_start,
01015                 unsigned int A_row_inc,
01016                 unsigned int A_col_inc,
01017                 unsigned int A_row_size,
01018                 unsigned int A_col_size,
01019                 unsigned int A_internal_rows,
01020                 unsigned int A_internal_cols,
01021                 const T * B,
01022                 unsigned int B_row_start,
01023                 unsigned int B_col_start,
01024                 unsigned int B_row_inc,
01025                 unsigned int B_col_inc,
01026                 unsigned int B_row_size,
01027                 unsigned int B_col_size,
01028                 unsigned int B_internal_rows,
01029                 unsigned int B_internal_cols,
01030                 T beta,
01031                 T * C,
01032                 unsigned int C_row_start,
01033                 unsigned int C_col_start,
01034                 unsigned int C_row_inc,
01035                 unsigned int C_col_inc,
01036                 unsigned int C_row_size,
01037                 unsigned int C_col_size,
01038                 unsigned int C_internal_rows,
01039                 unsigned int C_internal_cols)
01040       {
01041 
01042         __shared__ T bufA[272];
01043         __shared__ T bufB[272];
01044 
01045         vcl_size_t block_size = 16;//get_local_size(0);
01046         vcl_size_t row_block_id = blockIdx.x;
01047         vcl_size_t col_block_id = blockIdx.y;
01048         vcl_size_t row_thread_id = threadIdx.x;
01049         vcl_size_t col_thread_id = threadIdx.y;
01050         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
01051         vcl_size_t aStep = block_size * A_row_inc;
01052         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
01053         vcl_size_t bStep = block_size * B_col_inc;
01054         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
01055         T Csub = 0;
01056         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
01057         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
01058 
01059         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
01060         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
01061         for (vcl_size_t block = 0;
01062                 block < block_num;
01063                 ++block)
01064         {
01065           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
01066           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
01067           __syncthreads();
01068           T * bufAptr = bufA + row_thread_id_times_block_size;
01069           T * bufBptr = bufB + col_thread_id_times_block_size;
01070             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01071             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01072             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01073             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01074             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01075             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01076             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01077             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01078             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01079             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01080             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01081             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01082             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01083             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01084             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01085             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01086           __syncthreads();
01087           aBegin += aStep;
01088           bBegin += bStep;
01089         }
01090         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
01091           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
01092       }
01093 
01094 
01095 
01097 
01098 
01099 
01100 
01101       // matrix-matrix multiplication C = A * B
01102       // matrix layouts: C...row_major, A...col_major, B...row_major
01103       template <typename T>
01104       __global__ void matrix_matrix_row_col_row_prod_AA_kernel(
01105                 T alpha,
01106                 const T * A,
01107                 unsigned int A_row_start,
01108                 unsigned int A_col_start,
01109                 unsigned int A_row_inc,
01110                 unsigned int A_col_inc,
01111                 unsigned int A_row_size,
01112                 unsigned int A_col_size,
01113                 unsigned int A_internal_rows,
01114                 unsigned int A_internal_cols,
01115                 const T * B,
01116                 unsigned int B_row_start,
01117                 unsigned int B_col_start,
01118                 unsigned int B_row_inc,
01119                 unsigned int B_col_inc,
01120                 unsigned int B_row_size,
01121                 unsigned int B_col_size,
01122                 unsigned int B_internal_rows,
01123                 unsigned int B_internal_cols,
01124                 T beta,
01125                 T * C,
01126                 unsigned int C_row_start,
01127                 unsigned int C_col_start,
01128                 unsigned int C_row_inc,
01129                 unsigned int C_col_inc,
01130                 unsigned int C_row_size,
01131                 unsigned int C_col_size,
01132                 unsigned int C_internal_rows,
01133                 unsigned int C_internal_cols)
01134       {
01135 
01136         __shared__ T bufA[272];
01137         __shared__ T bufB[272];
01138 
01139         vcl_size_t block_size = 16;//get_local_size(0);
01140         vcl_size_t row_block_id = blockIdx.x;
01141         vcl_size_t col_block_id = blockIdx.y;
01142         vcl_size_t row_thread_id = threadIdx.x;
01143         vcl_size_t col_thread_id = threadIdx.y;
01144         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
01145         vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
01146         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
01147         vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
01148         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
01149         T Csub = 0;
01150         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
01151         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
01152 
01153         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
01154         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
01155         for (vcl_size_t block = 0;
01156                 block < block_num;
01157                 ++block)
01158         {
01159           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
01160           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
01161           __syncthreads();
01162           T * bufAptr = bufA + row_thread_id_times_block_size;
01163           T * bufBptr = bufB + col_thread_id_times_block_size;
01164             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01165             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01166             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01167             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01168             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01169             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01170             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01171             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01172             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01173             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01174             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01175             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01176             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01177             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01178             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01179             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01180           __syncthreads();
01181           aBegin += aStep;
01182           bBegin += bStep;
01183         }
01184         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
01185           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
01186       }
01187 
01188       // matrix-matrix multiplication C = A * B^T
01189       // matrix layouts: C...row_major, A...col_major, B...row_major
01190       template <typename T>
01191       __global__ void matrix_matrix_row_col_row_prod_AT_kernel(
01192                 T alpha,
01193                 const T * A,
01194                 unsigned int A_row_start,
01195                 unsigned int A_col_start,
01196                 unsigned int A_row_inc,
01197                 unsigned int A_col_inc,
01198                 unsigned int A_row_size,
01199                 unsigned int A_col_size,
01200                 unsigned int A_internal_rows,
01201                 unsigned int A_internal_cols,
01202                 const T * B,
01203                 unsigned int B_row_start,
01204                 unsigned int B_col_start,
01205                 unsigned int B_row_inc,
01206                 unsigned int B_col_inc,
01207                 unsigned int B_row_size,
01208                 unsigned int B_col_size,
01209                 unsigned int B_internal_rows,
01210                 unsigned int B_internal_cols,
01211                 T beta,
01212                 T * C,
01213                 unsigned int C_row_start,
01214                 unsigned int C_col_start,
01215                 unsigned int C_row_inc,
01216                 unsigned int C_col_inc,
01217                 unsigned int C_row_size,
01218                 unsigned int C_col_size,
01219                 unsigned int C_internal_rows,
01220                 unsigned int C_internal_cols)
01221       {
01222 
01223         __shared__ T bufA[272];
01224         __shared__ T bufB[272];
01225 
01226         vcl_size_t block_size = 16;//get_local_size(0);
01227         vcl_size_t row_block_id = blockIdx.x;
01228         vcl_size_t col_block_id = blockIdx.y;
01229         vcl_size_t row_thread_id = threadIdx.x;
01230         vcl_size_t col_thread_id = threadIdx.y;
01231         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows;
01232         vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
01233         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
01234         vcl_size_t bStep = block_size * B_col_inc;
01235         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
01236         T Csub = 0;
01237         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
01238         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
01239 
01240         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
01241         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
01242         for (vcl_size_t block = 0;
01243                 block < block_num;
01244                 ++block)
01245         {
01246           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
01247           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
01248           __syncthreads();
01249           T * bufAptr = bufA + row_thread_id_times_block_size;
01250           T * bufBptr = bufB + col_thread_id_times_block_size;
01251             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01252             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01253             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01254             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01255             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01256             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01257             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01258             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01259             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01260             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01261             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01262             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01263             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01264             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01265             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01266             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01267           __syncthreads();
01268           aBegin += aStep;
01269           bBegin += bStep;
01270         }
01271         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
01272           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
01273       }
01274 
01275       // matrix-matrix multiplication C = A^T * B
01276       // matrix layouts: C...row_major, A...col_major, B...row_major
01277       template <typename T>
01278       __global__ void matrix_matrix_row_col_row_prod_TA_kernel(
01279                 T alpha,
01280                 const T * A,
01281                 unsigned int A_row_start,
01282                 unsigned int A_col_start,
01283                 unsigned int A_row_inc,
01284                 unsigned int A_col_inc,
01285                 unsigned int A_row_size,
01286                 unsigned int A_col_size,
01287                 unsigned int A_internal_rows,
01288                 unsigned int A_internal_cols,
01289                 const T * B,
01290                 unsigned int B_row_start,
01291                 unsigned int B_col_start,
01292                 unsigned int B_row_inc,
01293                 unsigned int B_col_inc,
01294                 unsigned int B_row_size,
01295                 unsigned int B_col_size,
01296                 unsigned int B_internal_rows,
01297                 unsigned int B_internal_cols,
01298                 T beta,
01299                 T * C,
01300                 unsigned int C_row_start,
01301                 unsigned int C_col_start,
01302                 unsigned int C_row_inc,
01303                 unsigned int C_col_inc,
01304                 unsigned int C_row_size,
01305                 unsigned int C_col_size,
01306                 unsigned int C_internal_rows,
01307                 unsigned int C_internal_cols)
01308       {
01309 
01310         __shared__ T bufA[272];
01311         __shared__ T bufB[272];
01312 
01313         vcl_size_t block_size = 16;//get_local_size(0);
01314         vcl_size_t row_block_id = blockIdx.x;
01315         vcl_size_t col_block_id = blockIdx.y;
01316         vcl_size_t row_thread_id = threadIdx.x;
01317         vcl_size_t col_thread_id = threadIdx.y;
01318         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
01319         vcl_size_t aStep = block_size * A_row_inc;
01320         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
01321         vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
01322         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
01323         T Csub = 0;
01324         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
01325         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
01326 
01327         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
01328         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
01329         for (vcl_size_t block = 0;
01330                 block < block_num;
01331                 ++block)
01332         {
01333           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
01334           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
01335           __syncthreads();
01336           T * bufAptr = bufA + row_thread_id_times_block_size;
01337           T * bufBptr = bufB + col_thread_id_times_block_size;
01338             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01339             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01340             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01341             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01342             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01343             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01344             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01345             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01346             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01347             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01348             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01349             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01350             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01351             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01352             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01353             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01354           __syncthreads();
01355           aBegin += aStep;
01356           bBegin += bStep;
01357         }
01358         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
01359           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
01360       }
01361 
01362       // matrix-matrix multiplication C = A^T * B^T
01363       // matrix layouts: C...row_major, A...col_major, B...row_major
01364       template <typename T>
01365       __global__ void matrix_matrix_row_col_row_prod_TT_kernel(
01366                 T alpha,
01367                 const T * A,
01368                 unsigned int A_row_start,
01369                 unsigned int A_col_start,
01370                 unsigned int A_row_inc,
01371                 unsigned int A_col_inc,
01372                 unsigned int A_row_size,
01373                 unsigned int A_col_size,
01374                 unsigned int A_internal_rows,
01375                 unsigned int A_internal_cols,
01376                 const T * B,
01377                 unsigned int B_row_start,
01378                 unsigned int B_col_start,
01379                 unsigned int B_row_inc,
01380                 unsigned int B_col_inc,
01381                 unsigned int B_row_size,
01382                 unsigned int B_col_size,
01383                 unsigned int B_internal_rows,
01384                 unsigned int B_internal_cols,
01385                 T beta,
01386                 T * C,
01387                 unsigned int C_row_start,
01388                 unsigned int C_col_start,
01389                 unsigned int C_row_inc,
01390                 unsigned int C_col_inc,
01391                 unsigned int C_row_size,
01392                 unsigned int C_col_size,
01393                 unsigned int C_internal_rows,
01394                 unsigned int C_internal_cols)
01395       {
01396 
01397         __shared__ T bufA[272];
01398         __shared__ T bufB[272];
01399 
01400         vcl_size_t block_size = 16;//get_local_size(0);
01401         vcl_size_t row_block_id = blockIdx.x;
01402         vcl_size_t col_block_id = blockIdx.y;
01403         vcl_size_t row_thread_id = threadIdx.x;
01404         vcl_size_t col_thread_id = threadIdx.y;
01405         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start;
01406         vcl_size_t aStep = block_size * A_row_inc;
01407         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
01408         vcl_size_t bStep = block_size * B_col_inc;
01409         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
01410         T Csub = 0;
01411         vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows;
01412         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
01413 
01414         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
01415         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
01416         for (vcl_size_t block = 0;
01417                 block < block_num;
01418                 ++block)
01419         {
01420           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
01421           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
01422           __syncthreads();
01423           T * bufAptr = bufA + row_thread_id_times_block_size;
01424           T * bufBptr = bufB + col_thread_id_times_block_size;
01425             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01426             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01427             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01428             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01429             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01430             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01431             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01432             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01433             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01434             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01435             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01436             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01437             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01438             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01439             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01440             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01441           __syncthreads();
01442           aBegin += aStep;
01443           bBegin += bStep;
01444         }
01445         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
01446           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
01447       }
01448 
01449 
01450 
01451 
01452 
01454 
01455 
01456 
01457 
01458 
01459 
01460       // matrix-matrix multiplication C = A * B
01461       // matrix layouts: C...col_major, A...row_major, B...col_major
01462       template <typename T>
01463       __global__ void matrix_matrix_col_row_col_prod_AA_kernel(
01464                 T alpha,
01465                 const T * A,
01466                 unsigned int A_row_start,
01467                 unsigned int A_col_start,
01468                 unsigned int A_row_inc,
01469                 unsigned int A_col_inc,
01470                 unsigned int A_row_size,
01471                 unsigned int A_col_size,
01472                 unsigned int A_internal_rows,
01473                 unsigned int A_internal_cols,
01474                 const T * B,
01475                 unsigned int B_row_start,
01476                 unsigned int B_col_start,
01477                 unsigned int B_row_inc,
01478                 unsigned int B_col_inc,
01479                 unsigned int B_row_size,
01480                 unsigned int B_col_size,
01481                 unsigned int B_internal_rows,
01482                 unsigned int B_internal_cols,
01483                 T beta,
01484                 T * C,
01485                 unsigned int C_row_start,
01486                 unsigned int C_col_start,
01487                 unsigned int C_row_inc,
01488                 unsigned int C_col_inc,
01489                 unsigned int C_row_size,
01490                 unsigned int C_col_size,
01491                 unsigned int C_internal_rows,
01492                 unsigned int C_internal_cols)
01493       {
01494 
01495         __shared__ T bufA[272];
01496         __shared__ T bufB[272];
01497 
01498         vcl_size_t block_size = 16;//get_local_size(0);
01499         vcl_size_t row_block_id = blockIdx.x;
01500         vcl_size_t col_block_id = blockIdx.y;
01501         vcl_size_t row_thread_id = threadIdx.x;
01502         vcl_size_t col_thread_id = threadIdx.y;
01503         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
01504         vcl_size_t aStep = block_size * A_col_inc;
01505         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
01506         vcl_size_t bStep = block_size * B_row_inc;
01507         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
01508         T Csub = 0;
01509         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
01510         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
01511 
01512         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
01513         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
01514         for (vcl_size_t block = 0;
01515                 block < block_num;
01516                 ++block)
01517         {
01518           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
01519           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
01520           __syncthreads();
01521           T * bufAptr = bufA + row_thread_id_times_block_size;
01522           T * bufBptr = bufB + col_thread_id_times_block_size;
01523             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01524             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01525             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01526             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01527             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01528             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01529             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01530             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01531             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01532             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01533             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01534             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01535             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01536             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01537             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01538             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01539           __syncthreads();
01540           aBegin += aStep;
01541           bBegin += bStep;
01542         }
01543         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
01544           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
01545       }
01546 
01547       // matrix-matrix multiplication C = A * B^T
01548       // matrix layouts: C...col_major, A...row_major, B...col_major
01549       template <typename T>
01550       __global__ void matrix_matrix_col_row_col_prod_AT_kernel(
01551                 T alpha,
01552                 const T * A,
01553                 unsigned int A_row_start,
01554                 unsigned int A_col_start,
01555                 unsigned int A_row_inc,
01556                 unsigned int A_col_inc,
01557                 unsigned int A_row_size,
01558                 unsigned int A_col_size,
01559                 unsigned int A_internal_rows,
01560                 unsigned int A_internal_cols,
01561                 const T * B,
01562                 unsigned int B_row_start,
01563                 unsigned int B_col_start,
01564                 unsigned int B_row_inc,
01565                 unsigned int B_col_inc,
01566                 unsigned int B_row_size,
01567                 unsigned int B_col_size,
01568                 unsigned int B_internal_rows,
01569                 unsigned int B_internal_cols,
01570                 T beta,
01571                 T * C,
01572                 unsigned int C_row_start,
01573                 unsigned int C_col_start,
01574                 unsigned int C_row_inc,
01575                 unsigned int C_col_inc,
01576                 unsigned int C_row_size,
01577                 unsigned int C_col_size,
01578                 unsigned int C_internal_rows,
01579                 unsigned int C_internal_cols)
01580       {
01581 
01582         __shared__ T bufA[272];
01583         __shared__ T bufB[272];
01584 
01585         vcl_size_t block_size = 16;//get_local_size(0);
01586         vcl_size_t row_block_id = blockIdx.x;
01587         vcl_size_t col_block_id = blockIdx.y;
01588         vcl_size_t row_thread_id = threadIdx.x;
01589         vcl_size_t col_thread_id = threadIdx.y;
01590         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
01591         vcl_size_t aStep = block_size * A_col_inc;
01592         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
01593         vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
01594         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
01595         T Csub = 0;
01596         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
01597         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
01598 
01599         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
01600         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
01601         for (vcl_size_t block = 0;
01602                 block < block_num;
01603                 ++block)
01604         {
01605           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
01606           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
01607           __syncthreads();
01608           T * bufAptr = bufA + row_thread_id_times_block_size;
01609           T * bufBptr = bufB + col_thread_id_times_block_size;
01610             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01611             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01612             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01613             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01614             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01615             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01616             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01617             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01618             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01619             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01620             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01621             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01622             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01623             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01624             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01625             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01626           __syncthreads();
01627           aBegin += aStep;
01628           bBegin += bStep;
01629         }
01630         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
01631           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
01632       }
01633 
01634       // matrix-matrix multiplication C = A^T * B
01635       // matrix layouts: C...col_major, A...row_major, B...col_major
01636       template <typename T>
01637       __global__ void matrix_matrix_col_row_col_prod_TA_kernel(
01638                 T alpha,
01639                 const T * A,
01640                 unsigned int A_row_start,
01641                 unsigned int A_col_start,
01642                 unsigned int A_row_inc,
01643                 unsigned int A_col_inc,
01644                 unsigned int A_row_size,
01645                 unsigned int A_col_size,
01646                 unsigned int A_internal_rows,
01647                 unsigned int A_internal_cols,
01648                 const T * B,
01649                 unsigned int B_row_start,
01650                 unsigned int B_col_start,
01651                 unsigned int B_row_inc,
01652                 unsigned int B_col_inc,
01653                 unsigned int B_row_size,
01654                 unsigned int B_col_size,
01655                 unsigned int B_internal_rows,
01656                 unsigned int B_internal_cols,
01657                 T beta,
01658                 T * C,
01659                 unsigned int C_row_start,
01660                 unsigned int C_col_start,
01661                 unsigned int C_row_inc,
01662                 unsigned int C_col_inc,
01663                 unsigned int C_row_size,
01664                 unsigned int C_col_size,
01665                 unsigned int C_internal_rows,
01666                 unsigned int C_internal_cols)
01667       {
01668 
01669         __shared__ T bufA[272];
01670         __shared__ T bufB[272];
01671 
01672         vcl_size_t block_size = 16;//get_local_size(0);
01673         vcl_size_t row_block_id = blockIdx.x;
01674         vcl_size_t col_block_id = blockIdx.y;
01675         vcl_size_t row_thread_id = threadIdx.x;
01676         vcl_size_t col_thread_id = threadIdx.y;
01677         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
01678         vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
01679         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
01680         vcl_size_t bStep = block_size * B_row_inc;
01681         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
01682         T Csub = 0;
01683         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
01684         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
01685 
01686         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
01687         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
01688         for (vcl_size_t block = 0;
01689                 block < block_num;
01690                 ++block)
01691         {
01692           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
01693           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
01694           __syncthreads();
01695           T * bufAptr = bufA + row_thread_id_times_block_size;
01696           T * bufBptr = bufB + col_thread_id_times_block_size;
01697             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01698             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01699             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01700             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01701             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01702             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01703             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01704             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01705             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01706             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01707             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01708             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01709             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01710             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01711             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01712             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01713           __syncthreads();
01714           aBegin += aStep;
01715           bBegin += bStep;
01716         }
01717         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
01718           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
01719       }
01720 
01721       // matrix-matrix multiplication C = A^T * B^T
01722       // matrix layouts: C...col_major, A...row_major, B...col_major
01723       template <typename T>
01724       __global__ void matrix_matrix_col_row_col_prod_TT_kernel(
01725                 T alpha,
01726                 const T * A,
01727                 unsigned int A_row_start,
01728                 unsigned int A_col_start,
01729                 unsigned int A_row_inc,
01730                 unsigned int A_col_inc,
01731                 unsigned int A_row_size,
01732                 unsigned int A_col_size,
01733                 unsigned int A_internal_rows,
01734                 unsigned int A_internal_cols,
01735                 const T * B,
01736                 unsigned int B_row_start,
01737                 unsigned int B_col_start,
01738                 unsigned int B_row_inc,
01739                 unsigned int B_col_inc,
01740                 unsigned int B_row_size,
01741                 unsigned int B_col_size,
01742                 unsigned int B_internal_rows,
01743                 unsigned int B_internal_cols,
01744                 T beta,
01745                 T * C,
01746                 unsigned int C_row_start,
01747                 unsigned int C_col_start,
01748                 unsigned int C_row_inc,
01749                 unsigned int C_col_inc,
01750                 unsigned int C_row_size,
01751                 unsigned int C_col_size,
01752                 unsigned int C_internal_rows,
01753                 unsigned int C_internal_cols)
01754       {
01755 
01756         __shared__ T bufA[272];
01757         __shared__ T bufB[272];
01758 
01759         vcl_size_t block_size = 16;//get_local_size(0);
01760         vcl_size_t row_block_id = blockIdx.x;
01761         vcl_size_t col_block_id = blockIdx.y;
01762         vcl_size_t row_thread_id = threadIdx.x;
01763         vcl_size_t col_thread_id = threadIdx.y;
01764         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
01765         vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
01766         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
01767         vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
01768         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
01769         T Csub = 0;
01770         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
01771         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
01772 
01773         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
01774         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
01775         for (vcl_size_t block = 0;
01776                 block < block_num;
01777                 ++block)
01778         {
01779           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
01780           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
01781           __syncthreads();
01782           T * bufAptr = bufA + row_thread_id_times_block_size;
01783           T * bufBptr = bufB + col_thread_id_times_block_size;
01784             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01785             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01786             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01787             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01788             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01789             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01790             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01791             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01792             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01793             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01794             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01795             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01796             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01797             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01798             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01799             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01800           __syncthreads();
01801           aBegin += aStep;
01802           bBegin += bStep;
01803         }
01804         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
01805           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
01806       }
01807 
01808 
01809 
01810 
01812 
01813 
01814 
01815 
01816       // matrix-matrix multiplication C = A * B
01817       // matrix layouts: C...row_major, A...row_major, B...col_major
01818       template <typename T>
01819       __global__ void matrix_matrix_row_row_col_prod_AA_kernel(
01820                 T alpha,
01821                 const T * A,
01822                 unsigned int A_row_start,
01823                 unsigned int A_col_start,
01824                 unsigned int A_row_inc,
01825                 unsigned int A_col_inc,
01826                 unsigned int A_row_size,
01827                 unsigned int A_col_size,
01828                 unsigned int A_internal_rows,
01829                 unsigned int A_internal_cols,
01830                 const T * B,
01831                 unsigned int B_row_start,
01832                 unsigned int B_col_start,
01833                 unsigned int B_row_inc,
01834                 unsigned int B_col_inc,
01835                 unsigned int B_row_size,
01836                 unsigned int B_col_size,
01837                 unsigned int B_internal_rows,
01838                 unsigned int B_internal_cols,
01839                 T beta,
01840                 T * C,
01841                 unsigned int C_row_start,
01842                 unsigned int C_col_start,
01843                 unsigned int C_row_inc,
01844                 unsigned int C_col_inc,
01845                 unsigned int C_row_size,
01846                 unsigned int C_col_size,
01847                 unsigned int C_internal_rows,
01848                 unsigned int C_internal_cols)
01849       {
01850 
01851         __shared__ T bufA[272];
01852         __shared__ T bufB[272];
01853 
01854         vcl_size_t block_size = 16;//get_local_size(0);
01855         vcl_size_t row_block_id = blockIdx.x;
01856         vcl_size_t col_block_id = blockIdx.y;
01857         vcl_size_t row_thread_id = threadIdx.x;
01858         vcl_size_t col_thread_id = threadIdx.y;
01859         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
01860         vcl_size_t aStep = block_size * A_col_inc;
01861         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
01862         vcl_size_t bStep = block_size * B_row_inc;
01863         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
01864         T Csub = 0;
01865         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
01866         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
01867 
01868         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
01869         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
01870         for (vcl_size_t block = 0;
01871                 block < block_num;
01872                 ++block)
01873         {
01874           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
01875           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
01876           __syncthreads();
01877           T * bufAptr = bufA + row_thread_id_times_block_size;
01878           T * bufBptr = bufB + col_thread_id_times_block_size;
01879             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01880             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01881             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01882             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01883             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01884             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01885             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01886             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01887             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01888             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01889             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01890             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01891             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01892             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01893             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01894             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01895           __syncthreads();
01896           aBegin += aStep;
01897           bBegin += bStep;
01898         }
01899         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
01900           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
01901       }
01902 
01903       // matrix-matrix multiplication C = A * B^T
01904       // matrix layouts: C...row_major, A...row_major, B...col_major
01905       template <typename T>
01906       __global__ void matrix_matrix_row_row_col_prod_AT_kernel(
01907                 T alpha,
01908                 const T * A,
01909                 unsigned int A_row_start,
01910                 unsigned int A_col_start,
01911                 unsigned int A_row_inc,
01912                 unsigned int A_col_inc,
01913                 unsigned int A_row_size,
01914                 unsigned int A_col_size,
01915                 unsigned int A_internal_rows,
01916                 unsigned int A_internal_cols,
01917                 const T * B,
01918                 unsigned int B_row_start,
01919                 unsigned int B_col_start,
01920                 unsigned int B_row_inc,
01921                 unsigned int B_col_inc,
01922                 unsigned int B_row_size,
01923                 unsigned int B_col_size,
01924                 unsigned int B_internal_rows,
01925                 unsigned int B_internal_cols,
01926                 T beta,
01927                 T * C,
01928                 unsigned int C_row_start,
01929                 unsigned int C_col_start,
01930                 unsigned int C_row_inc,
01931                 unsigned int C_col_inc,
01932                 unsigned int C_row_size,
01933                 unsigned int C_col_size,
01934                 unsigned int C_internal_rows,
01935                 unsigned int C_internal_cols)
01936       {
01937 
01938         __shared__ T bufA[272];
01939         __shared__ T bufB[272];
01940 
01941         vcl_size_t block_size = 16;//get_local_size(0);
01942         vcl_size_t row_block_id = blockIdx.x;
01943         vcl_size_t col_block_id = blockIdx.y;
01944         vcl_size_t row_thread_id = threadIdx.x;
01945         vcl_size_t col_thread_id = threadIdx.y;
01946         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
01947         vcl_size_t aStep = block_size * A_col_inc;
01948         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
01949         vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
01950         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
01951         T Csub = 0;
01952         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
01953         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
01954 
01955         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
01956         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
01957         for (vcl_size_t block = 0;
01958                 block < block_num;
01959                 ++block)
01960         {
01961           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
01962           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
01963           __syncthreads();
01964           T * bufAptr = bufA + row_thread_id_times_block_size;
01965           T * bufBptr = bufB + col_thread_id_times_block_size;
01966             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01967             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01968             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01969             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01970             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01971             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01972             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01973             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01974             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01975             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01976             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01977             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01978             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01979             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01980             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01981             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
01982           __syncthreads();
01983           aBegin += aStep;
01984           bBegin += bStep;
01985         }
01986         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
01987           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
01988       }
01989 
01990       // matrix-matrix multiplication C = A^T * B
01991       // matrix layouts: C...row_major, A...row_major, B...col_major
01992       template <typename T>
01993       __global__ void matrix_matrix_row_row_col_prod_TA_kernel(
01994                 T alpha,
01995                 const T * A,
01996                 unsigned int A_row_start,
01997                 unsigned int A_col_start,
01998                 unsigned int A_row_inc,
01999                 unsigned int A_col_inc,
02000                 unsigned int A_row_size,
02001                 unsigned int A_col_size,
02002                 unsigned int A_internal_rows,
02003                 unsigned int A_internal_cols,
02004                 const T * B,
02005                 unsigned int B_row_start,
02006                 unsigned int B_col_start,
02007                 unsigned int B_row_inc,
02008                 unsigned int B_col_inc,
02009                 unsigned int B_row_size,
02010                 unsigned int B_col_size,
02011                 unsigned int B_internal_rows,
02012                 unsigned int B_internal_cols,
02013                 T beta,
02014                 T * C,
02015                 unsigned int C_row_start,
02016                 unsigned int C_col_start,
02017                 unsigned int C_row_inc,
02018                 unsigned int C_col_inc,
02019                 unsigned int C_row_size,
02020                 unsigned int C_col_size,
02021                 unsigned int C_internal_rows,
02022                 unsigned int C_internal_cols)
02023       {
02024 
02025         __shared__ T bufA[272];
02026         __shared__ T bufB[272];
02027 
02028         vcl_size_t block_size = 16;//get_local_size(0);
02029         vcl_size_t row_block_id = blockIdx.x;
02030         vcl_size_t col_block_id = blockIdx.y;
02031         vcl_size_t row_thread_id = threadIdx.x;
02032         vcl_size_t col_thread_id = threadIdx.y;
02033         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
02034         vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
02035         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start;
02036         vcl_size_t bStep = block_size * B_row_inc;
02037         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
02038         T Csub = 0;
02039         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
02040         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
02041 
02042         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
02043         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
02044         for (vcl_size_t block = 0;
02045                 block < block_num;
02046                 ++block)
02047         {
02048           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
02049           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
02050           __syncthreads();
02051           T * bufAptr = bufA + row_thread_id_times_block_size;
02052           T * bufBptr = bufB + col_thread_id_times_block_size;
02053             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02054             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02055             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02056             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02057             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02058             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02059             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02060             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02061             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02062             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02063             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02064             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02065             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02066             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02067             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02068             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02069           __syncthreads();
02070           aBegin += aStep;
02071           bBegin += bStep;
02072         }
02073         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
02074           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
02075       }
02076 
02077       // matrix-matrix multiplication C = A^T * B^T
02078       // matrix layouts: C...row_major, A...row_major, B...col_major
02079       template <typename T>
02080       __global__ void matrix_matrix_row_row_col_prod_TT_kernel(
02081                 T alpha,
02082                 const T * A,
02083                 unsigned int A_row_start,
02084                 unsigned int A_col_start,
02085                 unsigned int A_row_inc,
02086                 unsigned int A_col_inc,
02087                 unsigned int A_row_size,
02088                 unsigned int A_col_size,
02089                 unsigned int A_internal_rows,
02090                 unsigned int A_internal_cols,
02091                 const T * B,
02092                 unsigned int B_row_start,
02093                 unsigned int B_col_start,
02094                 unsigned int B_row_inc,
02095                 unsigned int B_col_inc,
02096                 unsigned int B_row_size,
02097                 unsigned int B_col_size,
02098                 unsigned int B_internal_rows,
02099                 unsigned int B_internal_cols,
02100                 T beta,
02101                 T * C,
02102                 unsigned int C_row_start,
02103                 unsigned int C_col_start,
02104                 unsigned int C_row_inc,
02105                 unsigned int C_col_inc,
02106                 unsigned int C_row_size,
02107                 unsigned int C_col_size,
02108                 unsigned int C_internal_rows,
02109                 unsigned int C_internal_cols)
02110       {
02111 
02112         __shared__ T bufA[272];
02113         __shared__ T bufB[272];
02114 
02115         vcl_size_t block_size = 16;//get_local_size(0);
02116         vcl_size_t row_block_id = blockIdx.x;
02117         vcl_size_t col_block_id = blockIdx.y;
02118         vcl_size_t row_thread_id = threadIdx.x;
02119         vcl_size_t col_thread_id = threadIdx.y;
02120         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
02121         vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
02122         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows;
02123         vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
02124         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
02125         T Csub = 0;
02126         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
02127         vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows;
02128 
02129         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
02130         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
02131         for (vcl_size_t block = 0;
02132                 block < block_num;
02133                 ++block)
02134         {
02135           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
02136           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
02137           __syncthreads();
02138           T * bufAptr = bufA + row_thread_id_times_block_size;
02139           T * bufBptr = bufB + col_thread_id_times_block_size;
02140             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02141             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02142             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02143             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02144             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02145             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02146             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02147             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02148             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02149             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02150             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02151             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02152             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02153             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02154             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02155             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02156           __syncthreads();
02157           aBegin += aStep;
02158           bBegin += bStep;
02159         }
02160         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
02161           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
02162       }
02163 
02164 
02165 
02166 
02167 
02169 
02170 
02171 
02172 
02173 
02174 
02175       // matrix-matrix multiplication C = A * B
02176       // matrix layouts: C...col_major, A...row_major, B...row_major
02177       template <typename T>
02178       __global__ void matrix_matrix_col_row_row_prod_AA_kernel(
02179                 T alpha,
02180                 const T * A,
02181                 unsigned int A_row_start,
02182                 unsigned int A_col_start,
02183                 unsigned int A_row_inc,
02184                 unsigned int A_col_inc,
02185                 unsigned int A_row_size,
02186                 unsigned int A_col_size,
02187                 unsigned int A_internal_rows,
02188                 unsigned int A_internal_cols,
02189                 const T * B,
02190                 unsigned int B_row_start,
02191                 unsigned int B_col_start,
02192                 unsigned int B_row_inc,
02193                 unsigned int B_col_inc,
02194                 unsigned int B_row_size,
02195                 unsigned int B_col_size,
02196                 unsigned int B_internal_rows,
02197                 unsigned int B_internal_cols,
02198                 T beta,
02199                 T * C,
02200                 unsigned int C_row_start,
02201                 unsigned int C_col_start,
02202                 unsigned int C_row_inc,
02203                 unsigned int C_col_inc,
02204                 unsigned int C_row_size,
02205                 unsigned int C_col_size,
02206                 unsigned int C_internal_rows,
02207                 unsigned int C_internal_cols)
02208       {
02209 
02210         __shared__ T bufA[272];
02211         __shared__ T bufB[272];
02212 
02213         vcl_size_t block_size = 16;//get_local_size(0);
02214         vcl_size_t row_block_id = blockIdx.x;
02215         vcl_size_t col_block_id = blockIdx.y;
02216         vcl_size_t row_thread_id = threadIdx.x;
02217         vcl_size_t col_thread_id = threadIdx.y;
02218         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
02219         vcl_size_t aStep = block_size * A_col_inc;
02220         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
02221         vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
02222         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
02223         T Csub = 0;
02224         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
02225         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
02226 
02227         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
02228         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
02229         for (vcl_size_t block = 0;
02230                 block < block_num;
02231                 ++block)
02232         {
02233           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
02234           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
02235           __syncthreads();
02236           T * bufAptr = bufA + row_thread_id_times_block_size;
02237           T * bufBptr = bufB + col_thread_id_times_block_size;
02238             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02239             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02240             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02241             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02242             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02243             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02244             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02245             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02246             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02247             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02248             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02249             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02250             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02251             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02252             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02253             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02254           __syncthreads();
02255           aBegin += aStep;
02256           bBegin += bStep;
02257         }
02258         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
02259           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
02260       }
02261 
02262       // matrix-matrix multiplication C = A * B^T
02263       // matrix layouts: C...col_major, A...row_major, B...row_major
02264       template <typename T>
02265       __global__ void matrix_matrix_col_row_row_prod_AT_kernel(
02266                 T alpha,
02267                 const T * A,
02268                 unsigned int A_row_start,
02269                 unsigned int A_col_start,
02270                 unsigned int A_row_inc,
02271                 unsigned int A_col_inc,
02272                 unsigned int A_row_size,
02273                 unsigned int A_col_size,
02274                 unsigned int A_internal_rows,
02275                 unsigned int A_internal_cols,
02276                 const T * B,
02277                 unsigned int B_row_start,
02278                 unsigned int B_col_start,
02279                 unsigned int B_row_inc,
02280                 unsigned int B_col_inc,
02281                 unsigned int B_row_size,
02282                 unsigned int B_col_size,
02283                 unsigned int B_internal_rows,
02284                 unsigned int B_internal_cols,
02285                 T beta,
02286                 T * C,
02287                 unsigned int C_row_start,
02288                 unsigned int C_col_start,
02289                 unsigned int C_row_inc,
02290                 unsigned int C_col_inc,
02291                 unsigned int C_row_size,
02292                 unsigned int C_col_size,
02293                 unsigned int C_internal_rows,
02294                 unsigned int C_internal_cols)
02295       {
02296 
02297         __shared__ T bufA[272];
02298         __shared__ T bufB[272];
02299 
02300         vcl_size_t block_size = 16;//get_local_size(0);
02301         vcl_size_t row_block_id = blockIdx.x;
02302         vcl_size_t col_block_id = blockIdx.y;
02303         vcl_size_t row_thread_id = threadIdx.x;
02304         vcl_size_t col_thread_id = threadIdx.y;
02305         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
02306         vcl_size_t aStep = block_size * A_col_inc;
02307         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
02308         vcl_size_t bStep = block_size * B_col_inc;
02309         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
02310         T Csub = 0;
02311         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
02312         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
02313 
02314         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
02315         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
02316         for (vcl_size_t block = 0;
02317                 block < block_num;
02318                 ++block)
02319         {
02320           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
02321           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
02322           __syncthreads();
02323           T * bufAptr = bufA + row_thread_id_times_block_size;
02324           T * bufBptr = bufB + col_thread_id_times_block_size;
02325             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02326             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02327             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02328             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02329             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02330             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02331             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02332             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02333             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02334             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02335             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02336             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02337             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02338             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02339             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02340             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02341           __syncthreads();
02342           aBegin += aStep;
02343           bBegin += bStep;
02344         }
02345         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
02346           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
02347       }
02348 
02349       // matrix-matrix multiplication C = A^T * B
02350       // matrix layouts: C...col_major, A...row_major, B...row_major
02351       template <typename T>
02352       __global__ void matrix_matrix_col_row_row_prod_TA_kernel(
02353                 T alpha,
02354                 const T * A,
02355                 unsigned int A_row_start,
02356                 unsigned int A_col_start,
02357                 unsigned int A_row_inc,
02358                 unsigned int A_col_inc,
02359                 unsigned int A_row_size,
02360                 unsigned int A_col_size,
02361                 unsigned int A_internal_rows,
02362                 unsigned int A_internal_cols,
02363                 const T * B,
02364                 unsigned int B_row_start,
02365                 unsigned int B_col_start,
02366                 unsigned int B_row_inc,
02367                 unsigned int B_col_inc,
02368                 unsigned int B_row_size,
02369                 unsigned int B_col_size,
02370                 unsigned int B_internal_rows,
02371                 unsigned int B_internal_cols,
02372                 T beta,
02373                 T * C,
02374                 unsigned int C_row_start,
02375                 unsigned int C_col_start,
02376                 unsigned int C_row_inc,
02377                 unsigned int C_col_inc,
02378                 unsigned int C_row_size,
02379                 unsigned int C_col_size,
02380                 unsigned int C_internal_rows,
02381                 unsigned int C_internal_cols)
02382       {
02383 
02384         __shared__ T bufA[272];
02385         __shared__ T bufB[272];
02386 
02387         vcl_size_t block_size = 16;//get_local_size(0);
02388         vcl_size_t row_block_id = blockIdx.x;
02389         vcl_size_t col_block_id = blockIdx.y;
02390         vcl_size_t row_thread_id = threadIdx.x;
02391         vcl_size_t col_thread_id = threadIdx.y;
02392         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
02393         vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
02394         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
02395         vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
02396         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
02397         T Csub = 0;
02398         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
02399         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
02400 
02401         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
02402         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
02403         for (vcl_size_t block = 0;
02404                 block < block_num;
02405                 ++block)
02406         {
02407           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
02408           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
02409           __syncthreads();
02410           T * bufAptr = bufA + row_thread_id_times_block_size;
02411           T * bufBptr = bufB + col_thread_id_times_block_size;
02412             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02413             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02414             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02415             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02416             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02417             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02418             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02419             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02420             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02421             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02422             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02423             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02424             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02425             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02426             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02427             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02428           __syncthreads();
02429           aBegin += aStep;
02430           bBegin += bStep;
02431         }
02432         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
02433           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
02434       }
02435 
02436       // matrix-matrix multiplication C = A^T * B^T
02437       // matrix layouts: C...col_major, A...row_major, B...row_major
02438       template <typename T>
02439       __global__ void matrix_matrix_col_row_row_prod_TT_kernel(
02440                 T alpha,
02441                 const T * A,
02442                 unsigned int A_row_start,
02443                 unsigned int A_col_start,
02444                 unsigned int A_row_inc,
02445                 unsigned int A_col_inc,
02446                 unsigned int A_row_size,
02447                 unsigned int A_col_size,
02448                 unsigned int A_internal_rows,
02449                 unsigned int A_internal_cols,
02450                 const T * B,
02451                 unsigned int B_row_start,
02452                 unsigned int B_col_start,
02453                 unsigned int B_row_inc,
02454                 unsigned int B_col_inc,
02455                 unsigned int B_row_size,
02456                 unsigned int B_col_size,
02457                 unsigned int B_internal_rows,
02458                 unsigned int B_internal_cols,
02459                 T beta,
02460                 T * C,
02461                 unsigned int C_row_start,
02462                 unsigned int C_col_start,
02463                 unsigned int C_row_inc,
02464                 unsigned int C_col_inc,
02465                 unsigned int C_row_size,
02466                 unsigned int C_col_size,
02467                 unsigned int C_internal_rows,
02468                 unsigned int C_internal_cols)
02469       {
02470 
02471         __shared__ T bufA[272];
02472         __shared__ T bufB[272];
02473 
02474         vcl_size_t block_size = 16;//get_local_size(0);
02475         vcl_size_t row_block_id = blockIdx.x;
02476         vcl_size_t col_block_id = blockIdx.y;
02477         vcl_size_t row_thread_id = threadIdx.x;
02478         vcl_size_t col_thread_id = threadIdx.y;
02479         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
02480         vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
02481         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
02482         vcl_size_t bStep = block_size * B_col_inc;
02483         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
02484         T Csub = 0;
02485         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
02486         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
02487 
02488         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
02489         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
02490         for (vcl_size_t block = 0;
02491                 block < block_num;
02492                 ++block)
02493         {
02494           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
02495           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
02496           __syncthreads();
02497           T * bufAptr = bufA + row_thread_id_times_block_size;
02498           T * bufBptr = bufB + col_thread_id_times_block_size;
02499             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02500             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02501             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02502             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02503             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02504             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02505             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02506             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02507             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02508             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02509             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02510             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02511             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02512             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02513             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02514             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02515           __syncthreads();
02516           aBegin += aStep;
02517           bBegin += bStep;
02518         }
02519         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
02520           C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows];
02521       }
02522 
02523 
02524 
02525 
02526 
02528 
02529 
02530 
02531 
02532       // matrix-matrix multiplication C = A * B
02533       // matrix layouts: C...row_major, A...row_major, B...row_major
02534       template <typename T>
02535       __global__ void matrix_matrix_row_row_row_prod_AA_kernel(
02536                 T alpha,
02537                 const T * A,
02538                 unsigned int A_row_start,
02539                 unsigned int A_col_start,
02540                 unsigned int A_row_inc,
02541                 unsigned int A_col_inc,
02542                 unsigned int A_row_size,
02543                 unsigned int A_col_size,
02544                 unsigned int A_internal_rows,
02545                 unsigned int A_internal_cols,
02546                 const T * B,
02547                 unsigned int B_row_start,
02548                 unsigned int B_col_start,
02549                 unsigned int B_row_inc,
02550                 unsigned int B_col_inc,
02551                 unsigned int B_row_size,
02552                 unsigned int B_col_size,
02553                 unsigned int B_internal_rows,
02554                 unsigned int B_internal_cols,
02555                 T beta,
02556                 T * C,
02557                 unsigned int C_row_start,
02558                 unsigned int C_col_start,
02559                 unsigned int C_row_inc,
02560                 unsigned int C_col_inc,
02561                 unsigned int C_row_size,
02562                 unsigned int C_col_size,
02563                 unsigned int C_internal_rows,
02564                 unsigned int C_internal_cols)
02565       {
02566 
02567         __shared__ T bufA[272];
02568         __shared__ T bufB[272];
02569 
02570         vcl_size_t block_size = 16;//get_local_size(0);
02571         vcl_size_t row_block_id = blockIdx.x;
02572         vcl_size_t col_block_id = blockIdx.y;
02573         vcl_size_t row_thread_id = threadIdx.x;
02574         vcl_size_t col_thread_id = threadIdx.y;
02575         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
02576         vcl_size_t aStep = block_size * A_col_inc;
02577         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
02578         vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
02579         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
02580         T Csub = 0;
02581         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
02582         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
02583 
02584         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
02585         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
02586         for (vcl_size_t block = 0;
02587                 block < block_num;
02588                 ++block)
02589         {
02590           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
02591           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
02592           __syncthreads();
02593           T * bufAptr = bufA + row_thread_id_times_block_size;
02594           T * bufBptr = bufB + col_thread_id_times_block_size;
02595             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02596             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02597             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02598             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02599             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02600             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02601             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02602             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02603             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02604             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02605             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02606             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02607             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02608             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02609             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02610             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02611           __syncthreads();
02612           aBegin += aStep;
02613           bBegin += bStep;
02614         }
02615         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
02616           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
02617       }
02618 
02619       // matrix-matrix multiplication C = A * B^T
02620       // matrix layouts: C...row_major, A...row_major, B...row_major
02621       template <typename T>
02622       __global__ void matrix_matrix_row_row_row_prod_AT_kernel(
02623                 T alpha,
02624                 const T * A,
02625                 unsigned int A_row_start,
02626                 unsigned int A_col_start,
02627                 unsigned int A_row_inc,
02628                 unsigned int A_col_inc,
02629                 unsigned int A_row_size,
02630                 unsigned int A_col_size,
02631                 unsigned int A_internal_rows,
02632                 unsigned int A_internal_cols,
02633                 const T * B,
02634                 unsigned int B_row_start,
02635                 unsigned int B_col_start,
02636                 unsigned int B_row_inc,
02637                 unsigned int B_col_inc,
02638                 unsigned int B_row_size,
02639                 unsigned int B_col_size,
02640                 unsigned int B_internal_rows,
02641                 unsigned int B_internal_cols,
02642                 T beta,
02643                 T * C,
02644                 unsigned int C_row_start,
02645                 unsigned int C_col_start,
02646                 unsigned int C_row_inc,
02647                 unsigned int C_col_inc,
02648                 unsigned int C_row_size,
02649                 unsigned int C_col_size,
02650                 unsigned int C_internal_rows,
02651                 unsigned int C_internal_cols)
02652       {
02653 
02654         __shared__ T bufA[272];
02655         __shared__ T bufB[272];
02656 
02657         vcl_size_t block_size = 16;//get_local_size(0);
02658         vcl_size_t row_block_id = blockIdx.x;
02659         vcl_size_t col_block_id = blockIdx.y;
02660         vcl_size_t row_thread_id = threadIdx.x;
02661         vcl_size_t col_thread_id = threadIdx.y;
02662         vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start;
02663         vcl_size_t aStep = block_size * A_col_inc;
02664         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
02665         vcl_size_t bStep = block_size * B_col_inc;
02666         vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
02667         T Csub = 0;
02668         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
02669         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
02670 
02671         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
02672         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
02673         for (vcl_size_t block = 0;
02674                 block < block_num;
02675                 ++block)
02676         {
02677           bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
02678           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
02679           __syncthreads();
02680           T * bufAptr = bufA + row_thread_id_times_block_size;
02681           T * bufBptr = bufB + col_thread_id_times_block_size;
02682             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02683             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02684             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02685             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02686             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02687             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02688             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02689             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02690             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02691             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02692             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02693             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02694             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02695             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02696             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02697             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02698           __syncthreads();
02699           aBegin += aStep;
02700           bBegin += bStep;
02701         }
02702         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
02703           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
02704       }
02705 
02706       // matrix-matrix multiplication C = A^T * B
02707       // matrix layouts: C...row_major, A...row_major, B...row_major
02708       template <typename T>
02709       __global__ void matrix_matrix_row_row_row_prod_TA_kernel(
02710                 T alpha,
02711                 const T * A,
02712                 unsigned int A_row_start,
02713                 unsigned int A_col_start,
02714                 unsigned int A_row_inc,
02715                 unsigned int A_col_inc,
02716                 unsigned int A_row_size,
02717                 unsigned int A_col_size,
02718                 unsigned int A_internal_rows,
02719                 unsigned int A_internal_cols,
02720                 const T * B,
02721                 unsigned int B_row_start,
02722                 unsigned int B_col_start,
02723                 unsigned int B_row_inc,
02724                 unsigned int B_col_inc,
02725                 unsigned int B_row_size,
02726                 unsigned int B_col_size,
02727                 unsigned int B_internal_rows,
02728                 unsigned int B_internal_cols,
02729                 T beta,
02730                 T * C,
02731                 unsigned int C_row_start,
02732                 unsigned int C_col_start,
02733                 unsigned int C_row_inc,
02734                 unsigned int C_col_inc,
02735                 unsigned int C_row_size,
02736                 unsigned int C_col_size,
02737                 unsigned int C_internal_rows,
02738                 unsigned int C_internal_cols)
02739       {
02740 
02741         __shared__ T bufA[272];
02742         __shared__ T bufB[272];
02743 
02744         vcl_size_t block_size = 16;//get_local_size(0);
02745         vcl_size_t row_block_id = blockIdx.x;
02746         vcl_size_t col_block_id = blockIdx.y;
02747         vcl_size_t row_thread_id = threadIdx.x;
02748         vcl_size_t col_thread_id = threadIdx.y;
02749         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
02750         vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
02751         vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols;
02752         vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
02753         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
02754         T Csub = 0;
02755         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
02756         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
02757 
02758         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
02759         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
02760         for (vcl_size_t block = 0;
02761                 block < block_num;
02762                 ++block)
02763         {
02764           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
02765           bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
02766           __syncthreads();
02767           T * bufAptr = bufA + row_thread_id_times_block_size;
02768           T * bufBptr = bufB + col_thread_id_times_block_size;
02769             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02770             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02771             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02772             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02773             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02774             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02775             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02776             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02777             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02778             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02779             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02780             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02781             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02782             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02783             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02784             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02785           __syncthreads();
02786           aBegin += aStep;
02787           bBegin += bStep;
02788         }
02789         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size)
02790           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
02791       }
02792 
02793       // matrix-matrix multiplication C = A^T * B^T
02794       // matrix layouts: C...row_major, A...row_major, B...row_major
02795       template <typename T>
02796       __global__ void matrix_matrix_row_row_row_prod_TT_kernel(
02797                 T alpha,
02798                 const T * A,
02799                 unsigned int A_row_start,
02800                 unsigned int A_col_start,
02801                 unsigned int A_row_inc,
02802                 unsigned int A_col_inc,
02803                 unsigned int A_row_size,
02804                 unsigned int A_col_size,
02805                 unsigned int A_internal_rows,
02806                 unsigned int A_internal_cols,
02807                 const T * B,
02808                 unsigned int B_row_start,
02809                 unsigned int B_col_start,
02810                 unsigned int B_row_inc,
02811                 unsigned int B_col_inc,
02812                 unsigned int B_row_size,
02813                 unsigned int B_col_size,
02814                 unsigned int B_internal_rows,
02815                 unsigned int B_internal_cols,
02816                 T beta,
02817                 T * C,
02818                 unsigned int C_row_start,
02819                 unsigned int C_col_start,
02820                 unsigned int C_row_inc,
02821                 unsigned int C_col_inc,
02822                 unsigned int C_row_size,
02823                 unsigned int C_col_size,
02824                 unsigned int C_internal_rows,
02825                 unsigned int C_internal_cols)
02826       {
02827 
02828         __shared__ T bufA[272];
02829         __shared__ T bufB[272];
02830 
02831         vcl_size_t block_size = 16;//get_local_size(0);
02832         vcl_size_t row_block_id = blockIdx.x;
02833         vcl_size_t col_block_id = blockIdx.y;
02834         vcl_size_t row_thread_id = threadIdx.x;
02835         vcl_size_t col_thread_id = threadIdx.y;
02836         vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols;
02837         vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
02838         vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start;
02839         vcl_size_t bStep = block_size * B_col_inc;
02840         vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
02841         T Csub = 0;
02842         vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols;
02843         vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols;
02844 
02845         vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
02846         vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
02847         for (vcl_size_t block = 0;
02848                 block < block_num;
02849                 ++block)
02850         {
02851           bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
02852           bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
02853           __syncthreads();
02854           T * bufAptr = bufA + row_thread_id_times_block_size;
02855           T * bufBptr = bufB + col_thread_id_times_block_size;
02856             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02857             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02858             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02859             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02860             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02861             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02862             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02863             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02864             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02865             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02866             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02867             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02868             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02869             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02870             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02871             Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
02872           __syncthreads();
02873           aBegin += aStep;
02874           bBegin += bStep;
02875         }
02876         if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size)
02877           C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start];
02878       }
02879 
02880 
02881     } // namespace cuda
02882   } //namespace linalg
02883 } //namespace viennacl
02884 
02885 
02886 #endif