ViennaCL - The Vienna Computing Library  1.5.0
viennacl/linalg/opencl/kernels/matrix_prod.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_PROD_HPP
00002 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_PROD_HPP
00003 
00004 #include "viennacl/tools/tools.hpp"
00005 #include "viennacl/ocl/kernel.hpp"
00006 #include "viennacl/ocl/platform.hpp"
00007 #include "viennacl/ocl/utils.hpp"
00008 
00009 #include "viennacl/linalg/opencl/kernels/matrix.hpp"
00010 
00013 namespace viennacl
00014 {
00015   namespace linalg
00016   {
00017     namespace opencl
00018     {
00019       namespace kernels
00020       {
00021 
00022         template <typename StringType>
00023         void generate_matrix_prod_blas3(StringType & source, std::string const & numeric_string,
00024                                         bool row_major_A, bool row_major_B, bool row_major_C,
00025                                         bool transpose_A, bool transpose_B)
00026         {
00027           //start OpenCL code:
00028           source.append("__kernel void prod_");
00029           if (transpose_A)
00030             source.append("T");
00031           else
00032             source.append("A");
00033           if (transpose_B)
00034             source.append("T");
00035           else
00036             source.append("A");
00037 
00038           source.append("( \n");
00039           source.append("  "); source.append(numeric_string); source.append(" alpha, \n");
00040           source.append("  __global const "); source.append(numeric_string); source.append(" * A, \n");
00041           source.append("  unsigned int A_row_start, \n");
00042           source.append("  unsigned int A_col_start, \n");
00043           source.append("  unsigned int A_row_inc, \n");
00044           source.append("  unsigned int A_col_inc, \n");
00045           source.append("  unsigned int A_row_size, \n");   //number of elements starting from row_start!
00046           source.append("  unsigned int A_col_size, \n");
00047           source.append("  unsigned int A_internal_rows, \n");
00048           source.append("  unsigned int A_internal_cols, \n");
00049 
00050           source.append("  __global const "); source.append(numeric_string); source.append(" * B,   \n");
00051           source.append("  unsigned int B_row_start, \n");
00052           source.append("  unsigned int B_col_start, \n");
00053           source.append("  unsigned int B_row_inc, \n");
00054           source.append("  unsigned int B_col_inc, \n");
00055           source.append("  unsigned int B_row_size, \n");
00056           source.append("  unsigned int B_col_size, \n");
00057           source.append("  unsigned int B_internal_rows, \n");
00058           source.append("  unsigned int B_internal_cols, \n");
00059 
00060           source.append("  "); source.append(numeric_string); source.append(" beta, \n");
00061           source.append("  __global "); source.append(numeric_string); source.append(" * C, \n");
00062           source.append("  unsigned int C_row_start, \n");
00063           source.append("  unsigned int C_col_start, \n");
00064           source.append("  unsigned int C_row_inc, \n");
00065           source.append("  unsigned int C_col_inc, \n");
00066           source.append("  unsigned int C_row_size, \n");
00067           source.append("  unsigned int C_col_size, \n");
00068           source.append("  unsigned int C_internal_rows, \n");
00069           source.append("  unsigned int C_internal_cols)  \n");
00070           source.append("{  \n");
00071 
00072           source.append("  __local "); source.append(numeric_string); source.append(" bufA[272]; \n"); // 16 * 17
00073           source.append("  __local "); source.append(numeric_string); source.append(" bufB[272]; \n"); // 16 * 17
00074 
00075           source.append("  size_t block_size = 16; \n"); //get_local_size(0);
00076 
00077           source.append("  size_t row_block_id = get_group_id(0); \n");
00078           source.append("  size_t col_block_id = get_group_id(1); \n");
00079           source.append("  size_t row_thread_id = get_local_id(0); \n");
00080           source.append("  size_t col_thread_id = get_local_id(1); \n");
00081 
00082           //traverse block row of A (taking mem layout and transpose operation into account)
00083           if (row_major_A && transpose_A)
00084           {
00085             source.append("  size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; \n");
00086             source.append("  size_t aStep = block_size * A_row_inc * A_internal_cols; \n");
00087           }
00088           else if (row_major_A && !transpose_A)
00089           {
00090             source.append("  size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; \n");
00091             source.append("  size_t aStep = block_size * A_col_inc; \n");
00092           }
00093           else if (!row_major_A && transpose_A)
00094           {
00095             source.append("  size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; \n");
00096             source.append("  size_t aStep = block_size * A_row_inc; \n");
00097           }
00098           else if (!row_major_A && !transpose_A)
00099           {
00100             source.append("  size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; \n");
00101             source.append("  size_t aStep = block_size * A_col_inc * A_internal_rows; \n");
00102           }
00103 
00104 
00105           if (row_major_B && transpose_B)
00106           {
00107             source.append("  size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; \n");
00108             source.append("  size_t bStep = block_size * B_col_inc; \n");
00109           }
00110           else if (row_major_B && !transpose_B)
00111           {
00112             source.append("  size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; \n");
00113             source.append("  size_t bStep = block_size * B_internal_cols * B_row_inc; \n");
00114           }
00115           else if (!row_major_B && transpose_B)
00116           {
00117             source.append("  size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; \n");
00118             source.append("  size_t bStep = block_size * B_internal_rows * B_col_inc; \n");
00119           }
00120           else if (!row_major_B && !transpose_B)
00121           {
00122             source.append("  size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; \n");
00123             source.append("  size_t bStep = block_size * B_row_inc; \n");
00124           }
00125 
00126 
00127           if (transpose_A)
00128             source.append("  size_t block_num = (A_row_size + block_size - 1) / block_size; \n");
00129           else
00130             source.append("  size_t block_num = (A_col_size + block_size - 1) / block_size; \n");
00131 
00132           source.append("  "); source.append(numeric_string); source.append(" Csub = 0; \n");
00133 
00134           //offset of the the memory access by the thread relative to the beginning of the block:
00135           if (row_major_A)
00136             source.append("  size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; \n");
00137           else
00138             source.append("  size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; \n");
00139 
00140           if (row_major_B)
00141             source.append("  size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; \n");
00142           else
00143             source.append("  size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc *  B_internal_rows; \n");
00144 
00145           source.append("  size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); \n");
00146           source.append("  size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); \n");
00147 
00148           source.append("  for (size_t block = 0; \n");
00149           source.append("           block < block_num; \n");
00150           source.append("           ++block) \n");
00151           source.append("  { \n");
00152 
00153           //read block from A and check for access within matrix:
00154 
00155           if (transpose_A && row_major_A)
00156             source.append("    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; \n");
00157           else if (transpose_A && !row_major_A)
00158             source.append("    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; \n");
00159           else if (!transpose_A && row_major_A)
00160             source.append("    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; \n");
00161           else if (!transpose_A && !row_major_A)
00162             source.append("    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; \n");
00163 
00164 
00165           if (transpose_B && row_major_B)
00166             source.append("    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; \n");
00167           else if (transpose_B && !row_major_B)
00168             source.append("    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; \n");
00169           else if (!transpose_B && row_major_B)
00170             source.append("    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; \n");
00171           else if (!transpose_B && !row_major_B)
00172             source.append("    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; \n");
00173 
00174           //computation of block-matrix-matrix product is the same for all cases:
00175           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00176 
00177           //loop unrolling:
00178           source.append("    __local "); source.append(numeric_string); source.append(" * bufAptr = bufA + row_thread_id_times_block_size; \n");
00179           source.append("    __local "); source.append(numeric_string); source.append(" * bufBptr = bufB + col_thread_id_times_block_size; \n");
00180 
00181           for (size_t unroll = 0; unroll < 16; ++unroll) {
00182             source.append("      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; \n");
00183           }
00184 
00185           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00186           source.append("    aBegin += aStep; \n");
00187           source.append("    bBegin += bStep; \n");
00188           source.append("  } \n");
00189 
00190 
00191           if (transpose_A)
00192           {
00193             source.append("  if (get_global_id(0) < A_col_size && ");
00194           }
00195           else
00196           {
00197             source.append("  if (get_global_id(0) < A_row_size && ");
00198           }
00199 
00200           if (transpose_B)
00201           {
00202             source.append("get_global_id(1) < B_row_size) \n");
00203           }
00204           else
00205           {
00206             source.append("get_global_id(1) < B_col_size) \n");
00207           }
00208 
00209           if (row_major_C)
00210           {
00211             source.append("    C[(get_global_id(0) * C_row_inc + C_row_start) * C_internal_cols + get_global_id(1) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(get_global_id(0) * C_row_inc + C_row_start) * C_internal_cols + get_global_id(1) * C_col_inc + C_col_start]; \n");
00212           }
00213           else
00214           {
00215             source.append("    C[get_global_id(0) * C_row_inc + C_row_start + (get_global_id(1) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[get_global_id(0) * C_row_inc + C_row_start + (get_global_id(1) * C_col_inc + C_col_start) * C_internal_rows]; \n");
00216           }
00217           source.append("} \n");
00218         }
00219 
00220         template <typename StringType>
00221         void generate_matrix_prod16_blas3(StringType & source, std::string const & numeric_string,
00222                                         bool row_major_A, bool row_major_B, bool row_major_C,
00223                                         bool transpose_A, bool transpose_B)
00224         {
00225           //vcl_size_t vector_size =  4;
00226           vcl_size_t block_size  = 16;
00227 
00228           //start OpenCL code:
00229           source.append("__kernel void prod16_");
00230           if (transpose_A)
00231             source.append("T");
00232           else
00233             source.append("A");
00234           if (transpose_B)
00235             source.append("T");
00236           else
00237             source.append("A");
00238 
00239           source.append("( "); source.append(numeric_string); source.append(" alpha, \n");
00240           source.append("   __global const "); source.append(numeric_string); source.append(" * A, \n");
00241           source.append("   unsigned int A_row_start, \n");
00242           source.append("   unsigned int A_col_start, \n");
00243           source.append("   unsigned int A_row_inc, \n");
00244           source.append("   unsigned int A_col_inc, \n");
00245           source.append("   unsigned int A_row_size, \n");   //number of elements starting from row_start, using an increment of A_row_inc
00246           source.append("   unsigned int A_col_size, \n");
00247           source.append("   unsigned int A_internal_rows, \n");
00248           source.append("   unsigned int A_internal_cols, \n");
00249           source.append("   __global const "); source.append(numeric_string); source.append(" * B,   \n");
00250           source.append("   unsigned int B_row_start, \n");
00251           source.append("   unsigned int B_col_start, \n");
00252           source.append("   unsigned int B_row_inc, \n");
00253           source.append("   unsigned int B_col_inc, \n");
00254           source.append("   unsigned int B_row_size, \n");
00255           source.append("   unsigned int B_col_size, \n");
00256           source.append("   unsigned int B_internal_rows, \n");
00257           source.append("   unsigned int B_internal_cols, \n");
00258           source.append("   "); source.append(numeric_string); source.append(" beta, \n");
00259           source.append("   __global "); source.append(numeric_string); source.append(" * C, \n");
00260           source.append("   unsigned int C_row_start, \n");
00261           source.append("   unsigned int C_col_start, \n");
00262           source.append("   unsigned int C_row_inc, \n");
00263           source.append("   unsigned int C_col_inc, \n");
00264           source.append("   unsigned int C_row_size, \n");
00265           source.append("   unsigned int C_col_size, \n");
00266           source.append("   unsigned int C_internal_rows, \n");
00267           source.append("   unsigned int C_internal_cols)  \n");
00268           source.append("{  \n");
00269           //do not forgot to change block_size !!!
00270           source.append("  size_t row_block_id = get_group_id(1); \n");    //refers to the row index in op(A), op(B)
00271           source.append("  size_t col_block_id = get_group_id(0); \n");    //refers to the col index in op(A), op(B)
00272           source.append("  size_t row_thread_id = get_local_id(1); \n");
00273           source.append("  size_t col_thread_id = get_local_id(0); \n");
00274 
00275           source.append("  __local "); source.append(numeric_string); source.append(" As[256]; \n");
00276 
00277           source.append("  "); source.append(numeric_string); source.append(" cv[16] = {");
00278           for (vcl_size_t i=0; i<block_size-1; ++i)
00279             source.append("0,");
00280           source.append("0}; \n");
00281 
00282           //traverse block row of A (taking mem layout and transpose operation into account)
00283           if (row_major_A && transpose_A)
00284           {
00285             source.append("  size_t aBegin = (row_block_id * 16 * A_col_inc + A_col_start) + A_row_start * A_internal_cols; \n");
00286             source.append("  size_t aStep = 16 * A_internal_cols * A_row_inc; \n");
00287             source.append("  size_t aEnd = aBegin + A_internal_cols * A_row_inc * A_row_size; \n");
00288           }
00289           else if (row_major_A && !transpose_A)
00290           {
00291             source.append("  size_t aBegin = (row_block_id * 16 * A_row_inc + A_row_start) * A_internal_cols + A_col_start; \n");
00292             source.append("  size_t aStep = 16 * A_col_inc; \n");
00293             source.append("  size_t aEnd = aBegin + A_col_inc * A_col_size; \n");
00294           }
00295           else if (!row_major_A && transpose_A)
00296           {
00297             source.append("  size_t aBegin = (row_block_id * 16 * A_col_inc + A_col_start) * A_internal_rows + A_row_start; \n");
00298             source.append("  size_t aStep = 16 * A_row_inc; \n");
00299             source.append("  size_t aEnd = aBegin + A_row_inc * A_row_size; \n");
00300           }
00301           else if (!row_major_A && !transpose_A)
00302           {
00303             source.append("  size_t aBegin = (row_block_id * 16 * A_row_inc + A_row_start) + A_col_start * A_internal_rows; \n");
00304             source.append("  size_t aStep = 16 * A_internal_rows * A_col_inc; \n");
00305             source.append("  size_t aEnd = aBegin + A_internal_rows * A_col_inc * A_col_size; \n");
00306           }
00307 
00308 
00309           if (row_major_B && transpose_B)
00310           {
00311             source.append("  size_t bBegin = (col_block_id * 64 * B_row_inc + B_row_start) * B_internal_cols + B_col_start; \n");
00312             source.append("  size_t bStep = 16 * B_col_inc; \n");
00313           }
00314           else if (row_major_B && !transpose_B)
00315           {
00316             source.append("  size_t bBegin = (col_block_id * 64 * B_col_inc + B_col_start) + B_row_start * B_internal_cols; \n");
00317             source.append("  size_t bStep = 16 * B_row_inc * B_internal_cols; \n");
00318           }
00319           else if (!row_major_B && transpose_B)
00320           {
00321             source.append("  size_t bBegin = (col_block_id * 64 * B_row_inc + B_row_start) + B_col_start * B_internal_rows; \n");
00322             source.append("  size_t bStep = 16 * B_col_inc * B_internal_rows; \n");
00323           }
00324           else if (!row_major_B && !transpose_B)
00325           {
00326             source.append("  size_t bBegin = (col_block_id * 64 * B_col_inc + B_col_start) * B_internal_rows + B_row_start; \n");
00327             source.append("  size_t bStep = 16 * B_row_inc; \n");
00328           }
00329 
00330           source.append("  for(size_t a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep) {  \n");
00331 
00332           // copy blocks of op(A) to shared memory (op(A) is column-major in shared memory then)
00333           source.append("    for(size_t i = 0; i < 4; i++)   \n");
00334           if (row_major_A && transpose_A)
00335             source.append("      As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_col_inc * (i * 4 + row_thread_id) + A_internal_cols * A_row_inc * col_thread_id]);");
00336           else if (row_major_A && !transpose_A)
00337             source.append("      As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_internal_cols * A_row_inc * (i * 4 + row_thread_id) + A_col_inc * col_thread_id]);");
00338           else if (!row_major_A && transpose_A)
00339             source.append("      As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_internal_rows * A_col_inc * (i * 4 + row_thread_id) + A_row_inc * col_thread_id]);");
00340           else if (!row_major_A && !transpose_A)
00341             source.append("      As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_row_inc * (i * 4 + row_thread_id) + A_internal_rows * A_col_inc * col_thread_id]);");
00342 
00343           source.append("    barrier(CLK_LOCAL_MEM_FENCE);  \n");
00344 
00345           // initialize memory pointers
00346           source.append("    __local  "); source.append(numeric_string); source.append(" *ap = As;  \n");
00347           if (row_major_B && transpose_B)
00348           {
00349             source.append("    __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_row_inc * B_internal_cols);  \n");
00350           }
00351           else if (row_major_B && !transpose_B)
00352           {
00353             source.append("    __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_col_inc);  \n");
00354           }
00355           else if (!row_major_B && transpose_B)
00356           {
00357             source.append("    __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_row_inc);  \n");
00358           }
00359           else if (!row_major_B && !transpose_B)
00360           {
00361             source.append("    __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_col_inc * B_internal_rows);  \n");
00362           }
00363 
00364           // run computations
00365           source.append("    for(size_t i = 0; i < 16; i++) {  \n");
00366           if (row_major_B && transpose_B)
00367           {
00368             source.append("      "); source.append(numeric_string); source.append(" bv = bp[i * B_col_inc];  \n");
00369           }
00370           else if (row_major_B && !transpose_B)
00371           {
00372             source.append("      "); source.append(numeric_string); source.append(" bv = bp[i * B_row_inc * B_internal_cols];  \n");
00373           }
00374           else if (!row_major_B && transpose_B)
00375           {
00376             source.append("      "); source.append(numeric_string); source.append(" bv = bp[i * B_col_inc * B_internal_rows];  \n");
00377           }
00378           else if (!row_major_B && !transpose_B)
00379           {
00380             source.append("      "); source.append(numeric_string); source.append(" bv = bp[i * B_row_inc];  \n");
00381           }
00382 
00383           source.append("      for(size_t k = 0; k < 16; k++)   \n");
00384           source.append("           cv[k] += ap[k] * bv;  \n");
00385 
00386           source.append("      ap += 16;  \n");
00387           source.append("    }  \n");
00388 
00389           source.append("    barrier(CLK_LOCAL_MEM_FENCE);  \n");
00390           source.append("  }  \n");
00391 
00392           // write to C
00393           if (row_major_C)
00394           {
00395               source.append("  int c = C_internal_cols * (C_row_inc * 16 * row_block_id + C_row_start) + 64 * C_col_inc * col_block_id + C_col_start  \n");  //block column index
00396               source.append("          + C_col_inc * (16 * row_thread_id + col_thread_id);  \n");
00397           }
00398           else
00399           {
00400               source.append("  int c = C_row_inc * 16 * row_block_id + C_row_start + (64 * C_col_inc * col_block_id + C_col_start) * C_internal_rows  \n");   // block column index
00401               source.append("          + C_internal_rows * C_col_inc * (16 * row_thread_id + col_thread_id);  \n");
00402           }
00403 
00404           source.append("  for(size_t i = 0; i < 16; i++) {  \n");
00405 
00406           if (row_major_C)
00407           {
00408             source.append("    C[c] = (beta == 0) ? alpha * cv[i] : alpha * cv[i] + beta * C[c];  \n");
00409             source.append("      c += C_internal_cols * C_row_inc;  \n");
00410           }
00411           else
00412           {
00413             source.append("    C[c] = (beta == 0) ? alpha * cv[i] : alpha * cv[i] + beta * C[c];  \n");
00414             source.append("      c += C_row_inc;  \n");
00415           }
00416 
00417           source.append("  }  \n");
00418           source.append("}  \n");
00419 
00420         }
00421 
00422 
00423         // main kernel class
00430         template <class NumericT, typename F_A, typename F_B, typename F_C>
00431         struct matrix_prod
00432         {
00433           static std::string program_name()
00434           {
00435             return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_prod_" + detail::type_to_string(F_A()) + detail::type_to_string(F_B()) + detail::type_to_string(F_C());
00436           }
00437 
00438           static void init(viennacl::ocl::context & ctx)
00439           {
00440             viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
00441             std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
00442             bool row_major_A = viennacl::is_row_major<F_A>::value;
00443             bool row_major_B = viennacl::is_row_major<F_B>::value;
00444             bool row_major_C = viennacl::is_row_major<F_C>::value;
00445 
00446 
00447             static std::map<cl_context, bool> init_done;
00448             if (!init_done[ctx.handle().get()])
00449             {
00450               std::string source;
00451               source.reserve(8192);
00452 
00453               viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
00454 
00455               // only generate for floating points (forces error for integers)
00456               if (numeric_string == "float" || numeric_string == "double")
00457               {
00458                 generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, false);
00459                 generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, true);
00460                 generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, false);
00461                 generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, true);
00462 
00463                 generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, false);
00464                 generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, true);
00465                 generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, false);
00466                 generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, true);
00467 
00468               }
00469 
00470               std::string prog_name = program_name();
00471               #ifdef VIENNACL_BUILD_INFO
00472               std::cout << "Creating program " << prog_name << std::endl;
00473               #endif
00474               ctx.add_program(source, prog_name);
00475               init_done[ctx.handle().get()] = true;
00476             } //if
00477           } //init
00478         };
00479 
00480       }  // namespace kernels
00481     }  // namespace opencl
00482   }  // namespace linalg
00483 }  // namespace viennacl
00484 #endif
00485