ViennaCL - The Vienna Computing Library
1.5.0
|
00001 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_PROD_HPP 00002 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_PROD_HPP 00003 00004 #include "viennacl/tools/tools.hpp" 00005 #include "viennacl/ocl/kernel.hpp" 00006 #include "viennacl/ocl/platform.hpp" 00007 #include "viennacl/ocl/utils.hpp" 00008 00009 #include "viennacl/linalg/opencl/kernels/matrix.hpp" 00010 00013 namespace viennacl 00014 { 00015 namespace linalg 00016 { 00017 namespace opencl 00018 { 00019 namespace kernels 00020 { 00021 00022 template <typename StringType> 00023 void generate_matrix_prod_blas3(StringType & source, std::string const & numeric_string, 00024 bool row_major_A, bool row_major_B, bool row_major_C, 00025 bool transpose_A, bool transpose_B) 00026 { 00027 //start OpenCL code: 00028 source.append("__kernel void prod_"); 00029 if (transpose_A) 00030 source.append("T"); 00031 else 00032 source.append("A"); 00033 if (transpose_B) 00034 source.append("T"); 00035 else 00036 source.append("A"); 00037 00038 source.append("( \n"); 00039 source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); 00040 source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n"); 00041 source.append(" unsigned int A_row_start, \n"); 00042 source.append(" unsigned int A_col_start, \n"); 00043 source.append(" unsigned int A_row_inc, \n"); 00044 source.append(" unsigned int A_col_inc, \n"); 00045 source.append(" unsigned int A_row_size, \n"); //number of elements starting from row_start! 00046 source.append(" unsigned int A_col_size, \n"); 00047 source.append(" unsigned int A_internal_rows, \n"); 00048 source.append(" unsigned int A_internal_cols, \n"); 00049 00050 source.append(" __global const "); source.append(numeric_string); source.append(" * B, \n"); 00051 source.append(" unsigned int B_row_start, \n"); 00052 source.append(" unsigned int B_col_start, \n"); 00053 source.append(" unsigned int B_row_inc, \n"); 00054 source.append(" unsigned int B_col_inc, \n"); 00055 source.append(" unsigned int B_row_size, \n"); 00056 source.append(" unsigned int B_col_size, \n"); 00057 source.append(" unsigned int B_internal_rows, \n"); 00058 source.append(" unsigned int B_internal_cols, \n"); 00059 00060 source.append(" "); source.append(numeric_string); source.append(" beta, \n"); 00061 source.append(" __global "); source.append(numeric_string); source.append(" * C, \n"); 00062 source.append(" unsigned int C_row_start, \n"); 00063 source.append(" unsigned int C_col_start, \n"); 00064 source.append(" unsigned int C_row_inc, \n"); 00065 source.append(" unsigned int C_col_inc, \n"); 00066 source.append(" unsigned int C_row_size, \n"); 00067 source.append(" unsigned int C_col_size, \n"); 00068 source.append(" unsigned int C_internal_rows, \n"); 00069 source.append(" unsigned int C_internal_cols) \n"); 00070 source.append("{ \n"); 00071 00072 source.append(" __local "); source.append(numeric_string); source.append(" bufA[272]; \n"); // 16 * 17 00073 source.append(" __local "); source.append(numeric_string); source.append(" bufB[272]; \n"); // 16 * 17 00074 00075 source.append(" size_t block_size = 16; \n"); //get_local_size(0); 00076 00077 source.append(" size_t row_block_id = get_group_id(0); \n"); 00078 source.append(" size_t col_block_id = get_group_id(1); \n"); 00079 source.append(" size_t row_thread_id = get_local_id(0); \n"); 00080 source.append(" size_t col_thread_id = get_local_id(1); \n"); 00081 00082 //traverse block row of A (taking mem layout and transpose operation into account) 00083 if (row_major_A && transpose_A) 00084 { 00085 source.append(" size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; \n"); 00086 source.append(" size_t aStep = block_size * A_row_inc * A_internal_cols; \n"); 00087 } 00088 else if (row_major_A && !transpose_A) 00089 { 00090 source.append(" size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; \n"); 00091 source.append(" size_t aStep = block_size * A_col_inc; \n"); 00092 } 00093 else if (!row_major_A && transpose_A) 00094 { 00095 source.append(" size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; \n"); 00096 source.append(" size_t aStep = block_size * A_row_inc; \n"); 00097 } 00098 else if (!row_major_A && !transpose_A) 00099 { 00100 source.append(" size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; \n"); 00101 source.append(" size_t aStep = block_size * A_col_inc * A_internal_rows; \n"); 00102 } 00103 00104 00105 if (row_major_B && transpose_B) 00106 { 00107 source.append(" size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; \n"); 00108 source.append(" size_t bStep = block_size * B_col_inc; \n"); 00109 } 00110 else if (row_major_B && !transpose_B) 00111 { 00112 source.append(" size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; \n"); 00113 source.append(" size_t bStep = block_size * B_internal_cols * B_row_inc; \n"); 00114 } 00115 else if (!row_major_B && transpose_B) 00116 { 00117 source.append(" size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; \n"); 00118 source.append(" size_t bStep = block_size * B_internal_rows * B_col_inc; \n"); 00119 } 00120 else if (!row_major_B && !transpose_B) 00121 { 00122 source.append(" size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; \n"); 00123 source.append(" size_t bStep = block_size * B_row_inc; \n"); 00124 } 00125 00126 00127 if (transpose_A) 00128 source.append(" size_t block_num = (A_row_size + block_size - 1) / block_size; \n"); 00129 else 00130 source.append(" size_t block_num = (A_col_size + block_size - 1) / block_size; \n"); 00131 00132 source.append(" "); source.append(numeric_string); source.append(" Csub = 0; \n"); 00133 00134 //offset of the the memory access by the thread relative to the beginning of the block: 00135 if (row_major_A) 00136 source.append(" size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; \n"); 00137 else 00138 source.append(" size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; \n"); 00139 00140 if (row_major_B) 00141 source.append(" size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; \n"); 00142 else 00143 source.append(" size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; \n"); 00144 00145 source.append(" size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); \n"); 00146 source.append(" size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); \n"); 00147 00148 source.append(" for (size_t block = 0; \n"); 00149 source.append(" block < block_num; \n"); 00150 source.append(" ++block) \n"); 00151 source.append(" { \n"); 00152 00153 //read block from A and check for access within matrix: 00154 00155 if (transpose_A && row_major_A) 00156 source.append(" bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; \n"); 00157 else if (transpose_A && !row_major_A) 00158 source.append(" bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; \n"); 00159 else if (!transpose_A && row_major_A) 00160 source.append(" bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; \n"); 00161 else if (!transpose_A && !row_major_A) 00162 source.append(" bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; \n"); 00163 00164 00165 if (transpose_B && row_major_B) 00166 source.append(" bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; \n"); 00167 else if (transpose_B && !row_major_B) 00168 source.append(" bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; \n"); 00169 else if (!transpose_B && row_major_B) 00170 source.append(" bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; \n"); 00171 else if (!transpose_B && !row_major_B) 00172 source.append(" bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; \n"); 00173 00174 //computation of block-matrix-matrix product is the same for all cases: 00175 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00176 00177 //loop unrolling: 00178 source.append(" __local "); source.append(numeric_string); source.append(" * bufAptr = bufA + row_thread_id_times_block_size; \n"); 00179 source.append(" __local "); source.append(numeric_string); source.append(" * bufBptr = bufB + col_thread_id_times_block_size; \n"); 00180 00181 for (size_t unroll = 0; unroll < 16; ++unroll) { 00182 source.append(" Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; \n"); 00183 } 00184 00185 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00186 source.append(" aBegin += aStep; \n"); 00187 source.append(" bBegin += bStep; \n"); 00188 source.append(" } \n"); 00189 00190 00191 if (transpose_A) 00192 { 00193 source.append(" if (get_global_id(0) < A_col_size && "); 00194 } 00195 else 00196 { 00197 source.append(" if (get_global_id(0) < A_row_size && "); 00198 } 00199 00200 if (transpose_B) 00201 { 00202 source.append("get_global_id(1) < B_row_size) \n"); 00203 } 00204 else 00205 { 00206 source.append("get_global_id(1) < B_col_size) \n"); 00207 } 00208 00209 if (row_major_C) 00210 { 00211 source.append(" C[(get_global_id(0) * C_row_inc + C_row_start) * C_internal_cols + get_global_id(1) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(get_global_id(0) * C_row_inc + C_row_start) * C_internal_cols + get_global_id(1) * C_col_inc + C_col_start]; \n"); 00212 } 00213 else 00214 { 00215 source.append(" C[get_global_id(0) * C_row_inc + C_row_start + (get_global_id(1) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[get_global_id(0) * C_row_inc + C_row_start + (get_global_id(1) * C_col_inc + C_col_start) * C_internal_rows]; \n"); 00216 } 00217 source.append("} \n"); 00218 } 00219 00220 template <typename StringType> 00221 void generate_matrix_prod16_blas3(StringType & source, std::string const & numeric_string, 00222 bool row_major_A, bool row_major_B, bool row_major_C, 00223 bool transpose_A, bool transpose_B) 00224 { 00225 //vcl_size_t vector_size = 4; 00226 vcl_size_t block_size = 16; 00227 00228 //start OpenCL code: 00229 source.append("__kernel void prod16_"); 00230 if (transpose_A) 00231 source.append("T"); 00232 else 00233 source.append("A"); 00234 if (transpose_B) 00235 source.append("T"); 00236 else 00237 source.append("A"); 00238 00239 source.append("( "); source.append(numeric_string); source.append(" alpha, \n"); 00240 source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n"); 00241 source.append(" unsigned int A_row_start, \n"); 00242 source.append(" unsigned int A_col_start, \n"); 00243 source.append(" unsigned int A_row_inc, \n"); 00244 source.append(" unsigned int A_col_inc, \n"); 00245 source.append(" unsigned int A_row_size, \n"); //number of elements starting from row_start, using an increment of A_row_inc 00246 source.append(" unsigned int A_col_size, \n"); 00247 source.append(" unsigned int A_internal_rows, \n"); 00248 source.append(" unsigned int A_internal_cols, \n"); 00249 source.append(" __global const "); source.append(numeric_string); source.append(" * B, \n"); 00250 source.append(" unsigned int B_row_start, \n"); 00251 source.append(" unsigned int B_col_start, \n"); 00252 source.append(" unsigned int B_row_inc, \n"); 00253 source.append(" unsigned int B_col_inc, \n"); 00254 source.append(" unsigned int B_row_size, \n"); 00255 source.append(" unsigned int B_col_size, \n"); 00256 source.append(" unsigned int B_internal_rows, \n"); 00257 source.append(" unsigned int B_internal_cols, \n"); 00258 source.append(" "); source.append(numeric_string); source.append(" beta, \n"); 00259 source.append(" __global "); source.append(numeric_string); source.append(" * C, \n"); 00260 source.append(" unsigned int C_row_start, \n"); 00261 source.append(" unsigned int C_col_start, \n"); 00262 source.append(" unsigned int C_row_inc, \n"); 00263 source.append(" unsigned int C_col_inc, \n"); 00264 source.append(" unsigned int C_row_size, \n"); 00265 source.append(" unsigned int C_col_size, \n"); 00266 source.append(" unsigned int C_internal_rows, \n"); 00267 source.append(" unsigned int C_internal_cols) \n"); 00268 source.append("{ \n"); 00269 //do not forgot to change block_size !!! 00270 source.append(" size_t row_block_id = get_group_id(1); \n"); //refers to the row index in op(A), op(B) 00271 source.append(" size_t col_block_id = get_group_id(0); \n"); //refers to the col index in op(A), op(B) 00272 source.append(" size_t row_thread_id = get_local_id(1); \n"); 00273 source.append(" size_t col_thread_id = get_local_id(0); \n"); 00274 00275 source.append(" __local "); source.append(numeric_string); source.append(" As[256]; \n"); 00276 00277 source.append(" "); source.append(numeric_string); source.append(" cv[16] = {"); 00278 for (vcl_size_t i=0; i<block_size-1; ++i) 00279 source.append("0,"); 00280 source.append("0}; \n"); 00281 00282 //traverse block row of A (taking mem layout and transpose operation into account) 00283 if (row_major_A && transpose_A) 00284 { 00285 source.append(" size_t aBegin = (row_block_id * 16 * A_col_inc + A_col_start) + A_row_start * A_internal_cols; \n"); 00286 source.append(" size_t aStep = 16 * A_internal_cols * A_row_inc; \n"); 00287 source.append(" size_t aEnd = aBegin + A_internal_cols * A_row_inc * A_row_size; \n"); 00288 } 00289 else if (row_major_A && !transpose_A) 00290 { 00291 source.append(" size_t aBegin = (row_block_id * 16 * A_row_inc + A_row_start) * A_internal_cols + A_col_start; \n"); 00292 source.append(" size_t aStep = 16 * A_col_inc; \n"); 00293 source.append(" size_t aEnd = aBegin + A_col_inc * A_col_size; \n"); 00294 } 00295 else if (!row_major_A && transpose_A) 00296 { 00297 source.append(" size_t aBegin = (row_block_id * 16 * A_col_inc + A_col_start) * A_internal_rows + A_row_start; \n"); 00298 source.append(" size_t aStep = 16 * A_row_inc; \n"); 00299 source.append(" size_t aEnd = aBegin + A_row_inc * A_row_size; \n"); 00300 } 00301 else if (!row_major_A && !transpose_A) 00302 { 00303 source.append(" size_t aBegin = (row_block_id * 16 * A_row_inc + A_row_start) + A_col_start * A_internal_rows; \n"); 00304 source.append(" size_t aStep = 16 * A_internal_rows * A_col_inc; \n"); 00305 source.append(" size_t aEnd = aBegin + A_internal_rows * A_col_inc * A_col_size; \n"); 00306 } 00307 00308 00309 if (row_major_B && transpose_B) 00310 { 00311 source.append(" size_t bBegin = (col_block_id * 64 * B_row_inc + B_row_start) * B_internal_cols + B_col_start; \n"); 00312 source.append(" size_t bStep = 16 * B_col_inc; \n"); 00313 } 00314 else if (row_major_B && !transpose_B) 00315 { 00316 source.append(" size_t bBegin = (col_block_id * 64 * B_col_inc + B_col_start) + B_row_start * B_internal_cols; \n"); 00317 source.append(" size_t bStep = 16 * B_row_inc * B_internal_cols; \n"); 00318 } 00319 else if (!row_major_B && transpose_B) 00320 { 00321 source.append(" size_t bBegin = (col_block_id * 64 * B_row_inc + B_row_start) + B_col_start * B_internal_rows; \n"); 00322 source.append(" size_t bStep = 16 * B_col_inc * B_internal_rows; \n"); 00323 } 00324 else if (!row_major_B && !transpose_B) 00325 { 00326 source.append(" size_t bBegin = (col_block_id * 64 * B_col_inc + B_col_start) * B_internal_rows + B_row_start; \n"); 00327 source.append(" size_t bStep = 16 * B_row_inc; \n"); 00328 } 00329 00330 source.append(" for(size_t a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep) { \n"); 00331 00332 // copy blocks of op(A) to shared memory (op(A) is column-major in shared memory then) 00333 source.append(" for(size_t i = 0; i < 4; i++) \n"); 00334 if (row_major_A && transpose_A) 00335 source.append(" As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_col_inc * (i * 4 + row_thread_id) + A_internal_cols * A_row_inc * col_thread_id]);"); 00336 else if (row_major_A && !transpose_A) 00337 source.append(" As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_internal_cols * A_row_inc * (i * 4 + row_thread_id) + A_col_inc * col_thread_id]);"); 00338 else if (!row_major_A && transpose_A) 00339 source.append(" As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_internal_rows * A_col_inc * (i * 4 + row_thread_id) + A_row_inc * col_thread_id]);"); 00340 else if (!row_major_A && !transpose_A) 00341 source.append(" As[ (i*4 + row_thread_id) + 16 * col_thread_id] = (A[a + A_row_inc * (i * 4 + row_thread_id) + A_internal_rows * A_col_inc * col_thread_id]);"); 00342 00343 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00344 00345 // initialize memory pointers 00346 source.append(" __local "); source.append(numeric_string); source.append(" *ap = As; \n"); 00347 if (row_major_B && transpose_B) 00348 { 00349 source.append(" __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_row_inc * B_internal_cols); \n"); 00350 } 00351 else if (row_major_B && !transpose_B) 00352 { 00353 source.append(" __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_col_inc); \n"); 00354 } 00355 else if (!row_major_B && transpose_B) 00356 { 00357 source.append(" __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_row_inc); \n"); 00358 } 00359 else if (!row_major_B && !transpose_B) 00360 { 00361 source.append(" __global const "); source.append(numeric_string); source.append(" *bp = B + (b + (16 * row_thread_id + col_thread_id) * B_col_inc * B_internal_rows); \n"); 00362 } 00363 00364 // run computations 00365 source.append(" for(size_t i = 0; i < 16; i++) { \n"); 00366 if (row_major_B && transpose_B) 00367 { 00368 source.append(" "); source.append(numeric_string); source.append(" bv = bp[i * B_col_inc]; \n"); 00369 } 00370 else if (row_major_B && !transpose_B) 00371 { 00372 source.append(" "); source.append(numeric_string); source.append(" bv = bp[i * B_row_inc * B_internal_cols]; \n"); 00373 } 00374 else if (!row_major_B && transpose_B) 00375 { 00376 source.append(" "); source.append(numeric_string); source.append(" bv = bp[i * B_col_inc * B_internal_rows]; \n"); 00377 } 00378 else if (!row_major_B && !transpose_B) 00379 { 00380 source.append(" "); source.append(numeric_string); source.append(" bv = bp[i * B_row_inc]; \n"); 00381 } 00382 00383 source.append(" for(size_t k = 0; k < 16; k++) \n"); 00384 source.append(" cv[k] += ap[k] * bv; \n"); 00385 00386 source.append(" ap += 16; \n"); 00387 source.append(" } \n"); 00388 00389 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00390 source.append(" } \n"); 00391 00392 // write to C 00393 if (row_major_C) 00394 { 00395 source.append(" int c = C_internal_cols * (C_row_inc * 16 * row_block_id + C_row_start) + 64 * C_col_inc * col_block_id + C_col_start \n"); //block column index 00396 source.append(" + C_col_inc * (16 * row_thread_id + col_thread_id); \n"); 00397 } 00398 else 00399 { 00400 source.append(" int c = C_row_inc * 16 * row_block_id + C_row_start + (64 * C_col_inc * col_block_id + C_col_start) * C_internal_rows \n"); // block column index 00401 source.append(" + C_internal_rows * C_col_inc * (16 * row_thread_id + col_thread_id); \n"); 00402 } 00403 00404 source.append(" for(size_t i = 0; i < 16; i++) { \n"); 00405 00406 if (row_major_C) 00407 { 00408 source.append(" C[c] = (beta == 0) ? alpha * cv[i] : alpha * cv[i] + beta * C[c]; \n"); 00409 source.append(" c += C_internal_cols * C_row_inc; \n"); 00410 } 00411 else 00412 { 00413 source.append(" C[c] = (beta == 0) ? alpha * cv[i] : alpha * cv[i] + beta * C[c]; \n"); 00414 source.append(" c += C_row_inc; \n"); 00415 } 00416 00417 source.append(" } \n"); 00418 source.append("} \n"); 00419 00420 } 00421 00422 00423 // main kernel class 00430 template <class NumericT, typename F_A, typename F_B, typename F_C> 00431 struct matrix_prod 00432 { 00433 static std::string program_name() 00434 { 00435 return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_prod_" + detail::type_to_string(F_A()) + detail::type_to_string(F_B()) + detail::type_to_string(F_C()); 00436 } 00437 00438 static void init(viennacl::ocl::context & ctx) 00439 { 00440 viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); 00441 std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); 00442 bool row_major_A = viennacl::is_row_major<F_A>::value; 00443 bool row_major_B = viennacl::is_row_major<F_B>::value; 00444 bool row_major_C = viennacl::is_row_major<F_C>::value; 00445 00446 00447 static std::map<cl_context, bool> init_done; 00448 if (!init_done[ctx.handle().get()]) 00449 { 00450 std::string source; 00451 source.reserve(8192); 00452 00453 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); 00454 00455 // only generate for floating points (forces error for integers) 00456 if (numeric_string == "float" || numeric_string == "double") 00457 { 00458 generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, false); 00459 generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, true); 00460 generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, false); 00461 generate_matrix_prod_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, true); 00462 00463 generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, false); 00464 generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, false, true); 00465 generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, false); 00466 generate_matrix_prod16_blas3(source, numeric_string, row_major_A, row_major_B, row_major_C, true, true); 00467 00468 } 00469 00470 std::string prog_name = program_name(); 00471 #ifdef VIENNACL_BUILD_INFO 00472 std::cout << "Creating program " << prog_name << std::endl; 00473 #endif 00474 ctx.add_program(source, prog_name); 00475 init_done[ctx.handle().get()] = true; 00476 } //if 00477 } //init 00478 }; 00479 00480 } // namespace kernels 00481 } // namespace opencl 00482 } // namespace linalg 00483 } // namespace viennacl 00484 #endif 00485