ViennaCL - The Vienna Computing Library
1.5.0
|
00001 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP 00002 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP 00003 00004 #include "viennacl/tools/tools.hpp" 00005 #include "viennacl/ocl/kernel.hpp" 00006 #include "viennacl/ocl/platform.hpp" 00007 #include "viennacl/ocl/utils.hpp" 00008 00011 namespace viennacl 00012 { 00013 namespace linalg 00014 { 00015 namespace opencl 00016 { 00017 namespace kernels 00018 { 00019 00021 00023 enum ambm_scalar_type 00024 { 00025 VIENNACL_AMBM_NONE = 0, // vector does not exist/contribute 00026 VIENNACL_AMBM_CPU, 00027 VIENNACL_AMBM_GPU 00028 }; 00029 00031 struct ambm_config 00032 { 00033 ambm_config() : with_stride_and_range(true), is_row_major(true), a(VIENNACL_AMBM_CPU), b(VIENNACL_AMBM_NONE) {} 00034 00035 bool with_stride_and_range; 00036 bool is_row_major; 00037 std::string assign_op; 00038 ambm_scalar_type a; 00039 ambm_scalar_type b; 00040 }; 00041 00042 // just returns the for-loop 00043 template <typename StringType> 00044 void generate_ambm_impl2(StringType & source, ambm_config const & cfg, bool mult_alpha, bool mult_beta) 00045 { 00046 source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n"); 00047 source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n"); 00048 if (cfg.is_row_major) 00049 { 00050 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n"); 00051 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n"); 00052 } 00053 else 00054 { 00055 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n"); 00056 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n"); 00057 } 00058 00059 if (cfg.with_stride_and_range) 00060 { 00061 if (cfg.is_row_major) 00062 source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] "); 00063 else 00064 source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] "); 00065 source.append(cfg.assign_op); 00066 if (cfg.is_row_major) 00067 source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] "); 00068 else 00069 source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] "); 00070 00071 if (mult_alpha) 00072 source.append("* alpha "); 00073 else 00074 source.append("/ alpha "); 00075 if (cfg.b != VIENNACL_AMBM_NONE) 00076 { 00077 if (cfg.is_row_major) 00078 source.append("+ C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] "); 00079 else 00080 source.append("+ C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] "); 00081 if (mult_beta) 00082 source.append("* beta"); 00083 else 00084 source.append("/ beta"); 00085 } 00086 } 00087 else 00088 { 00089 if (cfg.is_row_major) 00090 source.append(" A[row * A_internal_size2 + col] "); 00091 else 00092 source.append(" A[row + col * A_internal_size1] "); 00093 source.append(cfg.assign_op); 00094 if (cfg.is_row_major) 00095 source.append(" B[row * B_internal_size2 + col] "); 00096 else 00097 source.append(" B[row + col * B_internal_size1] "); 00098 00099 if (mult_alpha) 00100 source.append("* alpha "); 00101 else 00102 source.append("/ alpha "); 00103 if (cfg.b != VIENNACL_AMBM_NONE) 00104 { 00105 if (cfg.is_row_major) 00106 source.append("+ C[row * C_internal_size2 + col] "); 00107 else 00108 source.append("+ C[row + col * C_internal_size2] "); 00109 if (mult_beta) 00110 source.append("* beta"); 00111 else 00112 source.append("/ beta"); 00113 } 00114 } 00115 source.append("; \n"); 00116 } 00117 00118 template <typename StringType> 00119 void generate_ambm_impl(StringType & source, std::string const & numeric_string, ambm_config const & cfg) 00120 { 00121 source.append("__kernel void am"); 00122 if (cfg.b != VIENNACL_AMBM_NONE) 00123 source.append("bm"); 00124 if (cfg.assign_op != "=") 00125 source.append("_m"); 00126 00127 if (cfg.a == VIENNACL_AMBM_CPU) 00128 source.append("_cpu"); 00129 else if (cfg.a == VIENNACL_AMBM_GPU) 00130 source.append("_gpu"); 00131 00132 if (cfg.b == VIENNACL_AMBM_CPU) 00133 source.append("_cpu"); 00134 else if (cfg.b == VIENNACL_AMBM_GPU) 00135 source.append("_gpu"); 00136 source.append("( \n"); 00137 source.append(" __global "); source.append(numeric_string); source.append(" * A, \n"); 00138 source.append(" unsigned int A_start1, unsigned int A_start2, \n"); 00139 source.append(" unsigned int A_inc1, unsigned int A_inc2, \n"); 00140 source.append(" unsigned int A_size1, unsigned int A_size2, \n"); 00141 source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n"); 00142 if (cfg.a == VIENNACL_AMBM_CPU) 00143 { 00144 source.append(" "); source.append(numeric_string); source.append(" fac2, \n"); 00145 } 00146 else if (cfg.a == VIENNACL_AMBM_GPU) 00147 { 00148 source.append(" __global "); source.append(numeric_string); source.append(" * fac2, \n"); 00149 } 00150 source.append(" unsigned int options2, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse 00151 source.append(" __global const "); source.append(numeric_string); source.append(" * B, \n"); 00152 source.append(" unsigned int B_start1, unsigned int B_start2, \n"); 00153 source.append(" unsigned int B_inc1, unsigned int B_inc2, \n"); 00154 source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2"); 00155 00156 if (cfg.b != VIENNACL_AMBM_NONE) 00157 { 00158 source.append(", \n\n"); 00159 if (cfg.b == VIENNACL_AMBM_CPU) 00160 { 00161 source.append(" "); source.append(numeric_string); source.append(" fac3, \n"); 00162 } 00163 else if (cfg.b == VIENNACL_AMBM_GPU) 00164 { 00165 source.append(" __global "); source.append(numeric_string); source.append(" * fac3, \n"); 00166 } 00167 source.append(" unsigned int options3, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse 00168 source.append(" __global const "); source.append(numeric_string); source.append(" * C, \n"); 00169 source.append(" unsigned int C_start1, unsigned int C_start2, \n"); 00170 source.append(" unsigned int C_inc1, unsigned int C_inc2, \n"); 00171 source.append(" unsigned int C_internal_size1, unsigned int C_internal_size2 \n"); 00172 } 00173 source.append(") { \n"); 00174 00175 if (cfg.a == VIENNACL_AMBM_CPU) 00176 { 00177 source.append(" "); source.append(numeric_string); source.append(" alpha = fac2; \n"); 00178 } 00179 else if (cfg.a == VIENNACL_AMBM_GPU) 00180 { 00181 source.append(" "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n"); 00182 } 00183 source.append(" if (options2 & (1 << 0)) \n"); 00184 source.append(" alpha = -alpha; \n"); 00185 source.append(" \n"); 00186 00187 if (cfg.b == VIENNACL_AMBM_CPU) 00188 { 00189 source.append(" "); source.append(numeric_string); source.append(" beta = fac3; \n"); 00190 } 00191 else if (cfg.b == VIENNACL_AMBM_GPU) 00192 { 00193 source.append(" "); source.append(numeric_string); source.append(" beta = fac3[0]; \n"); 00194 } 00195 if (cfg.b != VIENNACL_AMBM_NONE) 00196 { 00197 source.append(" if (options3 & (1 << 0)) \n"); 00198 source.append(" beta = -beta; \n"); 00199 source.append(" \n"); 00200 } 00201 source.append(" if (options2 & (1 << 1)) { \n"); 00202 if (cfg.b != VIENNACL_AMBM_NONE) 00203 { 00204 source.append(" if (options3 & (1 << 1)) {\n"); 00205 generate_ambm_impl2(source, cfg, false, false); 00206 source.append(" } else {\n"); 00207 generate_ambm_impl2(source, cfg, false, true); 00208 source.append(" } \n"); 00209 } 00210 else 00211 generate_ambm_impl2(source, cfg, false, true); 00212 source.append(" } else { \n"); 00213 if (cfg.b != VIENNACL_AMBM_NONE) 00214 { 00215 source.append(" if (options3 & (1 << 1)) {\n"); 00216 generate_ambm_impl2(source, cfg, true, false); 00217 source.append(" } else {\n"); 00218 generate_ambm_impl2(source, cfg, true, true); 00219 source.append(" } \n"); 00220 } 00221 else 00222 generate_ambm_impl2(source, cfg, true, true); 00223 source.append(" } \n"); 00224 source.append("} \n"); 00225 } 00226 00227 template <typename StringType> 00228 void generate_ambm(StringType & source, std::string const & numeric_string, bool is_row_major) 00229 { 00230 ambm_config cfg; 00231 cfg.assign_op = "="; 00232 cfg.with_stride_and_range = true; 00233 cfg.is_row_major = is_row_major; 00234 00235 // am 00236 cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg); 00237 cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg); 00238 00239 // ambm 00240 cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg); 00241 cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg); 00242 cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg); 00243 cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg); 00244 00245 // ambm_m 00246 cfg.assign_op = "+="; 00247 00248 cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg); 00249 cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg); 00250 cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg); 00251 cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg); 00252 } 00253 00254 template <typename StringType> 00255 void generate_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major) 00256 { 00257 source.append("__kernel void assign_cpu( \n"); 00258 source.append(" __global "); source.append(numeric_string); source.append(" * A, \n"); 00259 source.append(" unsigned int A_start1, unsigned int A_start2, \n"); 00260 source.append(" unsigned int A_inc1, unsigned int A_inc2, \n"); 00261 source.append(" unsigned int A_size1, unsigned int A_size2, \n"); 00262 source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n"); 00263 source.append(" "); source.append(numeric_string); source.append(" alpha) \n"); 00264 source.append("{ \n"); 00265 if (is_row_major) 00266 { 00267 source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n"); 00268 source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n"); 00269 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n"); 00270 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n"); 00271 source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha; \n"); 00272 } 00273 else 00274 { 00275 source.append(" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n"); 00276 source.append(" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n"); 00277 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n"); 00278 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n"); 00279 source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha; \n"); 00280 } 00281 source.append("} \n"); 00282 } 00283 00284 template <typename StringType> 00285 void generate_diagonal_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major) 00286 { 00287 source.append("__kernel void diagonal_assign_cpu( \n"); 00288 source.append(" __global "); source.append(numeric_string); source.append(" * A, \n"); 00289 source.append(" unsigned int A_start1, unsigned int A_start2, \n"); 00290 source.append(" unsigned int A_inc1, unsigned int A_inc2, \n"); 00291 source.append(" unsigned int A_size1, unsigned int A_size2, \n"); 00292 source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n"); 00293 source.append(" "); source.append(numeric_string); source.append(" alpha) \n"); 00294 source.append("{ \n"); 00295 source.append(" for (unsigned int idx = get_global_id(0); idx < min(A_size1, A_size2); idx += get_global_size(0))\n"); 00296 if (is_row_major) 00297 source.append(" A[(idx * A_inc1 + A_start1) * A_internal_size2 + (idx * A_inc2 + A_start2)] = alpha; \n"); 00298 else 00299 source.append(" A[(idx * A_inc1 + A_start1) + (idx * A_inc2 + A_start2) * A_internal_size1] = alpha; \n"); 00300 source.append("} \n"); 00301 } 00302 00303 template <typename StringType> 00304 void generate_element_op(StringType & source, std::string const & numeric_string, bool is_row_major) 00305 { 00306 source.append("__kernel void element_op( \n"); 00307 source.append(" __global "); source.append(numeric_string); source.append(" * A, \n"); 00308 source.append(" unsigned int A_start1, unsigned int A_start2, \n"); 00309 source.append(" unsigned int A_inc1, unsigned int A_inc2, \n"); 00310 source.append(" unsigned int A_size1, unsigned int A_size2, \n"); 00311 source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n"); 00312 source.append(" __global "); source.append(numeric_string); source.append(" * B, \n"); 00313 source.append(" unsigned int B_start1, unsigned int B_start2, \n"); 00314 source.append(" unsigned int B_inc1, unsigned int B_inc2, \n"); 00315 source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2, \n"); 00316 source.append(" __global "); source.append(numeric_string); source.append(" * C, \n"); 00317 source.append(" unsigned int C_start1, unsigned int C_start2, \n"); 00318 source.append(" unsigned int C_inc1, unsigned int C_inc2, \n"); 00319 source.append(" unsigned int C_internal_size1, unsigned int C_internal_size2, \n"); 00320 source.append(" unsigned int op_type) \n"); //0: product, 1: division, 2: pow 00321 source.append("{ \n"); 00322 if (is_row_major) 00323 { 00324 source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n"); 00325 source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n"); 00326 source.append(" if (op_type == 2) {"); 00327 if (numeric_string == "float" || numeric_string == "double") 00328 { 00329 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n"); 00330 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n"); 00331 source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n"); 00332 source.append(" pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)], \n"); 00333 source.append(" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]); \n"); 00334 } 00335 source.append(" } else if (op_type == 1) {"); 00336 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n"); 00337 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n"); 00338 source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n"); 00339 source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / \n"); 00340 source.append(" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n"); 00341 source.append(" } else if (op_type == 0) {"); 00342 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n"); 00343 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n"); 00344 source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n"); 00345 source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * \n"); 00346 source.append(" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n"); 00347 source.append(" }"); 00348 } 00349 else 00350 { 00351 source.append(" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n"); 00352 source.append(" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n"); 00353 source.append(" if (op_type == 2) {"); 00354 if (numeric_string == "float" || numeric_string == "double") 00355 { 00356 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n"); 00357 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n"); 00358 source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n"); 00359 source.append(" pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1], \n"); 00360 source.append(" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]); \n"); 00361 } 00362 source.append(" } else if (op_type == 1) {"); 00363 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n"); 00364 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n"); 00365 source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n"); 00366 source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / \n"); 00367 source.append(" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; \n"); 00368 source.append(" } else if (op_type == 0) {"); 00369 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n"); 00370 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n"); 00371 source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n"); 00372 source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * \n"); 00373 source.append(" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; \n"); 00374 source.append(" }"); 00375 } 00376 source.append("} \n"); 00377 } 00378 00379 00380 template <typename StringType> 00381 void generate_fft(StringType & source, std::string const & numeric_string, bool is_row_major) 00382 { 00383 // naive fourier transform (quadratic complexity, use for reference only) 00384 source.append("__kernel void fft_direct(__global "); source.append(numeric_string); source.append("2 *input, \n"); 00385 source.append(" __global "); source.append(numeric_string); source.append("2 *output, \n"); 00386 source.append(" unsigned int size, \n"); 00387 source.append(" unsigned int stride, \n"); 00388 source.append(" unsigned int batch_num, \n"); 00389 source.append(" "); source.append(numeric_string); source.append(" sign) { \n"); 00390 source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n"); 00391 source.append(" \n"); 00392 source.append(" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n"); 00393 source.append(" for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n"); 00394 source.append(" "); source.append(numeric_string); source.append("2 f = 0.0f; \n"); 00395 source.append(" \n"); 00396 source.append(" for(unsigned int n = 0; n < size; n++) { \n"); 00397 source.append(" "); source.append(numeric_string); source.append("2 in = "); 00398 if (is_row_major) 00399 source.append("input[batch_id * stride + n]; \n"); //input index here 00400 else 00401 source.append("input[n * stride + batch_id]; \n"); //input index here 00402 source.append(" \n"); 00403 source.append(" "); source.append(numeric_string); source.append(" sn, cs; \n"); 00404 source.append(" "); source.append(numeric_string); source.append(" arg = sign * 2 * NUM_PI * k / size * n; \n"); 00405 source.append(" sn = sincos(arg, &cs); \n"); 00406 source.append(" \n"); 00407 source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n"); 00408 source.append(" f = f + ("); source.append(numeric_string); source.append("2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n"); 00409 source.append(" } \n"); 00410 source.append(" \n"); 00411 if (is_row_major) 00412 source.append(" output[batch_id * stride + k] = f; \n"); // output index here 00413 else 00414 source.append(" output[k * stride + batch_id] = f; \n"); // output index here 00415 source.append(" } \n"); 00416 source.append(" } \n"); 00417 source.append("} \n"); 00418 00419 source.append(" \n"); 00420 00421 source.append("__kernel void fft_radix2(__global "); source.append(numeric_string); source.append("2* input, \n"); 00422 source.append(" unsigned int s, \n"); 00423 source.append(" unsigned int bit_size, \n"); 00424 source.append(" unsigned int size, \n"); 00425 source.append(" unsigned int stride, \n"); 00426 source.append(" unsigned int batch_num, \n"); 00427 source.append(" "); source.append(numeric_string); source.append(" sign) { \n"); 00428 source.append(" \n"); 00429 source.append(" unsigned int ss = 1 << s; \n"); 00430 source.append(" unsigned int half_size = size >> 1; \n"); 00431 source.append(" \n"); 00432 source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n"); 00433 source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n"); 00434 source.append(" \n"); 00435 source.append(" unsigned int glb_id = get_global_id(0); \n"); 00436 source.append(" unsigned int glb_sz = get_global_size(0); \n"); 00437 00438 source.append(" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n"); 00439 source.append(" for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n"); 00440 source.append(" unsigned int group = (tid & (ss - 1)); \n"); 00441 source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n"); 00442 00443 if (is_row_major) 00444 { 00445 source.append(" unsigned int offset = batch_id * stride + pos; \n"); 00446 source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index 00447 source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss]; \n");//index 00448 } 00449 else 00450 { 00451 source.append(" unsigned int offset = pos * stride + batch_id; \n"); 00452 source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index 00453 source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss * stride]; \n");//index 00454 } 00455 00456 source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n"); 00457 00458 source.append(" sn = sincos(arg, &cs); \n"); 00459 00460 source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n"); 00461 00462 source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n"); 00463 00464 if (is_row_major) 00465 source.append(" input[offset + ss] = in1 - tmp; \n");//index 00466 else 00467 source.append(" input[offset + ss * stride] = in1 - tmp; \n");//index 00468 source.append(" input[offset] = in1 + tmp; \n");//index 00469 source.append(" } \n"); 00470 source.append(" } \n"); 00471 source.append("} \n"); 00472 00473 source.append(" \n"); 00474 00475 source.append(" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n"); 00476 source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n"); 00477 source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n"); 00478 source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n"); 00479 source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n"); 00480 source.append(" v = (v >> 16) | (v << 16); \n"); 00481 source.append(" \n"); 00482 source.append(" v = v >> (32 - bit_size); \n"); 00483 source.append(" \n"); 00484 source.append(" return v; \n"); 00485 source.append(" } \n"); 00486 00487 source.append(" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append("2* input, \n"); 00488 source.append(" __local "); source.append(numeric_string); source.append("2* lcl_input, \n"); 00489 source.append(" unsigned int bit_size, \n"); 00490 source.append(" unsigned int size, \n"); 00491 source.append(" unsigned int stride, \n"); 00492 source.append(" unsigned int batch_num, \n"); 00493 source.append(" "); source.append(numeric_string); source.append(" sign) { \n"); 00494 00495 source.append(" unsigned int grp_id = get_group_id(0); \n"); 00496 source.append(" unsigned int grp_num = get_num_groups(0); \n"); 00497 00498 source.append(" unsigned int lcl_sz = get_local_size(0); \n"); 00499 source.append(" unsigned int lcl_id = get_local_id(0); \n"); 00500 source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n"); 00501 00502 source.append(" for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n"); 00503 //unsigned int base_offset = stride * batch_id; \n"); 00504 //copy chunk of global memory to local \n"); 00505 source.append(" for(unsigned int p = lcl_id; p < size; p += lcl_sz) { \n"); 00506 source.append(" unsigned int v = get_reorder_num(p, bit_size); \n"); 00507 if (is_row_major) 00508 source.append(" lcl_input[v] = input[batch_id * stride + p]; \n"); //index 00509 else 00510 source.append(" lcl_input[v] = input[p * stride + batch_id]; \n"); //index 00511 source.append(" } \n"); 00512 00513 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00514 00515 //performs Cooley-Tukey FFT on local array 00516 source.append(" for(unsigned int s = 0; s < bit_size; s++) { \n"); 00517 source.append(" unsigned int ss = 1 << s; \n"); 00518 00519 source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n"); 00520 00521 source.append(" for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n"); 00522 source.append(" unsigned int group = (tid & (ss - 1)); \n"); 00523 source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n"); 00524 00525 source.append(" "); source.append(numeric_string); source.append("2 in1 = lcl_input[pos]; \n"); 00526 source.append(" "); source.append(numeric_string); source.append("2 in2 = lcl_input[pos + ss]; \n"); 00527 00528 source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n"); 00529 00530 source.append(" sn = sincos(arg, &cs); \n"); 00531 source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n"); 00532 00533 source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n"); 00534 00535 source.append(" lcl_input[pos + ss] = in1 - tmp; \n"); 00536 source.append(" lcl_input[pos] = in1 + tmp; \n"); 00537 source.append(" } \n"); 00538 00539 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00540 source.append(" } \n"); 00541 00542 //copy local array back to global memory 00543 source.append(" for(unsigned int p = lcl_id; p < size; p += lcl_sz) { \n"); 00544 if (is_row_major) 00545 source.append(" input[batch_id * stride + p] = lcl_input[p]; \n");//index 00546 else 00547 source.append(" input[p * stride + batch_id] = lcl_input[p]; \n");//index 00548 source.append(" } \n"); 00549 source.append(" } \n"); 00550 source.append(" } \n"); 00551 00552 source.append(" \n"); 00553 00554 // 00555 // Performs reordering of input data in bit-reversal order 00556 // Probably it's better to do in host side, 00557 // 00558 source.append("unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n"); 00559 source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n"); 00560 source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n"); 00561 source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n"); 00562 source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n"); 00563 source.append(" v = (v >> 16) | (v << 16); \n"); 00564 00565 source.append(" v = v >> (32 - bit_size); \n"); 00566 00567 source.append(" return v; \n"); 00568 source.append("} \n"); 00569 00570 source.append("__kernel void fft_reorder(__global "); source.append(numeric_string); source.append("2* input, \n"); 00571 source.append(" unsigned int bit_size, \n"); 00572 source.append(" unsigned int size, \n"); 00573 source.append(" unsigned int stride, \n"); 00574 source.append(" int batch_num) { \n"); 00575 00576 source.append(" unsigned int glb_id = get_global_id(0); \n"); 00577 source.append(" unsigned int glb_sz = get_global_size(0); \n"); 00578 00579 source.append(" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n"); 00580 source.append(" for(unsigned int i = glb_id; i < size; i += glb_sz) { \n"); 00581 source.append(" unsigned int v = get_reorder_num_2(i, bit_size); \n"); 00582 00583 source.append(" if(i < v) {\n"); 00584 if (is_row_major) 00585 { 00586 source.append(" "); source.append(numeric_string); source.append("2 tmp = input[batch_id * stride + i]; \n"); // index 00587 source.append(" input[batch_id * stride + i] = input[batch_id * stride + v]; \n"); //index 00588 source.append(" input[batch_id * stride + v] = tmp; \n"); //index 00589 } 00590 else 00591 { 00592 source.append(" "); source.append(numeric_string); source.append("2 tmp = input[i * stride + batch_id]; \n"); // index 00593 source.append(" input[i * stride + batch_id] = input[v * stride + batch_id]; \n"); //index 00594 source.append(" input[v * stride + batch_id] = tmp; \n"); //index 00595 } 00596 source.append(" } \n"); 00597 source.append(" } \n"); 00598 source.append(" } \n"); 00599 source.append("} \n"); 00600 } 00601 00602 template <typename StringType> 00603 void generate_lu(StringType & source, std::string const & numeric_string, bool is_row_major) 00604 { 00605 source.append("__kernel void lu_factorize( \n"); 00606 source.append(" __global "); source.append(numeric_string); source.append(" * matrix, \n"); 00607 source.append(" unsigned int matrix_rows, \n"); 00608 source.append(" unsigned int matrix_cols, \n"); 00609 source.append(" unsigned int matrix_internal_rows, \n"); 00610 source.append(" unsigned int matrix_internal_cols) \n"); 00611 source.append("{ \n"); 00612 source.append(" "); source.append(numeric_string); source.append(" temp; \n"); 00613 00614 if (is_row_major) 00615 { 00616 source.append(" unsigned rowi; \n"); 00617 source.append(" unsigned rowk; \n"); 00618 source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n"); 00619 source.append(" { \n"); 00620 source.append(" rowi = i * matrix_internal_cols; \n"); 00621 source.append(" for (unsigned int k=0; k<i; ++k) \n"); 00622 source.append(" { \n"); 00623 source.append(" rowk = k * matrix_internal_cols; \n"); 00624 source.append(" if (get_global_id(0) == 0) \n"); 00625 source.append(" matrix[rowi + k] /= matrix[rowk + k]; \n"); 00626 00627 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00628 source.append(" temp = matrix[rowi + k]; \n"); 00629 00630 //parallel subtraction: 00631 source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n"); 00632 source.append(" matrix[rowi + j] -= temp * matrix[rowk + j]; \n"); 00633 } 00634 else 00635 { 00636 source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n"); 00637 source.append(" { \n"); 00638 source.append(" for (unsigned int k=0; k<i; ++k) \n"); 00639 source.append(" { \n"); 00640 00641 source.append(" if (get_global_id(0) == 0) \n"); 00642 source.append(" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n"); 00643 00644 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00645 source.append(" temp = matrix[i + k*matrix_internal_rows]; \n"); 00646 00647 //parallel subtraction: 00648 source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n"); 00649 source.append(" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n"); 00650 } 00651 source.append(" }"); 00652 source.append(" }"); 00653 source.append("}"); 00654 } 00655 00656 00657 template <typename StringType> 00658 void generate_scaled_rank1_update(StringType & source, std::string const & numeric_string, bool is_row_major, bool alpha_on_cpu) 00659 { 00660 source.append("__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append("cpu") : source.append("gpu"); source.append("( \n"); 00661 source.append(" __global "); source.append(numeric_string); source.append(" * A, \n"); 00662 source.append(" unsigned int A_start1, unsigned int A_start2, \n"); 00663 source.append(" unsigned int A_inc1, unsigned int A_inc2, \n"); 00664 source.append(" unsigned int A_size1, unsigned int A_size2, \n"); 00665 source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n"); 00666 00667 if (alpha_on_cpu) { 00668 source.append(" "); source.append(numeric_string); source.append(" val, \n"); 00669 } else { 00670 source.append(" __global const "); source.append(numeric_string); source.append(" *val, \n"); 00671 } 00672 source.append(" unsigned int options2, \n"); 00673 00674 source.append(" __global const "); source.append(numeric_string); source.append(" * vec1, \n"); 00675 source.append(" unsigned int start1, \n"); 00676 source.append(" unsigned int inc1, \n"); 00677 source.append(" unsigned int size1, \n"); 00678 00679 source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n"); 00680 source.append(" unsigned int start2, \n"); 00681 source.append(" unsigned int inc2, \n"); 00682 source.append(" unsigned int size2) \n"); 00683 source.append("{ \n"); 00684 00685 if (alpha_on_cpu) { 00686 source.append(" "); source.append(numeric_string); source.append(" alpha = val; \n"); 00687 } else { 00688 source.append(" "); source.append(numeric_string); source.append(" alpha = val[0]; \n"); 00689 } 00690 source.append(" if (options2 & (1 << 0)) \n"); 00691 source.append(" alpha = -alpha; \n"); 00692 00693 source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n"); 00694 source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n"); 00695 00696 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n"); 00697 source.append(" { \n"); 00698 source.append(" "); source.append(numeric_string); source.append(" tmp = vec1[row * inc1 + start1];"); 00699 source.append(" tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;"); 00700 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n"); 00701 if (is_row_major) 00702 source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n"); 00703 else 00704 source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n"); 00705 source.append(" } \n"); 00706 source.append("} \n"); 00707 } 00708 00709 template <typename StringType> 00710 void generate_trans_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major) 00711 { 00712 source.append("__kernel void trans_vec_mul( \n"); 00713 source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n"); 00714 source.append(" unsigned int A_row_start, unsigned int A_col_start, \n"); 00715 source.append(" unsigned int A_row_inc, unsigned int A_col_inc, \n"); 00716 source.append(" unsigned int A_row_size, unsigned int A_col_size, \n"); 00717 source.append(" unsigned int A_internal_rows, unsigned int A_internal_cols, \n"); 00718 source.append(" __global const "); source.append(numeric_string); source.append(" * v, \n"); 00719 source.append(" unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n"); 00720 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 00721 source.append(" unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n"); 00722 source.append(" __local "); source.append(numeric_string); source.append(" * work) \n"); 00723 source.append("{ \n"); 00724 if (is_row_major) 00725 { 00726 source.append(" for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0)) \n"); 00727 source.append(" { \n"); 00728 source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n"); 00729 source.append(" for (unsigned int col = 0; col < A_row_size; ++col) \n"); 00730 source.append(" dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; \n"); 00731 source.append(" result[row * result_inc + result_start] = dot_prod; \n"); 00732 } 00733 else 00734 { 00735 source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n"); 00736 source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n"); 00737 source.append(" unsigned int lid = get_local_id(0); \n"); 00738 00739 source.append(" for (unsigned int row = row_gid; row < A_col_size; row += get_num_groups(0)) \n"); 00740 source.append(" { \n"); 00741 source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n"); 00742 source.append(" for (unsigned int col = col_gid; col < A_row_size; col+=get_local_size(0)) \n"); 00743 source.append(" dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col]; \n"); 00744 source.append(" work[lid] = dot_prod; \n"); 00745 00746 source.append(" for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n"); 00747 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00748 source.append(" if(lid < stride) \n"); 00749 source.append(" work[lid] += work[lid+stride]; \n"); 00750 source.append(" } \n"); 00751 00752 source.append(" if(lid == 0) \n"); 00753 source.append(" result[row * result_inc + result_start] = work[0]; \n"); 00754 } 00755 source.append(" } \n"); 00756 source.append("} \n"); 00757 } 00758 00759 template <typename StringType> 00760 void generate_triangular_substitute_inplace(StringType & source, std::string const & numeric_string, bool is_row_major) 00761 { 00762 source.append("__kernel void triangular_substitute_inplace( \n"); 00763 source.append(" __global "); source.append(numeric_string); source.append(" * A, \n"); 00764 source.append(" unsigned int A_start1, unsigned int A_start2, \n"); 00765 source.append(" unsigned int A_inc1, unsigned int A_inc2, \n"); 00766 source.append(" unsigned int A_size1, unsigned int A_size2, \n"); 00767 source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n"); 00768 source.append(" __global "); source.append(numeric_string); source.append(" * v, \n"); 00769 source.append(" unsigned int v_start, \n"); 00770 source.append(" unsigned int v_inc, \n"); 00771 source.append(" unsigned int v_size, \n"); 00772 source.append(" unsigned int options) \n"); 00773 source.append("{ \n"); 00774 source.append(" "); source.append(numeric_string); source.append(" temp; \n"); 00775 source.append(" unsigned int unit_diagonal_flag = (options & (1 << 0)); \n"); 00776 source.append(" unsigned int transposed_access_A = (options & (1 << 1)); \n"); 00777 source.append(" unsigned int is_lower_solve = (options & (1 << 2)); \n"); 00778 source.append(" unsigned int row; \n"); 00779 source.append(" for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) \n"); //Note: A required to be square 00780 source.append(" { \n"); 00781 source.append(" row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n"); 00782 source.append(" if (!unit_diagonal_flag) \n"); 00783 source.append(" { \n"); 00784 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00785 source.append(" if (get_global_id(0) == 0) \n"); 00786 if (is_row_major) 00787 source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n"); 00788 else 00789 source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n"); 00790 source.append(" } \n"); 00791 00792 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00793 00794 source.append(" temp = v[row * v_inc + v_start]; \n"); 00795 00796 source.append(" for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n"); 00797 source.append(" elim < (is_lower_solve ? A_size1 : row); \n"); 00798 source.append(" elim += get_global_size(0)) \n"); 00799 if (is_row_major) 00800 { 00801 source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n"); 00802 source.append(" : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))]; \n"); 00803 } 00804 else 00805 { 00806 source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n"); 00807 source.append(" : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)]; \n"); 00808 } 00809 source.append(" } \n"); 00810 source.append("} \n"); 00811 } 00812 00813 template <typename StringType> 00814 void generate_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major) 00815 { 00816 source.append("__kernel void vec_mul( \n"); 00817 source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n"); 00818 source.append(" unsigned int A_row_start, unsigned int A_col_start, \n"); 00819 source.append(" unsigned int A_row_inc, unsigned int A_col_inc, \n"); 00820 source.append(" unsigned int A_row_size, unsigned int A_col_size, \n"); 00821 source.append(" unsigned int A_internal_rows, unsigned int A_internal_cols, \n"); 00822 source.append(" __global const "); source.append(numeric_string); source.append(" * v, \n"); 00823 source.append(" unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n"); 00824 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 00825 source.append(" unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n"); 00826 source.append(" __local "); source.append(numeric_string); source.append(" * work) \n"); 00827 source.append("{ \n"); 00828 if (is_row_major) 00829 { 00830 source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n"); 00831 source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n"); 00832 source.append(" unsigned int lid = get_local_id(0); \n"); 00833 00834 source.append(" for (unsigned int row = row_gid; row < A_row_size; row += get_num_groups(0)) \n"); 00835 source.append(" { \n"); 00836 source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n"); 00837 source.append(" for (unsigned int col = col_gid; col < A_col_size; col+=get_local_size(0)) \n"); 00838 source.append(" dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; \n"); 00839 source.append(" work[lid] = dot_prod; \n"); 00840 00841 source.append(" for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n"); 00842 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00843 source.append(" if(lid < stride) \n"); 00844 source.append(" work[lid] += work[lid+stride]; \n"); 00845 source.append(" } \n"); 00846 00847 source.append(" if(lid == 0) \n"); 00848 source.append(" result[row * result_inc + result_start] = work[0]; \n"); 00849 00850 } 00851 else 00852 { 00853 source.append(" for (unsigned int row = get_global_id(0); row < A_row_size; row += get_global_size(0)) \n"); 00854 source.append(" { \n"); 00855 source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n"); 00856 source.append(" for (unsigned int col = 0; col < A_col_size; ++col) \n"); 00857 source.append(" dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col]; \n"); 00858 source.append(" result[row * result_inc + result_start] = dot_prod; \n"); 00859 } 00860 source.append(" } \n"); 00861 source.append("} \n"); 00862 } 00863 00864 namespace detail 00865 { 00866 inline std::string type_to_string(viennacl::row_major) { return "row"; } 00867 inline std::string type_to_string(viennacl::column_major) { return "col"; } 00868 } 00869 00871 00872 // main kernel class 00874 template <typename NumericT, typename F> 00875 struct matrix 00876 { 00877 static std::string program_name() 00878 { 00879 return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_" + detail::type_to_string(F()); 00880 } 00881 00882 static void init(viennacl::ocl::context & ctx) 00883 { 00884 viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); 00885 std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); 00886 bool is_row_major = viennacl::is_row_major<F>::value; 00887 00888 static std::map<cl_context, bool> init_done; 00889 if (!init_done[ctx.handle().get()]) 00890 { 00891 std::string source; 00892 source.reserve(8192); 00893 00894 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); 00895 00896 // fully parametrized kernels: 00897 generate_ambm(source, numeric_string, is_row_major); 00898 00899 // kernels with mostly predetermined skeleton: 00900 generate_assign_cpu(source, numeric_string, is_row_major); 00901 generate_diagonal_assign_cpu(source, numeric_string, is_row_major); 00902 generate_element_op(source, numeric_string, is_row_major); 00903 generate_scaled_rank1_update(source, numeric_string, is_row_major, true); 00904 generate_scaled_rank1_update(source, numeric_string, is_row_major, false); 00905 generate_trans_vec_mul(source, numeric_string, is_row_major); 00906 generate_vec_mul(source, numeric_string, is_row_major); 00907 00908 if (numeric_string == "float" || numeric_string == "double") 00909 { 00910 generate_fft(source, numeric_string, is_row_major); 00911 generate_lu(source, numeric_string, is_row_major); 00912 generate_triangular_substitute_inplace(source, numeric_string, is_row_major); 00913 } 00914 00915 std::string prog_name = program_name(); 00916 #ifdef VIENNACL_BUILD_INFO 00917 std::cout << "Creating program " << prog_name << std::endl; 00918 #endif 00919 ctx.add_program(source, prog_name); 00920 init_done[ctx.handle().get()] = true; 00921 } //if 00922 } //init 00923 }; 00924 00925 } // namespace kernels 00926 } // namespace opencl 00927 } // namespace linalg 00928 } // namespace viennacl 00929 #endif 00930