ViennaCL - The Vienna Computing Library
1.5.0
|
00001 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP 00002 #define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP 00003 00004 #include "viennacl/tools/tools.hpp" 00005 #include "viennacl/ocl/kernel.hpp" 00006 #include "viennacl/ocl/platform.hpp" 00007 #include "viennacl/ocl/utils.hpp" 00008 00011 namespace viennacl 00012 { 00013 namespace linalg 00014 { 00015 namespace opencl 00016 { 00017 namespace kernels 00018 { 00019 00021 00023 enum avbv_scalar_type 00024 { 00025 VIENNACL_AVBV_NONE = 0, // vector does not exist/contribute 00026 VIENNACL_AVBV_CPU, 00027 VIENNACL_AVBV_GPU 00028 }; 00029 00031 struct avbv_config 00032 { 00033 avbv_config() : with_stride_and_range(true), a(VIENNACL_AVBV_CPU), b(VIENNACL_AVBV_NONE) {} 00034 00035 bool with_stride_and_range; 00036 std::string assign_op; 00037 avbv_scalar_type a; 00038 avbv_scalar_type b; 00039 }; 00040 00041 // just returns the for-loop 00042 template <typename StringType> 00043 void generate_avbv_impl2(StringType & source, std::string const & /*numeric_string*/, avbv_config const & cfg, bool mult_alpha, bool mult_beta) 00044 { 00045 source.append(" for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n"); 00046 if (cfg.with_stride_and_range) 00047 { 00048 source.append(" vec1[i*size1.y+size1.x] "); source.append(cfg.assign_op); source.append(" vec2[i*size2.y+size2.x] "); 00049 if (mult_alpha) 00050 source.append("* alpha "); 00051 else 00052 source.append("/ alpha "); 00053 if (cfg.b != VIENNACL_AVBV_NONE) 00054 { 00055 source.append("+ vec3[i*size3.y+size3.x] "); 00056 if (mult_beta) 00057 source.append("* beta"); 00058 else 00059 source.append("/ beta"); 00060 } 00061 } 00062 else 00063 { 00064 source.append(" vec1[i] "); source.append(cfg.assign_op); source.append(" vec2[i] "); 00065 if (mult_alpha) 00066 source.append("* alpha "); 00067 else 00068 source.append("/ alpha "); 00069 if (cfg.b != VIENNACL_AVBV_NONE) 00070 { 00071 source.append("+ vec3[i] "); 00072 if (mult_beta) 00073 source.append("* beta"); 00074 else 00075 source.append("/ beta"); 00076 } 00077 } 00078 source.append("; \n"); 00079 } 00080 00081 template <typename StringType> 00082 void generate_avbv_impl(StringType & source, std::string const & numeric_string, avbv_config const & cfg) 00083 { 00084 source.append("__kernel void av"); 00085 if (cfg.b != VIENNACL_AVBV_NONE) 00086 source.append("bv"); 00087 if (cfg.assign_op != "=") 00088 source.append("_v"); 00089 00090 if (cfg.a == VIENNACL_AVBV_CPU) 00091 source.append("_cpu"); 00092 else if (cfg.a == VIENNACL_AVBV_GPU) 00093 source.append("_gpu"); 00094 00095 if (cfg.b == VIENNACL_AVBV_CPU) 00096 source.append("_cpu"); 00097 else if (cfg.b == VIENNACL_AVBV_GPU) 00098 source.append("_gpu"); 00099 source.append("( \n"); 00100 source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); 00101 source.append(" uint4 size1, \n"); 00102 source.append(" \n"); 00103 if (cfg.a == VIENNACL_AVBV_CPU) 00104 { 00105 source.append(" "); source.append(numeric_string); source.append(" fac2, \n"); 00106 } 00107 else if (cfg.a == VIENNACL_AVBV_GPU) 00108 { 00109 source.append(" __global "); source.append(numeric_string); source.append(" * fac2, \n"); 00110 } 00111 source.append(" unsigned int options2, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse 00112 source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n"); 00113 source.append(" uint4 size2"); 00114 00115 if (cfg.b != VIENNACL_AVBV_NONE) 00116 { 00117 source.append(", \n\n"); 00118 if (cfg.b == VIENNACL_AVBV_CPU) 00119 { 00120 source.append(" "); source.append(numeric_string); source.append(" fac3, \n"); 00121 } 00122 else if (cfg.b == VIENNACL_AVBV_GPU) 00123 { 00124 source.append(" __global "); source.append(numeric_string); source.append(" * fac3, \n"); 00125 } 00126 source.append(" unsigned int options3, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse 00127 source.append(" __global const "); source.append(numeric_string); source.append(" * vec3, \n"); 00128 source.append(" uint4 size3 \n"); 00129 } 00130 source.append(") { \n"); 00131 00132 if (cfg.a == VIENNACL_AVBV_CPU) 00133 { 00134 source.append(" "); source.append(numeric_string); source.append(" alpha = fac2; \n"); 00135 } 00136 else if (cfg.a == VIENNACL_AVBV_GPU) 00137 { 00138 source.append(" "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n"); 00139 } 00140 source.append(" if (options2 & (1 << 0)) \n"); 00141 source.append(" alpha = -alpha; \n"); 00142 source.append(" \n"); 00143 00144 if (cfg.b == VIENNACL_AVBV_CPU) 00145 { 00146 source.append(" "); source.append(numeric_string); source.append(" beta = fac3; \n"); 00147 } 00148 else if (cfg.b == VIENNACL_AVBV_GPU) 00149 { 00150 source.append(" "); source.append(numeric_string); source.append(" beta = fac3[0]; \n"); 00151 } 00152 if (cfg.b != VIENNACL_AVBV_NONE) 00153 { 00154 source.append(" if (options3 & (1 << 0)) \n"); 00155 source.append(" beta = -beta; \n"); 00156 source.append(" \n"); 00157 } 00158 source.append(" if (options2 & (1 << 1)) { \n"); 00159 if (cfg.b != VIENNACL_AVBV_NONE) 00160 { 00161 source.append(" if (options3 & (1 << 1)) {\n"); 00162 generate_avbv_impl2(source, numeric_string, cfg, false, false); 00163 source.append(" } else {\n"); 00164 generate_avbv_impl2(source, numeric_string, cfg, false, true); 00165 source.append(" } \n"); 00166 } 00167 else 00168 generate_avbv_impl2(source, numeric_string, cfg, false, true); 00169 source.append(" } else { \n"); 00170 if (cfg.b != VIENNACL_AVBV_NONE) 00171 { 00172 source.append(" if (options3 & (1 << 1)) {\n"); 00173 generate_avbv_impl2(source, numeric_string, cfg, true, false); 00174 source.append(" } else {\n"); 00175 generate_avbv_impl2(source, numeric_string, cfg, true, true); 00176 source.append(" } \n"); 00177 } 00178 else 00179 generate_avbv_impl2(source, numeric_string, cfg, true, true); 00180 source.append(" } \n"); 00181 source.append("} \n"); 00182 } 00183 00184 template <typename StringType> 00185 void generate_avbv(StringType & source, std::string const & numeric_string) 00186 { 00187 avbv_config cfg; 00188 cfg.assign_op = "="; 00189 cfg.with_stride_and_range = true; 00190 00191 // av 00192 cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg); 00193 cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg); 00194 00195 // avbv 00196 cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg); 00197 cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg); 00198 cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg); 00199 cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg); 00200 00201 // avbv 00202 cfg.assign_op = "+="; 00203 00204 cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg); 00205 cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg); 00206 cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg); 00207 cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg); 00208 } 00209 00210 template <typename StringType> 00211 void generate_plane_rotation(StringType & source, std::string const & numeric_string) 00212 { 00213 source.append("__kernel void plane_rotation( \n"); 00214 source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); 00215 source.append(" unsigned int start1, \n"); 00216 source.append(" unsigned int inc1, \n"); 00217 source.append(" unsigned int size1, \n"); 00218 source.append(" __global "); source.append(numeric_string); source.append(" * vec2, \n"); 00219 source.append(" unsigned int start2, \n"); 00220 source.append(" unsigned int inc2, \n"); 00221 source.append(" unsigned int size2, \n"); 00222 source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); 00223 source.append(" "); source.append(numeric_string); source.append(" beta) \n"); 00224 source.append("{ \n"); 00225 source.append(" "); source.append(numeric_string); source.append(" tmp1 = 0; \n"); 00226 source.append(" "); source.append(numeric_string); source.append(" tmp2 = 0; \n"); 00227 source.append(" \n"); 00228 source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n"); 00229 source.append(" { \n"); 00230 source.append(" tmp1 = vec1[i*inc1+start1]; \n"); 00231 source.append(" tmp2 = vec2[i*inc2+start2]; \n"); 00232 source.append(" \n"); 00233 source.append(" vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2; \n"); 00234 source.append(" vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1; \n"); 00235 source.append(" } \n"); 00236 source.append(" \n"); 00237 source.append("} \n"); 00238 } 00239 00240 template <typename StringType> 00241 void generate_vector_swap(StringType & source, std::string const & numeric_string) 00242 { 00243 source.append("__kernel void swap( \n"); 00244 source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); 00245 source.append(" unsigned int start1, \n"); 00246 source.append(" unsigned int inc1, \n"); 00247 source.append(" unsigned int size1, \n"); 00248 source.append(" __global "); source.append(numeric_string); source.append(" * vec2, \n"); 00249 source.append(" unsigned int start2, \n"); 00250 source.append(" unsigned int inc2, \n"); 00251 source.append(" unsigned int size2 \n"); 00252 source.append(" ) \n"); 00253 source.append("{ \n"); 00254 source.append(" "); source.append(numeric_string); source.append(" tmp; \n"); 00255 source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n"); 00256 source.append(" { \n"); 00257 source.append(" tmp = vec2[i*inc2+start2]; \n"); 00258 source.append(" vec2[i*inc2+start2] = vec1[i*inc1+start1]; \n"); 00259 source.append(" vec1[i*inc1+start1] = tmp; \n"); 00260 source.append(" } \n"); 00261 source.append("} \n"); 00262 } 00263 00264 template <typename StringType> 00265 void generate_assign_cpu(StringType & source, std::string const & numeric_string) 00266 { 00267 source.append("__kernel void assign_cpu( \n"); 00268 source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); 00269 source.append(" unsigned int start1, \n"); 00270 source.append(" unsigned int inc1, \n"); 00271 source.append(" unsigned int size1, \n"); 00272 source.append(" unsigned int internal_size1, \n"); 00273 source.append(" "); source.append(numeric_string); source.append(" alpha) \n"); 00274 source.append("{ \n"); 00275 source.append(" for (unsigned int i = get_global_id(0); i < internal_size1; i += get_global_size(0)) \n"); 00276 source.append(" vec1[i*inc1+start1] = (i < size1) ? alpha : 0; \n"); 00277 source.append("} \n"); 00278 00279 } 00280 00281 template <typename StringType> 00282 void generate_inner_prod(StringType & source, std::string const & numeric_string, vcl_size_t vector_num) 00283 { 00284 std::stringstream ss; 00285 ss << vector_num; 00286 std::string vector_num_string = ss.str(); 00287 00288 source.append("__kernel void inner_prod"); source.append(vector_num_string); source.append("( \n"); 00289 source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n"); 00290 source.append(" uint4 params_x, \n"); 00291 for (vcl_size_t i=0; i<vector_num; ++i) 00292 { 00293 ss.str(""); 00294 ss << i; 00295 source.append(" __global const "); source.append(numeric_string); source.append(" * y"); source.append(ss.str()); source.append(", \n"); 00296 source.append(" uint4 params_y"); source.append(ss.str()); source.append(", \n"); 00297 } 00298 source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n"); 00299 source.append(" __global "); source.append(numeric_string); source.append(" * group_buffer) \n"); 00300 source.append("{ \n"); 00301 source.append(" unsigned int entries_per_thread = (params_x.z - 1) / get_global_size(0) + 1; \n"); 00302 source.append(" unsigned int vec_start_index = get_group_id(0) * get_local_size(0) * entries_per_thread; \n"); 00303 source.append(" unsigned int vec_stop_index = min((unsigned int)((get_group_id(0) + 1) * get_local_size(0) * entries_per_thread), params_x.z); \n"); 00304 00305 // compute partial results within group: 00306 for (vcl_size_t i=0; i<vector_num; ++i) 00307 { 00308 ss.str(""); 00309 ss << i; 00310 source.append(" "); source.append(numeric_string); source.append(" tmp"); source.append(ss.str()); source.append(" = 0; \n"); 00311 } 00312 source.append(" for (unsigned int i = vec_start_index + get_local_id(0); i < vec_stop_index; i += get_local_size(0)) { \n"); 00313 source.append(" "); source.append(numeric_string); source.append(" val_x = x[i*params_x.y + params_x.x]; \n"); 00314 for (vcl_size_t i=0; i<vector_num; ++i) 00315 { 00316 ss.str(""); 00317 ss << i; 00318 source.append(" tmp"); source.append(ss.str()); source.append(" += val_x * y"); source.append(ss.str()); source.append("[i * params_y"); source.append(ss.str()); source.append(".y + params_y"); source.append(ss.str()); source.append(".x]; \n"); 00319 } 00320 source.append(" } \n"); 00321 for (vcl_size_t i=0; i<vector_num; ++i) 00322 { 00323 ss.str(""); 00324 ss << i; 00325 source.append(" tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] = tmp"); source.append(ss.str()); source.append("; \n"); 00326 } 00327 00328 // now run reduction: 00329 source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n"); 00330 source.append(" { \n"); 00331 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00332 source.append(" if (get_local_id(0) < stride) { \n"); 00333 for (vcl_size_t i=0; i<vector_num; ++i) 00334 { 00335 ss.str(""); 00336 ss << i; 00337 source.append(" tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] += tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0) + stride]; \n"); 00338 } 00339 source.append(" } \n"); 00340 source.append(" } \n"); 00341 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00342 00343 source.append(" if (get_local_id(0) == 0) { \n"); 00344 for (vcl_size_t i=0; i<vector_num; ++i) 00345 { 00346 ss.str(""); 00347 ss << i; 00348 source.append(" group_buffer[get_group_id(0) + "); source.append(ss.str()); source.append(" * get_num_groups(0)] = tmp_buffer["); source.append(ss.str()); source.append(" * get_local_size(0)]; \n"); 00349 } 00350 source.append(" } \n"); 00351 source.append("} \n"); 00352 00353 } 00354 00355 template <typename StringType> 00356 void generate_norm(StringType & source, std::string const & numeric_string) 00357 { 00358 bool is_float_or_double = (numeric_string == "float" || numeric_string == "double"); 00359 00360 source.append(numeric_string); source.append(" impl_norm( \n"); 00361 source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n"); 00362 source.append(" unsigned int start1, \n"); 00363 source.append(" unsigned int inc1, \n"); 00364 source.append(" unsigned int size1, \n"); 00365 source.append(" unsigned int norm_selector, \n"); 00366 source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer) \n"); 00367 source.append("{ \n"); 00368 source.append(" "); source.append(numeric_string); source.append(" tmp = 0; \n"); 00369 source.append(" if (norm_selector == 1) \n"); //norm_1 00370 source.append(" { \n"); 00371 source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n"); 00372 if (is_float_or_double) 00373 source.append(" tmp += fabs(vec[i*inc1 + start1]); \n"); 00374 else 00375 source.append(" tmp += abs(vec[i*inc1 + start1]); \n"); 00376 source.append(" } \n"); 00377 source.append(" else if (norm_selector == 2) \n"); //norm_2 00378 source.append(" { \n"); 00379 source.append(" "); source.append(numeric_string); source.append(" vec_entry = 0; \n"); 00380 source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n"); 00381 source.append(" { \n"); 00382 source.append(" vec_entry = vec[i*inc1 + start1]; \n"); 00383 source.append(" tmp += vec_entry * vec_entry; \n"); 00384 source.append(" } \n"); 00385 source.append(" } \n"); 00386 source.append(" else if (norm_selector == 0) \n"); //norm_inf 00387 source.append(" { \n"); 00388 source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n"); 00389 if (is_float_or_double) 00390 source.append(" tmp = fmax(fabs(vec[i*inc1 + start1]), tmp); \n"); 00391 else 00392 { 00393 source.append(" tmp = max(("); source.append(numeric_string); source.append(")abs(vec[i*inc1 + start1]), tmp); \n"); 00394 } 00395 source.append(" } \n"); 00396 00397 source.append(" tmp_buffer[get_local_id(0)] = tmp; \n"); 00398 00399 source.append(" if (norm_selector > 0) \n"); //norm_1 or norm_2: 00400 source.append(" { \n"); 00401 source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n"); 00402 source.append(" { \n"); 00403 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00404 source.append(" if (get_local_id(0) < stride) \n"); 00405 source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride]; \n"); 00406 source.append(" } \n"); 00407 source.append(" return tmp_buffer[0]; \n"); 00408 source.append(" } \n"); 00409 00410 //norm_inf: 00411 source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n"); 00412 source.append(" { \n"); 00413 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00414 source.append(" if (get_local_id(0) < stride) \n"); 00415 if (is_float_or_double) 00416 source.append(" tmp_buffer[get_local_id(0)] = fmax(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n"); 00417 else 00418 source.append(" tmp_buffer[get_local_id(0)] = max(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n"); 00419 source.append(" } \n"); 00420 00421 source.append(" return tmp_buffer[0]; \n"); 00422 source.append("}; \n"); 00423 00424 source.append("__kernel void norm( \n"); 00425 source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n"); 00426 source.append(" unsigned int start1, \n"); 00427 source.append(" unsigned int inc1, \n"); 00428 source.append(" unsigned int size1, \n"); 00429 source.append(" unsigned int norm_selector, \n"); 00430 source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n"); 00431 source.append(" __global "); source.append(numeric_string); source.append(" * group_buffer) \n"); 00432 source.append("{ \n"); 00433 source.append(" "); source.append(numeric_string); source.append(" tmp = impl_norm(vec, \n"); 00434 source.append(" ( get_group_id(0) * size1) / get_num_groups(0) * inc1 + start1, \n"); 00435 source.append(" inc1, \n"); 00436 source.append(" ( (1 + get_group_id(0)) * size1) / get_num_groups(0) \n"); 00437 source.append(" - ( get_group_id(0) * size1) / get_num_groups(0), \n"); 00438 source.append(" norm_selector, \n"); 00439 source.append(" tmp_buffer); \n"); 00440 00441 source.append(" if (get_local_id(0) == 0) \n"); 00442 source.append(" group_buffer[get_group_id(0)] = tmp; \n"); 00443 source.append("} \n"); 00444 00445 } 00446 00447 template <typename StringType> 00448 void generate_inner_prod_sum(StringType & source, std::string const & numeric_string) 00449 { 00450 // sums the array 'vec1' and writes to result. Makes use of a single work-group only. 00451 source.append("__kernel void sum_inner_prod( \n"); 00452 source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); 00453 source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n"); 00454 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 00455 source.append(" unsigned int start_result, \n"); 00456 source.append(" unsigned int inc_result) \n"); 00457 source.append("{ \n"); 00458 source.append(" tmp_buffer[get_local_id(0)] = vec1[get_global_id(0)]; \n"); 00459 00460 source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n"); 00461 source.append(" { \n"); 00462 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00463 source.append(" if (get_local_id(0) < stride) \n"); 00464 source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n"); 00465 source.append(" } \n"); 00466 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00467 00468 source.append(" if (get_local_id(0) == 0) \n"); 00469 source.append(" result[start_result + inc_result * get_group_id(0)] = tmp_buffer[0]; \n"); 00470 source.append("} \n"); 00471 00472 } 00473 00474 template <typename StringType> 00475 void generate_sum(StringType & source, std::string const & numeric_string) 00476 { 00477 // sums the array 'vec1' and writes to result. Makes use of a single work-group only. 00478 source.append("__kernel void sum( \n"); 00479 source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); 00480 source.append(" unsigned int start1, \n"); 00481 source.append(" unsigned int inc1, \n"); 00482 source.append(" unsigned int size1, \n"); 00483 source.append(" unsigned int option, \n"); //0: use fmax, 1: just sum, 2: sum and return sqrt of sum 00484 source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n"); 00485 source.append(" __global "); source.append(numeric_string); source.append(" * result) \n"); 00486 source.append("{ \n"); 00487 source.append(" "); source.append(numeric_string); source.append(" thread_sum = 0; \n"); 00488 source.append(" "); source.append(numeric_string); source.append(" tmp = 0; \n"); 00489 source.append(" for (unsigned int i = get_local_id(0); i<size1; i += get_local_size(0)) \n"); 00490 source.append(" { \n"); 00491 source.append(" if (option > 0) \n"); 00492 source.append(" thread_sum += vec1[i*inc1+start1]; \n"); 00493 source.append(" else \n"); 00494 source.append(" { \n"); 00495 source.append(" tmp = vec1[i*inc1+start1]; \n"); 00496 source.append(" tmp = (tmp < 0) ? -tmp : tmp; \n"); 00497 source.append(" thread_sum = (thread_sum > tmp) ? thread_sum : tmp; \n"); 00498 source.append(" } \n"); 00499 source.append(" } \n"); 00500 00501 source.append(" tmp_buffer[get_local_id(0)] = thread_sum; \n"); 00502 00503 source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n"); 00504 source.append(" { \n"); 00505 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00506 source.append(" if (get_local_id(0) < stride) \n"); 00507 source.append(" { \n"); 00508 source.append(" if (option > 0) \n"); 00509 source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n"); 00510 source.append(" else \n"); 00511 source.append(" tmp_buffer[get_local_id(0)] = (tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride]) ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n"); 00512 source.append(" } \n"); 00513 source.append(" } \n"); 00514 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00515 00516 source.append(" if (get_global_id(0) == 0) \n"); 00517 source.append(" { \n"); 00518 if (numeric_string == "float" || numeric_string == "double") 00519 { 00520 source.append(" if (option == 2) \n"); 00521 source.append(" *result = sqrt(tmp_buffer[0]); \n"); 00522 source.append(" else \n"); 00523 } 00524 source.append(" *result = tmp_buffer[0]; \n"); 00525 source.append(" } \n"); 00526 source.append("} \n"); 00527 00528 } 00529 00530 template <typename StringType> 00531 void generate_index_norm_inf(StringType & source, std::string const & numeric_string) 00532 { 00533 //index_norm_inf: 00534 source.append("unsigned int index_norm_inf_impl( \n"); 00535 source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n"); 00536 source.append(" unsigned int start1, \n"); 00537 source.append(" unsigned int inc1, \n"); 00538 source.append(" unsigned int size1, \n"); 00539 source.append(" __local "); source.append(numeric_string); source.append(" * entry_buffer, \n"); 00540 source.append(" __local unsigned int * index_buffer) \n"); 00541 source.append("{ \n"); 00542 //step 1: fill buffer: 00543 source.append(" "); source.append(numeric_string); source.append(" cur_max = 0; \n"); 00544 source.append(" "); source.append(numeric_string); source.append(" tmp; \n"); 00545 source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n"); 00546 source.append(" { \n"); 00547 if (numeric_string == "float" || numeric_string == "double") 00548 source.append(" tmp = fabs(vec[i*inc1+start1]); \n"); 00549 else 00550 source.append(" tmp = abs(vec[i*inc1+start1]); \n"); 00551 source.append(" if (cur_max < tmp) \n"); 00552 source.append(" { \n"); 00553 source.append(" entry_buffer[get_global_id(0)] = tmp; \n"); 00554 source.append(" index_buffer[get_global_id(0)] = i; \n"); 00555 source.append(" cur_max = tmp; \n"); 00556 source.append(" } \n"); 00557 source.append(" } \n"); 00558 00559 //step 2: parallel reduction: 00560 source.append(" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2) \n"); 00561 source.append(" { \n"); 00562 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00563 source.append(" if (get_global_id(0) < stride) \n"); 00564 source.append(" { \n"); 00565 //find the first occurring index 00566 source.append(" if (entry_buffer[get_global_id(0)] < entry_buffer[get_global_id(0)+stride]) \n"); 00567 source.append(" { \n"); 00568 source.append(" index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride]; \n"); 00569 source.append(" entry_buffer[get_global_id(0)] = entry_buffer[get_global_id(0)+stride]; \n"); 00570 source.append(" } \n"); 00571 source.append(" } \n"); 00572 source.append(" } \n"); 00573 source.append(" \n"); 00574 source.append(" return index_buffer[0]; \n"); 00575 source.append("} \n"); 00576 00577 source.append("__kernel void index_norm_inf( \n"); 00578 source.append(" __global "); source.append(numeric_string); source.append(" * vec, \n"); 00579 source.append(" unsigned int start1, \n"); 00580 source.append(" unsigned int inc1, \n"); 00581 source.append(" unsigned int size1, \n"); 00582 source.append(" __local "); source.append(numeric_string); source.append(" * entry_buffer, \n"); 00583 source.append(" __local unsigned int * index_buffer, \n"); 00584 source.append(" __global unsigned int * result) \n"); 00585 source.append("{ \n"); 00586 source.append(" entry_buffer[get_global_id(0)] = 0; \n"); 00587 source.append(" index_buffer[get_global_id(0)] = 0; \n"); 00588 source.append(" unsigned int tmp = index_norm_inf_impl(vec, start1, inc1, size1, entry_buffer, index_buffer); \n"); 00589 source.append(" if (get_global_id(0) == 0) *result = tmp; \n"); 00590 source.append("} \n"); 00591 00592 } 00593 00594 00596 00597 // main kernel class 00599 template <class TYPE> 00600 struct vector 00601 { 00602 static std::string program_name() 00603 { 00604 return viennacl::ocl::type_to_string<TYPE>::apply() + "_vector"; 00605 } 00606 00607 static void init(viennacl::ocl::context & ctx) 00608 { 00609 viennacl::ocl::DOUBLE_PRECISION_CHECKER<TYPE>::apply(ctx); 00610 std::string numeric_string = viennacl::ocl::type_to_string<TYPE>::apply(); 00611 00612 static std::map<cl_context, bool> init_done; 00613 if (!init_done[ctx.handle().get()]) 00614 { 00615 std::string source; 00616 source.reserve(8192); 00617 00618 viennacl::ocl::append_double_precision_pragma<TYPE>(ctx, source); 00619 00620 // fully parametrized kernels: 00621 generate_avbv(source, numeric_string); 00622 00623 // kernels with mostly predetermined skeleton: 00624 generate_plane_rotation(source, numeric_string); 00625 generate_vector_swap(source, numeric_string); 00626 generate_assign_cpu(source, numeric_string); 00627 00628 generate_inner_prod(source, numeric_string, 1); 00629 generate_norm(source, numeric_string); 00630 generate_sum(source, numeric_string); 00631 generate_index_norm_inf(source, numeric_string); 00632 00633 std::string prog_name = program_name(); 00634 #ifdef VIENNACL_BUILD_INFO 00635 std::cout << "Creating program " << prog_name << std::endl; 00636 #endif 00637 ctx.add_program(source, prog_name); 00638 init_done[ctx.handle().get()] = true; 00639 } //if 00640 } //init 00641 }; 00642 00643 // class with kernels for multiple inner products. 00645 template <class TYPE> 00646 struct vector_multi_inner_prod 00647 { 00648 static std::string program_name() 00649 { 00650 return viennacl::ocl::type_to_string<TYPE>::apply() + "_vector_multi"; 00651 } 00652 00653 static void init(viennacl::ocl::context & ctx) 00654 { 00655 viennacl::ocl::DOUBLE_PRECISION_CHECKER<TYPE>::apply(ctx); 00656 std::string numeric_string = viennacl::ocl::type_to_string<TYPE>::apply(); 00657 00658 static std::map<cl_context, bool> init_done; 00659 if (!init_done[ctx.handle().get()]) 00660 { 00661 std::string source; 00662 source.reserve(8192); 00663 00664 viennacl::ocl::append_double_precision_pragma<TYPE>(ctx, source); 00665 00666 generate_inner_prod(source, numeric_string, 2); 00667 generate_inner_prod(source, numeric_string, 3); 00668 generate_inner_prod(source, numeric_string, 4); 00669 generate_inner_prod(source, numeric_string, 8); 00670 00671 generate_inner_prod_sum(source, numeric_string); 00672 00673 std::string prog_name = program_name(); 00674 #ifdef VIENNACL_BUILD_INFO 00675 std::cout << "Creating program " << prog_name << std::endl; 00676 #endif 00677 ctx.add_program(source, prog_name); 00678 init_done[ctx.handle().get()] = true; 00679 } //if 00680 } //init 00681 }; 00682 00683 } // namespace kernels 00684 } // namespace opencl 00685 } // namespace linalg 00686 } // namespace viennacl 00687 #endif 00688