ViennaCL - The Vienna Computing Library
1.5.0
|
00001 #ifndef VIENNACL_GENERATOR_GENERATE_SCALAR_REDUCTION_HPP 00002 #define VIENNACL_GENERATOR_GENERATE_SCALAR_REDUCTION_HPP 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2013, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00021 00027 #include <vector> 00028 00029 #include "viennacl/backend/opencl.hpp" 00030 00031 #include "viennacl/scheduler/forwards.h" 00032 00033 #include "viennacl/generator/helpers.hpp" 00034 #include "viennacl/generator/utils.hpp" 00035 00036 #include "viennacl/generator/profile_base.hpp" 00037 00038 #include "viennacl/tools/tools.hpp" 00039 00040 namespace viennacl{ 00041 00042 namespace generator{ 00043 00045 class scalar_reduction : public profile_base{ 00046 private: 00047 typedef std::vector<std::pair<const char *, viennacl::ocl::handle<cl_mem> > > temporaries_type; 00048 00049 static void fill_scalartypes(statements_type statements, std::vector<const char *> & res){ 00050 res.reserve(statements.size()); 00051 for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){ 00052 if (it->second.lhs.type_family == scheduler::SCALAR_TYPE_FAMILY) 00053 { 00054 switch(it->second.lhs.numeric_type){ 00055 case scheduler::FLOAT_TYPE: 00056 res.push_back("float"); 00057 break; 00058 case scheduler::DOUBLE_TYPE: 00059 res.push_back("double"); 00060 break; 00061 default: 00062 res.push_back(""); 00063 break; 00064 } 00065 } 00066 else 00067 { 00068 res.push_back(""); 00069 } 00070 } 00071 } 00072 00073 public: 00074 00075 vcl_size_t lmem_used(vcl_size_t scalartype_size) const { 00076 return local_size_1_*scalartype_size; 00077 } 00078 00079 void init_temporaries(statements_type const & statements) const { 00080 if(temporaries_.empty()){ 00081 //set temporary buffer argument 00082 for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){ 00083 scheduler::statement::container_type const & array = it->first.array(); 00084 vcl_size_t size_of_scalartype; 00085 const char * scalartype_name; 00086 if (array[0].lhs.type_family != scheduler::SCALAR_TYPE_FAMILY) throw "not implemented"; 00087 switch(array[0].lhs.numeric_type){ 00088 case scheduler::FLOAT_TYPE: scalartype_name = "float"; size_of_scalartype = sizeof(float); break; 00089 case scheduler::DOUBLE_TYPE: scalartype_name = "double"; size_of_scalartype = sizeof(double); break; 00090 default: throw "not implemented"; 00091 } 00092 for(scheduler::statement::container_type::const_iterator iit = array.begin() ; iit != array.end() ; ++iit){ 00093 if(iit->op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE){ 00094 temporaries_.push_back(std::make_pair(scalartype_name, viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast<unsigned int>(num_groups_*size_of_scalartype)))); 00095 } 00096 } 00097 } 00098 } 00099 } 00100 00101 void set_size_argument(viennacl::scheduler::statement const & s, viennacl::scheduler::statement_node const & /*root_node*/, unsigned int & n_arg, viennacl::ocl::kernel & k) const { 00102 scheduler::statement::container_type exprs = s.array(); 00103 for(scheduler::statement::container_type::iterator it = exprs.begin() ; it != exprs.end() ; ++it){ 00104 if(it->op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE){ 00105 //set size argument 00106 scheduler::statement_node const * current_node = &(*it); 00107 00108 vcl_size_t vector_size = 0; 00109 //The LHS of the prod is a vector 00110 if(current_node->lhs.type_family==scheduler::VECTOR_TYPE_FAMILY) 00111 { 00112 vector_size = utils::call_on_vector(current_node->lhs, utils::internal_size_fun()); 00113 } 00114 else{ 00115 //The LHS of the prod is a vector expression 00116 current_node = &exprs[current_node->lhs.node_index]; 00117 if(current_node->lhs.type_family==scheduler::VECTOR_TYPE_FAMILY) 00118 { 00119 vector_size = cl_uint(utils::call_on_vector(current_node->lhs, utils::internal_size_fun())); 00120 } 00121 else if(current_node->rhs.type_family==scheduler::VECTOR_TYPE_FAMILY) 00122 { 00123 vector_size = cl_uint(utils::call_on_vector(current_node->lhs, utils::internal_size_fun())); 00124 } 00125 else{ 00126 assert(false && bool("unexpected expression tree")); 00127 } 00128 } 00129 k.arg(n_arg++, cl_uint(vector_size/vector_size_)); 00130 } 00131 } 00132 } 00133 00134 public: 00136 scalar_reduction(unsigned int vectorization, unsigned int local_size, unsigned int num_groups, unsigned int decomposition) : profile_base(vectorization, local_size, 1, 2), num_groups_(num_groups), decomposition_(decomposition){ } 00137 00138 00139 static std::string csv_format() { 00140 return "Vec,LSize,NumGroups,GlobalDecomposition"; 00141 } 00142 00143 std::string csv_representation() const{ 00144 std::ostringstream oss; 00145 oss << vector_size_ 00146 << "," << local_size_1_ 00147 << "," << num_groups_ 00148 << "," << decomposition_; 00149 return oss.str(); 00150 } 00151 00152 unsigned int num_groups() const { return num_groups_; } 00153 00154 00155 unsigned int decomposition() const { return decomposition_; } 00156 00157 00158 void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg) const{ 00159 00160 //create temporaries 00161 init_temporaries(statements); 00162 00163 //configure ND range 00164 if(kernel_id==0){ 00165 configure_local_sizes(k, 0); 00166 00167 vcl_size_t gsize = local_size_1_*num_groups_; 00168 k.global_work_size(0,gsize); 00169 k.global_work_size(1,1); 00170 } 00171 else{ 00172 configure_local_sizes(k, 1); 00173 00174 k.global_work_size(0,local_size_1_); 00175 k.global_work_size(1,1); 00176 } 00177 00178 //set arguments 00179 set_size_argument(statements.front().first, statements.front().second, n_arg, k); 00180 for(temporaries_type::iterator it = temporaries_.begin() ; it != temporaries_.end() ; ++it){ 00181 k.arg(n_arg++, it->second); 00182 } 00183 } 00184 00185 void kernel_arguments(statements_type const & statements, std::string & arguments_string) const{ 00186 init_temporaries(statements); 00187 arguments_string += detail::generate_value_kernel_argument("unsigned int", "N"); 00188 for(temporaries_type::iterator it = temporaries_.begin() ; it != temporaries_.end() ; ++it){ 00189 arguments_string += detail::generate_pointer_kernel_argument("__global", it->first, "temp" + utils::to_string(std::distance(temporaries_.begin(), it))); 00190 } 00191 } 00192 00193 private: 00194 00195 void core_0(utils::kernel_generation_stream& stream, std::vector<detail::mapped_scalar_reduction*> exprs, std::vector<const char *> const & scalartypes, statements_type const & /*statements*/, std::vector<detail::mapping_type> const & /*mapping*/) const { 00196 00197 stream << "unsigned int lid = get_local_id(0);" << std::endl; 00198 00199 for(vcl_size_t k = 0 ; k < exprs.size() ; ++k) 00200 stream << scalartypes[k] << " sum" << k << " = 0;" << std::endl; 00201 00202 if(decomposition_){ 00203 stream << "for(unsigned int i = get_global_id(0) ; i < N ; i += get_global_size(0)){" << std::endl; 00204 } 00205 else{ 00206 stream << "unsigned int chunk_size = (N + get_num_groups(0)-1)/get_num_groups(0);" << std::endl; 00207 stream << "unsigned int chunk_start = get_group_id(0)*chunk_size;" << std::endl; 00208 stream << "unsigned int chunk_end = min(chunk_start+chunk_size, N);" << std::endl; 00209 stream << "for(unsigned int i = chunk_start + get_local_id(0) ; i < chunk_end ; i += get_local_size(0)){" << std::endl; 00210 } 00211 stream.inc_tab(); 00212 00213 //Fetch vector entry 00214 std::set<std::string> fetched; 00215 00216 for(std::vector<detail::mapped_scalar_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){ 00217 viennacl::scheduler::statement const & statement = (*it)->statement(); 00218 viennacl::scheduler::statement_node const & root_node = (*it)->root_node(); 00219 detail::fetch_all_lhs(fetched,statement,root_node, std::make_pair("i", "0"),vector_size_,stream,(*it)->mapping()); 00220 detail::fetch_all_rhs(fetched,statement,root_node, std::make_pair("i", "0"),vector_size_,stream,(*it)->mapping()); 00221 } 00222 00223 00224 //Update sums; 00225 for(std::vector<detail::mapped_scalar_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){ 00226 viennacl::scheduler::statement const & statement = (*it)->statement(); 00227 viennacl::scheduler::statement_node const & root_node = (*it)->root_node(); 00228 if(vector_size_ > 1){ 00229 for(unsigned int a = 0 ; a < vector_size_ ; ++a){ 00230 std::string str; 00231 detail::generate_all_lhs(statement,root_node,std::make_pair("i","0"),a,str,(*it)->mapping()); 00232 str += "*"; 00233 detail::generate_all_rhs(statement,root_node,std::make_pair("i","0"),a,str,(*it)->mapping()); 00234 stream << " sum" << std::distance(exprs.begin(),it) << " += " << str << ";" << std::endl; 00235 } 00236 } 00237 else{ 00238 std::string str; 00239 detail::generate_all_lhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping()); 00240 str += "*"; 00241 detail::generate_all_rhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping()); 00242 stream << " sum" << std::distance(exprs.begin(),it) << " += " << str << ";" << std::endl; 00243 } 00244 } 00245 00246 00247 stream.dec_tab(); 00248 stream << "}" << std::endl; 00249 //Declare and fill local memory 00250 for(vcl_size_t k = 0 ; k < exprs.size() ; ++k) 00251 stream << "__local " << scalartypes[k] << " buf" << k << "[" << local_size_1_ << "];" << std::endl; 00252 00253 for(vcl_size_t k = 0 ; k < exprs.size() ; ++k) 00254 stream << "buf" << k << "[lid] = sum" << k << ";" << std::endl; 00255 00256 //Reduce local memory 00257 for(vcl_size_t stride = local_size_1_/2 ; stride>1 ; stride /=2){ 00258 stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl; 00259 stream << "if(lid < " << stride << "){" << std::endl; 00260 stream.inc_tab(); 00261 for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){ 00262 stream << "buf" << k << "[lid] += buf" << k << "[lid + " << stride << "];" << std::endl; 00263 } 00264 stream.dec_tab(); 00265 stream << "}" << std::endl; 00266 } 00267 00268 //Last reduction and write back to temporary buffer 00269 stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl; 00270 stream << "if(lid==0){" << std::endl; 00271 stream.inc_tab(); 00272 for(vcl_size_t k = 0 ; k < exprs.size() ; ++k) 00273 stream << "buf" << k << "[0] += buf" << k << "[1];" << std::endl; 00274 00275 for(vcl_size_t k = 0 ; k < exprs.size() ; ++k) 00276 stream << "temp"<< k << "[get_group_id(0)] = buf" << k << "[0];" << std::endl; 00277 00278 stream.dec_tab(); 00279 stream << "}" << std::endl; 00280 } 00281 00282 00283 void core_1(utils::kernel_generation_stream& stream, std::vector<detail::mapped_scalar_reduction*> exprs, std::vector<const char *> scalartypes, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const { 00284 stream << "unsigned int lid = get_local_id(0);" << std::endl; 00285 00286 for(vcl_size_t k = 0 ; k < exprs.size() ; ++k) 00287 stream << "__local " << scalartypes[k] << " buf" << k << "[" << local_size_1_ << "];" << std::endl; 00288 00289 for(vcl_size_t k = 0 ; k < exprs.size() ; ++k) 00290 stream << scalartypes[0] << " sum" << k << " = 0;" << std::endl; 00291 00292 stream << "for(unsigned int i = lid ; i < " << num_groups_ << " ; i += get_local_size(0)){" << std::endl; 00293 stream.inc_tab(); 00294 for(vcl_size_t k = 0 ; k < exprs.size() ; ++k) 00295 stream << "sum" << k << " += temp" << k << "[i];" << std::endl; 00296 stream.dec_tab(); 00297 stream << "}" << std::endl; 00298 00299 for(vcl_size_t k = 0 ; k < exprs.size() ; ++k) 00300 stream << "buf" << k << "[lid] = sum" << k << ";" << std::endl; 00301 00302 //Reduce local memory 00303 for(vcl_size_t stride = local_size_1_/2 ; stride>1 ; stride /=2){ 00304 stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl; 00305 stream << "if(lid < " << stride << "){" << std::endl; 00306 stream.inc_tab(); 00307 for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){ 00308 stream << "buf" << k << "[lid] += buf" << k << "[lid + " << stride << "];" << std::endl; 00309 } 00310 stream.dec_tab(); 00311 stream << "}" << std::endl; 00312 } 00313 00314 stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl; 00315 stream << "if(lid==0){" << std::endl; 00316 stream.inc_tab(); 00317 for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){ 00318 stream << "buf" << k << "[0] += buf" << k << "[1];" << std::endl; 00319 exprs[k]->access_name("buf"+utils::to_string(k)+"[0]"); 00320 } 00321 00322 vcl_size_t i = 0; 00323 for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){ 00324 std::string str; 00325 detail::traverse(it->first, it->second, detail::expression_generation_traversal(std::make_pair("0", "0"), -1, str, mapping[i++]), false); 00326 stream << str << ";" << std::endl; 00327 } 00328 00329 stream.dec_tab(); 00330 stream << "}" << std::endl; 00331 } 00332 00333 void core(vcl_size_t kernel_id, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const { 00334 std::vector<detail::mapped_scalar_reduction*> exprs; 00335 for(std::vector<detail::mapping_type>::const_iterator it = mapping.begin() ; it != mapping.end() ; ++it) 00336 for(detail::mapping_type::const_iterator iit = it->begin() ; iit != it->end() ; ++iit) 00337 if(detail::mapped_scalar_reduction * p = dynamic_cast<detail::mapped_scalar_reduction*>(iit->second.get())) 00338 exprs.push_back(p); 00339 00340 std::vector<const char *> scalartypes; 00341 fill_scalartypes(statements, scalartypes); 00342 00343 if(kernel_id==0){ 00344 core_0(stream,exprs,scalartypes,statements,mapping); 00345 } 00346 else{ 00347 core_1(stream,exprs,scalartypes,statements,mapping); 00348 } 00349 } 00350 00351 private: 00352 unsigned int num_groups_; 00353 unsigned int decomposition_; 00354 mutable temporaries_type temporaries_; 00355 }; 00356 00357 00358 } 00359 00360 } 00361 00362 #endif