ViennaCL - The Vienna Computing Library  1.5.0
viennacl/generator/generate.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_GENERATOR_GENERATE_HPP
00002 #define VIENNACL_GENERATOR_GENERATE_HPP
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2013, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00021 
00026 #include <cstring>
00027 #include <vector>
00028 #include <typeinfo>
00029 
00030 #include "viennacl/scheduler/forwards.h"
00031 #include "viennacl/generator/forwards.h"
00032 
00033 #include "viennacl/generator/profiles.hpp"
00034 #include "viennacl/generator/statement_representation_functor.hpp"
00035 #include "viennacl/generator/set_arguments_functor.hpp"
00036 #include "viennacl/generator/map_functor.hpp"
00037 
00038 #include "viennacl/tools/tools.hpp"
00039 
00040 namespace viennacl{
00041 
00042   namespace generator{
00043 
00047     class code_generator{
00048       public:
00050         typedef std::pair<expression_type, vcl_size_t> forced_profile_key_type;
00051       private:
00052         typedef std::pair<expression_descriptor, generator::profile_base::statements_type> representation_node_type;
00053         typedef std::vector<representation_node_type> statements_type;
00054         typedef std::map<forced_profile_key_type, tools::shared_ptr<profile_base> > forced_profiles_type;
00055 
00060         static bool is_flow_transposed(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node){
00061           viennacl::scheduler::statement::container_type const & expr = statement.array();
00062           if(root_node.op.type==viennacl::scheduler::OPERATION_UNARY_TRANS_TYPE)
00063             return root_node.lhs.subtype==viennacl::scheduler::DENSE_ROW_MATRIX_TYPE;
00064           else{
00065             bool res = root_node.lhs.subtype==viennacl::scheduler::DENSE_COL_MATRIX_TYPE || root_node.rhs.subtype==viennacl::scheduler::DENSE_COL_MATRIX_TYPE;
00066             if(root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
00067               res = res || is_lhs_flow_transposed(statement, expr[root_node.lhs.node_index]);
00068             if(root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
00069               res = res || is_lhs_flow_transposed(statement, expr[root_node.rhs.node_index]);
00070             return res;
00071           }
00072         }
00073 
00075         static bool is_lhs_flow_transposed(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node){
00076           scheduler::statement::container_type const & expr = statement.array();
00077           if(root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
00078             return is_flow_transposed(statement, expr[root_node.lhs.node_index]);
00079           else
00080             return root_node.lhs.subtype==viennacl::scheduler::DENSE_COL_MATRIX_TYPE;
00081         }
00082 
00084         static bool is_rhs_flow_transposed(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node){
00085           viennacl::scheduler::statement::container_type const & expr = statement.array();
00086           if(root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
00087             return is_flow_transposed(statement, expr[root_node.rhs.node_index]);
00088           else
00089             return root_node.rhs.subtype==viennacl::scheduler::DENSE_COL_MATRIX_TYPE;
00090         }
00091 
00093         static void fill_expression_descriptor_scalar(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, expression_descriptor & descriptor){
00094           viennacl::scheduler::statement::container_type const & expr = statement.array();
00095           bool is_invalid = (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE)
00096                           || (descriptor.type_family==SCALAR_REDUCE_FAMILY && root_node.op.type == viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE);
00097           if(is_invalid){
00098             descriptor.type_family = INVALID_EXPRESSION_FAMILY;
00099             descriptor.type = INVALID_EXPRESSION_TYPE;
00100           }
00101           else if(root_node.op.type==viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE){
00102             descriptor.type_family = SCALAR_REDUCE_FAMILY;
00103             descriptor.type = SCALAR_REDUCE_TYPE;
00104           }
00105           if(descriptor.type_family!=INVALID_EXPRESSION_FAMILY && root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
00106             fill_expression_descriptor_scalar(statement, expr[root_node.lhs.node_index],descriptor);
00107           if(descriptor.type_family!=INVALID_EXPRESSION_FAMILY && root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
00108             fill_expression_descriptor_scalar(statement, expr[root_node.rhs.node_index],descriptor);
00109         }
00110 
00112         static void fill_expression_descriptor_vector(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, expression_descriptor & descriptor){
00113           viennacl::scheduler::statement::container_type const & expr = statement.array();
00114           bool is_invalid =  (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
00115                           || (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE)
00116                           || (descriptor.type_family==VECTOR_REDUCE_FAMILY && root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE);
00117           if(is_invalid){
00118             descriptor.type_family=INVALID_EXPRESSION_FAMILY;
00119             descriptor.type=INVALID_EXPRESSION_TYPE;
00120           }
00121           else if(root_node.op.type==viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE){
00122             descriptor.type_family=VECTOR_REDUCE_FAMILY;
00123             if(is_lhs_flow_transposed(statement,root_node))
00124               descriptor.type=VECTOR_REDUCE_Tx_TYPE;
00125             else
00126               descriptor.type=VECTOR_REDUCE_Nx_TYPE;
00127           }
00128           if(descriptor.type_family!=INVALID_EXPRESSION_FAMILY && root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
00129             fill_expression_descriptor_vector(statement, expr[root_node.lhs.node_index],descriptor);
00130           if(descriptor.type_family!=INVALID_EXPRESSION_FAMILY && root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
00131             fill_expression_descriptor_vector(statement, expr[root_node.rhs.node_index],descriptor);
00132         }
00133 
00135         static void fill_expression_descriptor_matrix(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, expression_descriptor & descriptor){
00136           viennacl::scheduler::statement::container_type const & expr = statement.array();
00137           bool is_invalid =  (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
00138                           || (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE)
00139                           || (descriptor.type_family==MATRIX_PRODUCT_FAMILY && root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE);
00140           if(is_invalid){
00141             descriptor.type_family=INVALID_EXPRESSION_FAMILY;
00142             descriptor.type=INVALID_EXPRESSION_TYPE;
00143           }
00144           else if(root_node.op.type==viennacl::scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE){
00145             descriptor.type_family=MATRIX_PRODUCT_FAMILY;
00146             bool lhs_trans = is_lhs_flow_transposed(statement,root_node);
00147             bool rhs_trans = is_rhs_flow_transposed(statement,root_node);
00148             if(!lhs_trans && !rhs_trans)
00149               descriptor.type=MATRIX_PRODUCT_NN_TYPE;
00150             else if(lhs_trans && !rhs_trans)
00151               descriptor.type=MATRIX_PRODUCT_TN_TYPE;
00152             else if(!lhs_trans && rhs_trans)
00153               descriptor.type=MATRIX_PRODUCT_NT_TYPE;
00154             else if(lhs_trans && rhs_trans)
00155               descriptor.type=MATRIX_PRODUCT_TT_TYPE;
00156 
00157           }
00158           if(descriptor.type_family!=INVALID_EXPRESSION_FAMILY && root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
00159             fill_expression_descriptor_matrix(statement, expr[root_node.lhs.node_index],descriptor);
00160           if(descriptor.type_family!=INVALID_EXPRESSION_FAMILY && root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
00161             fill_expression_descriptor_matrix(statement, expr[root_node.rhs.node_index],descriptor);
00162         }
00163 
00165         void fill_descriptor(viennacl::scheduler::statement const & statement, viennacl::scheduler::statement_node const & root_node, expression_descriptor & descriptor){
00166           viennacl::scheduler::statement_node_type_family lhs_family = root_node.lhs.type_family;
00167           descriptor.scalartype_size = utils::call_on_element(root_node.lhs, utils::scalartype_size_fun());
00168           if(lhs_family==viennacl::scheduler::VECTOR_TYPE_FAMILY){
00169             descriptor.type_family = VECTOR_SAXPY_FAMILY;
00170             descriptor.type = VECTOR_SAXPY_TYPE;
00171             fill_expression_descriptor_vector(statement,root_node,descriptor);
00172           }
00173           else if(lhs_family==viennacl::scheduler::MATRIX_TYPE_FAMILY){
00174             descriptor.type_family = MATRIX_SAXPY_FAMILY;
00175             descriptor.type = MATRIX_SAXPY_TYPE;
00176             fill_expression_descriptor_matrix(statement,root_node,descriptor);
00177           }
00178           else if(lhs_family==viennacl::scheduler::SCALAR_TYPE_FAMILY){
00179             descriptor.type_family = SCALAR_SAXPY_FAMILY;
00180             descriptor.type = SCALAR_SAXPY_TYPE;
00181             fill_expression_descriptor_scalar(statement,root_node,descriptor);
00182           }
00183         }
00184 
00189         template<class StatementsType>
00190         void set_expression_arguments(profile_base const & profile, unsigned int device_offset, StatementsType const & statements, unsigned int & kernel_id, viennacl::ocl::program & p, std::list<viennacl::ocl::kernel *> & kernels) const {
00191           for(vcl_size_t i = 0 ; i < profile.num_kernels() ; ++i){
00192             //add kernel name
00193             char str[32];
00194             std::sprintf(str,"kernel_%d_%d",device_offset,kernel_id);
00195             viennacl::ocl::kernel & kernel = p.get_kernel(str);
00196             kernels.push_back(&kernel);
00197             unsigned int current_arg = 0;
00198             //Configure ND Range and enqueue arguments
00199             profile.configure_range_enqueue_arguments(i, statements, kernel, current_arg);
00200             std::set<void *> memory;
00201             for(typename StatementsType::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
00202               detail::traverse(it->first, it->second, detail::set_arguments_functor(memory,current_arg,kernel));
00203             }
00204             ++kernel_id;
00205           }
00206         }
00207 
00209         profile_base const & get_profile(viennacl::ocl::device const & device, expression_descriptor const & descriptor) const {
00210           forced_profiles_type::const_iterator it = forced_profiles_.find(std::make_pair(descriptor.type, descriptor.scalartype_size));
00211           if(it != forced_profiles_.end())
00212             return *it->second;
00213           return *profiles::get(device,descriptor);
00214         }
00215 
00216       public:
00217 
00219         code_generator(viennacl::ocl::context const & ctx = viennacl::ocl::current_context()) : ctx_(ctx){
00220           statements_.reserve(16);
00221         }
00222 
00224         template<class T>
00225         void force_profile(forced_profile_key_type key, T const & t){
00226           forced_profiles_.insert(std::pair<forced_profile_key_type, tools::shared_ptr<profile_base> >(key, tools::shared_ptr<profile_base>(new T(t))));
00227         }
00228 
00232         bool add(scheduler::statement const & statement, scheduler::statement_node const & root_node) {
00233           expression_descriptor descriptor;
00234           fill_descriptor(statement, root_node, descriptor);
00235           if(descriptor.type_family==INVALID_EXPRESSION_FAMILY)
00236             return false;
00237           if(statements_.empty())
00238             statements_.push_back(std::make_pair(descriptor,profile_base::statements_type(1,std::make_pair(statement, root_node))));
00239           else
00240             if(statements_.back().first == descriptor)
00241               statements_.back().second.push_back(std::make_pair(statement, root_node));
00242             else
00243               statements_.push_back(std::make_pair(descriptor,profile_base::statements_type(1,std::make_pair(statement, root_node))));
00244           return true;
00245         }
00246 
00248         void configure_program(viennacl::ocl::program & p, std::list<viennacl::ocl::kernel *> & kernels) const {
00249           unsigned int kernel_id = 0;
00250           std::vector<viennacl::ocl::device>::const_iterator found = std::find(ctx_.devices().begin(),ctx_.devices().end(),ctx_.current_device());
00251           for(statements_type::const_iterator it = statements_.begin() ; it != statements_.end() ; ++it)
00252             set_expression_arguments(get_profile(ctx_.current_device(), it->first), static_cast<unsigned int>(std::distance(ctx_.devices().begin(), found)), it->second, kernel_id, p, kernels);
00253         }
00254 
00256         void make_program_name(char * program_name) const {
00257           unsigned int current_arg = 0;
00258           void* memory[64] = {NULL};
00259           for(statements_type::const_iterator it = statements_.begin() ; it != statements_.end() ; ++it){
00260             for(profile_base::statements_type::const_iterator iit = it->second.begin() ; iit != it->second.end() ; ++iit){
00261               detail::traverse(iit->first, iit->second, detail::statement_representation_functor(memory, current_arg, program_name));
00262             }
00263           }
00264           *program_name='\0';
00265         }
00266 
00268         std::string make_opencl_program_string() const {
00269           utils::kernel_generation_stream stream;
00270 
00271           //Headers generation
00272           stream << "#if defined(cl_khr_fp64)\n";
00273           stream <<  "#  pragma OPENCL EXTENSION cl_khr_fp64: enable\n";
00274           stream <<  "#elif defined(cl_amd_fp64)\n";
00275           stream <<  "#  pragma OPENCL EXTENSION cl_amd_fp64: enable\n";
00276           stream <<  "#endif\n";
00277           stream << std::endl;
00278 
00279           vcl_size_t device_offset =0;
00280           for(std::vector<viennacl::ocl::device>::const_iterator it = ctx_.devices().begin() ; it != ctx_.devices().end() ; ++it)
00281             for(statements_type::const_iterator iit = statements_.begin() ; iit != statements_.end() ; ++iit)
00282               get_profile(*it,iit->first)(stream,device_offset++,iit->second);
00283 
00284           return stream.str();
00285         }
00286 
00291         std::string make_cuda_program_string() const {
00292           //Creates OpenCL string with #ifdef and attributes
00293           utils::kernel_generation_stream stream;
00294           vcl_size_t device_offset =0;
00295           for(std::vector<viennacl::ocl::device>::const_iterator it = ctx_.devices().begin() ; it != ctx_.devices().end() ; ++it)
00296             for(statements_type::const_iterator iit = statements_.begin() ; iit != statements_.end() ; ++iit)
00297               get_profile(*it,iit->first)(stream,device_offset++,iit->second);
00298           std::string res = stream.str();
00299 
00300           viennacl::tools::find_and_replace(res,"__attribute__","//__attribute__");
00301 
00302           //Pointer
00303           viennacl::tools::find_and_replace(res, "__global float*", "float*");
00304           viennacl::tools::find_and_replace(res, "__local float*", "float*");
00305 
00306           viennacl::tools::find_and_replace(res, "__global double*", "double*");
00307           viennacl::tools::find_and_replace(res, "__local double*", "double*");
00308 
00309           //Qualifiers
00310           viennacl::tools::find_and_replace(res,"__global","__device__");
00311           viennacl::tools::find_and_replace(res,"__kernel","__global__");
00312           viennacl::tools::find_and_replace(res,"__constant","__constant__");
00313           viennacl::tools::find_and_replace(res,"__local","__shared__");
00314 
00315           //Indexing
00316           viennacl::tools::find_and_replace(res,"get_num_groups(0)","gridDim.x");
00317           viennacl::tools::find_and_replace(res,"get_num_groups(1)","gridDim.y");
00318 
00319           viennacl::tools::find_and_replace(res,"get_local_size(0)","blockDim.x");
00320           viennacl::tools::find_and_replace(res,"get_local_size(1)","blockDim.y");
00321 
00322           viennacl::tools::find_and_replace(res,"get_group_id(0)","blockIdx.x");
00323           viennacl::tools::find_and_replace(res,"get_group_id(1)","blockIdx.y");
00324 
00325           viennacl::tools::find_and_replace(res,"get_local_id(0)","threadIdx.x");
00326           viennacl::tools::find_and_replace(res,"get_local_id(1)","threadIdx.y");
00327 
00328           viennacl::tools::find_and_replace(res,"get_global_id(0)","(blockIdx.x*blockDim.x + threadIdx.x)");
00329           viennacl::tools::find_and_replace(res,"get_global_id(1)","(blockIdx.y*blockDim.y + threadIdx.y)");
00330 
00331           //Synchronization
00332           viennacl::tools::find_and_replace(res,"barrier(CLK_LOCAL_MEM_FENCE)","__syncthreads()");
00333           viennacl::tools::find_and_replace(res,"barrier(CLK_GLOBAL_MEM_FENCE)","__syncthreads()");
00334 
00335 
00336           return res;
00337         }
00338 
00339       private:
00340         statements_type statements_;
00341         viennacl::ocl::context const & ctx_;
00342         forced_profiles_type forced_profiles_;
00343     };
00344 
00351     inline viennacl::ocl::program & get_configured_program(viennacl::generator::code_generator const & generator, std::list<viennacl::ocl::kernel*> & kernels, bool force_recompilation = false){
00352       char* program_name = (char*)malloc(256*sizeof(char));
00353       generator.make_program_name(program_name);
00354       if(force_recompilation)
00355         viennacl::ocl::current_context().delete_program(program_name);
00356       if(!viennacl::ocl::current_context().has_program(program_name)){
00357         std::string source_code = generator.make_opencl_program_string();
00358     #ifdef VIENNACL_DEBUG_BUILD
00359         std::cout << "Building " << program_name << "..." << std::endl;
00360         std::cout << source_code << std::endl;
00361     #endif
00362         viennacl::ocl::current_context().add_program(source_code, program_name);
00363       }
00364       viennacl::ocl::program & p = viennacl::ocl::current_context().get_program(program_name);
00365       generator.configure_program(p, kernels);
00366       return p;
00367     }
00368 
00370     inline void enqueue(viennacl::generator::code_generator const & generator, bool force_recompilation = false){
00371       std::list<viennacl::ocl::kernel*> kernels;
00372       get_configured_program(generator, kernels, force_recompilation);
00373       for(std::list<viennacl::ocl::kernel*>::iterator it = kernels.begin() ; it != kernels.end() ; ++it){
00374         viennacl::ocl::enqueue(**it, (*it)->context().get_queue());
00375       }
00376     }
00377 
00379     inline std::string get_opencl_program_string(viennacl::scheduler::statement const & s){
00380       generator::code_generator gen;
00381       gen.add(s,s.array()[0]);
00382       return gen.make_opencl_program_string();
00383     }
00384 
00386     inline std::string get_cuda_device_code(viennacl::scheduler::statement const & s){
00387       generator::code_generator gen;
00388       gen.add(s, s.array()[0]);
00389       return gen.make_cuda_program_string();
00390     }
00391 
00393     inline void generate_enqueue_statement(viennacl::scheduler::statement const & s, scheduler::statement_node const & root_node){
00394       generator::code_generator gen;
00395       gen.add(s,root_node);
00396       viennacl::generator::enqueue(gen);
00397     }
00398 
00400     inline void generate_enqueue_statement(viennacl::scheduler::statement const & s){
00401       generate_enqueue_statement(s, s.array()[0]);
00402     }
00403 
00404   }
00405 }
00406 #endif