ViennaCL - The Vienna Computing Library  1.5.0
viennacl/generator/profile_base.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_GENERATOR_GENERATE_TEMPLATE_BASE_BASE
00002 #define VIENNACL_GENERATOR_GENERATE_TEMPLATE_BASE_BASE
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2013, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00021 
00027 #include <list>
00028 #include <set>
00029 
00030 #include "viennacl/ocl/backend.hpp"
00031 #include "viennacl/ocl/kernel.hpp"
00032 #include "viennacl/ocl/device.hpp"
00033 #include "viennacl/ocl/device_utils.hpp"
00034 #include "viennacl/ocl/infos.hpp"
00035 
00036 #include "viennacl/scheduler/forwards.h"
00037 
00038 #include "viennacl/generator/helpers.hpp"
00039 #include "viennacl/generator/map_functor.hpp"
00040 
00041 namespace viennacl{
00042 
00043   namespace generator{
00044 
00045 
00047     class profile_base{
00048       public:
00049         typedef std::list< std::pair<scheduler::statement, scheduler::statement_node> > statements_type;
00050 
00051       protected:
00052         friend std::ostream & operator<<(std::ostream &, profile_base const &);
00053 
00054         virtual bool invalid_impl(viennacl::ocl::device const & /*dev*/, vcl_size_t /*scalartype_size*/) const { return false; }
00055         virtual bool is_slow_impl(viennacl::ocl::device const &) const { return false; }
00056 
00057         virtual vcl_size_t lmem_used(vcl_size_t /*scalartype_size*/) const { return 0; }
00058 
00059         void configure_local_sizes(viennacl::ocl::kernel & k, vcl_size_t /*kernel_id*/) const {
00060           k.local_work_size(0,local_size_1_);
00061           k.local_work_size(1,local_size_2_);
00062         }
00063 
00064         virtual void print(std::ostream & s) const{
00065           s << csv_representation();
00066         }
00067 
00075         virtual void core(vcl_size_t kernel_id, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const = 0;
00076 
00077       public:
00079         profile_base(unsigned int vectorization, vcl_size_t local_size_1, vcl_size_t local_size_2, vcl_size_t num_kernels) : vector_size_(vectorization), local_size_1_(local_size_1), local_size_2_(local_size_2), num_kernels_(num_kernels){ }
00080 
00082         virtual ~profile_base(){ }
00083 
00085         virtual void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type  const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg) const = 0;
00086 
00087         virtual void kernel_arguments(statements_type  const & statements, std::string & arguments_string) const = 0;
00088 
00090         unsigned int vector_size() const { return vector_size_; }
00091 
00095         virtual std::string csv_representation() const = 0;
00096 
00099         bool is_slow(viennacl::ocl::device const & dev) const{
00100           bool res = false;
00101           if(dev.type()==CL_DEVICE_TYPE_GPU){
00102             vcl_size_t warp_size = 32;
00103             if(dev.vendor_id()==4098)
00104               warp_size = 64;
00105             res = static_cast<bool>(((local_size_1_*local_size_2_)%warp_size)>0);
00106           }
00107           return res || is_slow_impl(dev);
00108         }
00109 
00114         bool is_invalid(viennacl::ocl::device const & dev, vcl_size_t scalartype_size) const{
00115           //Query device informations
00116           vcl_size_t lmem_available = static_cast<vcl_size_t>(dev.local_mem_size());
00117           vcl_size_t max_workgroup_size = dev.max_work_group_size();
00118 
00119           std::vector<vcl_size_t> max_work_item_sizes = dev.max_work_item_sizes();
00120           bool invalid_work_group_sizes = local_size_1_*local_size_2_ > max_workgroup_size
00121               || local_size_1_ > max_work_item_sizes[0]
00122               || local_size_2_ > max_work_item_sizes[1]; // uses too much resources
00123 
00124           return  invalid_work_group_sizes
00125               || lmem_used(scalartype_size)>lmem_available
00126               || invalid_impl(dev, scalartype_size);
00127         }
00128 
00130         vcl_size_t num_kernels() const{ return num_kernels_; }
00131 
00138         virtual void operator()(utils::kernel_generation_stream & stream, vcl_size_t device_offset, statements_type const & statements) const {
00139           std::vector<detail::mapping_type> mapping(statements.size());
00140 
00142           std::string prototype;
00143           std::set<std::string> already_generated;
00144           kernel_arguments(statements, prototype);
00145 
00146           {
00147             std::map<void *, vcl_size_t> memory;
00148             unsigned int current_arg = 0;
00149             vcl_size_t i = 0;
00150             for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it)
00151               detail::traverse(it->first, it->second, detail::map_functor(memory,current_arg,mapping[i++]));
00152           }
00153 
00154           for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
00155             detail::traverse(it->first, it->second, detail::prototype_generation_traversal(already_generated, prototype, vector_size(), mapping[std::distance(statements.begin(), it)]));
00156           }
00157 
00158           prototype.erase(prototype.size()-1); //Last comma pruned
00159 
00160           //Generate
00161           for(vcl_size_t n = 0 ; n < num_kernels() ; ++n){
00162             //stream << "__attribute__((vec_type_hint()))" << std::endl;
00163             stream << " __attribute__((reqd_work_group_size(" << local_size_1_ << "," << local_size_2_ << "," << 1 << ")))" << std::endl;
00164             stream << "__kernel " << "void " << "kernel_" << device_offset << "_" << n << "(" << std::endl;
00165             stream << prototype << std::endl;
00166             stream << ")" << std::endl;
00167 
00168             //core:
00169             stream << "{" << std::endl;
00170             stream.inc_tab();
00171             core(n, stream, statements, mapping);
00172             stream.dec_tab();
00173             stream << "}" << std::endl;
00174           }
00175         }
00176 
00177       protected:
00178         unsigned int vector_size_;
00179         vcl_size_t local_size_1_;
00180         vcl_size_t local_size_2_;
00181         vcl_size_t num_kernels_;
00182     };
00183 
00184 
00185     inline std::ostream & operator<<(std::ostream & os, profile_base const & profile){
00186       profile.print(os);
00187       return os;
00188     }
00189 
00190   }
00191 
00192 }
00193 
00194 #endif