ViennaCL - The Vienna Computing Library  1.5.0
viennacl/linalg/opencl/kernels/matrix.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
00002 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
00003 
00004 #include "viennacl/tools/tools.hpp"
00005 #include "viennacl/ocl/kernel.hpp"
00006 #include "viennacl/ocl/platform.hpp"
00007 #include "viennacl/ocl/utils.hpp"
00008 
00011 namespace viennacl
00012 {
00013   namespace linalg
00014   {
00015     namespace opencl
00016     {
00017       namespace kernels
00018       {
00019 
00021 
00023         enum ambm_scalar_type
00024         {
00025           VIENNACL_AMBM_NONE = 0, // vector does not exist/contribute
00026           VIENNACL_AMBM_CPU,
00027           VIENNACL_AMBM_GPU
00028         };
00029 
00031         struct ambm_config
00032         {
00033           ambm_config() : with_stride_and_range(true), is_row_major(true), a(VIENNACL_AMBM_CPU), b(VIENNACL_AMBM_NONE) {}
00034 
00035           bool with_stride_and_range;
00036           bool is_row_major;
00037           std::string      assign_op;
00038           ambm_scalar_type a;
00039           ambm_scalar_type b;
00040         };
00041 
00042         // just returns the for-loop
00043         template <typename StringType>
00044         void generate_ambm_impl2(StringType & source, ambm_config const & cfg, bool mult_alpha, bool mult_beta)
00045         {
00046           source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
00047           source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
00048           if (cfg.is_row_major)
00049           {
00050             source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
00051             source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
00052           }
00053           else
00054           {
00055             source.append("  for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
00056             source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
00057           }
00058 
00059           if (cfg.with_stride_and_range)
00060           {
00061             if (cfg.is_row_major)
00062               source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] ");
00063             else
00064               source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] ");
00065             source.append(cfg.assign_op);
00066             if (cfg.is_row_major)
00067               source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] ");
00068             else
00069               source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] ");
00070 
00071             if (mult_alpha)
00072               source.append("* alpha ");
00073             else
00074               source.append("/ alpha ");
00075             if (cfg.b != VIENNACL_AMBM_NONE)
00076             {
00077               if (cfg.is_row_major)
00078                 source.append("+ C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] ");
00079               else
00080                 source.append("+ C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] ");
00081               if (mult_beta)
00082                 source.append("* beta");
00083               else
00084                 source.append("/ beta");
00085             }
00086           }
00087           else
00088           {
00089             if (cfg.is_row_major)
00090               source.append("    A[row * A_internal_size2 + col] ");
00091             else
00092               source.append("    A[row + col * A_internal_size1] ");
00093             source.append(cfg.assign_op);
00094             if (cfg.is_row_major)
00095               source.append(" B[row * B_internal_size2 + col] ");
00096             else
00097               source.append(" B[row + col * B_internal_size1] ");
00098 
00099             if (mult_alpha)
00100               source.append("* alpha ");
00101             else
00102               source.append("/ alpha ");
00103             if (cfg.b != VIENNACL_AMBM_NONE)
00104             {
00105               if (cfg.is_row_major)
00106                 source.append("+ C[row * C_internal_size2 + col] ");
00107               else
00108                 source.append("+ C[row + col * C_internal_size2] ");
00109               if (mult_beta)
00110                 source.append("* beta");
00111               else
00112                 source.append("/ beta");
00113             }
00114           }
00115           source.append("; \n");
00116         }
00117 
00118         template <typename StringType>
00119         void generate_ambm_impl(StringType & source, std::string const & numeric_string, ambm_config const & cfg)
00120         {
00121           source.append("__kernel void am");
00122           if (cfg.b != VIENNACL_AMBM_NONE)
00123             source.append("bm");
00124           if (cfg.assign_op != "=")
00125             source.append("_m");
00126 
00127           if (cfg.a == VIENNACL_AMBM_CPU)
00128             source.append("_cpu");
00129           else if (cfg.a == VIENNACL_AMBM_GPU)
00130             source.append("_gpu");
00131 
00132           if (cfg.b == VIENNACL_AMBM_CPU)
00133             source.append("_cpu");
00134           else if (cfg.b == VIENNACL_AMBM_GPU)
00135             source.append("_gpu");
00136           source.append("( \n");
00137           source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
00138           source.append("  unsigned int A_start1, unsigned int A_start2, \n");
00139           source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
00140           source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
00141           source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
00142           if (cfg.a == VIENNACL_AMBM_CPU)
00143           {
00144             source.append("  "); source.append(numeric_string); source.append(" fac2, \n");
00145           }
00146           else if (cfg.a == VIENNACL_AMBM_GPU)
00147           {
00148             source.append("  __global "); source.append(numeric_string); source.append(" * fac2, \n");
00149           }
00150           source.append("  unsigned int options2, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
00151           source.append("  __global const "); source.append(numeric_string); source.append(" * B, \n");
00152           source.append("  unsigned int B_start1, unsigned int B_start2, \n");
00153           source.append("  unsigned int B_inc1,   unsigned int B_inc2, \n");
00154           source.append("  unsigned int B_internal_size1,  unsigned int B_internal_size2");
00155 
00156           if (cfg.b != VIENNACL_AMBM_NONE)
00157           {
00158             source.append(", \n\n");
00159             if (cfg.b == VIENNACL_AMBM_CPU)
00160             {
00161               source.append("  "); source.append(numeric_string); source.append(" fac3, \n");
00162             }
00163             else if (cfg.b == VIENNACL_AMBM_GPU)
00164             {
00165               source.append("  __global "); source.append(numeric_string); source.append(" * fac3, \n");
00166             }
00167             source.append("  unsigned int options3, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
00168             source.append("  __global const "); source.append(numeric_string); source.append(" * C, \n");
00169             source.append("  unsigned int C_start1, unsigned int C_start2, \n");
00170             source.append("  unsigned int C_inc1,   unsigned int C_inc2, \n");
00171             source.append("  unsigned int C_internal_size1,  unsigned int C_internal_size2 \n");
00172           }
00173           source.append(") { \n");
00174 
00175           if (cfg.a == VIENNACL_AMBM_CPU)
00176           {
00177             source.append("  "); source.append(numeric_string); source.append(" alpha = fac2; \n");
00178           }
00179           else if (cfg.a == VIENNACL_AMBM_GPU)
00180           {
00181             source.append("  "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
00182           }
00183           source.append("  if (options2 & (1 << 0)) \n");
00184           source.append("    alpha = -alpha; \n");
00185           source.append(" \n");
00186 
00187           if (cfg.b == VIENNACL_AMBM_CPU)
00188           {
00189             source.append("  "); source.append(numeric_string); source.append(" beta = fac3; \n");
00190           }
00191           else if (cfg.b == VIENNACL_AMBM_GPU)
00192           {
00193             source.append("  "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
00194           }
00195           if (cfg.b != VIENNACL_AMBM_NONE)
00196           {
00197             source.append("  if (options3 & (1 << 0)) \n");
00198             source.append("    beta = -beta; \n");
00199             source.append(" \n");
00200           }
00201           source.append("  if (options2 & (1 << 1)) { \n");
00202           if (cfg.b != VIENNACL_AMBM_NONE)
00203           {
00204             source.append("    if (options3 & (1 << 1)) {\n");
00205             generate_ambm_impl2(source, cfg, false, false);
00206             source.append("    } else {\n");
00207             generate_ambm_impl2(source, cfg, false, true);
00208             source.append("    } \n");
00209           }
00210           else
00211             generate_ambm_impl2(source, cfg, false, true);
00212           source.append("  } else { \n");
00213           if (cfg.b != VIENNACL_AMBM_NONE)
00214           {
00215             source.append("    if (options3 & (1 << 1)) {\n");
00216             generate_ambm_impl2(source, cfg, true, false);
00217             source.append("    } else {\n");
00218             generate_ambm_impl2(source, cfg, true, true);
00219             source.append("    } \n");
00220           }
00221           else
00222             generate_ambm_impl2(source, cfg, true, true);
00223           source.append("  } \n");
00224           source.append("} \n");
00225         }
00226 
00227         template <typename StringType>
00228         void generate_ambm(StringType & source, std::string const & numeric_string, bool is_row_major)
00229         {
00230           ambm_config cfg;
00231           cfg.assign_op = "=";
00232           cfg.with_stride_and_range = true;
00233           cfg.is_row_major = is_row_major;
00234 
00235           // am
00236           cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
00237           cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
00238 
00239           // ambm
00240           cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
00241           cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
00242           cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
00243           cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
00244 
00245           // ambm_m
00246           cfg.assign_op = "+=";
00247 
00248           cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
00249           cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
00250           cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
00251           cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
00252         }
00253 
00254         template <typename StringType>
00255         void generate_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major)
00256         {
00257           source.append("__kernel void assign_cpu( \n");
00258           source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
00259           source.append("  unsigned int A_start1, unsigned int A_start2, \n");
00260           source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
00261           source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
00262           source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
00263           source.append("  "); source.append(numeric_string); source.append(" alpha) \n");
00264           source.append("{ \n");
00265           if (is_row_major)
00266           {
00267             source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
00268             source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
00269             source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
00270             source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
00271             source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha; \n");
00272           }
00273           else
00274           {
00275             source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
00276             source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
00277             source.append("  for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
00278             source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
00279             source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] = alpha; \n");
00280           }
00281           source.append("} \n");
00282         }
00283 
00284         template <typename StringType>
00285         void generate_diagonal_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major)
00286         {
00287           source.append("__kernel void diagonal_assign_cpu( \n");
00288           source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
00289           source.append("  unsigned int A_start1, unsigned int A_start2, \n");
00290           source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
00291           source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
00292           source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
00293           source.append("  "); source.append(numeric_string); source.append(" alpha) \n");
00294           source.append("{ \n");
00295           source.append("  for (unsigned int idx = get_global_id(0); idx < min(A_size1, A_size2); idx += get_global_size(0))\n");
00296           if (is_row_major)
00297             source.append("    A[(idx * A_inc1 + A_start1) * A_internal_size2 + (idx * A_inc2 + A_start2)] = alpha; \n");
00298           else
00299             source.append("    A[(idx * A_inc1 + A_start1) + (idx * A_inc2 + A_start2) *  A_internal_size1] = alpha; \n");
00300           source.append("} \n");
00301         }
00302 
00303         template <typename StringType>
00304         void generate_element_op(StringType & source, std::string const & numeric_string, bool is_row_major)
00305         {
00306           source.append("__kernel void element_op( \n");
00307           source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
00308           source.append("  unsigned int A_start1, unsigned int A_start2, \n");
00309           source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
00310           source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
00311           source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
00312           source.append("  __global "); source.append(numeric_string); source.append(" * B, \n");
00313           source.append("  unsigned int B_start1, unsigned int B_start2, \n");
00314           source.append("  unsigned int B_inc1,   unsigned int B_inc2, \n");
00315           source.append("  unsigned int B_internal_size1,  unsigned int B_internal_size2, \n");
00316           source.append("  __global "); source.append(numeric_string); source.append(" * C, \n");
00317           source.append("  unsigned int C_start1, unsigned int C_start2, \n");
00318           source.append("  unsigned int C_inc1,   unsigned int C_inc2, \n");
00319           source.append("  unsigned int C_internal_size1,  unsigned int C_internal_size2, \n");
00320           source.append("  unsigned int op_type) \n"); //0: product, 1: division, 2: pow
00321           source.append("{ \n");
00322           if (is_row_major)
00323           {
00324             source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
00325             source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
00326             source.append("  if (op_type == 2) {");
00327             if (numeric_string == "float" || numeric_string == "double")
00328             {
00329               source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
00330               source.append("      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
00331               source.append("        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
00332               source.append("        pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)], \n");
00333               source.append("            C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]); \n");
00334             }
00335             source.append("  } else if (op_type == 1) {");
00336             source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
00337             source.append("      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
00338             source.append("        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
00339             source.append("        B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / \n");
00340             source.append("        C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
00341             source.append("  } else if (op_type == 0) {");
00342             source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
00343             source.append("      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
00344             source.append("        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
00345             source.append("        B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * \n");
00346             source.append("        C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
00347             source.append("  }");
00348           }
00349           else
00350           {
00351             source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
00352             source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
00353             source.append("  if (op_type == 2) {");
00354             if (numeric_string == "float" || numeric_string == "double")
00355             {
00356               source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
00357               source.append("      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
00358               source.append("        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] =  \n");
00359               source.append("          pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1], \n");
00360               source.append("              C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]); \n");
00361             }
00362             source.append("  } else if (op_type == 1) {");
00363             source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
00364             source.append("      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
00365             source.append("        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] =  \n");
00366             source.append("          B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1] / \n");
00367             source.append("          C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]; \n");
00368             source.append("  } else if (op_type == 0) {");
00369             source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
00370             source.append("      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
00371             source.append("        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] = \n");
00372             source.append("          B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1] * \n");
00373             source.append("          C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]; \n");
00374             source.append("  }");
00375           }
00376           source.append("} \n");
00377         }
00378 
00379 
00380         template <typename StringType>
00381         void generate_fft(StringType & source, std::string const & numeric_string, bool is_row_major)
00382         {
00383           // naive fourier transform (quadratic complexity, use for reference only)
00384           source.append("__kernel void fft_direct(__global "); source.append(numeric_string); source.append("2 *input, \n");
00385           source.append("                         __global "); source.append(numeric_string); source.append("2 *output, \n");
00386           source.append("                         unsigned int size, \n");
00387           source.append("                         unsigned int stride, \n");
00388           source.append("                         unsigned int batch_num, \n");
00389           source.append("                         "); source.append(numeric_string); source.append(" sign) { \n");
00390           source.append("    const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
00391           source.append(" \n");
00392           source.append("    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
00393           source.append("        for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
00394           source.append("            "); source.append(numeric_string); source.append("2 f = 0.0f; \n");
00395           source.append(" \n");
00396           source.append("            for(unsigned int n = 0; n < size; n++) { \n");
00397           source.append("                "); source.append(numeric_string); source.append("2 in = ");
00398           if (is_row_major)
00399             source.append("input[batch_id * stride + n]; \n"); //input index here
00400           else
00401             source.append("input[n * stride + batch_id]; \n"); //input index here
00402           source.append(" \n");
00403           source.append("                "); source.append(numeric_string); source.append(" sn, cs; \n");
00404           source.append("                "); source.append(numeric_string); source.append(" arg = sign * 2 * NUM_PI * k / size * n; \n");
00405           source.append("                sn = sincos(arg, &cs); \n");
00406           source.append(" \n");
00407           source.append("                "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
00408           source.append("                f = f + ("); source.append(numeric_string); source.append("2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
00409           source.append("            } \n");
00410           source.append(" \n");
00411           if (is_row_major)
00412             source.append("            output[batch_id * stride + k] = f; \n"); // output index here
00413           else
00414             source.append("            output[k * stride + batch_id] = f; \n"); // output index here
00415           source.append("        } \n");
00416           source.append("    } \n");
00417           source.append("} \n");
00418 
00419           source.append(" \n"); 
00420 
00421           source.append("__kernel void fft_radix2(__global "); source.append(numeric_string); source.append("2* input, \n");
00422           source.append("                         unsigned int s, \n");
00423           source.append("                         unsigned int bit_size, \n");
00424           source.append("                         unsigned int size, \n");
00425           source.append("                         unsigned int stride, \n");
00426           source.append("                         unsigned int batch_num, \n");
00427           source.append("                         "); source.append(numeric_string); source.append(" sign) { \n");
00428           source.append(" \n");
00429           source.append("    unsigned int ss = 1 << s; \n");
00430           source.append("    unsigned int half_size = size >> 1; \n");
00431           source.append(" \n");
00432           source.append("    "); source.append(numeric_string); source.append(" cs, sn; \n");
00433           source.append("    const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
00434           source.append(" \n");
00435           source.append("    unsigned int glb_id = get_global_id(0); \n");
00436           source.append("    unsigned int glb_sz = get_global_size(0); \n");
00437 
00438           source.append("    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
00439           source.append("        for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
00440           source.append("            unsigned int group = (tid & (ss - 1)); \n");
00441           source.append("            unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
00442 
00443           if (is_row_major)
00444           {
00445             source.append("            unsigned int offset = batch_id * stride + pos; \n");
00446             source.append("            "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
00447             source.append("            "); source.append(numeric_string); source.append("2 in2 = input[offset + ss]; \n");//index
00448           }
00449           else
00450           {
00451             source.append("            unsigned int offset = pos * stride + batch_id; \n");
00452             source.append("            "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
00453             source.append("            "); source.append(numeric_string); source.append("2 in2 = input[offset + ss * stride]; \n");//index
00454           }
00455 
00456           source.append("            "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
00457 
00458           source.append("            sn = sincos(arg, &cs); \n");
00459 
00460           source.append("            "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
00461 
00462           source.append("            "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
00463 
00464           if (is_row_major)
00465             source.append("            input[offset + ss] = in1 - tmp; \n");//index
00466           else
00467             source.append("            input[offset + ss * stride] = in1 - tmp; \n");//index
00468           source.append("            input[offset] = in1 + tmp; \n");//index
00469           source.append("        } \n");
00470           source.append("    } \n");
00471           source.append("} \n");
00472 
00473           source.append(" \n"); 
00474 
00475           source.append(" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
00476           source.append("     v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
00477           source.append("     v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
00478           source.append("     v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
00479           source.append("     v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
00480           source.append("     v = (v >> 16) | (v << 16); \n");
00481           source.append("  \n");
00482           source.append("     v = v >> (32 - bit_size); \n");
00483           source.append("  \n");
00484           source.append("     return v; \n");
00485           source.append(" } \n");
00486 
00487           source.append(" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append("2* input, \n");
00488           source.append("                                 __local "); source.append(numeric_string); source.append("2* lcl_input, \n");
00489           source.append("                                 unsigned int bit_size, \n");
00490           source.append("                                 unsigned int size, \n");
00491           source.append("                                 unsigned int stride, \n");
00492           source.append("                                 unsigned int batch_num, \n");
00493           source.append("                                 "); source.append(numeric_string); source.append(" sign) { \n");
00494 
00495           source.append("     unsigned int grp_id = get_group_id(0); \n");
00496           source.append("     unsigned int grp_num = get_num_groups(0); \n");
00497 
00498           source.append("     unsigned int lcl_sz = get_local_size(0); \n");
00499           source.append("     unsigned int lcl_id = get_local_id(0); \n");
00500           source.append("     const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
00501 
00502           source.append("     for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
00503                   //unsigned int base_offset = stride * batch_id; \n");
00504                   //copy chunk of global memory to local \n");
00505           source.append("         for(unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
00506           source.append("             unsigned int v = get_reorder_num(p, bit_size); \n");
00507           if (is_row_major)
00508             source.append("             lcl_input[v] = input[batch_id * stride + p]; \n"); //index
00509           else
00510             source.append("             lcl_input[v] = input[p * stride + batch_id]; \n"); //index
00511           source.append("         } \n");
00512 
00513           source.append("         barrier(CLK_LOCAL_MEM_FENCE); \n");
00514 
00515                   //performs Cooley-Tukey FFT on local array
00516           source.append("         for(unsigned int s = 0; s < bit_size; s++) { \n");
00517           source.append("             unsigned int ss = 1 << s; \n");
00518 
00519           source.append("             "); source.append(numeric_string); source.append(" cs, sn; \n");
00520 
00521           source.append("             for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
00522           source.append("                 unsigned int group = (tid & (ss - 1)); \n");
00523           source.append("                 unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
00524 
00525           source.append("                 "); source.append(numeric_string); source.append("2 in1 = lcl_input[pos]; \n");
00526           source.append("                 "); source.append(numeric_string); source.append("2 in2 = lcl_input[pos + ss]; \n");
00527 
00528           source.append("                 "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
00529 
00530           source.append("                 sn = sincos(arg, &cs); \n");
00531           source.append("                 "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
00532 
00533           source.append("                 "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
00534 
00535           source.append("                 lcl_input[pos + ss] = in1 - tmp; \n");
00536           source.append("                 lcl_input[pos] = in1 + tmp; \n");
00537           source.append("             } \n");
00538 
00539           source.append("             barrier(CLK_LOCAL_MEM_FENCE); \n");
00540           source.append("         } \n");
00541 
00542                   //copy local array back to global memory
00543           source.append("         for(unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
00544           if (is_row_major)
00545             source.append("             input[batch_id * stride + p] = lcl_input[p]; \n");//index
00546           else
00547             source.append("             input[p * stride + batch_id] = lcl_input[p]; \n");//index
00548           source.append("         } \n");
00549           source.append("     } \n");
00550           source.append(" } \n");
00551 
00552           source.append(" \n"); 
00553 
00554           //
00555           // Performs reordering of input data in bit-reversal order
00556           // Probably it's better to do in host side,
00557           //
00558           source.append("unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
00559           source.append("    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
00560           source.append("    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
00561           source.append("    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
00562           source.append("    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
00563           source.append("    v = (v >> 16) | (v << 16); \n");
00564 
00565           source.append("    v = v >> (32 - bit_size); \n");
00566 
00567           source.append("    return v; \n");
00568           source.append("} \n");
00569 
00570           source.append("__kernel void fft_reorder(__global "); source.append(numeric_string); source.append("2* input, \n");
00571           source.append("                          unsigned int bit_size, \n");
00572           source.append("                          unsigned int size, \n");
00573           source.append("                          unsigned int stride, \n");
00574           source.append("                          int batch_num) { \n");
00575 
00576           source.append("    unsigned int glb_id = get_global_id(0); \n");
00577           source.append("    unsigned int glb_sz = get_global_size(0); \n");
00578 
00579           source.append("    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
00580           source.append("        for(unsigned int i = glb_id; i < size; i += glb_sz) { \n");
00581           source.append("            unsigned int v = get_reorder_num_2(i, bit_size); \n");
00582 
00583           source.append("            if(i < v) {\n");
00584           if (is_row_major)
00585           {
00586             source.append("                "); source.append(numeric_string); source.append("2 tmp = input[batch_id * stride + i]; \n"); // index
00587             source.append("                input[batch_id * stride + i] = input[batch_id * stride + v]; \n"); //index
00588             source.append("                input[batch_id * stride + v] = tmp; \n"); //index
00589           }
00590           else
00591           {
00592             source.append("                "); source.append(numeric_string); source.append("2 tmp = input[i * stride + batch_id]; \n"); // index
00593             source.append("                input[i * stride + batch_id] = input[v * stride + batch_id]; \n"); //index
00594             source.append("                input[v * stride + batch_id] = tmp; \n"); //index
00595           }
00596           source.append("            } \n");
00597           source.append("        } \n");
00598           source.append("    } \n");
00599           source.append("} \n");
00600         }
00601 
00602         template <typename StringType>
00603         void generate_lu(StringType & source, std::string const & numeric_string, bool is_row_major)
00604         {
00605           source.append("__kernel void lu_factorize( \n");
00606           source.append("          __global "); source.append(numeric_string); source.append(" * matrix, \n");
00607           source.append("          unsigned int matrix_rows, \n");
00608           source.append("          unsigned int matrix_cols, \n");
00609           source.append("          unsigned int matrix_internal_rows, \n");
00610           source.append("          unsigned int matrix_internal_cols) \n");
00611           source.append("{ \n");
00612           source.append("  "); source.append(numeric_string); source.append(" temp; \n");
00613 
00614           if (is_row_major)
00615           {
00616             source.append("  unsigned rowi; \n");
00617             source.append("  unsigned rowk; \n");
00618             source.append("  for (unsigned int i=1; i<matrix_rows; ++i) \n");
00619             source.append("  { \n");
00620             source.append("    rowi = i * matrix_internal_cols; \n");
00621             source.append("    for (unsigned int k=0; k<i; ++k) \n");
00622             source.append("    { \n");
00623             source.append("      rowk = k * matrix_internal_cols; \n");
00624             source.append("      if (get_global_id(0) == 0) \n");
00625             source.append("        matrix[rowi + k] /= matrix[rowk + k]; \n");
00626 
00627             source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
00628             source.append("      temp = matrix[rowi + k]; \n");
00629 
00630             //parallel subtraction:
00631             source.append("      for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
00632             source.append("        matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
00633           }
00634           else
00635           {
00636             source.append("      for (unsigned int i=1; i<matrix_rows; ++i) \n");
00637             source.append("      { \n");
00638             source.append("        for (unsigned int k=0; k<i; ++k) \n");
00639             source.append("        { \n");
00640 
00641             source.append("          if (get_global_id(0) == 0) \n");
00642             source.append("            matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
00643 
00644             source.append("          barrier(CLK_GLOBAL_MEM_FENCE); \n");
00645             source.append("          temp = matrix[i + k*matrix_internal_rows]; \n");
00646 
00647             //parallel subtraction:
00648             source.append("          for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
00649             source.append("            matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
00650           }
00651           source.append("   }");
00652           source.append("  }");
00653           source.append("}");
00654         }
00655 
00656 
00657         template <typename StringType>
00658         void generate_scaled_rank1_update(StringType & source, std::string const & numeric_string, bool is_row_major, bool alpha_on_cpu)
00659         {
00660           source.append("__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append("cpu") : source.append("gpu"); source.append("( \n");
00661           source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
00662           source.append("  unsigned int A_start1, unsigned int A_start2, \n");
00663           source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
00664           source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
00665           source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
00666 
00667           if (alpha_on_cpu) {
00668             source.append("  "); source.append(numeric_string); source.append(" val, \n");
00669           } else {
00670             source.append("  __global const "); source.append(numeric_string); source.append(" *val, \n");
00671           }
00672           source.append("  unsigned int options2, \n");
00673 
00674           source.append("  __global const "); source.append(numeric_string); source.append(" * vec1, \n");
00675           source.append("  unsigned int start1, \n");
00676           source.append("  unsigned int inc1, \n");
00677           source.append("  unsigned int size1, \n");
00678 
00679           source.append("  __global const "); source.append(numeric_string); source.append(" * vec2, \n");
00680           source.append("  unsigned int start2, \n");
00681           source.append("  unsigned int inc2, \n");
00682           source.append("  unsigned int size2) \n");
00683           source.append("{ \n");
00684 
00685           if (alpha_on_cpu) {
00686             source.append("  "); source.append(numeric_string); source.append(" alpha = val; \n");
00687           } else {
00688             source.append("  "); source.append(numeric_string); source.append(" alpha = val[0]; \n");
00689           }
00690           source.append("  if (options2 & (1 << 0)) \n");
00691           source.append("    alpha = -alpha; \n");
00692 
00693           source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
00694           source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
00695 
00696           source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
00697           source.append("  { \n");
00698           source.append("    "); source.append(numeric_string); source.append(" tmp = vec1[row * inc1 + start1];");
00699           source.append("    tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
00700           source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
00701           if (is_row_major)
00702             source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
00703           else
00704             source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
00705           source.append("  } \n");
00706           source.append("} \n");
00707         }
00708 
00709         template <typename StringType>
00710         void generate_trans_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major)
00711         {
00712           source.append("__kernel void trans_vec_mul( \n");
00713           source.append("          __global const "); source.append(numeric_string); source.append(" * A, \n");
00714           source.append("          unsigned int A_row_start, unsigned int A_col_start, \n");
00715           source.append("          unsigned int A_row_inc, unsigned int A_col_inc, \n");
00716           source.append("          unsigned int A_row_size, unsigned int A_col_size, \n");
00717           source.append("          unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
00718           source.append("          __global const "); source.append(numeric_string); source.append(" * v, \n");
00719           source.append("          unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
00720           source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
00721           source.append("          unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
00722           source.append("          __local "); source.append(numeric_string); source.append(" * work) \n");
00723           source.append("{ \n");
00724           if (is_row_major)
00725           {
00726             source.append("  for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0)) \n");
00727             source.append("  { \n");
00728             source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
00729             source.append("    for (unsigned int col = 0; col < A_row_size; ++col) \n");
00730             source.append("      dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; \n");
00731             source.append("    result[row * result_inc + result_start] = dot_prod; \n");
00732           }
00733           else
00734           {
00735             source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
00736             source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
00737             source.append("  unsigned int lid = get_local_id(0); \n");
00738 
00739             source.append("  for (unsigned int row = row_gid; row < A_col_size; row += get_num_groups(0)) \n");
00740             source.append("  { \n");
00741             source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
00742             source.append("    for (unsigned int col = col_gid; col < A_row_size; col+=get_local_size(0)) \n");
00743             source.append("      dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col]; \n");
00744             source.append("    work[lid] = dot_prod; \n");
00745 
00746             source.append("    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
00747             source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
00748             source.append("      if(lid < stride) \n");
00749             source.append("        work[lid] += work[lid+stride]; \n");
00750             source.append("    } \n");
00751 
00752             source.append("    if(lid == 0) \n");
00753             source.append("      result[row * result_inc + result_start] = work[0]; \n");
00754           }
00755           source.append("  } \n");
00756           source.append("} \n");
00757         }
00758 
00759         template <typename StringType>
00760         void generate_triangular_substitute_inplace(StringType & source, std::string const & numeric_string, bool is_row_major)
00761         {
00762           source.append("__kernel void triangular_substitute_inplace( \n");
00763           source.append("          __global "); source.append(numeric_string); source.append(" * A, \n");
00764           source.append("          unsigned int A_start1, unsigned int A_start2, \n");
00765           source.append("          unsigned int A_inc1,   unsigned int A_inc2, \n");
00766           source.append("          unsigned int A_size1,  unsigned int A_size2, \n");
00767           source.append("          unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
00768           source.append("          __global "); source.append(numeric_string); source.append(" * v, \n");
00769           source.append("          unsigned int v_start, \n");
00770           source.append("          unsigned int v_inc, \n");
00771           source.append("          unsigned int v_size, \n");
00772           source.append("          unsigned int options) \n");
00773           source.append("{ \n");
00774           source.append("  "); source.append(numeric_string); source.append(" temp; \n");
00775           source.append("  unsigned int unit_diagonal_flag  = (options & (1 << 0)); \n");
00776           source.append("  unsigned int transposed_access_A = (options & (1 << 1)); \n");
00777           source.append("  unsigned int is_lower_solve      = (options & (1 << 2)); \n");
00778           source.append("  unsigned int row; \n");
00779           source.append("  for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)  \n");   //Note: A required to be square
00780           source.append("  { \n");
00781           source.append("    row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
00782           source.append("    if (!unit_diagonal_flag) \n");
00783           source.append("    { \n");
00784           source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
00785           source.append("      if (get_global_id(0) == 0) \n");
00786           if (is_row_major)
00787             source.append("        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
00788           else
00789             source.append("        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
00790           source.append("   } \n");
00791 
00792           source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
00793 
00794           source.append("    temp = v[row * v_inc + v_start]; \n");
00795 
00796           source.append("    for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
00797           source.append("             elim < (is_lower_solve ? A_size1 : row); \n");
00798           source.append("             elim += get_global_size(0)) \n");
00799           if (is_row_major)
00800           {
00801             source.append("      v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
00802             source.append("                                                                : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row  * A_inc2 + A_start2))]; \n");
00803           }
00804           else
00805           {
00806             source.append("      v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
00807             source.append("                                                                : ((elim * A_inc1 + A_start1) + (row  * A_inc2 + A_start2) * A_internal_size1)]; \n");
00808           }
00809           source.append("  } \n");
00810           source.append("} \n");
00811         }
00812 
00813         template <typename StringType>
00814         void generate_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major)
00815         {
00816           source.append("__kernel void vec_mul( \n");
00817           source.append("          __global const "); source.append(numeric_string); source.append(" * A, \n");
00818           source.append("          unsigned int A_row_start, unsigned int A_col_start, \n");
00819           source.append("          unsigned int A_row_inc, unsigned int A_col_inc, \n");
00820           source.append("          unsigned int A_row_size, unsigned int A_col_size, \n");
00821           source.append("          unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
00822           source.append("          __global const "); source.append(numeric_string); source.append(" * v, \n");
00823           source.append("          unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
00824           source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
00825           source.append("          unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
00826           source.append("          __local "); source.append(numeric_string); source.append(" * work) \n");
00827           source.append("{ \n");
00828           if (is_row_major)
00829           {
00830             source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
00831             source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
00832             source.append("  unsigned int lid = get_local_id(0); \n");
00833 
00834             source.append("  for (unsigned int row = row_gid; row < A_row_size; row += get_num_groups(0)) \n");
00835             source.append("  { \n");
00836             source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
00837             source.append("    for (unsigned int col = col_gid; col < A_col_size; col+=get_local_size(0)) \n");
00838             source.append("      dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; \n");
00839             source.append("    work[lid] = dot_prod; \n");
00840 
00841             source.append("    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
00842             source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
00843             source.append("      if(lid < stride) \n");
00844             source.append("        work[lid] += work[lid+stride]; \n");
00845             source.append("    } \n");
00846 
00847             source.append("    if(lid == 0) \n");
00848             source.append("      result[row * result_inc + result_start] = work[0]; \n");
00849 
00850           }
00851           else
00852           {
00853             source.append("    for (unsigned int row = get_global_id(0); row < A_row_size; row += get_global_size(0)) \n");
00854             source.append("    { \n");
00855             source.append("      "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
00856             source.append("      for (unsigned int col = 0; col < A_col_size; ++col) \n");
00857             source.append("        dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col]; \n");
00858             source.append("      result[row * result_inc + result_start] = dot_prod; \n");
00859           }
00860           source.append("  } \n");
00861           source.append("} \n");
00862         }
00863 
00864         namespace detail
00865         {
00866           inline std::string type_to_string(viennacl::row_major)    { return "row"; }
00867           inline std::string type_to_string(viennacl::column_major) { return "col"; }
00868         }
00869 
00871 
00872         // main kernel class
00874         template <typename NumericT, typename F>
00875         struct matrix
00876         {
00877           static std::string program_name()
00878           {
00879             return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_" + detail::type_to_string(F());
00880           }
00881 
00882           static void init(viennacl::ocl::context & ctx)
00883           {
00884             viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
00885             std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
00886             bool is_row_major = viennacl::is_row_major<F>::value;
00887 
00888             static std::map<cl_context, bool> init_done;
00889             if (!init_done[ctx.handle().get()])
00890             {
00891               std::string source;
00892               source.reserve(8192);
00893 
00894               viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
00895 
00896               // fully parametrized kernels:
00897               generate_ambm(source, numeric_string, is_row_major);
00898 
00899               // kernels with mostly predetermined skeleton:
00900               generate_assign_cpu(source, numeric_string, is_row_major);
00901               generate_diagonal_assign_cpu(source, numeric_string, is_row_major);
00902               generate_element_op(source, numeric_string, is_row_major);
00903               generate_scaled_rank1_update(source, numeric_string, is_row_major, true);
00904               generate_scaled_rank1_update(source, numeric_string, is_row_major, false);
00905               generate_trans_vec_mul(source, numeric_string, is_row_major);
00906               generate_vec_mul(source, numeric_string, is_row_major);
00907 
00908               if (numeric_string == "float" || numeric_string == "double")
00909               {
00910                 generate_fft(source, numeric_string, is_row_major);
00911                 generate_lu(source, numeric_string, is_row_major);
00912                 generate_triangular_substitute_inplace(source, numeric_string, is_row_major);
00913               }
00914 
00915               std::string prog_name = program_name();
00916               #ifdef VIENNACL_BUILD_INFO
00917               std::cout << "Creating program " << prog_name << std::endl;
00918               #endif
00919               ctx.add_program(source, prog_name);
00920               init_done[ctx.handle().get()] = true;
00921             } //if
00922           } //init
00923         };
00924 
00925       }  // namespace kernels
00926     }  // namespace opencl
00927   }  // namespace linalg
00928 }  // namespace viennacl
00929 #endif
00930