ViennaCL - The Vienna Computing Library  1.5.0
viennacl/linalg/opencl/kernels/compressed_matrix.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
00002 #define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP
00003 
00004 #include "viennacl/tools/tools.hpp"
00005 #include "viennacl/ocl/kernel.hpp"
00006 #include "viennacl/ocl/platform.hpp"
00007 #include "viennacl/ocl/utils.hpp"
00008 
00009 #include "viennacl/linalg/opencl/common.hpp"
00010 
00013 namespace viennacl
00014 {
00015   namespace linalg
00016   {
00017     namespace opencl
00018     {
00019       namespace kernels
00020       {
00021 
00023 
00024         template <typename StringType>
00025         void generate_compressed_matrix_block_trans_lu_backward(StringType & source, std::string const & numeric_string)
00026         {
00027           source.append("__kernel void block_trans_lu_backward( \n");
00028           source.append("          __global const unsigned int * row_jumper_U,  \n");     //U part (note that U is transposed in memory)
00029           source.append("          __global const unsigned int * column_indices_U, \n");
00030           source.append("          __global const "); source.append(numeric_string); source.append(" * elements_U, \n");
00031           source.append("          __global const "); source.append(numeric_string); source.append(" * diagonal_U, \n");
00032           source.append("          __global const unsigned int * block_offsets, \n");
00033           source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
00034           source.append("          unsigned int size) \n");
00035           source.append("{ \n");
00036           source.append("  unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
00037           source.append("  unsigned int col_stop  = block_offsets[2*get_group_id(0)+1]; \n");
00038           source.append("  unsigned int row_start; \n");
00039           source.append("  unsigned int row_stop; \n");
00040           source.append("  "); source.append(numeric_string); source.append(" result_entry = 0; \n");
00041 
00042           source.append("  if (col_start >= col_stop) \n");
00043           source.append("    return; \n");
00044 
00045             //backward elimination, using U and diagonal_U
00046           source.append("  for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n");
00047           source.append("  { \n");
00048           source.append("    unsigned int col = (col_stop - iter) - 1; \n");
00049           source.append("    result_entry = result[col] / diagonal_U[col]; \n");
00050           source.append("    row_start = row_jumper_U[col]; \n");
00051           source.append("    row_stop  = row_jumper_U[col + 1]; \n");
00052           source.append("    for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
00053           source.append("      result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; \n");
00054           source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
00055           source.append("  } \n");
00056 
00057             //divide result vector by diagonal:
00058           source.append("  for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n");
00059           source.append("    result[col] /= diagonal_U[col]; \n");
00060           source.append("} \n");
00061         }
00062 
00063         template <typename StringType>
00064         void generate_compressed_matrix_block_trans_unit_lu_forward(StringType & source, std::string const & numeric_string)
00065         {
00066           source.append("__kernel void block_trans_unit_lu_forward( \n");
00067           source.append("          __global const unsigned int * row_jumper_L,  \n");     //L part (note that L is transposed in memory)
00068           source.append("          __global const unsigned int * column_indices_L, \n");
00069           source.append("          __global const "); source.append(numeric_string); source.append(" * elements_L, \n");
00070           source.append("          __global const unsigned int * block_offsets, \n");
00071           source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
00072           source.append("          unsigned int size) \n");
00073           source.append("{ \n");
00074           source.append("  unsigned int col_start = block_offsets[2*get_group_id(0)]; \n");
00075           source.append("  unsigned int col_stop  = block_offsets[2*get_group_id(0)+1]; \n");
00076           source.append("  unsigned int row_start = row_jumper_L[col_start]; \n");
00077           source.append("  unsigned int row_stop; \n");
00078           source.append("  "); source.append(numeric_string); source.append(" result_entry = 0; \n");
00079 
00080           source.append("  if (col_start >= col_stop) \n");
00081           source.append("    return; \n");
00082 
00083             //forward elimination, using L:
00084           source.append("  for (unsigned int col = col_start; col < col_stop; ++col) \n");
00085           source.append("  { \n");
00086           source.append("    result_entry = result[col]; \n");
00087           source.append("    row_stop = row_jumper_L[col + 1]; \n");
00088           source.append("    for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n");
00089           source.append("      result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n");
00090           source.append("    row_start = row_stop; \n"); //for next iteration (avoid unnecessary loads from GPU RAM)
00091           source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
00092           source.append("  } \n");
00093 
00094           source.append("}; \n");
00095         }
00096 
00097         namespace detail
00098         {
00100           template <typename StringType>
00101           void generate_compressed_matrix_dense_matrix_mult(StringType & source, std::string const & numeric_string,
00102                                                             bool B_transposed, bool B_row_major, bool C_row_major)
00103           {
00104             source.append("__kernel void ");
00105             source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
00106             source.append("( \n");
00107             source.append("          __global const unsigned int * sp_mat_row_indices, \n");
00108             source.append("          __global const unsigned int * sp_mat_col_indices, \n");
00109             source.append("          __global const "); source.append(numeric_string); source.append(" * sp_mat_elements, \n");
00110             source.append("          __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
00111             source.append("          unsigned int d_mat_row_start, \n");
00112             source.append("          unsigned int d_mat_col_start, \n");
00113             source.append("          unsigned int d_mat_row_inc, \n");
00114             source.append("          unsigned int d_mat_col_inc, \n");
00115             source.append("          unsigned int d_mat_row_size, \n");
00116             source.append("          unsigned int d_mat_col_size, \n");
00117             source.append("          unsigned int d_mat_internal_rows, \n");
00118             source.append("          unsigned int d_mat_internal_cols, \n");
00119             source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
00120             source.append("          unsigned int result_row_start, \n");
00121             source.append("          unsigned int result_col_start, \n");
00122             source.append("          unsigned int result_row_inc, \n");
00123             source.append("          unsigned int result_col_inc, \n");
00124             source.append("          unsigned int result_row_size, \n");
00125             source.append("          unsigned int result_col_size, \n");
00126             source.append("          unsigned int result_internal_rows, \n");
00127             source.append("          unsigned int result_internal_cols) { \n");
00128 
00129               // split work rows (sparse matrix rows) to thread groups
00130             source.append("  for (unsigned int row = get_group_id(0); row < result_row_size; row += get_num_groups(0)) { \n");
00131 
00132             source.append("    unsigned int row_start = sp_mat_row_indices[row]; \n");
00133             source.append("    unsigned int row_end = sp_mat_row_indices[row+1]; \n");
00134 
00135                 // split result cols between threads in a thread group
00136             source.append("    for ( unsigned int col = get_local_id(0); col < result_col_size; col += get_local_size(0) ) { \n");
00137 
00138             source.append("      "); source.append(numeric_string); source.append(" r = 0; \n");
00139 
00140             source.append("      for (unsigned int k = row_start; k < row_end; k ++) { \n");
00141 
00142             source.append("        unsigned int j = sp_mat_col_indices[k]; \n");
00143             source.append("        "); source.append(numeric_string); source.append(" x = sp_mat_elements[k]; \n");
00144 
00145             source.append("        "); source.append(numeric_string);
00146             if (B_transposed && B_row_major)
00147               source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start +   j * d_mat_col_inc ]; \n");
00148             else if (B_transposed && !B_row_major)
00149               source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc)                       + (d_mat_col_start +  j * d_mat_col_inc) * d_mat_internal_rows ]; \n");
00150             else if (!B_transposed && B_row_major)
00151               source.append(" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n");
00152             else
00153               source.append(" y = d_mat[ (d_mat_row_start +   j * d_mat_row_inc)                       + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n");
00154             source.append("        r += x * y; \n");
00155             source.append("      } \n");
00156 
00157             if (C_row_major)
00158               source.append("      result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n");
00159             else
00160               source.append("      result[ (result_row_start + row * result_row_inc)                        + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n");
00161             source.append("    } \n");
00162             source.append("  } \n");
00163 
00164             source.append("} \n");
00165 
00166           }
00167         }
00168         template <typename StringType>
00169         void generate_compressed_matrix_dense_matrix_multiplication(StringType & source, std::string const & numeric_string)
00170         {
00171           detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, false);
00172           detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false,  true);
00173           detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false,  true, false);
00174           detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false,  true,  true);
00175 
00176           detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, false);
00177           detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false,  true);
00178           detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true,  true, false);
00179           detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true,  true,  true);
00180         }
00181 
00182         template <typename StringType>
00183         void generate_compressed_matrix_jacobi(StringType & source, std::string const & numeric_string)
00184         {
00185 
00186          source.append(" __kernel void jacobi( \n");
00187          source.append("  __global const unsigned int * row_indices, \n");
00188          source.append("  __global const unsigned int * column_indices, \n");
00189          source.append("  __global const "); source.append(numeric_string); source.append(" * elements, \n");
00190          source.append("  "); source.append(numeric_string); source.append(" weight, \n");
00191          source.append("  __global const "); source.append(numeric_string); source.append(" * old_result, \n");
00192          source.append("  __global "); source.append(numeric_string); source.append(" * new_result, \n");
00193          source.append("  __global const "); source.append(numeric_string); source.append(" * rhs, \n");
00194          source.append("  unsigned int size) \n");
00195          source.append("  { \n");
00196          source.append("   "); source.append(numeric_string); source.append(" sum, diag=1; \n");
00197          source.append("   int col; \n");
00198          source.append("   for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n");
00199          source.append("   { \n");
00200          source.append("     sum = 0; \n");
00201          source.append("     for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++) \n");
00202          source.append("     { \n");
00203          source.append("       col = column_indices[j]; \n");
00204          source.append("       if (i == col) \n");
00205          source.append("   diag = elements[j]; \n");
00206          source.append("       else \n");
00207          source.append("   sum += elements[j] * old_result[col]; \n");
00208          source.append("     } \n");
00209          source.append("       new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n");
00210          source.append("    } \n");
00211          source.append("  } \n");
00212 
00213         }
00214 
00215         template <typename StringType>
00216         void generate_compressed_matrix_lu_backward(StringType & source, std::string const & numeric_string)
00217         {
00218           // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
00219           source.append("__kernel void lu_backward( \n");
00220           source.append("          __global const unsigned int * row_indices, \n");
00221           source.append("          __global const unsigned int * column_indices, \n");
00222           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00223           source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
00224           source.append("          unsigned int size) \n");
00225           source.append("{ \n");
00226           source.append("  __local unsigned int col_index_buffer[128]; \n");
00227           source.append("  __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
00228           source.append("  __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
00229 
00230           source.append("  unsigned int nnz = row_indices[size]; \n");
00231           source.append("  unsigned int current_row = size-1; \n");
00232           source.append("  unsigned int row_at_window_start = size-1; \n");
00233           source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
00234           source.append("  "); source.append(numeric_string); source.append(" diagonal_entry = 0; \n");
00235           source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
00236           source.append("  unsigned int next_row = row_indices[size-1]; \n");
00237 
00238           source.append("  unsigned int i = loop_end + get_local_id(0); \n");
00239           source.append("  while (1) \n");
00240           source.append("  { \n");
00241               //load into shared memory (coalesced access):
00242           source.append("    if (i < nnz) \n");
00243           source.append("    { \n");
00244           source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
00245           source.append("      unsigned int tmp = column_indices[i]; \n");
00246           source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
00247           source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
00248           source.append("    } \n");
00249 
00250           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00251 
00252               //now a single thread does the remaining work in shared memory:
00253           source.append("    if (get_local_id(0) == 0) \n");
00254           source.append("    { \n");
00255                 // traverse through all the loaded data from back to front:
00256           source.append("      for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
00257           source.append("      { \n");
00258           source.append("        unsigned int k = (get_local_size(0) - k2) - 1; \n");
00259 
00260           source.append("        if (i+k >= nnz) \n");
00261           source.append("          continue; \n");
00262 
00263           source.append("        if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
00264           source.append("          current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
00265           source.append("        else if (col_index_buffer[k] > current_row) \n"); //use buffered data
00266           source.append("          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
00267           source.append("        else if (col_index_buffer[k] == current_row) \n");
00268           source.append("          diagonal_entry = element_buffer[k]; \n");
00269 
00270           source.append("        if (i+k == next_row) \n"); //current row is finished. Write back result
00271           source.append("        { \n");
00272           source.append("          vector[current_row] = current_vector_entry / diagonal_entry; \n");
00273           source.append("          if (current_row > 0) //load next row's data \n");
00274           source.append("          { \n");
00275           source.append("            --current_row; \n");
00276           source.append("            next_row = row_indices[current_row]; \n");
00277           source.append("            current_vector_entry = vector[current_row]; \n");
00278           source.append("          } \n");
00279           source.append("        } \n");
00280 
00281 
00282           source.append("      } \n"); // for k
00283 
00284           source.append("      row_at_window_start = current_row; \n");
00285           source.append("    } \n"); // if (get_local_id(0) == 0)
00286 
00287           source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
00288 
00289           source.append("    if (i < get_local_size(0)) \n");
00290           source.append("      break; \n");
00291 
00292           source.append("    i -= get_local_size(0); \n");
00293           source.append("  } \n"); //for i
00294           source.append("} \n");
00295 
00296         }
00297 
00298         template <typename StringType>
00299         void generate_compressed_matrix_lu_forward(StringType & source, std::string const & numeric_string)
00300         {
00301 
00302           // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
00303           source.append("__kernel void lu_forward( \n");
00304           source.append("          __global const unsigned int * row_indices, \n");
00305           source.append("          __global const unsigned int * column_indices, \n");
00306           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00307           source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
00308           source.append("          unsigned int size) \n");
00309           source.append("{ \n");
00310           source.append("  __local unsigned int col_index_buffer[128]; \n");
00311           source.append("  __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
00312           source.append("  __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
00313 
00314           source.append("  unsigned int nnz = row_indices[size]; \n");
00315           source.append("  unsigned int current_row = 0; \n");
00316           source.append("  unsigned int row_at_window_start = 0; \n");
00317           source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
00318           source.append("  "); source.append(numeric_string); source.append(" diagonal_entry; \n");
00319           source.append("  unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
00320           source.append("  unsigned int next_row = row_indices[1]; \n");
00321 
00322           source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
00323           source.append("  { \n");
00324               //load into shared memory (coalesced access):
00325           source.append("    if (i < nnz) \n");
00326           source.append("    { \n");
00327           source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
00328           source.append("      unsigned int tmp = column_indices[i]; \n");
00329           source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
00330           source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
00331           source.append("    } \n");
00332 
00333           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00334 
00335               //now a single thread does the remaining work in shared memory:
00336           source.append("    if (get_local_id(0) == 0) \n");
00337           source.append("    { \n");
00338                 // traverse through all the loaded data:
00339           source.append("      for (unsigned int k=0; k<get_local_size(0); ++k) \n");
00340           source.append("      { \n");
00341           source.append("        if (current_row < size && i+k == next_row) \n"); //current row is finished. Write back result
00342           source.append("        { \n");
00343           source.append("          vector[current_row] = current_vector_entry / diagonal_entry; \n");
00344           source.append("          ++current_row; \n");
00345           source.append("          if (current_row < size) \n"); //load next row's data
00346           source.append("          { \n");
00347           source.append("            next_row = row_indices[current_row+1]; \n");
00348           source.append("            current_vector_entry = vector[current_row]; \n");
00349           source.append("          } \n");
00350           source.append("        } \n");
00351 
00352           source.append("        if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
00353           source.append("        { \n");
00354           source.append("          if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
00355           source.append("            current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
00356           source.append("          else if (col_index_buffer[k] < current_row) \n"); //use buffered data
00357           source.append("            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
00358           source.append("        } \n");
00359           source.append("        else if (col_index_buffer[k] == current_row) \n");
00360           source.append("          diagonal_entry = element_buffer[k]; \n");
00361 
00362           source.append("      } \n"); // for k
00363 
00364           source.append("      row_at_window_start = current_row; \n");
00365           source.append("    } \n"); // if (get_local_id(0) == 0)
00366 
00367           source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
00368           source.append("  } \n"); //for i
00369           source.append("} \n");
00370 
00371         }
00372 
00373         template <typename StringType>
00374         void generate_compressed_matrix_row_info_extractor(StringType & source, std::string const & numeric_string)
00375         {
00376           source.append("__kernel void row_info_extractor( \n");
00377           source.append("          __global const unsigned int * row_indices, \n");
00378           source.append("          __global const unsigned int * column_indices, \n");
00379           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00380           source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
00381           source.append("          unsigned int size, \n");
00382           source.append("          unsigned int option \n");
00383           source.append("          ) \n");
00384           source.append("{ \n");
00385           source.append("  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n");
00386           source.append("  { \n");
00387           source.append("    "); source.append(numeric_string); source.append(" value = 0; \n");
00388           source.append("    unsigned int row_end = row_indices[row+1]; \n");
00389 
00390           source.append("    switch (option) \n");
00391           source.append("    { \n");
00392           source.append("      case 0: \n"); //inf-norm
00393           source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
00394           source.append("          value = max(value, fabs(elements[i])); \n");
00395           source.append("        break; \n");
00396 
00397           source.append("      case 1: \n"); //1-norm
00398           source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
00399           source.append("          value += fabs(elements[i]); \n");
00400           source.append("        break; \n");
00401 
00402           source.append("      case 2: \n"); //2-norm
00403           source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
00404           source.append("          value += elements[i] * elements[i]; \n");
00405           source.append("        value = sqrt(value); \n");
00406           source.append("        break; \n");
00407 
00408           source.append("      case 3: \n"); //diagonal entry
00409           source.append("        for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
00410           source.append("        { \n");
00411           source.append("          if (column_indices[i] == row) \n");
00412           source.append("          { \n");
00413           source.append("            value = elements[i]; \n");
00414           source.append("            break; \n");
00415           source.append("          } \n");
00416           source.append("        } \n");
00417           source.append("        break; \n");
00418 
00419           source.append("      default: \n");
00420           source.append("        break; \n");
00421           source.append("    } \n");
00422           source.append("    result[row] = value; \n");
00423           source.append("  } \n");
00424           source.append("} \n");
00425 
00426         }
00427 
00428         template <typename StringType>
00429         void generate_compressed_matrix_trans_lu_backward(StringType & source, std::string const & numeric_string)
00430         {
00431 
00432           // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
00433           source.append("__kernel void trans_lu_backward( \n");
00434           source.append("          __global const unsigned int * row_indices, \n");
00435           source.append("          __global const unsigned int * column_indices, \n");
00436           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00437           source.append("          __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
00438           source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
00439           source.append("          unsigned int size) \n");
00440           source.append("{ \n");
00441           source.append("  __local unsigned int row_index_lookahead[256]; \n");
00442           source.append("  __local unsigned int row_index_buffer[256]; \n");
00443 
00444           source.append("  unsigned int row_index; \n");
00445           source.append("  unsigned int col_index; \n");
00446           source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
00447           source.append("  unsigned int nnz = row_indices[size]; \n");
00448           source.append("  unsigned int row_at_window_start = size; \n");
00449           source.append("  unsigned int row_at_window_end; \n");
00450           source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
00451 
00452           source.append("  for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
00453           source.append("  { \n");
00454           source.append("    unsigned int i = (nnz - i2) - 1; \n");
00455           source.append("    col_index    = (i2 < nnz) ? column_indices[i] : 0; \n");
00456           source.append("    matrix_entry = (i2 < nnz) ? elements[i]       : 0; \n");
00457           source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
00458 
00459           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00460 
00461           source.append("    if (i2 < nnz) \n");
00462           source.append("    { \n");
00463           source.append("      unsigned int row_index_dec = 0; \n");
00464           source.append("      while (row_index_lookahead[row_index_dec] > i) \n");
00465           source.append("        ++row_index_dec; \n");
00466           source.append("      row_index = row_at_window_start - row_index_dec; \n");
00467           source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
00468           source.append("    } \n");
00469           source.append("    else \n");
00470           source.append("    { \n");
00471           source.append("      row_index = size+1; \n");
00472           source.append("      row_index_buffer[get_local_id(0)] = 0; \n");
00473           source.append("    } \n");
00474 
00475           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00476 
00477           source.append("    row_at_window_start = row_index_buffer[0]; \n");
00478           source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
00479 
00480               //backward elimination
00481           source.append("    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
00482           source.append("    { \n");
00483           source.append("      unsigned int row = row_at_window_start - row2; \n");
00484           source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
00485 
00486           source.append("      if ( (row_index == row) && (col_index < row) ) \n");
00487           source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
00488 
00489           source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
00490           source.append("    } \n");
00491 
00492           source.append("    row_at_window_start = row_at_window_end; \n");
00493           source.append("  } \n");
00494 
00495             // final step: Divide vector by diagonal entries:
00496           source.append("  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
00497           source.append("    vector[i] /= diagonal_entries[i]; \n");
00498           source.append("} \n");
00499 
00500         }
00501 
00502         template <typename StringType>
00503         void generate_compressed_matrix_trans_lu_forward(StringType & source, std::string const & numeric_string)
00504         {
00505 
00506           // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
00507           source.append("__kernel void trans_lu_forward( \n");
00508           source.append("          __global const unsigned int * row_indices, \n");
00509           source.append("          __global const unsigned int * column_indices, \n");
00510           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00511           source.append("          __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n");
00512           source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
00513           source.append("          unsigned int size) \n");
00514           source.append("{ \n");
00515           source.append("  __local unsigned int row_index_lookahead[256]; \n");
00516           source.append("  __local unsigned int row_index_buffer[256]; \n");
00517 
00518           source.append("  unsigned int row_index; \n");
00519           source.append("  unsigned int col_index; \n");
00520           source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
00521           source.append("  unsigned int nnz = row_indices[size]; \n");
00522           source.append("  unsigned int row_at_window_start = 0; \n");
00523           source.append("  unsigned int row_at_window_end = 0; \n");
00524           source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
00525 
00526           source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
00527           source.append("  { \n");
00528           source.append("    col_index    = (i < nnz) ? column_indices[i] : 0; \n");
00529           source.append("    matrix_entry = (i < nnz) ? elements[i]       : 0; \n");
00530           source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1; \n");
00531 
00532           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00533 
00534           source.append("    if (i < nnz) \n");
00535           source.append("    { \n");
00536           source.append("      unsigned int row_index_inc = 0; \n");
00537           source.append("      while (i >= row_index_lookahead[row_index_inc + 1]) \n");
00538           source.append("        ++row_index_inc; \n");
00539           source.append("      row_index = row_at_window_start + row_index_inc; \n");
00540           source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
00541           source.append("    } \n");
00542           source.append("    else \n");
00543           source.append("    { \n");
00544           source.append("      row_index = size+1; \n");
00545           source.append("      row_index_buffer[get_local_id(0)] = size - 1; \n");
00546           source.append("    } \n");
00547 
00548           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00549 
00550           source.append("    row_at_window_start = row_index_buffer[0]; \n");
00551           source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
00552 
00553               //forward elimination
00554           source.append("    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
00555           source.append("    { \n");
00556           source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n");
00557 
00558           source.append("      if ( (row_index == row) && (col_index > row) ) \n");
00559           source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
00560 
00561           source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
00562           source.append("    } \n");
00563 
00564           source.append("    row_at_window_start = row_at_window_end; \n");
00565           source.append("  } \n");
00566 
00567             // final step: Divide vector by diagonal entries:
00568           source.append("  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n");
00569           source.append("    vector[i] /= diagonal_entries[i]; \n");
00570           source.append("} \n");
00571 
00572         }
00573 
00574         template <typename StringType>
00575         void generate_compressed_matrix_trans_unit_lu_backward(StringType & source, std::string const & numeric_string)
00576         {
00577 
00578           // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
00579           source.append("__kernel void trans_unit_lu_backward( \n");
00580           source.append("          __global const unsigned int * row_indices, \n");
00581           source.append("          __global const unsigned int * column_indices, \n");
00582           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00583           source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
00584           source.append("          unsigned int size) \n");
00585           source.append("{ \n");
00586           source.append("  __local unsigned int row_index_lookahead[256]; \n");
00587           source.append("  __local unsigned int row_index_buffer[256]; \n");
00588 
00589           source.append("  unsigned int row_index; \n");
00590           source.append("  unsigned int col_index; \n");
00591           source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
00592           source.append("  unsigned int nnz = row_indices[size]; \n");
00593           source.append("  unsigned int row_at_window_start = size; \n");
00594           source.append("  unsigned int row_at_window_end; \n");
00595           source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
00596 
00597           source.append("  for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n");
00598           source.append("  { \n");
00599           source.append("    unsigned int i = (nnz - i2) - 1; \n");
00600           source.append("    col_index    = (i2 < nnz) ? column_indices[i] : 0; \n");
00601           source.append("    matrix_entry = (i2 < nnz) ? elements[i]       : 0; \n");
00602           source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n");
00603 
00604           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00605 
00606           source.append("    if (i2 < nnz) \n");
00607           source.append("    { \n");
00608           source.append("      unsigned int row_index_dec = 0; \n");
00609           source.append("      while (row_index_lookahead[row_index_dec] > i) \n");
00610           source.append("        ++row_index_dec; \n");
00611           source.append("      row_index = row_at_window_start - row_index_dec; \n");
00612           source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
00613           source.append("    } \n");
00614           source.append("    else \n");
00615           source.append("    { \n");
00616           source.append("      row_index = size+1; \n");
00617           source.append("      row_index_buffer[get_local_id(0)] = 0; \n");
00618           source.append("    } \n");
00619 
00620           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00621 
00622           source.append("    row_at_window_start = row_index_buffer[0]; \n");
00623           source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
00624 
00625               //backward elimination
00626           source.append("    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n");
00627           source.append("    { \n");
00628           source.append("      unsigned int row = row_at_window_start - row2; \n");
00629           source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
00630 
00631           source.append("      if ( (row_index == row) && (col_index < row) ) \n");
00632           source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
00633 
00634           source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
00635           source.append("    } \n");
00636 
00637           source.append("    row_at_window_start = row_at_window_end; \n");
00638           source.append("  } \n");
00639           source.append("} \n");
00640 
00641         }
00642 
00643 
00644         template <typename StringType>
00645         void generate_compressed_matrix_trans_unit_lu_forward(StringType & source, std::string const & numeric_string)
00646         {
00647 
00648           // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
00649           source.append("__kernel void trans_unit_lu_forward( \n");
00650           source.append("          __global const unsigned int * row_indices, \n");
00651           source.append("          __global const unsigned int * column_indices, \n");
00652           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00653           source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
00654           source.append("          unsigned int size) \n");
00655           source.append("{ \n");
00656           source.append("  __local unsigned int row_index_lookahead[256]; \n");
00657           source.append("  __local unsigned int row_index_buffer[256]; \n");
00658 
00659           source.append("  unsigned int row_index; \n");
00660           source.append("  unsigned int col_index; \n");
00661           source.append("  "); source.append(numeric_string); source.append(" matrix_entry; \n");
00662           source.append("  unsigned int nnz = row_indices[size]; \n");
00663           source.append("  unsigned int row_at_window_start = 0; \n");
00664           source.append("  unsigned int row_at_window_end = 0; \n");
00665           source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n");
00666 
00667           source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
00668           source.append("  { \n");
00669           source.append("    col_index    = (i < nnz) ? column_indices[i] : 0; \n");
00670           source.append("    matrix_entry = (i < nnz) ? elements[i]       : 0; \n");
00671           source.append("    row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1; \n");
00672 
00673           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00674 
00675           source.append("    if (i < nnz) \n");
00676           source.append("    { \n");
00677           source.append("      unsigned int row_index_inc = 0; \n");
00678           source.append("      while (i >= row_index_lookahead[row_index_inc + 1]) \n");
00679           source.append("        ++row_index_inc; \n");
00680           source.append("      row_index = row_at_window_start + row_index_inc; \n");
00681           source.append("      row_index_buffer[get_local_id(0)] = row_index; \n");
00682           source.append("    } \n");
00683           source.append("    else \n");
00684           source.append("    { \n");
00685           source.append("      row_index = size+1; \n");
00686           source.append("      row_index_buffer[get_local_id(0)] = size - 1; \n");
00687           source.append("    } \n");
00688 
00689           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00690 
00691           source.append("    row_at_window_start = row_index_buffer[0]; \n");
00692           source.append("    row_at_window_end   = row_index_buffer[get_local_size(0) - 1]; \n");
00693 
00694               //forward elimination
00695           source.append("    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n");
00696           source.append("    { \n");
00697           source.append("      "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
00698 
00699           source.append("      if ( (row_index == row) && (col_index > row) ) \n");
00700           source.append("        vector[col_index] -= result_entry * matrix_entry; \n");
00701 
00702           source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
00703           source.append("    } \n");
00704 
00705           source.append("    row_at_window_start = row_at_window_end; \n");
00706           source.append("  } \n");
00707           source.append("} \n");
00708 
00709         }
00710 
00711         template <typename StringType>
00712         void generate_compressed_matrix_trans_unit_lu_forward_slow(StringType & source, std::string const & numeric_string)
00713         {
00714 
00715           // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
00716           source.append("__kernel void trans_unit_lu_forward_slow( \n");
00717           source.append("          __global const unsigned int * row_indices, \n");
00718           source.append("          __global const unsigned int * column_indices, \n");
00719           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00720           source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
00721           source.append("          unsigned int size) \n");
00722           source.append("{ \n");
00723           source.append("  for (unsigned int row = 0; row < size; ++row) \n");
00724           source.append("  { \n");
00725           source.append("    "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n");
00726 
00727           source.append("    unsigned int row_start = row_indices[row]; \n");
00728           source.append("    unsigned int row_stop  = row_indices[row + 1]; \n");
00729           source.append("    for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n");
00730           source.append("    { \n");
00731           source.append("      unsigned int col_index = column_indices[entry_index]; \n");
00732           source.append("      if (col_index > row) \n");
00733           source.append("        vector[col_index] -= result_entry * elements[entry_index]; \n");
00734           source.append("    } \n");
00735 
00736           source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
00737           source.append("  } \n");
00738           source.append("} \n");
00739 
00740         }
00741 
00742         template <typename StringType>
00743         void generate_compressed_matrix_unit_lu_backward(StringType & source, std::string const & numeric_string)
00744         {
00745 
00746           // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format
00747           source.append("__kernel void unit_lu_backward( \n");
00748           source.append("          __global const unsigned int * row_indices, \n");
00749           source.append("          __global const unsigned int * column_indices, \n");
00750           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00751           source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
00752           source.append("          unsigned int size) \n");
00753           source.append("{ \n");
00754           source.append("  __local  unsigned int col_index_buffer[128]; \n");
00755           source.append("  __local  "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
00756           source.append("  __local  "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
00757 
00758           source.append("  unsigned int nnz = row_indices[size]; \n");
00759           source.append("  unsigned int current_row = size-1; \n");
00760           source.append("  unsigned int row_at_window_start = size-1; \n");
00761           source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n");
00762           source.append("  unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n");
00763           source.append("  unsigned int next_row = row_indices[size-1]; \n");
00764 
00765           source.append("  unsigned int i = loop_end + get_local_id(0); \n");
00766           source.append("  while (1) \n");
00767           source.append("  { \n");
00768               //load into shared memory (coalesced access):
00769           source.append("    if (i < nnz) \n");
00770           source.append("    { \n");
00771           source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
00772           source.append("      unsigned int tmp = column_indices[i]; \n");
00773           source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
00774           source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
00775           source.append("    } \n");
00776 
00777           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00778 
00779               //now a single thread does the remaining work in shared memory:
00780           source.append("    if (get_local_id(0) == 0) \n");
00781           source.append("    { \n");
00782               // traverse through all the loaded data from back to front:
00783           source.append("      for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n");
00784           source.append("      { \n");
00785           source.append("        unsigned int k = (get_local_size(0) - k2) - 1; \n");
00786 
00787           source.append("        if (i+k >= nnz) \n");
00788           source.append("          continue; \n");
00789 
00790           source.append("        if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results
00791           source.append("          current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
00792           source.append("        else if (col_index_buffer[k] > current_row) \n"); //use buffered data
00793           source.append("          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
00794 
00795           source.append("        if (i+k == next_row) \n"); //current row is finished. Write back result
00796           source.append("        { \n");
00797           source.append("          vector[current_row] = current_vector_entry; \n");
00798           source.append("          if (current_row > 0) \n"); //load next row's data
00799           source.append("          { \n");
00800           source.append("            --current_row; \n");
00801           source.append("            next_row = row_indices[current_row]; \n");
00802           source.append("            current_vector_entry = vector[current_row]; \n");
00803           source.append("          } \n");
00804           source.append("        } \n");
00805 
00806 
00807           source.append("      } \n"); // for k
00808 
00809           source.append("      row_at_window_start = current_row; \n");
00810           source.append("    } \n"); // if (get_local_id(0) == 0)
00811 
00812           source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
00813 
00814           source.append("    if (i < get_local_size(0)) \n");
00815           source.append("      break; \n");
00816 
00817           source.append("    i -= get_local_size(0); \n");
00818           source.append("  } \n"); //for i
00819           source.append("} \n");
00820 
00821         }
00822 
00823         template <typename StringType>
00824         void generate_compressed_matrix_unit_lu_forward(StringType & source, std::string const & numeric_string)
00825         {
00826 
00827           // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format
00828           source.append("__kernel void unit_lu_forward( \n");
00829           source.append("          __global const unsigned int * row_indices, \n");
00830           source.append("          __global const unsigned int * column_indices, \n");
00831           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00832           source.append("          __global "); source.append(numeric_string); source.append(" * vector, \n");
00833           source.append("          unsigned int size) \n");
00834           source.append("{ \n");
00835           source.append("  __local  unsigned int col_index_buffer[128]; \n");
00836           source.append("  __local  "); source.append(numeric_string); source.append(" element_buffer[128]; \n");
00837           source.append("  __local  "); source.append(numeric_string); source.append(" vector_buffer[128]; \n");
00838 
00839           source.append("  unsigned int nnz = row_indices[size]; \n");
00840           source.append("  unsigned int current_row = 0; \n");
00841           source.append("  unsigned int row_at_window_start = 0; \n");
00842           source.append("  "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n");
00843           source.append("  unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n");
00844           source.append("  unsigned int next_row = row_indices[1]; \n");
00845 
00846           source.append("  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n");
00847           source.append("  { \n");
00848               //load into shared memory (coalesced access):
00849           source.append("    if (i < nnz) \n");
00850           source.append("    { \n");
00851           source.append("      element_buffer[get_local_id(0)] = elements[i]; \n");
00852           source.append("      unsigned int tmp = column_indices[i]; \n");
00853           source.append("      col_index_buffer[get_local_id(0)] = tmp; \n");
00854           source.append("      vector_buffer[get_local_id(0)] = vector[tmp]; \n");
00855           source.append("    } \n");
00856 
00857           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00858 
00859               //now a single thread does the remaining work in shared memory:
00860           source.append("    if (get_local_id(0) == 0) \n");
00861           source.append("    { \n");
00862                 // traverse through all the loaded data:
00863           source.append("      for (unsigned int k=0; k<get_local_size(0); ++k) \n");
00864           source.append("      { \n");
00865           source.append("        if (i+k == next_row) \n"); //current row is finished. Write back result
00866           source.append("        { \n");
00867           source.append("          vector[current_row] = current_vector_entry; \n");
00868           source.append("          ++current_row; \n");
00869           source.append("          if (current_row < size) //load next row's data \n");
00870           source.append("          { \n");
00871           source.append("            next_row = row_indices[current_row+1]; \n");
00872           source.append("            current_vector_entry = vector[current_row]; \n");
00873           source.append("          } \n");
00874           source.append("        } \n");
00875 
00876           source.append("        if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute
00877           source.append("        { \n");
00878           source.append("          if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results
00879           source.append("            current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n");
00880           source.append("          else if (col_index_buffer[k] < current_row) \n"); //use buffered data
00881           source.append("            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n");
00882           source.append("        } \n");
00883 
00884           source.append("      } \n"); // for k
00885 
00886           source.append("      row_at_window_start = current_row; \n");
00887           source.append("    } \n"); // if (get_local_id(0) == 0)
00888 
00889           source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
00890           source.append("  } //for i \n");
00891           source.append("} \n");
00892 
00893         }
00894 
00895         template <typename StringType>
00896         void generate_compressed_matrix_vec_mul(StringType & source, std::string const & numeric_string)
00897         {
00898 
00899           source.append("__kernel void vec_mul( \n");
00900           source.append("          __global const unsigned int * row_indices, \n");
00901           source.append("          __global const unsigned int * column_indices, \n");
00902           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00903           source.append("          __global const "); source.append(numeric_string); source.append(" * x, \n");
00904           source.append("          uint4 layout_x, \n");
00905           source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
00906           source.append("          uint4 layout_result) \n");
00907           source.append("{ \n");
00908           source.append("  for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
00909           source.append("  { \n");
00910           source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
00911           source.append("    unsigned int row_end = row_indices[row+1]; \n");
00912           source.append("    for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
00913           source.append("      dot_prod += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n");
00914           source.append("    result[row * layout_result.y + layout_result.x] = dot_prod; \n");
00915           source.append("  } \n");
00916           source.append("} \n");
00917 
00918         }
00919 
00920         template <typename StringType>
00921         void generate_compressed_matrix_vec_mul4(StringType & source, std::string const & numeric_string)
00922         {
00923           source.append("__kernel void vec_mul4( \n");
00924           source.append("          __global const unsigned int * row_indices, \n");
00925           source.append("          __global const uint4 * column_indices, \n");
00926           source.append("          __global const "); source.append(numeric_string); source.append("4 * elements, \n");
00927           source.append("          __global const "); source.append(numeric_string); source.append(" * x, \n");
00928           source.append("          uint4 layout_x, \n");
00929           source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
00930           source.append("          uint4 layout_result) \n");
00931           source.append("{ \n");
00932           source.append("  "); source.append(numeric_string); source.append(" dot_prod; \n");
00933           source.append("  unsigned int start, next_stop; \n");
00934           source.append("  uint4 col_idx; \n");
00935           source.append("  "); source.append(numeric_string); source.append("4 tmp_vec; \n");
00936           source.append("  "); source.append(numeric_string); source.append("4 tmp_entries; \n");
00937 
00938           source.append("  for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
00939           source.append("  { \n");
00940           source.append("    dot_prod = 0; \n");
00941           source.append("    start = row_indices[row] / 4; \n");
00942           source.append("    next_stop = row_indices[row+1] / 4; \n");
00943 
00944           source.append("    for (unsigned int i = start; i < next_stop; ++i) \n");
00945           source.append("    { \n");
00946           source.append("      col_idx = column_indices[i]; \n");
00947 
00948           source.append("      tmp_entries = elements[i]; \n");
00949           source.append("      tmp_vec.x = x[col_idx.x * layout_x.y + layout_x.x]; \n");
00950           source.append("      tmp_vec.y = x[col_idx.y * layout_x.y + layout_x.x]; \n");
00951           source.append("      tmp_vec.z = x[col_idx.z * layout_x.y + layout_x.x]; \n");
00952           source.append("      tmp_vec.w = x[col_idx.w * layout_x.y + layout_x.x]; \n");
00953 
00954           source.append("      dot_prod += dot(tmp_entries, tmp_vec); \n");
00955           source.append("    } \n");
00956           source.append("    result[row * layout_result.y + layout_result.x] = dot_prod; \n");
00957           source.append("  } \n");
00958           source.append("} \n");
00959         }
00960 
00961         template <typename StringType>
00962         void generate_compressed_matrix_vec_mul8(StringType & source, std::string const & numeric_string)
00963         {
00964           source.append("__kernel void vec_mul8( \n");
00965           source.append("          __global const unsigned int * row_indices, \n");
00966           source.append("          __global const uint8 * column_indices, \n");
00967           source.append("          __global const "); source.append(numeric_string); source.append("8 * elements, \n");
00968           source.append("          __global const "); source.append(numeric_string); source.append(" * x, \n");
00969           source.append("          uint4 layout_x, \n");
00970           source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
00971           source.append("          uint4 layout_result) \n");
00972           source.append("{ \n");
00973           source.append("  "); source.append(numeric_string); source.append(" dot_prod; \n");
00974           source.append("  unsigned int start, next_stop; \n");
00975           source.append("  uint8 col_idx; \n");
00976           source.append("  "); source.append(numeric_string); source.append("8 tmp_vec; \n");
00977           source.append("  "); source.append(numeric_string); source.append("8 tmp_entries; \n");
00978 
00979           source.append("  for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n");
00980           source.append("  { \n");
00981           source.append("    dot_prod = 0; \n");
00982           source.append("    start = row_indices[row] / 8; \n");
00983           source.append("    next_stop = row_indices[row+1] / 8; \n");
00984 
00985           source.append("    for (unsigned int i = start; i < next_stop; ++i) \n");
00986           source.append("    { \n");
00987           source.append("      col_idx = column_indices[i]; \n");
00988 
00989           source.append("      tmp_entries = elements[i]; \n");
00990           source.append("      tmp_vec.s0 = x[col_idx.s0 * layout_x.y + layout_x.x]; \n");
00991           source.append("      tmp_vec.s1 = x[col_idx.s1 * layout_x.y + layout_x.x]; \n");
00992           source.append("      tmp_vec.s2 = x[col_idx.s2 * layout_x.y + layout_x.x]; \n");
00993           source.append("      tmp_vec.s3 = x[col_idx.s3 * layout_x.y + layout_x.x]; \n");
00994           source.append("      tmp_vec.s4 = x[col_idx.s4 * layout_x.y + layout_x.x]; \n");
00995           source.append("      tmp_vec.s5 = x[col_idx.s5 * layout_x.y + layout_x.x]; \n");
00996           source.append("      tmp_vec.s6 = x[col_idx.s6 * layout_x.y + layout_x.x]; \n");
00997           source.append("      tmp_vec.s7 = x[col_idx.s7 * layout_x.y + layout_x.x]; \n");
00998 
00999           source.append("      dot_prod += dot(tmp_entries.lo, tmp_vec.lo); \n");
01000           source.append("      dot_prod += dot(tmp_entries.hi, tmp_vec.hi); \n");
01001           source.append("    } \n");
01002           source.append("    result[row * layout_result.y + layout_result.x] = dot_prod; \n");
01003           source.append("  } \n");
01004           source.append("} \n");
01005         }
01006 
01007         template <typename StringType>
01008         void generate_compressed_matrix_vec_mul_cpu(StringType & source, std::string const & numeric_string)
01009         {
01010           source.append("__kernel void vec_mul_cpu( \n");
01011           source.append("          __global const unsigned int * row_indices, \n");
01012           source.append("          __global const unsigned int * column_indices, \n");
01013           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
01014           source.append("          __global const "); source.append(numeric_string); source.append(" * vector, \n");
01015           source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
01016           source.append("          unsigned int size) \n");
01017           source.append("{ \n");
01018           source.append("  unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1); \n");
01019           source.append("  unsigned int row_start = get_global_id(0) * work_per_item; \n");
01020           source.append("  unsigned int row_stop  = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size); \n");
01021           source.append("  for (unsigned int row = row_start; row < row_stop; ++row) \n");
01022           source.append("  { \n");
01023           source.append("    "); source.append(numeric_string); source.append(" dot_prod = ("); source.append(numeric_string); source.append(")0; \n");
01024           source.append("    unsigned int row_end = row_indices[row+1]; \n");
01025           source.append("    for (unsigned int i = row_indices[row]; i < row_end; ++i) \n");
01026           source.append("      dot_prod += elements[i] * vector[column_indices[i]]; \n");
01027           source.append("    result[row] = dot_prod; \n");
01028           source.append("  } \n");
01029           source.append("} \n");
01030 
01031         }
01032 
01033 
01035 
01036         // main kernel class
01038         template <typename NumericT>
01039         struct compressed_matrix
01040         {
01041           static std::string program_name()
01042           {
01043             return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_matrix";
01044           }
01045 
01046           static void init(viennacl::ocl::context & ctx)
01047           {
01048             viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
01049             std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
01050 
01051             static std::map<cl_context, bool> init_done;
01052             if (!init_done[ctx.handle().get()])
01053             {
01054               std::string source;
01055               source.reserve(1024);
01056 
01057               viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
01058 
01059               if (numeric_string == "float" || numeric_string == "double")
01060               {
01061                 generate_compressed_matrix_block_trans_lu_backward(source, numeric_string);
01062                 generate_compressed_matrix_block_trans_unit_lu_forward(source, numeric_string);
01063                 generate_compressed_matrix_jacobi(source, numeric_string);
01064                 generate_compressed_matrix_lu_backward(source, numeric_string);
01065                 generate_compressed_matrix_lu_forward(source, numeric_string);
01066                 generate_compressed_matrix_trans_lu_backward(source, numeric_string);
01067                 generate_compressed_matrix_trans_lu_forward(source, numeric_string);
01068                 generate_compressed_matrix_trans_unit_lu_backward(source, numeric_string);
01069                 generate_compressed_matrix_trans_unit_lu_forward(source, numeric_string);
01070                 generate_compressed_matrix_trans_unit_lu_forward_slow(source, numeric_string);
01071                 generate_compressed_matrix_unit_lu_backward(source, numeric_string);
01072                 generate_compressed_matrix_unit_lu_forward(source, numeric_string);
01073               }
01074               generate_compressed_matrix_dense_matrix_multiplication(source, numeric_string);
01075               generate_compressed_matrix_row_info_extractor(source, numeric_string);
01076               generate_compressed_matrix_vec_mul(source, numeric_string);
01077               generate_compressed_matrix_vec_mul4(source, numeric_string);
01078               generate_compressed_matrix_vec_mul8(source, numeric_string);
01079               generate_compressed_matrix_vec_mul_cpu(source, numeric_string);
01080 
01081               std::string prog_name = program_name();
01082               #ifdef VIENNACL_BUILD_INFO
01083               std::cout << "Creating program " << prog_name << std::endl;
01084               #endif
01085               ctx.add_program(source, prog_name);
01086               init_done[ctx.handle().get()] = true;
01087             } //if
01088           } //init
01089         };
01090 
01091       }  // namespace kernels
01092     }  // namespace opencl
01093   }  // namespace linalg
01094 }  // namespace viennacl
01095 #endif
01096