ViennaCL - The Vienna Computing Library
1.5.0
|
00001 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP 00002 #define VIENNACL_LINALG_OPENCL_KERNELS_COMPRESSED_MATRIX_HPP 00003 00004 #include "viennacl/tools/tools.hpp" 00005 #include "viennacl/ocl/kernel.hpp" 00006 #include "viennacl/ocl/platform.hpp" 00007 #include "viennacl/ocl/utils.hpp" 00008 00009 #include "viennacl/linalg/opencl/common.hpp" 00010 00013 namespace viennacl 00014 { 00015 namespace linalg 00016 { 00017 namespace opencl 00018 { 00019 namespace kernels 00020 { 00021 00023 00024 template <typename StringType> 00025 void generate_compressed_matrix_block_trans_lu_backward(StringType & source, std::string const & numeric_string) 00026 { 00027 source.append("__kernel void block_trans_lu_backward( \n"); 00028 source.append(" __global const unsigned int * row_jumper_U, \n"); //U part (note that U is transposed in memory) 00029 source.append(" __global const unsigned int * column_indices_U, \n"); 00030 source.append(" __global const "); source.append(numeric_string); source.append(" * elements_U, \n"); 00031 source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_U, \n"); 00032 source.append(" __global const unsigned int * block_offsets, \n"); 00033 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 00034 source.append(" unsigned int size) \n"); 00035 source.append("{ \n"); 00036 source.append(" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n"); 00037 source.append(" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n"); 00038 source.append(" unsigned int row_start; \n"); 00039 source.append(" unsigned int row_stop; \n"); 00040 source.append(" "); source.append(numeric_string); source.append(" result_entry = 0; \n"); 00041 00042 source.append(" if (col_start >= col_stop) \n"); 00043 source.append(" return; \n"); 00044 00045 //backward elimination, using U and diagonal_U 00046 source.append(" for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n"); 00047 source.append(" { \n"); 00048 source.append(" unsigned int col = (col_stop - iter) - 1; \n"); 00049 source.append(" result_entry = result[col] / diagonal_U[col]; \n"); 00050 source.append(" row_start = row_jumper_U[col]; \n"); 00051 source.append(" row_stop = row_jumper_U[col + 1]; \n"); 00052 source.append(" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n"); 00053 source.append(" result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; \n"); 00054 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00055 source.append(" } \n"); 00056 00057 //divide result vector by diagonal: 00058 source.append(" for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n"); 00059 source.append(" result[col] /= diagonal_U[col]; \n"); 00060 source.append("} \n"); 00061 } 00062 00063 template <typename StringType> 00064 void generate_compressed_matrix_block_trans_unit_lu_forward(StringType & source, std::string const & numeric_string) 00065 { 00066 source.append("__kernel void block_trans_unit_lu_forward( \n"); 00067 source.append(" __global const unsigned int * row_jumper_L, \n"); //L part (note that L is transposed in memory) 00068 source.append(" __global const unsigned int * column_indices_L, \n"); 00069 source.append(" __global const "); source.append(numeric_string); source.append(" * elements_L, \n"); 00070 source.append(" __global const unsigned int * block_offsets, \n"); 00071 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 00072 source.append(" unsigned int size) \n"); 00073 source.append("{ \n"); 00074 source.append(" unsigned int col_start = block_offsets[2*get_group_id(0)]; \n"); 00075 source.append(" unsigned int col_stop = block_offsets[2*get_group_id(0)+1]; \n"); 00076 source.append(" unsigned int row_start = row_jumper_L[col_start]; \n"); 00077 source.append(" unsigned int row_stop; \n"); 00078 source.append(" "); source.append(numeric_string); source.append(" result_entry = 0; \n"); 00079 00080 source.append(" if (col_start >= col_stop) \n"); 00081 source.append(" return; \n"); 00082 00083 //forward elimination, using L: 00084 source.append(" for (unsigned int col = col_start; col < col_stop; ++col) \n"); 00085 source.append(" { \n"); 00086 source.append(" result_entry = result[col]; \n"); 00087 source.append(" row_stop = row_jumper_L[col + 1]; \n"); 00088 source.append(" for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n"); 00089 source.append(" result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n"); 00090 source.append(" row_start = row_stop; \n"); //for next iteration (avoid unnecessary loads from GPU RAM) 00091 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00092 source.append(" } \n"); 00093 00094 source.append("}; \n"); 00095 } 00096 00097 namespace detail 00098 { 00100 template <typename StringType> 00101 void generate_compressed_matrix_dense_matrix_mult(StringType & source, std::string const & numeric_string, 00102 bool B_transposed, bool B_row_major, bool C_row_major) 00103 { 00104 source.append("__kernel void "); 00105 source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major)); 00106 source.append("( \n"); 00107 source.append(" __global const unsigned int * sp_mat_row_indices, \n"); 00108 source.append(" __global const unsigned int * sp_mat_col_indices, \n"); 00109 source.append(" __global const "); source.append(numeric_string); source.append(" * sp_mat_elements, \n"); 00110 source.append(" __global const "); source.append(numeric_string); source.append(" * d_mat, \n"); 00111 source.append(" unsigned int d_mat_row_start, \n"); 00112 source.append(" unsigned int d_mat_col_start, \n"); 00113 source.append(" unsigned int d_mat_row_inc, \n"); 00114 source.append(" unsigned int d_mat_col_inc, \n"); 00115 source.append(" unsigned int d_mat_row_size, \n"); 00116 source.append(" unsigned int d_mat_col_size, \n"); 00117 source.append(" unsigned int d_mat_internal_rows, \n"); 00118 source.append(" unsigned int d_mat_internal_cols, \n"); 00119 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 00120 source.append(" unsigned int result_row_start, \n"); 00121 source.append(" unsigned int result_col_start, \n"); 00122 source.append(" unsigned int result_row_inc, \n"); 00123 source.append(" unsigned int result_col_inc, \n"); 00124 source.append(" unsigned int result_row_size, \n"); 00125 source.append(" unsigned int result_col_size, \n"); 00126 source.append(" unsigned int result_internal_rows, \n"); 00127 source.append(" unsigned int result_internal_cols) { \n"); 00128 00129 // split work rows (sparse matrix rows) to thread groups 00130 source.append(" for (unsigned int row = get_group_id(0); row < result_row_size; row += get_num_groups(0)) { \n"); 00131 00132 source.append(" unsigned int row_start = sp_mat_row_indices[row]; \n"); 00133 source.append(" unsigned int row_end = sp_mat_row_indices[row+1]; \n"); 00134 00135 // split result cols between threads in a thread group 00136 source.append(" for ( unsigned int col = get_local_id(0); col < result_col_size; col += get_local_size(0) ) { \n"); 00137 00138 source.append(" "); source.append(numeric_string); source.append(" r = 0; \n"); 00139 00140 source.append(" for (unsigned int k = row_start; k < row_end; k ++) { \n"); 00141 00142 source.append(" unsigned int j = sp_mat_col_indices[k]; \n"); 00143 source.append(" "); source.append(numeric_string); source.append(" x = sp_mat_elements[k]; \n"); 00144 00145 source.append(" "); source.append(numeric_string); 00146 if (B_transposed && B_row_major) 00147 source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + j * d_mat_col_inc ]; \n"); 00148 else if (B_transposed && !B_row_major) 00149 source.append(" y = d_mat[ (d_mat_row_start + col * d_mat_row_inc) + (d_mat_col_start + j * d_mat_col_inc) * d_mat_internal_rows ]; \n"); 00150 else if (!B_transposed && B_row_major) 00151 source.append(" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + col * d_mat_col_inc ]; \n"); 00152 else 00153 source.append(" y = d_mat[ (d_mat_row_start + j * d_mat_row_inc) + (d_mat_col_start + col * d_mat_col_inc) * d_mat_internal_rows ]; \n"); 00154 source.append(" r += x * y; \n"); 00155 source.append(" } \n"); 00156 00157 if (C_row_major) 00158 source.append(" result[ (result_row_start + row * result_row_inc) * result_internal_cols + result_col_start + col * result_col_inc ] = r; \n"); 00159 else 00160 source.append(" result[ (result_row_start + row * result_row_inc) + (result_col_start + col * result_col_inc) * result_internal_rows ] = r; \n"); 00161 source.append(" } \n"); 00162 source.append(" } \n"); 00163 00164 source.append("} \n"); 00165 00166 } 00167 } 00168 template <typename StringType> 00169 void generate_compressed_matrix_dense_matrix_multiplication(StringType & source, std::string const & numeric_string) 00170 { 00171 detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, false); 00172 detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, false, true); 00173 detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, true, false); 00174 detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, false, true, true); 00175 00176 detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, false); 00177 detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, false, true); 00178 detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, true, false); 00179 detail::generate_compressed_matrix_dense_matrix_mult(source, numeric_string, true, true, true); 00180 } 00181 00182 template <typename StringType> 00183 void generate_compressed_matrix_jacobi(StringType & source, std::string const & numeric_string) 00184 { 00185 00186 source.append(" __kernel void jacobi( \n"); 00187 source.append(" __global const unsigned int * row_indices, \n"); 00188 source.append(" __global const unsigned int * column_indices, \n"); 00189 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00190 source.append(" "); source.append(numeric_string); source.append(" weight, \n"); 00191 source.append(" __global const "); source.append(numeric_string); source.append(" * old_result, \n"); 00192 source.append(" __global "); source.append(numeric_string); source.append(" * new_result, \n"); 00193 source.append(" __global const "); source.append(numeric_string); source.append(" * rhs, \n"); 00194 source.append(" unsigned int size) \n"); 00195 source.append(" { \n"); 00196 source.append(" "); source.append(numeric_string); source.append(" sum, diag=1; \n"); 00197 source.append(" int col; \n"); 00198 source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n"); 00199 source.append(" { \n"); 00200 source.append(" sum = 0; \n"); 00201 source.append(" for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++) \n"); 00202 source.append(" { \n"); 00203 source.append(" col = column_indices[j]; \n"); 00204 source.append(" if (i == col) \n"); 00205 source.append(" diag = elements[j]; \n"); 00206 source.append(" else \n"); 00207 source.append(" sum += elements[j] * old_result[col]; \n"); 00208 source.append(" } \n"); 00209 source.append(" new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n"); 00210 source.append(" } \n"); 00211 source.append(" } \n"); 00212 00213 } 00214 00215 template <typename StringType> 00216 void generate_compressed_matrix_lu_backward(StringType & source, std::string const & numeric_string) 00217 { 00218 // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format 00219 source.append("__kernel void lu_backward( \n"); 00220 source.append(" __global const unsigned int * row_indices, \n"); 00221 source.append(" __global const unsigned int * column_indices, \n"); 00222 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00223 source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n"); 00224 source.append(" unsigned int size) \n"); 00225 source.append("{ \n"); 00226 source.append(" __local unsigned int col_index_buffer[128]; \n"); 00227 source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n"); 00228 source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n"); 00229 00230 source.append(" unsigned int nnz = row_indices[size]; \n"); 00231 source.append(" unsigned int current_row = size-1; \n"); 00232 source.append(" unsigned int row_at_window_start = size-1; \n"); 00233 source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n"); 00234 source.append(" "); source.append(numeric_string); source.append(" diagonal_entry = 0; \n"); 00235 source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n"); 00236 source.append(" unsigned int next_row = row_indices[size-1]; \n"); 00237 00238 source.append(" unsigned int i = loop_end + get_local_id(0); \n"); 00239 source.append(" while (1) \n"); 00240 source.append(" { \n"); 00241 //load into shared memory (coalesced access): 00242 source.append(" if (i < nnz) \n"); 00243 source.append(" { \n"); 00244 source.append(" element_buffer[get_local_id(0)] = elements[i]; \n"); 00245 source.append(" unsigned int tmp = column_indices[i]; \n"); 00246 source.append(" col_index_buffer[get_local_id(0)] = tmp; \n"); 00247 source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n"); 00248 source.append(" } \n"); 00249 00250 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00251 00252 //now a single thread does the remaining work in shared memory: 00253 source.append(" if (get_local_id(0) == 0) \n"); 00254 source.append(" { \n"); 00255 // traverse through all the loaded data from back to front: 00256 source.append(" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n"); 00257 source.append(" { \n"); 00258 source.append(" unsigned int k = (get_local_size(0) - k2) - 1; \n"); 00259 00260 source.append(" if (i+k >= nnz) \n"); 00261 source.append(" continue; \n"); 00262 00263 source.append(" if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results 00264 source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n"); 00265 source.append(" else if (col_index_buffer[k] > current_row) \n"); //use buffered data 00266 source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n"); 00267 source.append(" else if (col_index_buffer[k] == current_row) \n"); 00268 source.append(" diagonal_entry = element_buffer[k]; \n"); 00269 00270 source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result 00271 source.append(" { \n"); 00272 source.append(" vector[current_row] = current_vector_entry / diagonal_entry; \n"); 00273 source.append(" if (current_row > 0) //load next row's data \n"); 00274 source.append(" { \n"); 00275 source.append(" --current_row; \n"); 00276 source.append(" next_row = row_indices[current_row]; \n"); 00277 source.append(" current_vector_entry = vector[current_row]; \n"); 00278 source.append(" } \n"); 00279 source.append(" } \n"); 00280 00281 00282 source.append(" } \n"); // for k 00283 00284 source.append(" row_at_window_start = current_row; \n"); 00285 source.append(" } \n"); // if (get_local_id(0) == 0) 00286 00287 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00288 00289 source.append(" if (i < get_local_size(0)) \n"); 00290 source.append(" break; \n"); 00291 00292 source.append(" i -= get_local_size(0); \n"); 00293 source.append(" } \n"); //for i 00294 source.append("} \n"); 00295 00296 } 00297 00298 template <typename StringType> 00299 void generate_compressed_matrix_lu_forward(StringType & source, std::string const & numeric_string) 00300 { 00301 00302 // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format 00303 source.append("__kernel void lu_forward( \n"); 00304 source.append(" __global const unsigned int * row_indices, \n"); 00305 source.append(" __global const unsigned int * column_indices, \n"); 00306 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00307 source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n"); 00308 source.append(" unsigned int size) \n"); 00309 source.append("{ \n"); 00310 source.append(" __local unsigned int col_index_buffer[128]; \n"); 00311 source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n"); 00312 source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n"); 00313 00314 source.append(" unsigned int nnz = row_indices[size]; \n"); 00315 source.append(" unsigned int current_row = 0; \n"); 00316 source.append(" unsigned int row_at_window_start = 0; \n"); 00317 source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n"); 00318 source.append(" "); source.append(numeric_string); source.append(" diagonal_entry; \n"); 00319 source.append(" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n"); 00320 source.append(" unsigned int next_row = row_indices[1]; \n"); 00321 00322 source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n"); 00323 source.append(" { \n"); 00324 //load into shared memory (coalesced access): 00325 source.append(" if (i < nnz) \n"); 00326 source.append(" { \n"); 00327 source.append(" element_buffer[get_local_id(0)] = elements[i]; \n"); 00328 source.append(" unsigned int tmp = column_indices[i]; \n"); 00329 source.append(" col_index_buffer[get_local_id(0)] = tmp; \n"); 00330 source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n"); 00331 source.append(" } \n"); 00332 00333 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00334 00335 //now a single thread does the remaining work in shared memory: 00336 source.append(" if (get_local_id(0) == 0) \n"); 00337 source.append(" { \n"); 00338 // traverse through all the loaded data: 00339 source.append(" for (unsigned int k=0; k<get_local_size(0); ++k) \n"); 00340 source.append(" { \n"); 00341 source.append(" if (current_row < size && i+k == next_row) \n"); //current row is finished. Write back result 00342 source.append(" { \n"); 00343 source.append(" vector[current_row] = current_vector_entry / diagonal_entry; \n"); 00344 source.append(" ++current_row; \n"); 00345 source.append(" if (current_row < size) \n"); //load next row's data 00346 source.append(" { \n"); 00347 source.append(" next_row = row_indices[current_row+1]; \n"); 00348 source.append(" current_vector_entry = vector[current_row]; \n"); 00349 source.append(" } \n"); 00350 source.append(" } \n"); 00351 00352 source.append(" if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute 00353 source.append(" { \n"); 00354 source.append(" if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results 00355 source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n"); 00356 source.append(" else if (col_index_buffer[k] < current_row) \n"); //use buffered data 00357 source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n"); 00358 source.append(" } \n"); 00359 source.append(" else if (col_index_buffer[k] == current_row) \n"); 00360 source.append(" diagonal_entry = element_buffer[k]; \n"); 00361 00362 source.append(" } \n"); // for k 00363 00364 source.append(" row_at_window_start = current_row; \n"); 00365 source.append(" } \n"); // if (get_local_id(0) == 0) 00366 00367 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00368 source.append(" } \n"); //for i 00369 source.append("} \n"); 00370 00371 } 00372 00373 template <typename StringType> 00374 void generate_compressed_matrix_row_info_extractor(StringType & source, std::string const & numeric_string) 00375 { 00376 source.append("__kernel void row_info_extractor( \n"); 00377 source.append(" __global const unsigned int * row_indices, \n"); 00378 source.append(" __global const unsigned int * column_indices, \n"); 00379 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00380 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 00381 source.append(" unsigned int size, \n"); 00382 source.append(" unsigned int option \n"); 00383 source.append(" ) \n"); 00384 source.append("{ \n"); 00385 source.append(" for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0)) \n"); 00386 source.append(" { \n"); 00387 source.append(" "); source.append(numeric_string); source.append(" value = 0; \n"); 00388 source.append(" unsigned int row_end = row_indices[row+1]; \n"); 00389 00390 source.append(" switch (option) \n"); 00391 source.append(" { \n"); 00392 source.append(" case 0: \n"); //inf-norm 00393 source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n"); 00394 source.append(" value = max(value, fabs(elements[i])); \n"); 00395 source.append(" break; \n"); 00396 00397 source.append(" case 1: \n"); //1-norm 00398 source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n"); 00399 source.append(" value += fabs(elements[i]); \n"); 00400 source.append(" break; \n"); 00401 00402 source.append(" case 2: \n"); //2-norm 00403 source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n"); 00404 source.append(" value += elements[i] * elements[i]; \n"); 00405 source.append(" value = sqrt(value); \n"); 00406 source.append(" break; \n"); 00407 00408 source.append(" case 3: \n"); //diagonal entry 00409 source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n"); 00410 source.append(" { \n"); 00411 source.append(" if (column_indices[i] == row) \n"); 00412 source.append(" { \n"); 00413 source.append(" value = elements[i]; \n"); 00414 source.append(" break; \n"); 00415 source.append(" } \n"); 00416 source.append(" } \n"); 00417 source.append(" break; \n"); 00418 00419 source.append(" default: \n"); 00420 source.append(" break; \n"); 00421 source.append(" } \n"); 00422 source.append(" result[row] = value; \n"); 00423 source.append(" } \n"); 00424 source.append("} \n"); 00425 00426 } 00427 00428 template <typename StringType> 00429 void generate_compressed_matrix_trans_lu_backward(StringType & source, std::string const & numeric_string) 00430 { 00431 00432 // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format 00433 source.append("__kernel void trans_lu_backward( \n"); 00434 source.append(" __global const unsigned int * row_indices, \n"); 00435 source.append(" __global const unsigned int * column_indices, \n"); 00436 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00437 source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n"); 00438 source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n"); 00439 source.append(" unsigned int size) \n"); 00440 source.append("{ \n"); 00441 source.append(" __local unsigned int row_index_lookahead[256]; \n"); 00442 source.append(" __local unsigned int row_index_buffer[256]; \n"); 00443 00444 source.append(" unsigned int row_index; \n"); 00445 source.append(" unsigned int col_index; \n"); 00446 source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n"); 00447 source.append(" unsigned int nnz = row_indices[size]; \n"); 00448 source.append(" unsigned int row_at_window_start = size; \n"); 00449 source.append(" unsigned int row_at_window_end; \n"); 00450 source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n"); 00451 00452 source.append(" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n"); 00453 source.append(" { \n"); 00454 source.append(" unsigned int i = (nnz - i2) - 1; \n"); 00455 source.append(" col_index = (i2 < nnz) ? column_indices[i] : 0; \n"); 00456 source.append(" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n"); 00457 source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n"); 00458 00459 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00460 00461 source.append(" if (i2 < nnz) \n"); 00462 source.append(" { \n"); 00463 source.append(" unsigned int row_index_dec = 0; \n"); 00464 source.append(" while (row_index_lookahead[row_index_dec] > i) \n"); 00465 source.append(" ++row_index_dec; \n"); 00466 source.append(" row_index = row_at_window_start - row_index_dec; \n"); 00467 source.append(" row_index_buffer[get_local_id(0)] = row_index; \n"); 00468 source.append(" } \n"); 00469 source.append(" else \n"); 00470 source.append(" { \n"); 00471 source.append(" row_index = size+1; \n"); 00472 source.append(" row_index_buffer[get_local_id(0)] = 0; \n"); 00473 source.append(" } \n"); 00474 00475 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00476 00477 source.append(" row_at_window_start = row_index_buffer[0]; \n"); 00478 source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n"); 00479 00480 //backward elimination 00481 source.append(" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n"); 00482 source.append(" { \n"); 00483 source.append(" unsigned int row = row_at_window_start - row2; \n"); 00484 source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n"); 00485 00486 source.append(" if ( (row_index == row) && (col_index < row) ) \n"); 00487 source.append(" vector[col_index] -= result_entry * matrix_entry; \n"); 00488 00489 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00490 source.append(" } \n"); 00491 00492 source.append(" row_at_window_start = row_at_window_end; \n"); 00493 source.append(" } \n"); 00494 00495 // final step: Divide vector by diagonal entries: 00496 source.append(" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n"); 00497 source.append(" vector[i] /= diagonal_entries[i]; \n"); 00498 source.append("} \n"); 00499 00500 } 00501 00502 template <typename StringType> 00503 void generate_compressed_matrix_trans_lu_forward(StringType & source, std::string const & numeric_string) 00504 { 00505 00506 // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format 00507 source.append("__kernel void trans_lu_forward( \n"); 00508 source.append(" __global const unsigned int * row_indices, \n"); 00509 source.append(" __global const unsigned int * column_indices, \n"); 00510 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00511 source.append(" __global const "); source.append(numeric_string); source.append(" * diagonal_entries, \n"); 00512 source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n"); 00513 source.append(" unsigned int size) \n"); 00514 source.append("{ \n"); 00515 source.append(" __local unsigned int row_index_lookahead[256]; \n"); 00516 source.append(" __local unsigned int row_index_buffer[256]; \n"); 00517 00518 source.append(" unsigned int row_index; \n"); 00519 source.append(" unsigned int col_index; \n"); 00520 source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n"); 00521 source.append(" unsigned int nnz = row_indices[size]; \n"); 00522 source.append(" unsigned int row_at_window_start = 0; \n"); 00523 source.append(" unsigned int row_at_window_end = 0; \n"); 00524 source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n"); 00525 00526 source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n"); 00527 source.append(" { \n"); 00528 source.append(" col_index = (i < nnz) ? column_indices[i] : 0; \n"); 00529 source.append(" matrix_entry = (i < nnz) ? elements[i] : 0; \n"); 00530 source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1; \n"); 00531 00532 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00533 00534 source.append(" if (i < nnz) \n"); 00535 source.append(" { \n"); 00536 source.append(" unsigned int row_index_inc = 0; \n"); 00537 source.append(" while (i >= row_index_lookahead[row_index_inc + 1]) \n"); 00538 source.append(" ++row_index_inc; \n"); 00539 source.append(" row_index = row_at_window_start + row_index_inc; \n"); 00540 source.append(" row_index_buffer[get_local_id(0)] = row_index; \n"); 00541 source.append(" } \n"); 00542 source.append(" else \n"); 00543 source.append(" { \n"); 00544 source.append(" row_index = size+1; \n"); 00545 source.append(" row_index_buffer[get_local_id(0)] = size - 1; \n"); 00546 source.append(" } \n"); 00547 00548 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00549 00550 source.append(" row_at_window_start = row_index_buffer[0]; \n"); 00551 source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n"); 00552 00553 //forward elimination 00554 source.append(" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n"); 00555 source.append(" { \n"); 00556 source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row] / diagonal_entries[row]; \n"); 00557 00558 source.append(" if ( (row_index == row) && (col_index > row) ) \n"); 00559 source.append(" vector[col_index] -= result_entry * matrix_entry; \n"); 00560 00561 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00562 source.append(" } \n"); 00563 00564 source.append(" row_at_window_start = row_at_window_end; \n"); 00565 source.append(" } \n"); 00566 00567 // final step: Divide vector by diagonal entries: 00568 source.append(" for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0)) \n"); 00569 source.append(" vector[i] /= diagonal_entries[i]; \n"); 00570 source.append("} \n"); 00571 00572 } 00573 00574 template <typename StringType> 00575 void generate_compressed_matrix_trans_unit_lu_backward(StringType & source, std::string const & numeric_string) 00576 { 00577 00578 // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format 00579 source.append("__kernel void trans_unit_lu_backward( \n"); 00580 source.append(" __global const unsigned int * row_indices, \n"); 00581 source.append(" __global const unsigned int * column_indices, \n"); 00582 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00583 source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n"); 00584 source.append(" unsigned int size) \n"); 00585 source.append("{ \n"); 00586 source.append(" __local unsigned int row_index_lookahead[256]; \n"); 00587 source.append(" __local unsigned int row_index_buffer[256]; \n"); 00588 00589 source.append(" unsigned int row_index; \n"); 00590 source.append(" unsigned int col_index; \n"); 00591 source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n"); 00592 source.append(" unsigned int nnz = row_indices[size]; \n"); 00593 source.append(" unsigned int row_at_window_start = size; \n"); 00594 source.append(" unsigned int row_at_window_end; \n"); 00595 source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n"); 00596 00597 source.append(" for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0)) \n"); 00598 source.append(" { \n"); 00599 source.append(" unsigned int i = (nnz - i2) - 1; \n"); 00600 source.append(" col_index = (i2 < nnz) ? column_indices[i] : 0; \n"); 00601 source.append(" matrix_entry = (i2 < nnz) ? elements[i] : 0; \n"); 00602 source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0; \n"); 00603 00604 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00605 00606 source.append(" if (i2 < nnz) \n"); 00607 source.append(" { \n"); 00608 source.append(" unsigned int row_index_dec = 0; \n"); 00609 source.append(" while (row_index_lookahead[row_index_dec] > i) \n"); 00610 source.append(" ++row_index_dec; \n"); 00611 source.append(" row_index = row_at_window_start - row_index_dec; \n"); 00612 source.append(" row_index_buffer[get_local_id(0)] = row_index; \n"); 00613 source.append(" } \n"); 00614 source.append(" else \n"); 00615 source.append(" { \n"); 00616 source.append(" row_index = size+1; \n"); 00617 source.append(" row_index_buffer[get_local_id(0)] = 0; \n"); 00618 source.append(" } \n"); 00619 00620 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00621 00622 source.append(" row_at_window_start = row_index_buffer[0]; \n"); 00623 source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n"); 00624 00625 //backward elimination 00626 source.append(" for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n"); 00627 source.append(" { \n"); 00628 source.append(" unsigned int row = row_at_window_start - row2; \n"); 00629 source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n"); 00630 00631 source.append(" if ( (row_index == row) && (col_index < row) ) \n"); 00632 source.append(" vector[col_index] -= result_entry * matrix_entry; \n"); 00633 00634 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00635 source.append(" } \n"); 00636 00637 source.append(" row_at_window_start = row_at_window_end; \n"); 00638 source.append(" } \n"); 00639 source.append("} \n"); 00640 00641 } 00642 00643 00644 template <typename StringType> 00645 void generate_compressed_matrix_trans_unit_lu_forward(StringType & source, std::string const & numeric_string) 00646 { 00647 00648 // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format 00649 source.append("__kernel void trans_unit_lu_forward( \n"); 00650 source.append(" __global const unsigned int * row_indices, \n"); 00651 source.append(" __global const unsigned int * column_indices, \n"); 00652 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00653 source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n"); 00654 source.append(" unsigned int size) \n"); 00655 source.append("{ \n"); 00656 source.append(" __local unsigned int row_index_lookahead[256]; \n"); 00657 source.append(" __local unsigned int row_index_buffer[256]; \n"); 00658 00659 source.append(" unsigned int row_index; \n"); 00660 source.append(" unsigned int col_index; \n"); 00661 source.append(" "); source.append(numeric_string); source.append(" matrix_entry; \n"); 00662 source.append(" unsigned int nnz = row_indices[size]; \n"); 00663 source.append(" unsigned int row_at_window_start = 0; \n"); 00664 source.append(" unsigned int row_at_window_end = 0; \n"); 00665 source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0); \n"); 00666 00667 source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n"); 00668 source.append(" { \n"); 00669 source.append(" col_index = (i < nnz) ? column_indices[i] : 0; \n"); 00670 source.append(" matrix_entry = (i < nnz) ? elements[i] : 0; \n"); 00671 source.append(" row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1; \n"); 00672 00673 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00674 00675 source.append(" if (i < nnz) \n"); 00676 source.append(" { \n"); 00677 source.append(" unsigned int row_index_inc = 0; \n"); 00678 source.append(" while (i >= row_index_lookahead[row_index_inc + 1]) \n"); 00679 source.append(" ++row_index_inc; \n"); 00680 source.append(" row_index = row_at_window_start + row_index_inc; \n"); 00681 source.append(" row_index_buffer[get_local_id(0)] = row_index; \n"); 00682 source.append(" } \n"); 00683 source.append(" else \n"); 00684 source.append(" { \n"); 00685 source.append(" row_index = size+1; \n"); 00686 source.append(" row_index_buffer[get_local_id(0)] = size - 1; \n"); 00687 source.append(" } \n"); 00688 00689 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00690 00691 source.append(" row_at_window_start = row_index_buffer[0]; \n"); 00692 source.append(" row_at_window_end = row_index_buffer[get_local_size(0) - 1]; \n"); 00693 00694 //forward elimination 00695 source.append(" for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n"); 00696 source.append(" { \n"); 00697 source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n"); 00698 00699 source.append(" if ( (row_index == row) && (col_index > row) ) \n"); 00700 source.append(" vector[col_index] -= result_entry * matrix_entry; \n"); 00701 00702 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00703 source.append(" } \n"); 00704 00705 source.append(" row_at_window_start = row_at_window_end; \n"); 00706 source.append(" } \n"); 00707 source.append("} \n"); 00708 00709 } 00710 00711 template <typename StringType> 00712 void generate_compressed_matrix_trans_unit_lu_forward_slow(StringType & source, std::string const & numeric_string) 00713 { 00714 00715 // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format 00716 source.append("__kernel void trans_unit_lu_forward_slow( \n"); 00717 source.append(" __global const unsigned int * row_indices, \n"); 00718 source.append(" __global const unsigned int * column_indices, \n"); 00719 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00720 source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n"); 00721 source.append(" unsigned int size) \n"); 00722 source.append("{ \n"); 00723 source.append(" for (unsigned int row = 0; row < size; ++row) \n"); 00724 source.append(" { \n"); 00725 source.append(" "); source.append(numeric_string); source.append(" result_entry = vector[row]; \n"); 00726 00727 source.append(" unsigned int row_start = row_indices[row]; \n"); 00728 source.append(" unsigned int row_stop = row_indices[row + 1]; \n"); 00729 source.append(" for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n"); 00730 source.append(" { \n"); 00731 source.append(" unsigned int col_index = column_indices[entry_index]; \n"); 00732 source.append(" if (col_index > row) \n"); 00733 source.append(" vector[col_index] -= result_entry * elements[entry_index]; \n"); 00734 source.append(" } \n"); 00735 00736 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00737 source.append(" } \n"); 00738 source.append("} \n"); 00739 00740 } 00741 00742 template <typename StringType> 00743 void generate_compressed_matrix_unit_lu_backward(StringType & source, std::string const & numeric_string) 00744 { 00745 00746 // compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format 00747 source.append("__kernel void unit_lu_backward( \n"); 00748 source.append(" __global const unsigned int * row_indices, \n"); 00749 source.append(" __global const unsigned int * column_indices, \n"); 00750 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00751 source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n"); 00752 source.append(" unsigned int size) \n"); 00753 source.append("{ \n"); 00754 source.append(" __local unsigned int col_index_buffer[128]; \n"); 00755 source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n"); 00756 source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n"); 00757 00758 source.append(" unsigned int nnz = row_indices[size]; \n"); 00759 source.append(" unsigned int current_row = size-1; \n"); 00760 source.append(" unsigned int row_at_window_start = size-1; \n"); 00761 source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[size-1]; \n"); 00762 source.append(" unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0); \n"); 00763 source.append(" unsigned int next_row = row_indices[size-1]; \n"); 00764 00765 source.append(" unsigned int i = loop_end + get_local_id(0); \n"); 00766 source.append(" while (1) \n"); 00767 source.append(" { \n"); 00768 //load into shared memory (coalesced access): 00769 source.append(" if (i < nnz) \n"); 00770 source.append(" { \n"); 00771 source.append(" element_buffer[get_local_id(0)] = elements[i]; \n"); 00772 source.append(" unsigned int tmp = column_indices[i]; \n"); 00773 source.append(" col_index_buffer[get_local_id(0)] = tmp; \n"); 00774 source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n"); 00775 source.append(" } \n"); 00776 00777 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00778 00779 //now a single thread does the remaining work in shared memory: 00780 source.append(" if (get_local_id(0) == 0) \n"); 00781 source.append(" { \n"); 00782 // traverse through all the loaded data from back to front: 00783 source.append(" for (unsigned int k2=0; k2<get_local_size(0); ++k2) \n"); 00784 source.append(" { \n"); 00785 source.append(" unsigned int k = (get_local_size(0) - k2) - 1; \n"); 00786 00787 source.append(" if (i+k >= nnz) \n"); 00788 source.append(" continue; \n"); 00789 00790 source.append(" if (col_index_buffer[k] > row_at_window_start) \n"); //use recently computed results 00791 source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n"); 00792 source.append(" else if (col_index_buffer[k] > current_row) \n"); //use buffered data 00793 source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n"); 00794 00795 source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result 00796 source.append(" { \n"); 00797 source.append(" vector[current_row] = current_vector_entry; \n"); 00798 source.append(" if (current_row > 0) \n"); //load next row's data 00799 source.append(" { \n"); 00800 source.append(" --current_row; \n"); 00801 source.append(" next_row = row_indices[current_row]; \n"); 00802 source.append(" current_vector_entry = vector[current_row]; \n"); 00803 source.append(" } \n"); 00804 source.append(" } \n"); 00805 00806 00807 source.append(" } \n"); // for k 00808 00809 source.append(" row_at_window_start = current_row; \n"); 00810 source.append(" } \n"); // if (get_local_id(0) == 0) 00811 00812 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00813 00814 source.append(" if (i < get_local_size(0)) \n"); 00815 source.append(" break; \n"); 00816 00817 source.append(" i -= get_local_size(0); \n"); 00818 source.append(" } \n"); //for i 00819 source.append("} \n"); 00820 00821 } 00822 00823 template <typename StringType> 00824 void generate_compressed_matrix_unit_lu_forward(StringType & source, std::string const & numeric_string) 00825 { 00826 00827 // compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format 00828 source.append("__kernel void unit_lu_forward( \n"); 00829 source.append(" __global const unsigned int * row_indices, \n"); 00830 source.append(" __global const unsigned int * column_indices, \n"); 00831 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00832 source.append(" __global "); source.append(numeric_string); source.append(" * vector, \n"); 00833 source.append(" unsigned int size) \n"); 00834 source.append("{ \n"); 00835 source.append(" __local unsigned int col_index_buffer[128]; \n"); 00836 source.append(" __local "); source.append(numeric_string); source.append(" element_buffer[128]; \n"); 00837 source.append(" __local "); source.append(numeric_string); source.append(" vector_buffer[128]; \n"); 00838 00839 source.append(" unsigned int nnz = row_indices[size]; \n"); 00840 source.append(" unsigned int current_row = 0; \n"); 00841 source.append(" unsigned int row_at_window_start = 0; \n"); 00842 source.append(" "); source.append(numeric_string); source.append(" current_vector_entry = vector[0]; \n"); 00843 source.append(" unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0); \n"); 00844 source.append(" unsigned int next_row = row_indices[1]; \n"); 00845 00846 source.append(" for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0)) \n"); 00847 source.append(" { \n"); 00848 //load into shared memory (coalesced access): 00849 source.append(" if (i < nnz) \n"); 00850 source.append(" { \n"); 00851 source.append(" element_buffer[get_local_id(0)] = elements[i]; \n"); 00852 source.append(" unsigned int tmp = column_indices[i]; \n"); 00853 source.append(" col_index_buffer[get_local_id(0)] = tmp; \n"); 00854 source.append(" vector_buffer[get_local_id(0)] = vector[tmp]; \n"); 00855 source.append(" } \n"); 00856 00857 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00858 00859 //now a single thread does the remaining work in shared memory: 00860 source.append(" if (get_local_id(0) == 0) \n"); 00861 source.append(" { \n"); 00862 // traverse through all the loaded data: 00863 source.append(" for (unsigned int k=0; k<get_local_size(0); ++k) \n"); 00864 source.append(" { \n"); 00865 source.append(" if (i+k == next_row) \n"); //current row is finished. Write back result 00866 source.append(" { \n"); 00867 source.append(" vector[current_row] = current_vector_entry; \n"); 00868 source.append(" ++current_row; \n"); 00869 source.append(" if (current_row < size) //load next row's data \n"); 00870 source.append(" { \n"); 00871 source.append(" next_row = row_indices[current_row+1]; \n"); 00872 source.append(" current_vector_entry = vector[current_row]; \n"); 00873 source.append(" } \n"); 00874 source.append(" } \n"); 00875 00876 source.append(" if (current_row < size && col_index_buffer[k] < current_row) \n"); //substitute 00877 source.append(" { \n"); 00878 source.append(" if (col_index_buffer[k] < row_at_window_start) \n"); //use recently computed results 00879 source.append(" current_vector_entry -= element_buffer[k] * vector_buffer[k]; \n"); 00880 source.append(" else if (col_index_buffer[k] < current_row) \n"); //use buffered data 00881 source.append(" current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; \n"); 00882 source.append(" } \n"); 00883 00884 source.append(" } \n"); // for k 00885 00886 source.append(" row_at_window_start = current_row; \n"); 00887 source.append(" } \n"); // if (get_local_id(0) == 0) 00888 00889 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00890 source.append(" } //for i \n"); 00891 source.append("} \n"); 00892 00893 } 00894 00895 template <typename StringType> 00896 void generate_compressed_matrix_vec_mul(StringType & source, std::string const & numeric_string) 00897 { 00898 00899 source.append("__kernel void vec_mul( \n"); 00900 source.append(" __global const unsigned int * row_indices, \n"); 00901 source.append(" __global const unsigned int * column_indices, \n"); 00902 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00903 source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n"); 00904 source.append(" uint4 layout_x, \n"); 00905 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 00906 source.append(" uint4 layout_result) \n"); 00907 source.append("{ \n"); 00908 source.append(" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n"); 00909 source.append(" { \n"); 00910 source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n"); 00911 source.append(" unsigned int row_end = row_indices[row+1]; \n"); 00912 source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n"); 00913 source.append(" dot_prod += elements[i] * x[column_indices[i] * layout_x.y + layout_x.x]; \n"); 00914 source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n"); 00915 source.append(" } \n"); 00916 source.append("} \n"); 00917 00918 } 00919 00920 template <typename StringType> 00921 void generate_compressed_matrix_vec_mul4(StringType & source, std::string const & numeric_string) 00922 { 00923 source.append("__kernel void vec_mul4( \n"); 00924 source.append(" __global const unsigned int * row_indices, \n"); 00925 source.append(" __global const uint4 * column_indices, \n"); 00926 source.append(" __global const "); source.append(numeric_string); source.append("4 * elements, \n"); 00927 source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n"); 00928 source.append(" uint4 layout_x, \n"); 00929 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 00930 source.append(" uint4 layout_result) \n"); 00931 source.append("{ \n"); 00932 source.append(" "); source.append(numeric_string); source.append(" dot_prod; \n"); 00933 source.append(" unsigned int start, next_stop; \n"); 00934 source.append(" uint4 col_idx; \n"); 00935 source.append(" "); source.append(numeric_string); source.append("4 tmp_vec; \n"); 00936 source.append(" "); source.append(numeric_string); source.append("4 tmp_entries; \n"); 00937 00938 source.append(" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n"); 00939 source.append(" { \n"); 00940 source.append(" dot_prod = 0; \n"); 00941 source.append(" start = row_indices[row] / 4; \n"); 00942 source.append(" next_stop = row_indices[row+1] / 4; \n"); 00943 00944 source.append(" for (unsigned int i = start; i < next_stop; ++i) \n"); 00945 source.append(" { \n"); 00946 source.append(" col_idx = column_indices[i]; \n"); 00947 00948 source.append(" tmp_entries = elements[i]; \n"); 00949 source.append(" tmp_vec.x = x[col_idx.x * layout_x.y + layout_x.x]; \n"); 00950 source.append(" tmp_vec.y = x[col_idx.y * layout_x.y + layout_x.x]; \n"); 00951 source.append(" tmp_vec.z = x[col_idx.z * layout_x.y + layout_x.x]; \n"); 00952 source.append(" tmp_vec.w = x[col_idx.w * layout_x.y + layout_x.x]; \n"); 00953 00954 source.append(" dot_prod += dot(tmp_entries, tmp_vec); \n"); 00955 source.append(" } \n"); 00956 source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n"); 00957 source.append(" } \n"); 00958 source.append("} \n"); 00959 } 00960 00961 template <typename StringType> 00962 void generate_compressed_matrix_vec_mul8(StringType & source, std::string const & numeric_string) 00963 { 00964 source.append("__kernel void vec_mul8( \n"); 00965 source.append(" __global const unsigned int * row_indices, \n"); 00966 source.append(" __global const uint8 * column_indices, \n"); 00967 source.append(" __global const "); source.append(numeric_string); source.append("8 * elements, \n"); 00968 source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n"); 00969 source.append(" uint4 layout_x, \n"); 00970 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 00971 source.append(" uint4 layout_result) \n"); 00972 source.append("{ \n"); 00973 source.append(" "); source.append(numeric_string); source.append(" dot_prod; \n"); 00974 source.append(" unsigned int start, next_stop; \n"); 00975 source.append(" uint8 col_idx; \n"); 00976 source.append(" "); source.append(numeric_string); source.append("8 tmp_vec; \n"); 00977 source.append(" "); source.append(numeric_string); source.append("8 tmp_entries; \n"); 00978 00979 source.append(" for (unsigned int row = get_global_id(0); row < layout_result.z; row += get_global_size(0)) \n"); 00980 source.append(" { \n"); 00981 source.append(" dot_prod = 0; \n"); 00982 source.append(" start = row_indices[row] / 8; \n"); 00983 source.append(" next_stop = row_indices[row+1] / 8; \n"); 00984 00985 source.append(" for (unsigned int i = start; i < next_stop; ++i) \n"); 00986 source.append(" { \n"); 00987 source.append(" col_idx = column_indices[i]; \n"); 00988 00989 source.append(" tmp_entries = elements[i]; \n"); 00990 source.append(" tmp_vec.s0 = x[col_idx.s0 * layout_x.y + layout_x.x]; \n"); 00991 source.append(" tmp_vec.s1 = x[col_idx.s1 * layout_x.y + layout_x.x]; \n"); 00992 source.append(" tmp_vec.s2 = x[col_idx.s2 * layout_x.y + layout_x.x]; \n"); 00993 source.append(" tmp_vec.s3 = x[col_idx.s3 * layout_x.y + layout_x.x]; \n"); 00994 source.append(" tmp_vec.s4 = x[col_idx.s4 * layout_x.y + layout_x.x]; \n"); 00995 source.append(" tmp_vec.s5 = x[col_idx.s5 * layout_x.y + layout_x.x]; \n"); 00996 source.append(" tmp_vec.s6 = x[col_idx.s6 * layout_x.y + layout_x.x]; \n"); 00997 source.append(" tmp_vec.s7 = x[col_idx.s7 * layout_x.y + layout_x.x]; \n"); 00998 00999 source.append(" dot_prod += dot(tmp_entries.lo, tmp_vec.lo); \n"); 01000 source.append(" dot_prod += dot(tmp_entries.hi, tmp_vec.hi); \n"); 01001 source.append(" } \n"); 01002 source.append(" result[row * layout_result.y + layout_result.x] = dot_prod; \n"); 01003 source.append(" } \n"); 01004 source.append("} \n"); 01005 } 01006 01007 template <typename StringType> 01008 void generate_compressed_matrix_vec_mul_cpu(StringType & source, std::string const & numeric_string) 01009 { 01010 source.append("__kernel void vec_mul_cpu( \n"); 01011 source.append(" __global const unsigned int * row_indices, \n"); 01012 source.append(" __global const unsigned int * column_indices, \n"); 01013 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 01014 source.append(" __global const "); source.append(numeric_string); source.append(" * vector, \n"); 01015 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 01016 source.append(" unsigned int size) \n"); 01017 source.append("{ \n"); 01018 source.append(" unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1); \n"); 01019 source.append(" unsigned int row_start = get_global_id(0) * work_per_item; \n"); 01020 source.append(" unsigned int row_stop = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size); \n"); 01021 source.append(" for (unsigned int row = row_start; row < row_stop; ++row) \n"); 01022 source.append(" { \n"); 01023 source.append(" "); source.append(numeric_string); source.append(" dot_prod = ("); source.append(numeric_string); source.append(")0; \n"); 01024 source.append(" unsigned int row_end = row_indices[row+1]; \n"); 01025 source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i) \n"); 01026 source.append(" dot_prod += elements[i] * vector[column_indices[i]]; \n"); 01027 source.append(" result[row] = dot_prod; \n"); 01028 source.append(" } \n"); 01029 source.append("} \n"); 01030 01031 } 01032 01033 01035 01036 // main kernel class 01038 template <typename NumericT> 01039 struct compressed_matrix 01040 { 01041 static std::string program_name() 01042 { 01043 return viennacl::ocl::type_to_string<NumericT>::apply() + "_compressed_matrix"; 01044 } 01045 01046 static void init(viennacl::ocl::context & ctx) 01047 { 01048 viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); 01049 std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); 01050 01051 static std::map<cl_context, bool> init_done; 01052 if (!init_done[ctx.handle().get()]) 01053 { 01054 std::string source; 01055 source.reserve(1024); 01056 01057 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); 01058 01059 if (numeric_string == "float" || numeric_string == "double") 01060 { 01061 generate_compressed_matrix_block_trans_lu_backward(source, numeric_string); 01062 generate_compressed_matrix_block_trans_unit_lu_forward(source, numeric_string); 01063 generate_compressed_matrix_jacobi(source, numeric_string); 01064 generate_compressed_matrix_lu_backward(source, numeric_string); 01065 generate_compressed_matrix_lu_forward(source, numeric_string); 01066 generate_compressed_matrix_trans_lu_backward(source, numeric_string); 01067 generate_compressed_matrix_trans_lu_forward(source, numeric_string); 01068 generate_compressed_matrix_trans_unit_lu_backward(source, numeric_string); 01069 generate_compressed_matrix_trans_unit_lu_forward(source, numeric_string); 01070 generate_compressed_matrix_trans_unit_lu_forward_slow(source, numeric_string); 01071 generate_compressed_matrix_unit_lu_backward(source, numeric_string); 01072 generate_compressed_matrix_unit_lu_forward(source, numeric_string); 01073 } 01074 generate_compressed_matrix_dense_matrix_multiplication(source, numeric_string); 01075 generate_compressed_matrix_row_info_extractor(source, numeric_string); 01076 generate_compressed_matrix_vec_mul(source, numeric_string); 01077 generate_compressed_matrix_vec_mul4(source, numeric_string); 01078 generate_compressed_matrix_vec_mul8(source, numeric_string); 01079 generate_compressed_matrix_vec_mul_cpu(source, numeric_string); 01080 01081 std::string prog_name = program_name(); 01082 #ifdef VIENNACL_BUILD_INFO 01083 std::cout << "Creating program " << prog_name << std::endl; 01084 #endif 01085 ctx.add_program(source, prog_name); 01086 init_done[ctx.handle().get()] = true; 01087 } //if 01088 } //init 01089 }; 01090 01091 } // namespace kernels 01092 } // namespace opencl 01093 } // namespace linalg 01094 } // namespace viennacl 01095 #endif 01096