ViennaCL - The Vienna Computing Library
1.5.0
|
00001 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP 00002 #define VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP 00003 00004 #include "viennacl/tools/tools.hpp" 00005 #include "viennacl/ocl/kernel.hpp" 00006 #include "viennacl/ocl/platform.hpp" 00007 #include "viennacl/ocl/utils.hpp" 00008 00011 namespace viennacl 00012 { 00013 namespace linalg 00014 { 00015 namespace opencl 00016 { 00017 namespace kernels 00018 { 00019 00021 00022 template <typename StringType> 00023 void generate_spai_assemble_blocks(StringType & source, std::string const & numeric_string) 00024 { 00025 source.append("float get_element(__global const unsigned int * row_indices, \n"); 00026 source.append(" __global const unsigned int * column_indices, \n"); 00027 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00028 source.append(" unsigned int row, \n"); 00029 source.append(" unsigned int col) \n"); 00030 source.append("{ \n"); 00031 source.append(" unsigned int row_end = row_indices[row+1]; \n"); 00032 source.append(" for(unsigned int i = row_indices[row]; i < row_end; ++i){ \n"); 00033 source.append(" if(column_indices[i] == col) \n"); 00034 source.append(" return elements[i]; \n"); 00035 source.append(" if(column_indices[i] > col) \n"); 00036 source.append(" return 0; \n"); 00037 source.append(" } \n"); 00038 source.append(" return 0; \n"); 00039 source.append("} \n"); 00040 00041 source.append("void block_assembly(__global const unsigned int * row_indices, \n"); 00042 source.append(" __global const unsigned int * column_indices, \n"); 00043 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00044 source.append(" __global const unsigned int * matrix_dimensions, \n"); 00045 source.append(" __global const unsigned int * set_I, \n"); 00046 source.append(" __global const unsigned int * set_J, \n"); 00047 source.append(" unsigned int matrix_ind, \n"); 00048 source.append(" __global "); source.append(numeric_string); source.append(" * com_A_I_J) \n"); 00049 source.append("{ \n"); 00050 source.append(" unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n"); 00051 source.append(" unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n"); 00052 00053 source.append(" for(unsigned int i = 0; i < col_n; ++i){ \n"); 00054 //start row index 00055 source.append(" for(unsigned int j = 0; j < row_n; j++){ \n"); 00056 source.append(" com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]); \n"); 00057 source.append(" } \n"); 00058 source.append(" } \n"); 00059 source.append("} \n"); 00060 00061 source.append("__kernel void assemble_blocks( \n"); 00062 source.append(" __global const unsigned int * row_indices, \n"); 00063 source.append(" __global const unsigned int * column_indices, \n"); 00064 source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); 00065 source.append(" __global const unsigned int * set_I, \n"); 00066 source.append(" __global const unsigned int * set_J, \n"); 00067 source.append(" __global const unsigned int * i_ind, \n"); 00068 source.append(" __global const unsigned int * j_ind, \n"); 00069 source.append(" __global const unsigned int * block_ind, \n"); 00070 source.append(" __global const unsigned int * matrix_dimensions, \n"); 00071 source.append(" __global "); source.append(numeric_string); source.append(" * com_A_I_J, \n"); 00072 source.append(" __global unsigned int * g_is_update, \n"); 00073 source.append(" unsigned int block_elems_num) \n"); 00074 source.append("{ \n"); 00075 source.append(" for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n"); 00076 source.append(" if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); 00077 source.append(" block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]); \n"); 00078 source.append(" } \n"); 00079 source.append(" } \n"); 00080 source.append(" } \n"); 00081 } 00082 00083 template <typename StringType> 00084 void generate_spai_block_bv_assembly(StringType & source, std::string const & numeric_string) 00085 { 00086 source.append(" void assemble_bv(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n){ \n"); 00087 source.append(" for(unsigned int i = 0; i < col_n; ++i){ \n"); 00088 source.append(" g_bv_r[i] = g_bv[ i]; \n"); 00089 source.append(" } \n"); 00090 source.append(" } \n"); 00091 00092 source.append(" void assemble_bv_block(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n, \n"); 00093 source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_u, unsigned int col_n_u) \n"); 00094 source.append(" { \n"); 00095 source.append(" assemble_bv(g_bv_r, g_bv, col_n); \n"); 00096 source.append(" assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u); \n"); 00097 source.append(" } \n"); 00098 00099 source.append(" __kernel void block_bv_assembly(__global "); source.append(numeric_string); source.append(" * g_bv, \n"); 00100 source.append(" __global unsigned int * start_bv_ind, \n"); 00101 source.append(" __global unsigned int * matrix_dimensions, \n"); 00102 source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_u, \n"); 00103 source.append(" __global unsigned int * start_bv_u_ind, \n"); 00104 source.append(" __global unsigned int * matrix_dimensions_u, \n"); 00105 source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_r, \n"); 00106 source.append(" __global unsigned int * start_bv_r_ind, \n"); 00107 source.append(" __global unsigned int * matrix_dimensions_r, \n"); 00108 source.append(" __global unsigned int * g_is_update, \n"); 00109 source.append(" //__local "); source.append(numeric_string); source.append(" * local_gb, \n"); 00110 source.append(" unsigned int block_elems_num) \n"); 00111 source.append(" { \n"); 00112 source.append(" for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n"); 00113 source.append(" if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); 00114 source.append(" assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]); \n"); 00115 source.append(" } \n"); 00116 source.append(" } \n"); 00117 source.append(" } \n"); 00118 } 00119 00120 template <typename StringType> 00121 void generate_spai_block_least_squares(StringType & source, std::string const & numeric_string) 00122 { 00123 source.append("void custom_dot_prod_ls(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __global "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n"); 00124 source.append(" *res = 0.0; \n"); 00125 source.append(" for(unsigned int j = ind; j < row_n; ++j){ \n"); 00126 source.append(" if(j == ind){ \n"); 00127 source.append(" *res += v[ j]; \n"); 00128 source.append(" }else{ \n"); 00129 source.append(" *res += A[ j + ind*row_n]*v[ j]; \n"); 00130 source.append(" } \n"); 00131 source.append(" } \n"); 00132 source.append("} \n"); 00133 00134 source.append("void backwardSolve(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * y, __global "); source.append(numeric_string); source.append(" * x){ \n"); 00135 source.append(" for (int i = col_n-1; i >= 0 ; i--) { \n"); 00136 source.append(" x[ i] = y[ i]; \n"); 00137 source.append(" for (int j = i+1; j < col_n; ++j) { \n"); 00138 source.append(" x[ i] -= R[ i + j*row_n]*x[ j]; \n"); 00139 source.append(" } \n"); 00140 source.append(" x[i] /= R[ i + i*row_n]; \n"); 00141 source.append(" } \n"); 00142 source.append("} \n"); 00143 00144 00145 source.append("void apply_q_trans_vec_ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global const "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * y){ \n"); 00146 source.append(" "); source.append(numeric_string); source.append(" inn_prod = 0; \n"); 00147 source.append(" for(unsigned int i = 0; i < col_n; ++i){ \n"); 00148 source.append(" custom_dot_prod_ls(R, row_n, y, i, &inn_prod); \n"); 00149 source.append(" for(unsigned int j = i; j < row_n; ++j){ \n"); 00150 source.append(" if(i == j){ \n"); 00151 source.append(" y[ j] -= b_v[ i]*inn_prod; \n"); 00152 source.append(" } \n"); 00153 source.append(" else{ \n"); 00154 source.append(" y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n]; \n"); 00155 source.append(" } \n"); 00156 source.append(" } \n"); 00157 source.append(" } \n"); 00158 source.append(" } \n"); 00159 00160 source.append("void ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * m_v, __global "); source.append(numeric_string); source.append(" * y_v){ \n"); 00161 source.append(" apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v); \n"); 00162 source.append(" //m_new - is m_v now \n"); 00163 source.append(" backwardSolve(R, row_n, col_n, y_v, m_v); \n"); 00164 source.append("} \n"); 00165 00166 source.append("__kernel void block_least_squares( \n"); 00167 source.append(" __global "); source.append(numeric_string); source.append(" * global_R, \n"); 00168 source.append(" __global unsigned int * block_ind, \n"); 00169 source.append(" __global "); source.append(numeric_string); source.append(" * b_v, \n"); 00170 source.append(" __global unsigned int * start_bv_inds, \n"); 00171 source.append(" __global "); source.append(numeric_string); source.append(" * m_v, \n"); 00172 source.append(" __global "); source.append(numeric_string); source.append(" * y_v, \n"); 00173 source.append(" __global unsigned int * start_y_inds, \n"); 00174 source.append(" __global unsigned int * matrix_dimensions, \n"); 00175 source.append(" __global unsigned int * g_is_update, \n"); 00176 source.append(" unsigned int block_elems_num) \n"); 00177 source.append("{ \n"); 00178 source.append(" for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n"); 00179 source.append(" if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); 00180 source.append(" ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] ); \n"); 00181 source.append(" } \n"); 00182 source.append(" } \n"); 00183 source.append("} \n"); 00184 } 00185 00186 template <typename StringType> 00187 void generate_spai_block_q_mult(StringType & source, std::string const & numeric_string) 00188 { 00189 source.append("void custom_dot_prod(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __local "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n"); 00190 source.append(" *res = 0.0; \n"); 00191 source.append(" for(unsigned int j = ind; j < row_n; ++j){ \n"); 00192 source.append(" if(j == ind){ \n"); 00193 source.append(" *res += v[j]; \n"); 00194 source.append(" }else{ \n"); 00195 source.append(" *res += A[j + ind*row_n]*v[j]; \n"); 00196 source.append(" } \n"); 00197 source.append(" } \n"); 00198 source.append("} \n"); 00199 00200 source.append("void apply_q_trans_vec(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * y){ \n"); 00201 source.append(" "); source.append(numeric_string); source.append(" inn_prod = 0; \n"); 00202 source.append(" for(unsigned int i = 0; i < col_n; ++i){ \n"); 00203 source.append(" custom_dot_prod(R, row_n, y, i, &inn_prod); \n"); 00204 source.append(" for(unsigned int j = i; j < row_n; ++j){ \n"); 00205 source.append(" if(i == j){ \n"); 00206 source.append(" y[j] -= b_v[ i]*inn_prod; \n"); 00207 source.append(" } \n"); 00208 source.append(" else{ \n"); 00209 source.append(" y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n]; \n"); 00210 source.append(" } \n"); 00211 source.append(" } \n"); 00212 source.append(" } \n"); 00213 source.append("} \n"); 00214 00215 source.append("void q_mult(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * R_u, unsigned int col_n_u){ \n"); 00216 source.append(" for(unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){ \n"); 00217 source.append(" apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i); \n"); 00218 source.append(" } \n"); 00219 source.append("} \n"); 00220 00221 source.append("void matrix_from_global_to_local(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n"); 00222 source.append(" for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n"); 00223 source.append(" for(unsigned int j = 0; j < row_n; ++j){ \n"); 00224 source.append(" l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n"); 00225 source.append(" } \n"); 00226 source.append(" } \n"); 00227 source.append("} \n"); 00228 00229 source.append("void matrix_from_local_to_global(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n"); 00230 source.append(" for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n"); 00231 source.append(" for(unsigned int j = 0; j < row_n; ++j){ \n"); 00232 source.append(" g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n"); 00233 source.append(" } \n"); 00234 source.append(" } \n"); 00235 source.append("} \n"); 00236 00237 source.append("__kernel void block_q_mult(__global "); source.append(numeric_string); source.append(" * global_R, \n"); 00238 source.append(" __global unsigned int * block_ind, \n"); 00239 source.append(" __global "); source.append(numeric_string); source.append(" * global_R_u, \n"); 00240 source.append(" __global unsigned int *block_ind_u, \n"); 00241 source.append(" __global "); source.append(numeric_string); source.append(" * b_v, \n"); 00242 source.append(" __global unsigned int * start_bv_inds, \n"); 00243 source.append(" __global unsigned int * matrix_dimensions, \n"); 00244 source.append(" __global unsigned int * matrix_dimensions_u, \n"); 00245 source.append(" __global unsigned int * g_is_update, \n"); 00246 source.append(" __local "); source.append(numeric_string); source.append(" * local_R_u, \n"); 00247 source.append(" unsigned int block_elems_num){ \n"); 00248 source.append(" for(unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n"); 00249 source.append(" if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){ \n"); 00250 //matrix_from_global_to_local(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n"); 00251 source.append(" matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]); \n"); 00252 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00253 source.append(" q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n"); 00254 source.append(" matrix_dimensions_u[2*i + 1]); \n"); 00255 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00256 source.append(" matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]); \n"); 00257 source.append(" } \n"); 00258 source.append(" } \n"); 00259 source.append("} \n"); 00260 } 00261 00262 template <typename StringType> 00263 void generate_spai_block_qr(StringType & source, std::string const & numeric_string) 00264 { 00265 source.append("void dot_prod(__local const "); source.append(numeric_string); source.append("* A, unsigned int n, unsigned int beg_ind, "); source.append(numeric_string); source.append("* res){ \n"); 00266 source.append(" *res = 0; \n"); 00267 source.append(" for(unsigned int i = beg_ind; i < n; ++i){ \n"); 00268 source.append(" *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i]; \n"); 00269 source.append(" } \n"); 00270 source.append("} \n"); 00271 00272 source.append("void vector_div(__global "); source.append(numeric_string); source.append("* v, unsigned int beg_ind, "); source.append(numeric_string); source.append(" b, unsigned int n){ \n"); 00273 source.append(" for(unsigned int i = beg_ind; i < n; ++i){ \n"); 00274 source.append(" v[i] /= b; \n"); 00275 source.append(" } \n"); 00276 source.append("} \n"); 00277 00278 source.append("void copy_vector(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, const unsigned int beg_ind, const unsigned int n){ \n"); 00279 source.append(" for(unsigned int i = beg_ind; i < n; ++i){ \n"); 00280 source.append(" v[i] = A[(beg_ind-1)*n + i]; \n"); 00281 source.append(" } \n"); 00282 source.append("} \n"); 00283 00284 00285 source.append("void householder_vector(__local const "); source.append(numeric_string); source.append("* A, unsigned int j, unsigned int n, __global "); source.append(numeric_string); source.append("* v, __global "); source.append(numeric_string); source.append("* b){ \n"); 00286 source.append(" "); source.append(numeric_string); source.append(" sg; \n"); 00287 source.append(" dot_prod(A, n, j+1, &sg); \n"); 00288 source.append(" copy_vector(A, v, j+1, n); \n"); 00289 source.append(" "); source.append(numeric_string); source.append(" mu; \n"); 00290 source.append(" v[j] = 1.0; \n"); 00291 //print_contigious_vector(v, v_start_ind, n); 00292 source.append(" if(sg == 0){ \n"); 00293 source.append(" *b = 0; \n"); 00294 source.append(" } \n"); 00295 source.append(" else{ \n"); 00296 source.append(" mu = sqrt(A[j*n + j]*A[ j*n + j] + sg); \n"); 00297 source.append(" if(A[ j*n + j] <= 0){ \n"); 00298 source.append(" v[j] = A[ j*n + j] - mu; \n"); 00299 source.append(" }else{ \n"); 00300 source.append(" v[j] = -sg/(A[ j*n + j] + mu); \n"); 00301 source.append(" } \n"); 00302 source.append(" *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]); \n"); 00303 //*b = (2*v[j]*v[j])/(sg + (v[j])*(v[j])); 00304 source.append(" vector_div(v, j, v[j], n); \n"); 00305 //print_contigious_vector(v, v_start_ind, n); 00306 source.append(" } \n"); 00307 source.append("} \n"); 00308 00309 source.append("void custom_inner_prod(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, "); source.append(numeric_string); source.append("* res){ \n"); 00310 source.append(" for(unsigned int i = start_ind; i < row_num; ++i){ \n"); 00311 source.append(" *res += A[col_ind*row_num + i]*v[i]; \n"); 00312 source.append(" } \n"); 00313 source.append("} \n"); 00314 // 00315 source.append("void apply_householder_reflection(__local "); source.append(numeric_string); source.append("* A, unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global "); source.append(numeric_string); source.append("* v, "); source.append(numeric_string); source.append(" b){ \n"); 00316 source.append(" "); source.append(numeric_string); source.append(" in_prod_res; \n"); 00317 source.append(" for(unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){ \n"); 00318 source.append(" in_prod_res = 0.0; \n"); 00319 source.append(" custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res); \n"); 00320 source.append(" for(unsigned int j = iter_cnt; j < row_n; ++j){ \n"); 00321 source.append(" A[ i*row_n + j] -= b*in_prod_res* v[j]; \n"); 00322 source.append(" } \n"); 00323 source.append(" } \n"); 00324 source.append("} \n"); 00325 00326 source.append("void store_householder_vector(__local "); source.append(numeric_string); source.append("* A, unsigned int ind, unsigned int n, __global "); source.append(numeric_string); source.append("* v){ \n"); 00327 source.append(" for(unsigned int i = ind; i < n; ++i){ \n"); 00328 source.append(" A[ (ind-1)*n + i] = v[i]; \n"); 00329 source.append(" } \n"); 00330 source.append("} \n"); 00331 00332 source.append("void single_qr( __local "); source.append(numeric_string); source.append("* R, __global unsigned int* matrix_dimensions, __global "); source.append(numeric_string); source.append("* b_v, __global "); source.append(numeric_string); source.append("* v, unsigned int matrix_ind){ \n"); 00333 //matrix_dimensions[0] - number of rows 00334 //matrix_dimensions[1] - number of columns 00335 source.append(" unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n"); 00336 source.append(" unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n"); 00337 00338 source.append(" if((col_n == row_n)&&(row_n == 1)){ \n"); 00339 source.append(" b_v[0] = 0.0; \n"); 00340 source.append(" return; \n"); 00341 source.append(" } \n"); 00342 source.append(" for(unsigned int i = 0; i < col_n; ++i){ \n"); 00343 source.append(" if(get_local_id(0) == 0){ \n"); 00344 source.append(" householder_vector(R, i, row_n, v, b_v + i); \n"); 00345 source.append(" } \n"); 00346 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00347 source.append(" apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]); \n"); 00348 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00349 source.append(" if(get_local_id(0) == 0){ \n"); 00350 source.append(" if(i < matrix_dimensions[2*matrix_ind]){ \n"); 00351 source.append(" store_householder_vector(R, i+1, row_n, v); \n"); 00352 source.append(" } \n"); 00353 source.append(" } \n"); 00354 source.append(" } \n"); 00355 source.append("} \n"); 00356 00357 source.append("void matrix_from_global_to_local_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n"); 00358 source.append(" for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n"); 00359 source.append(" for(unsigned int j = 0; j < row_n; ++j){ \n"); 00360 source.append(" l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n"); 00361 source.append(" } \n"); 00362 source.append(" } \n"); 00363 source.append("} \n"); 00364 source.append("void matrix_from_local_to_global_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n"); 00365 source.append(" for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n"); 00366 source.append(" for(unsigned int j = 0; j < row_n; ++j){ \n"); 00367 source.append(" g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n"); 00368 source.append(" } \n"); 00369 source.append(" } \n"); 00370 source.append("} \n"); 00371 00372 00373 source.append("__kernel void block_qr( \n"); 00374 source.append(" __global "); source.append(numeric_string); source.append("* R, \n"); 00375 source.append(" __global unsigned int* matrix_dimensions, \n"); 00376 source.append(" __global "); source.append(numeric_string); source.append("* b_v, \n"); 00377 source.append(" __global "); source.append(numeric_string); source.append("* v, \n"); 00378 source.append(" __global unsigned int* start_matrix_inds, \n"); 00379 source.append(" __global unsigned int* start_bv_inds, \n"); 00380 source.append(" __global unsigned int* start_v_inds, \n"); 00381 source.append(" __global unsigned int * g_is_update, \n"); 00382 source.append(" __local "); source.append(numeric_string); source.append("* local_buff_R, \n"); 00383 source.append(" unsigned int block_elems_num){ \n"); 00384 source.append(" for(unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n"); 00385 source.append(" if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); 00386 source.append(" matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n"); 00387 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00388 source.append(" single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i); \n"); 00389 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00390 source.append(" matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n"); 00391 source.append(" } \n"); 00392 source.append(" } \n"); 00393 source.append("} \n"); 00394 } 00395 00396 template <typename StringType> 00397 void generate_spai_block_qr_assembly(StringType & source, std::string const & numeric_string) 00398 { 00399 source.append("void assemble_upper_part(__global "); source.append(numeric_string); source.append(" * R_q, \n"); 00400 source.append(" unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n"); 00401 source.append(" unsigned int row_n_u, unsigned int col_n_u, \n"); 00402 source.append(" unsigned int col_n, unsigned int diff){ \n"); 00403 source.append(" for(unsigned int i = 0; i < col_n_q; ++i){ \n"); 00404 source.append(" for(unsigned int j = 0; j < diff; ++j){ \n"); 00405 source.append(" R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ]; \n"); 00406 source.append(" } \n"); 00407 source.append(" } \n"); 00408 source.append(" } \n"); 00409 00410 source.append("void assemble_lower_part(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u_u, \n"); 00411 source.append(" unsigned int row_n_u_u, unsigned int col_n_u_u, \n"); 00412 source.append(" unsigned int diff){ \n"); 00413 source.append(" for(unsigned int i = 0; i < col_n_u_u; ++i){ \n"); 00414 source.append(" for(unsigned int j = 0; j < row_n_u_u; ++j){ \n"); 00415 source.append(" R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j]; \n"); 00416 source.append(" } \n"); 00417 source.append(" } \n"); 00418 source.append("} \n"); 00419 00420 source.append("void assemble_qr_block(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n"); 00421 source.append(" unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){ \n"); 00422 source.append(" unsigned int diff = row_n_u - col_n; \n"); 00423 source.append(" assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n"); 00424 source.append(" if(diff > 0){ \n"); 00425 source.append(" assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff); \n"); 00426 source.append(" } \n"); 00427 source.append("} \n"); 00428 00429 source.append("__kernel void block_qr_assembly( \n"); 00430 source.append(" __global unsigned int * matrix_dimensions, \n"); 00431 source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n"); 00432 source.append(" __global unsigned int * block_ind_u, \n"); 00433 source.append(" __global unsigned int * matrix_dimensions_u, \n"); 00434 source.append(" __global "); source.append(numeric_string); source.append(" * R_u_u, \n"); 00435 source.append(" __global unsigned int * block_ind_u_u, \n"); 00436 source.append(" __global unsigned int * matrix_dimensions_u_u, \n"); 00437 source.append(" __global "); source.append(numeric_string); source.append(" * R_q, \n"); 00438 source.append(" __global unsigned int * block_ind_q, \n"); 00439 source.append(" __global unsigned int * matrix_dimensions_q, \n"); 00440 source.append(" __global unsigned int * g_is_update, \n"); 00441 source.append(" //__local "); source.append(numeric_string); source.append(" * local_R_q, \n"); 00442 source.append(" unsigned int block_elems_num) \n"); 00443 source.append("{ \n"); 00444 source.append(" for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n"); 00445 source.append(" if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); 00446 source.append(" assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n"); 00447 source.append(" matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]); \n"); 00448 source.append(" } \n"); 00449 source.append(" } \n"); 00450 source.append("} \n"); 00451 } 00452 00453 template <typename StringType> 00454 void generate_spai_block_qr_assembly_1(StringType & source, std::string const & numeric_string) 00455 { 00456 source.append("void assemble_upper_part_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n"); 00457 source.append(" unsigned int row_n_u, unsigned int col_n_u, \n"); 00458 source.append(" unsigned int col_n, unsigned int diff){ \n"); 00459 source.append(" for(unsigned int i = 0; i < col_n_q; ++i){ \n"); 00460 source.append(" for(unsigned int j = 0; j < diff; ++j){ \n"); 00461 source.append(" R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ]; \n"); 00462 source.append(" } \n"); 00463 source.append(" } \n"); 00464 source.append(" } \n"); 00465 00466 00467 source.append("void assemble_qr_block_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n"); 00468 source.append(" unsigned int col_n_u, unsigned int col_n){ \n"); 00469 source.append(" unsigned int diff = row_n_u - col_n; \n"); 00470 source.append(" assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n"); 00471 source.append("} \n"); 00472 00473 source.append("__kernel void block_qr_assembly_1( \n"); 00474 source.append(" __global unsigned int * matrix_dimensions, \n"); 00475 source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n"); 00476 source.append(" __global unsigned int * block_ind_u, \n"); 00477 source.append(" __global unsigned int * matrix_dimensions_u, \n"); 00478 source.append(" __global "); source.append(numeric_string); source.append(" * R_q, \n"); 00479 source.append(" __global unsigned int * block_ind_q, \n"); 00480 source.append(" __global unsigned int * matrix_dimensions_q, \n"); 00481 source.append(" __global unsigned int * g_is_update, \n"); 00482 source.append(" unsigned int block_elems_num) \n"); 00483 source.append("{ \n"); 00484 source.append(" for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n"); 00485 source.append(" if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); 00486 source.append(" assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n"); 00487 source.append(" matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]); \n"); 00488 source.append(" } \n"); 00489 source.append(" } \n"); 00490 source.append("} \n"); 00491 } 00492 00493 template <typename StringType> 00494 void generate_spai_block_r_assembly(StringType & source, std::string const & numeric_string) 00495 { 00496 source.append("void assemble_r(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, \n"); 00497 source.append(" unsigned int row_n, unsigned int col_n) \n"); 00498 source.append("{ \n"); 00499 source.append(" for(unsigned int i = 0; i < col_n; ++i){ \n"); 00500 source.append(" for(unsigned int j = 0; j < row_n; ++j){ \n"); 00501 source.append(" gR[i*row_n_r + j] = R[i*row_n + j ]; \n"); 00502 source.append(" } \n"); 00503 source.append(" } \n"); 00504 source.append("} \n"); 00505 00506 source.append("void assemble_r_u(__global "); source.append(numeric_string); source.append(" * gR, \n"); 00507 source.append(" unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, \n"); 00508 source.append(" unsigned int col_n) \n"); 00509 source.append("{ \n"); 00510 source.append(" for(unsigned int i = 0; i < col_n_u; ++i){ \n"); 00511 source.append(" for(unsigned int j = 0; j < col_n; ++j){ \n"); 00512 source.append(" gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j]; \n"); 00513 source.append(" } \n"); 00514 source.append(" } \n"); 00515 source.append("} \n"); 00516 00517 00518 source.append("void assemble_r_u_u(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, \n"); 00519 source.append(" unsigned int col_n_u_u, unsigned int col_n) \n"); 00520 source.append("{ \n"); 00521 source.append(" for(unsigned int i = 0; i < col_n_u_u; ++i){ \n"); 00522 source.append(" for(unsigned int j = 0; j < row_n_u_u; ++j){ \n"); 00523 source.append(" gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j]; \n"); 00524 source.append(" } \n"); 00525 source.append(" } \n"); 00526 source.append("} \n"); 00527 00528 source.append("void assemble_r_block(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, \n"); 00529 source.append(" unsigned int col_n, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, \n"); 00530 source.append(" unsigned int row_n_u_u, unsigned int col_n_u_u){ \n"); 00531 source.append(" assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n"); 00532 source.append(" assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n); \n"); 00533 source.append(" assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n); \n"); 00534 source.append("} \n"); 00535 00536 00537 source.append("__kernel void block_r_assembly( \n"); 00538 source.append(" __global "); source.append(numeric_string); source.append(" * R, \n"); 00539 source.append(" __global unsigned int * block_ind, \n"); 00540 source.append(" __global unsigned int * matrix_dimensions, \n"); 00541 source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n"); 00542 source.append(" __global unsigned int * block_ind_u, \n"); 00543 source.append(" __global unsigned int * matrix_dimensions_u, \n"); 00544 source.append(" __global "); source.append(numeric_string); source.append(" * R_u_u, \n"); 00545 source.append(" __global unsigned int * block_ind_u_u, \n"); 00546 source.append(" __global unsigned int * matrix_dimensions_u_u, \n"); 00547 source.append(" __global "); source.append(numeric_string); source.append(" * g_R, \n"); 00548 source.append(" __global unsigned int * block_ind_r, \n"); 00549 source.append(" __global unsigned int * matrix_dimensions_r, \n"); 00550 source.append(" __global unsigned int * g_is_update, \n"); 00551 source.append(" unsigned int block_elems_num) \n"); 00552 source.append("{ \n"); 00553 source.append(" for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n"); 00554 source.append(" if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); 00555 00556 source.append(" assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n"); 00557 source.append(" matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], \n"); 00558 source.append(" R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]); \n"); 00559 00560 source.append(" } \n"); 00561 source.append(" } \n"); 00562 source.append("} \n"); 00563 } 00564 00566 00567 // main kernel class 00569 template <typename NumericT> 00570 struct spai 00571 { 00572 static std::string program_name() 00573 { 00574 return viennacl::ocl::type_to_string<NumericT>::apply() + "_spai"; 00575 } 00576 00577 static void init(viennacl::ocl::context & ctx) 00578 { 00579 viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); 00580 std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); 00581 00582 static std::map<cl_context, bool> init_done; 00583 if (!init_done[ctx.handle().get()]) 00584 { 00585 std::string source; 00586 source.reserve(1024); 00587 00588 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); 00589 00590 generate_spai_assemble_blocks(source, numeric_string); 00591 generate_spai_block_bv_assembly(source, numeric_string); 00592 generate_spai_block_least_squares(source, numeric_string); 00593 generate_spai_block_q_mult(source, numeric_string); 00594 generate_spai_block_qr(source, numeric_string); 00595 generate_spai_block_qr_assembly(source, numeric_string); 00596 generate_spai_block_qr_assembly_1(source, numeric_string); 00597 generate_spai_block_r_assembly(source, numeric_string); 00598 00599 std::string prog_name = program_name(); 00600 #ifdef VIENNACL_BUILD_INFO 00601 std::cout << "Creating program " << prog_name << std::endl; 00602 #endif 00603 ctx.add_program(source, prog_name); 00604 init_done[ctx.handle().get()] = true; 00605 } //if 00606 } //init 00607 }; 00608 00609 } // namespace kernels 00610 } // namespace opencl 00611 } // namespace linalg 00612 } // namespace viennacl 00613 #endif 00614