ViennaCL - The Vienna Computing Library
1.5.0
|
00001 #ifndef VIENNACL_LINALG_QR_METHOD_HPP_ 00002 #define VIENNACL_LINALG_QR_METHOD_HPP_ 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2013, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00021 #include "viennacl/vector.hpp" 00022 #include "viennacl/matrix.hpp" 00023 00024 #include "viennacl/linalg/qr-method-common.hpp" 00025 #include "viennacl/linalg/prod.hpp" 00026 00027 #include <boost/numeric/ublas/vector.hpp> 00028 #include <boost/numeric/ublas/matrix.hpp> 00029 00034 namespace viennacl 00035 { 00036 namespace linalg 00037 { 00038 namespace detail 00039 { 00040 template<typename MatrixType, typename VectorType> 00041 void givens_next(MatrixType& matrix, 00042 VectorType& tmp1, 00043 VectorType& tmp2, 00044 int l, 00045 int m 00046 ) 00047 { 00048 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(matrix).context()); 00049 00050 typedef typename MatrixType::value_type ScalarType; 00051 typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type CPU_ScalarType; 00052 00053 viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<CPU_ScalarType>::program_name(), SVD_GIVENS_NEXT_KERNEL); 00054 00055 kernel.global_work_size(0, viennacl::tools::align_to_multiple<cl_uint>(cl_uint(viennacl::traits::size1(matrix)), 256)); 00056 kernel.local_work_size(0, 256); 00057 00058 viennacl::ocl::enqueue(kernel( 00059 matrix, 00060 tmp1, 00061 tmp2, 00062 static_cast<cl_uint>(matrix.size1()), 00063 static_cast<cl_uint>(matrix.internal_size2()), 00064 static_cast<cl_uint>(l), 00065 static_cast<cl_uint>(m - 1) 00066 )); 00067 } 00068 00069 00070 // Symmetric tridiagonal QL algorithm. 00071 // This is derived from the Algol procedures tql2, by Bowdler, Martin, Reinsch, and Wilkinson, 00072 // Handbook for Auto. Comp., Vol.ii-Linear Algebra, and the corresponding Fortran subroutine in EISPACK. 00073 template <typename SCALARTYPE, unsigned int ALIGNMENT> 00074 void tql2(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & Q, 00075 boost::numeric::ublas::vector<SCALARTYPE> & d, 00076 boost::numeric::ublas::vector<SCALARTYPE> & e) 00077 { 00078 int n = static_cast<int>(Q.size1()); 00079 00080 boost::numeric::ublas::vector<SCALARTYPE> cs(n), ss(n); 00081 viennacl::vector<SCALARTYPE> tmp1(n), tmp2(n); 00082 00083 for (int i = 1; i < n; i++) 00084 e(i - 1) = e(i); 00085 00086 e(n - 1) = 0; 00087 00088 SCALARTYPE f = 0; 00089 SCALARTYPE tst1 = 0; 00090 SCALARTYPE eps = 2 * static_cast<SCALARTYPE>(EPS); 00091 00092 for (int l = 0; l < n; l++) 00093 { 00094 // Find small subdiagonal element. 00095 tst1 = std::max<SCALARTYPE>(tst1, std::fabs(d(l)) + std::fabs(e(l))); 00096 int m = l; 00097 while (m < n) 00098 { 00099 if (std::fabs(e(m)) <= eps * tst1) 00100 break; 00101 m++; 00102 } 00103 00104 // If m == l, d(l) is an eigenvalue, otherwise, iterate. 00105 if (m > l) 00106 { 00107 int iter = 0; 00108 do 00109 { 00110 iter = iter + 1; // (Could check iteration count here.) 00111 00112 // Compute implicit shift 00113 SCALARTYPE g = d(l); 00114 SCALARTYPE p = (d(l + 1) - g) / (2 * e(l)); 00115 SCALARTYPE r = pythag<SCALARTYPE>(p, 1); 00116 if (p < 0) 00117 { 00118 r = -r; 00119 } 00120 00121 d(l) = e(l) / (p + r); 00122 d(l + 1) = e(l) * (p + r); 00123 SCALARTYPE dl1 = d(l + 1); 00124 SCALARTYPE h = g - d(l); 00125 for (int i = l + 2; i < n; i++) 00126 { 00127 d(i) -= h; 00128 } 00129 00130 f = f + h; 00131 00132 // Implicit QL transformation. 00133 p = d(m); 00134 SCALARTYPE c = 1; 00135 SCALARTYPE c2 = c; 00136 SCALARTYPE c3 = c; 00137 SCALARTYPE el1 = e(l + 1); 00138 SCALARTYPE s = 0; 00139 SCALARTYPE s2 = 0; 00140 for (int i = m - 1; i >= l; i--) 00141 { 00142 c3 = c2; 00143 c2 = c; 00144 s2 = s; 00145 g = c * e(i); 00146 h = c * p; 00147 r = pythag(p, e(i)); 00148 e(i + 1) = s * r; 00149 s = e(i) / r; 00150 c = p / r; 00151 p = c * d(i) - s * g; 00152 d(i + 1) = h + s * (c * g + s * d(i)); 00153 00154 cs[i] = c; 00155 ss[i] = s; 00156 } 00157 00158 p = -s * s2 * c3 * el1 * e(l) / dl1; 00159 e(l) = s * p; 00160 d(l) = c * p; 00161 00162 { 00163 viennacl::copy(cs, tmp1); 00164 viennacl::copy(ss, tmp2); 00165 00166 givens_next(Q, tmp1, tmp2, l, m); 00167 } 00168 00169 // Check for convergence. 00170 } 00171 while (std::fabs(e(l)) > eps * tst1); 00172 } 00173 d(l) = d(l) + f; 00174 e(l) = 0; 00175 } 00176 } 00177 00178 template <typename SCALARTYPE, typename MatrixT> 00179 void final_iter_update_gpu(MatrixT& A, 00180 int n, 00181 int last_n, 00182 SCALARTYPE q, 00183 SCALARTYPE p 00184 ) 00185 { 00186 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); 00187 00188 viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_FINAL_ITER_UPDATE_KERNEL); 00189 00190 viennacl::ocl::enqueue(kernel( 00191 A, 00192 static_cast<cl_uint>(A.internal_size1()), 00193 static_cast<cl_uint>(n), 00194 static_cast<cl_uint>(last_n), 00195 q, 00196 p 00197 )); 00198 } 00199 00200 template <typename SCALARTYPE, typename MatrixT> 00201 void update_float_QR_column_gpu(MatrixT& A, 00202 const std::vector<SCALARTYPE>& buf, 00203 viennacl::vector<SCALARTYPE>& buf_vcl, 00204 int m, 00205 int n, 00206 int last_n, 00207 bool //is_triangular 00208 ) 00209 { 00210 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); 00211 00212 viennacl::fast_copy(buf, buf_vcl); 00213 00214 viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_UPDATE_QR_COLUMN_KERNEL); 00215 00216 viennacl::ocl::enqueue(kernel( 00217 A, 00218 static_cast<cl_uint>(A.internal_size1()), 00219 buf_vcl, 00220 static_cast<cl_uint>(m), 00221 static_cast<cl_uint>(n), 00222 static_cast<cl_uint>(last_n) 00223 )); 00224 } 00225 00226 template <typename SCALARTYPE, typename MatrixT> 00227 void final_iter_update(MatrixT& A, 00228 int n, 00229 int last_n, 00230 SCALARTYPE q, 00231 SCALARTYPE p 00232 ) 00233 { 00234 for (int i = 0; i < last_n; i++) 00235 { 00236 SCALARTYPE v_in = A(i, n); 00237 SCALARTYPE z = A(i, n - 1); 00238 A(i, n - 1) = q * z + p * v_in; 00239 A(i, n) = q * v_in - p * z; 00240 } 00241 } 00242 00243 template <typename SCALARTYPE, typename MatrixT> 00244 void update_float_QR_column(MatrixT& A, 00245 const std::vector<SCALARTYPE>& buf, 00246 int m, 00247 int n, 00248 int last_i, 00249 bool is_triangular 00250 ) 00251 { 00252 for (int i = 0; i < last_i; i++) 00253 { 00254 int start_k = is_triangular?std::max(i + 1, m):m; 00255 00256 SCALARTYPE* a_row = A.row(i); 00257 00258 SCALARTYPE a_ik = a_row[start_k]; 00259 SCALARTYPE a_ik_1 = 0; 00260 SCALARTYPE a_ik_2 = 0; 00261 00262 if(start_k < n) 00263 a_ik_1 = a_row[start_k + 1]; 00264 00265 for(int k = start_k; k < n; k++) 00266 { 00267 bool notlast = (k != n - 1); 00268 00269 SCALARTYPE p = buf[5 * k] * a_ik + buf[5 * k + 1] * a_ik_1; 00270 00271 if (notlast) 00272 { 00273 a_ik_2 = a_row[k + 2]; 00274 p = p + buf[5 * k + 2] * a_ik_2; 00275 a_ik_2 = a_ik_2 - p * buf[5 * k + 4]; 00276 } 00277 00278 a_row[k] = a_ik - p; 00279 a_ik_1 = a_ik_1 - p * buf[5 * k + 3]; 00280 00281 a_ik = a_ik_1; 00282 a_ik_1 = a_ik_2; 00283 } 00284 00285 if(start_k < n) 00286 a_row[n] = a_ik; 00287 } 00288 } 00289 00291 template <typename SCALARTYPE> 00292 class FastMatrix 00293 { 00294 public: 00295 FastMatrix() 00296 { 00297 size_ = 0; 00298 } 00299 00300 FastMatrix(vcl_size_t sz, vcl_size_t internal_size) : size_(sz), internal_size_(internal_size) 00301 { 00302 data.resize(internal_size * internal_size); 00303 } 00304 00305 SCALARTYPE& operator()(int i, int j) 00306 { 00307 return data[i * internal_size_ + j]; 00308 } 00309 00310 SCALARTYPE* row(int i) 00311 { 00312 return &data[i * internal_size_]; 00313 } 00314 00315 SCALARTYPE* begin() 00316 { 00317 return &data[0]; 00318 } 00319 00320 SCALARTYPE* end() 00321 { 00322 return &data[0] + data.size(); 00323 } 00324 00325 std::vector<SCALARTYPE> data; 00326 private: 00327 vcl_size_t size_; 00328 vcl_size_t internal_size_; 00329 }; 00330 00331 // Nonsymmetric reduction from Hessenberg to real Schur form. 00332 // This is derived from the Algol procedure hqr2, by Martin and Wilkinson, Handbook for Auto. Comp., 00333 // Vol.ii-Linear Algebra, and the corresponding Fortran subroutine in EISPACK. 00334 template <typename SCALARTYPE, unsigned int ALIGNMENT> 00335 void hqr2(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& vcl_H, 00336 viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& V, 00337 boost::numeric::ublas::vector<SCALARTYPE>& d, 00338 boost::numeric::ublas::vector<SCALARTYPE>& e) 00339 { 00340 transpose(V); 00341 00342 int nn = static_cast<int>(vcl_H.size1()); 00343 00344 FastMatrix<SCALARTYPE> H(nn, vcl_H.internal_size2());//, V(nn); 00345 00346 std::vector<SCALARTYPE> buf(5 * nn); 00347 viennacl::vector<SCALARTYPE> buf_vcl(5 * nn); 00348 00349 viennacl::fast_copy(vcl_H, H.begin()); 00350 00351 00352 int n = nn - 1; 00353 00354 SCALARTYPE eps = 2 * static_cast<SCALARTYPE>(EPS); 00355 SCALARTYPE exshift = 0; 00356 SCALARTYPE p = 0; 00357 SCALARTYPE q = 0; 00358 SCALARTYPE r = 0; 00359 SCALARTYPE s = 0; 00360 SCALARTYPE z = 0; 00361 SCALARTYPE t; 00362 SCALARTYPE w; 00363 SCALARTYPE x; 00364 SCALARTYPE y; 00365 00366 SCALARTYPE out1, out2; 00367 00368 // compute matrix norm 00369 SCALARTYPE norm = 0; 00370 for (int i = 0; i < nn; i++) 00371 { 00372 for (int j = std::max(i - 1, 0); j < nn; j++) 00373 norm = norm + std::fabs(H(i, j)); 00374 } 00375 00376 // Outer loop over eigenvalue index 00377 int iter = 0; 00378 while (n >= 0) 00379 { 00380 // Look for single small sub-diagonal element 00381 int l = n; 00382 while (l > 0) 00383 { 00384 s = std::fabs(H(l - 1, l - 1)) + std::fabs(H(l, l)); 00385 if (s == 0) s = norm; 00386 if (std::fabs(H(l, l - 1)) < eps * s) 00387 break; 00388 00389 l--; 00390 } 00391 00392 // Check for convergence 00393 if (l == n) 00394 { 00395 // One root found 00396 H(n, n) = H(n, n) + exshift; 00397 d(n) = H(n, n); 00398 e(n) = 0; 00399 n--; 00400 iter = 0; 00401 } 00402 else if (l == n - 1) 00403 { 00404 // Two roots found 00405 w = H(n, n - 1) * H(n - 1, n); 00406 p = (H(n - 1, n - 1) - H(n, n)) / 2; 00407 q = p * p + w; 00408 z = static_cast<SCALARTYPE>(std::sqrt(std::fabs(q))); 00409 H(n, n) = H(n, n) + exshift; 00410 H(n - 1, n - 1) = H(n - 1, n - 1) + exshift; 00411 x = H(n, n); 00412 00413 if (q >= 0) 00414 { 00415 // Real pair 00416 z = (p >= 0) ? (p + z) : (p - z); 00417 d(n - 1) = x + z; 00418 d(n) = d(n - 1); 00419 if (z != 0) 00420 d(n) = x - w / z; 00421 e(n - 1) = 0; 00422 e(n) = 0; 00423 x = H(n, n - 1); 00424 s = std::fabs(x) + std::fabs(z); 00425 p = x / s; 00426 q = z / s; 00427 r = static_cast<SCALARTYPE>(std::sqrt(p * p + q * q)); 00428 p = p / r; 00429 q = q / r; 00430 00431 // Row modification 00432 for (int j = n - 1; j < nn; j++) 00433 { 00434 SCALARTYPE h_nj = H(n, j); 00435 z = H(n - 1, j); 00436 H(n - 1, j) = q * z + p * h_nj; 00437 H(n, j) = q * h_nj - p * z; 00438 } 00439 00440 final_iter_update(H, n, n + 1, q, p); 00441 final_iter_update_gpu(V, n, nn, q, p); 00442 } 00443 else 00444 { 00445 // Complex pair 00446 d(n - 1) = x + p; 00447 d(n) = x + p; 00448 e(n - 1) = z; 00449 e(n) = -z; 00450 } 00451 00452 n = n - 2; 00453 iter = 0; 00454 } 00455 else 00456 { 00457 // No convergence yet 00458 00459 // Form shift 00460 x = H(n, n); 00461 y = 0; 00462 w = 0; 00463 if (l < n) 00464 { 00465 y = H(n - 1, n - 1); 00466 w = H(n, n - 1) * H(n - 1, n); 00467 } 00468 00469 // Wilkinson's original ad hoc shift 00470 if (iter == 10) 00471 { 00472 exshift += x; 00473 for (int i = 0; i <= n; i++) 00474 H(i, i) -= x; 00475 00476 s = std::fabs(H(n, n - 1)) + std::fabs(H(n - 1, n - 2)); 00477 x = y = SCALARTYPE(0.75) * s; 00478 w = SCALARTYPE(-0.4375) * s * s; 00479 } 00480 00481 // MATLAB's new ad hoc shift 00482 if (iter == 30) 00483 { 00484 s = (y - x) / 2; 00485 s = s * s + w; 00486 if (s > 0) 00487 { 00488 s = static_cast<SCALARTYPE>(std::sqrt(s)); 00489 if (y < x) s = -s; 00490 s = x - w / ((y - x) / 2 + s); 00491 for (int i = 0; i <= n; i++) 00492 H(i, i) -= s; 00493 exshift += s; 00494 x = y = w = SCALARTYPE(0.964); 00495 } 00496 } 00497 00498 iter = iter + 1; 00499 00500 // Look for two consecutive small sub-diagonal elements 00501 int m = n - 2; 00502 while (m >= l) 00503 { 00504 SCALARTYPE h_m1_m1 = H(m + 1, m + 1); 00505 z = H(m, m); 00506 r = x - z; 00507 s = y - z; 00508 p = (r * s - w) / H(m + 1, m) + H(m, m + 1); 00509 q = h_m1_m1 - z - r - s; 00510 r = H(m + 2, m + 1); 00511 s = std::fabs(p) + std::fabs(q) + std::fabs(r); 00512 p = p / s; 00513 q = q / s; 00514 r = r / s; 00515 if (m == l) 00516 break; 00517 if (std::fabs(H(m, m - 1)) * (std::fabs(q) + std::fabs(r)) < eps * (std::fabs(p) * (std::fabs(H(m - 1, m - 1)) + std::fabs(z) + std::fabs(h_m1_m1)))) 00518 break; 00519 m--; 00520 } 00521 00522 for (int i = m + 2; i <= n; i++) 00523 { 00524 H(i, i - 2) = 0; 00525 if (i > m + 2) 00526 H(i, i - 3) = 0; 00527 } 00528 00529 // float QR step involving rows l:n and columns m:n 00530 for (int k = m; k < n; k++) 00531 { 00532 bool notlast = (k != n - 1); 00533 if (k != m) 00534 { 00535 p = H(k, k - 1); 00536 q = H(k + 1, k - 1); 00537 r = (notlast ? H(k + 2, k - 1) : 0); 00538 x = std::fabs(p) + std::fabs(q) + std::fabs(r); 00539 if (x != 0) 00540 { 00541 p = p / x; 00542 q = q / x; 00543 r = r / x; 00544 } 00545 } 00546 00547 if (x == 0) break; 00548 00549 s = static_cast<SCALARTYPE>(std::sqrt(p * p + q * q + r * r)); 00550 if (p < 0) s = -s; 00551 00552 if (s != 0) 00553 { 00554 if (k != m) 00555 H(k, k - 1) = -s * x; 00556 else 00557 if (l != m) 00558 H(k, k - 1) = -H(k, k - 1); 00559 00560 p = p + s; 00561 y = q / s; 00562 z = r / s; 00563 x = p / s; 00564 q = q / p; 00565 r = r / p; 00566 00567 buf[5 * k] = x; 00568 buf[5 * k + 1] = y; 00569 buf[5 * k + 2] = z; 00570 buf[5 * k + 3] = q; 00571 buf[5 * k + 4] = r; 00572 00573 00574 SCALARTYPE* a_row_k = H.row(k); 00575 SCALARTYPE* a_row_k_1 = H.row(k + 1); 00576 SCALARTYPE* a_row_k_2 = H.row(k + 2); 00577 // Row modification 00578 for (int j = k; j < nn; j++) 00579 { 00580 SCALARTYPE h_kj = a_row_k[j]; 00581 SCALARTYPE h_k1_j = a_row_k_1[j]; 00582 00583 p = h_kj + q * h_k1_j; 00584 if (notlast) 00585 { 00586 SCALARTYPE h_k2_j = a_row_k_2[j]; 00587 p = p + r * h_k2_j; 00588 a_row_k_2[j] = h_k2_j - p * z; 00589 } 00590 00591 a_row_k[j] = h_kj - p * x; 00592 a_row_k_1[j] = h_k1_j - p * y; 00593 } 00594 00595 //H(k + 1, nn - 1) = h_kj; 00596 00597 00598 // Column modification 00599 for (int i = k; i < std::min(nn, k + 4); i++) 00600 { 00601 p = x * H(i, k) + y * H(i, k + 1); 00602 if (notlast) 00603 { 00604 p = p + z * H(i, k + 2); 00605 H(i, k + 2) = H(i, k + 2) - p * r; 00606 } 00607 00608 H(i, k) = H(i, k) - p; 00609 H(i, k + 1) = H(i, k + 1) - p * q; 00610 } 00611 } 00612 else 00613 { 00614 buf[5 * k] = 0; 00615 buf[5 * k + 1] = 0; 00616 buf[5 * k + 2] = 0; 00617 buf[5 * k + 3] = 0; 00618 buf[5 * k + 4] = 0; 00619 } 00620 } 00621 00622 // Timer timer; 00623 // timer.start(); 00624 00625 update_float_QR_column(H, buf, m, n, n, true); 00626 update_float_QR_column_gpu(V, buf, buf_vcl, m, n, nn, false); 00627 00628 // std::cout << timer.get() << "\n"; 00629 } 00630 } 00631 00632 // Backsubstitute to find vectors of upper triangular form 00633 if (norm == 0) 00634 { 00635 return; 00636 } 00637 00638 for (n = nn - 1; n >= 0; n--) 00639 { 00640 p = d(n); 00641 q = e(n); 00642 00643 // Real vector 00644 if (q == 0) 00645 { 00646 int l = n; 00647 H(n, n) = 1; 00648 for (int i = n - 1; i >= 0; i--) 00649 { 00650 w = H(i, i) - p; 00651 r = 0; 00652 for (int j = l; j <= n; j++) 00653 r = r + H(i, j) * H(j, n); 00654 00655 if (e(i) < 0) 00656 { 00657 z = w; 00658 s = r; 00659 } 00660 else 00661 { 00662 l = i; 00663 if (e(i) == 0) 00664 { 00665 H(i, n) = (w != 0) ? (-r / w) : (-r / (eps * norm)); 00666 } 00667 else 00668 { 00669 // Solve real equations 00670 x = H(i, i + 1); 00671 y = H(i + 1, i); 00672 q = (d(i) - p) * (d(i) - p) + e(i) * e(i); 00673 t = (x * s - z * r) / q; 00674 H(i, n) = t; 00675 H(i + 1, n) = (std::fabs(x) > std::fabs(z)) ? ((-r - w * t) / x) : ((-s - y * t) / z); 00676 } 00677 00678 // Overflow control 00679 t = std::fabs(H(i, n)); 00680 if ((eps * t) * t > 1) 00681 for (int j = i; j <= n; j++) 00682 H(j, n) /= t; 00683 } 00684 } 00685 } 00686 else if (q < 0) 00687 { 00688 // Complex vector 00689 int l = n - 1; 00690 00691 // Last vector component imaginary so matrix is triangular 00692 if (std::fabs(H(n, n - 1)) > std::fabs(H(n - 1, n))) 00693 { 00694 H(n - 1, n - 1) = q / H(n, n - 1); 00695 H(n - 1, n) = -(H(n, n) - p) / H(n, n - 1); 00696 } 00697 else 00698 { 00699 cdiv<SCALARTYPE>(0, -H(n - 1, n), H(n - 1, n - 1) - p, q, out1, out2); 00700 00701 H(n - 1, n - 1) = out1; 00702 H(n - 1, n) = out2; 00703 } 00704 00705 H(n, n - 1) = 0; 00706 H(n, n) = 1; 00707 for (int i = n - 2; i >= 0; i--) 00708 { 00709 SCALARTYPE ra, sa, vr, vi; 00710 ra = 0; 00711 sa = 0; 00712 for (int j = l; j <= n; j++) 00713 { 00714 SCALARTYPE h_ij = H(i, j); 00715 ra = ra + h_ij * H(j, n - 1); 00716 sa = sa + h_ij * H(j, n); 00717 } 00718 00719 w = H(i, i) - p; 00720 00721 if (e(i) < 0) 00722 { 00723 z = w; 00724 r = ra; 00725 s = sa; 00726 } 00727 else 00728 { 00729 l = i; 00730 if (e(i) == 0) 00731 { 00732 cdiv<SCALARTYPE>(-ra, -sa, w, q, out1, out2); 00733 H(i, n - 1) = out1; 00734 H(i, n) = out2; 00735 } 00736 else 00737 { 00738 // Solve complex equations 00739 x = H(i, i + 1); 00740 y = H(i + 1, i); 00741 vr = (d(i) - p) * (d(i) - p) + e(i) * e(i) - q * q; 00742 vi = (d(i) - p) * 2 * q; 00743 if ( (vr == 0) && (vi == 0) ) 00744 vr = eps * norm * (std::fabs(w) + std::fabs(q) + std::fabs(x) + std::fabs(y) + std::fabs(z)); 00745 00746 cdiv<SCALARTYPE>(x * r - z * ra + q * sa, x * s - z * sa - q * ra, vr, vi, out1, out2); 00747 00748 H(i, n - 1) = out1; 00749 H(i, n) = out2; 00750 00751 00752 if (std::fabs(x) > (std::fabs(z) + std::fabs(q))) 00753 { 00754 H(i + 1, n - 1) = (-ra - w * H(i, n - 1) + q * H(i, n)) / x; 00755 H(i + 1, n) = (-sa - w * H(i, n) - q * H(i, n - 1)) / x; 00756 } 00757 else 00758 { 00759 cdiv<SCALARTYPE>(-r - y * H(i, n - 1), -s - y * H(i, n), z, q, out1, out2); 00760 00761 H(i + 1, n - 1) = out1; 00762 H(i + 1, n) = out2; 00763 } 00764 } 00765 00766 // Overflow control 00767 t = std::max(std::fabs(H(i, n - 1)), std::fabs(H(i, n))); 00768 if ((eps * t) * t > 1) 00769 { 00770 for (int j = i; j <= n; j++) 00771 { 00772 H(j, n - 1) /= t; 00773 H(j, n) /= t; 00774 } 00775 } 00776 } 00777 } 00778 } 00779 } 00780 00781 viennacl::fast_copy(H.begin(), H.end(), vcl_H); 00782 // viennacl::fast_copy(V.begin(), V.end(), vcl_V); 00783 00784 viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> tmp = V; 00785 00786 V = viennacl::linalg::prod(trans(tmp), vcl_H); 00787 } 00788 00789 template <typename SCALARTYPE, unsigned int ALIGNMENT> 00790 bool householder_twoside( 00791 viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A, 00792 viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& Q, 00793 viennacl::vector<SCALARTYPE, ALIGNMENT>& D, 00794 vcl_size_t start) 00795 { 00796 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); 00797 00798 if(start + 2 >= A.size1()) 00799 return false; 00800 00801 prepare_householder_vector(A, D, A.size1(), start + 1, start, start + 1, true); 00802 00803 { 00804 viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL); 00805 00806 viennacl::ocl::enqueue(kernel( 00807 A, 00808 D, 00809 static_cast<cl_uint>(start + 1), 00810 static_cast<cl_uint>(start), 00811 static_cast<cl_uint>(A.size1()), 00812 static_cast<cl_uint>(A.size2()), 00813 static_cast<cl_uint>(A.internal_size2()), 00814 viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4)) 00815 )); 00816 } 00817 00818 { 00819 viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL); 00820 00821 viennacl::ocl::enqueue(kernel( 00822 A, 00823 D, 00824 static_cast<cl_uint>(0), 00825 static_cast<cl_uint>(0), 00826 static_cast<cl_uint>(A.size1()), 00827 static_cast<cl_uint>(A.size2()), 00828 static_cast<cl_uint>(A.internal_size2()), 00829 viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE))) 00830 )); 00831 } 00832 00833 { 00834 viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL); 00835 00836 viennacl::ocl::enqueue(kernel( 00837 Q, 00838 D, 00839 static_cast<cl_uint>(A.size1()), 00840 static_cast<cl_uint>(A.size2()), 00841 static_cast<cl_uint>(Q.internal_size2()), 00842 viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE))) 00843 )); 00844 } 00845 00846 return true; 00847 } 00848 00849 template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT> 00850 void tridiagonal_reduction(viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& A, 00851 viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& Q) 00852 { 00853 vcl_size_t sz = A.size1(); 00854 00855 viennacl::vector<SCALARTYPE> hh_vector(sz); 00856 00857 for(vcl_size_t i = 0; i < sz; i++) 00858 { 00859 householder_twoside(A, Q, hh_vector, i); 00860 } 00861 00862 } 00863 00864 template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT> 00865 void qr_method(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & A, 00866 viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & Q, 00867 boost::numeric::ublas::vector<SCALARTYPE> & D, 00868 boost::numeric::ublas::vector<SCALARTYPE> & E, 00869 bool is_symmetric = true) 00870 { 00871 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); 00872 00873 assert(A.size1() == A.size2() && bool("Input matrix must be square for QR method!")); 00874 00875 D.resize(A.size1()); 00876 E.resize(A.size1()); 00877 00878 viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::init(ctx); 00879 00880 Q = viennacl::identity_matrix<SCALARTYPE>(Q.size1(), ctx); 00881 00882 // reduce to tridiagonal form 00883 detail::tridiagonal_reduction(A, Q); 00884 00885 // pack diagonal and super-diagonal 00886 // ublas::vector<SCALARTYPE> D(A.size1()), E(A.size1()); 00887 00888 bidiag_pack(A, D, E); 00889 00890 // find eigenvalues 00891 if(is_symmetric) 00892 { 00893 00894 detail::tql2(Q, D, E); 00895 transpose(Q); 00896 } 00897 else 00898 { 00899 detail::hqr2(A, Q, D, E); 00900 } 00901 00902 // std::cout << A << "\n"; 00903 00904 boost::numeric::ublas::matrix<float> eigen_values(A.size1(), A.size1()); 00905 eigen_values.clear(); 00906 00907 for (vcl_size_t i = 0; i < A.size1(); i++) 00908 { 00909 if(std::fabs(E(i)) < EPS) 00910 { 00911 eigen_values(i, i) = D(i); 00912 } 00913 else 00914 { 00915 eigen_values(i, i) = D(i); 00916 eigen_values(i, i + 1) = E(i); 00917 eigen_values(i + 1, i) = -E(i); 00918 eigen_values(i + 1, i + 1) = D(i); 00919 i++; 00920 } 00921 } 00922 00923 copy(eigen_values, A); 00924 } 00925 } 00926 00927 00928 template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT> 00929 void qr_method_nsm(viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& A, 00930 viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& Q, 00931 boost::numeric::ublas::vector<SCALARTYPE>& D, 00932 boost::numeric::ublas::vector<SCALARTYPE>& E 00933 ) 00934 { 00935 detail::qr_method(A, Q, D, E, false); 00936 } 00937 00938 template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT> 00939 void qr_method_sym(viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& A, 00940 viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& Q, 00941 boost::numeric::ublas::vector<SCALARTYPE>& D 00942 ) 00943 { 00944 boost::numeric::ublas::vector<SCALARTYPE> E(A.size1()); 00945 00946 detail::qr_method(A, Q, D, E, true); 00947 } 00948 00949 } 00950 } 00951 00952 #endif