1 #ifndef VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_ 
    2 #define VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_ 
   50 template<
typename NumericT>
 
   77 template<
typename NumericT>
 
   91   cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.
size()) / cl_uint(3);
 
  102   if (use_nvidia_blocked)
 
  109                              buffer_size_per_vector,
 
  121                              buffer_size_per_vector,
 
  130 template<
typename NumericT>
 
  140   cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.
size()) / cl_uint(3);
 
  145   unsigned int thread_num = 256; 
 
  158                            buffer_size_per_vector,
 
  164 template<
typename NumericT>
 
  174   cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.
size()) / cl_uint(3);
 
  178   unsigned int thread_num = 128;
 
  179   unsigned int group_num = 256;
 
  191                            A.
handle().opencl_handle(),
 
  195                            viennacl::traits::opencl_handle(p),
 
  196                            viennacl::traits::opencl_handle(Ap),
 
  199                            buffer_size_per_vector,
 
  206 template<
typename NumericT>
 
  216   cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.
size()) / cl_uint(3);
 
  221   unsigned int group_num = 256;
 
  232                            A.
handle().opencl_handle(),
 
  233                            viennacl::traits::opencl_handle(p),
 
  234                            viennacl::traits::opencl_handle(Ap),
 
  238                            buffer_size_per_vector,
 
  246 template<
typename NumericT>
 
  256   cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.
size()) / cl_uint(3);
 
  260   unsigned int thread_num = 128;
 
  261   unsigned int group_num = 128;
 
  273                            A.
handle().opencl_handle(),
 
  280                            viennacl::traits::opencl_handle(p),
 
  281                            viennacl::traits::opencl_handle(Ap),
 
  284                            buffer_size_per_vector,
 
  294 template<
typename NumericT>
 
  317   cl_uint chunk_size   = cl_uint(buffer_chunk_size);
 
  318   cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
 
  320                            inner_prod_buffer, chunk_size, chunk_offset, vec_size,
 
  325 template<
typename NumericT>
 
  332   (void)buffer_chunk_size;
 
  359 template<
typename NumericT>
 
  376   cl_uint chunk_size   = cl_uint(buffer_chunk_size);
 
  377   cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
 
  388   if (use_nvidia_blocked)
 
  395                              inner_prod_buffer, chunk_size, chunk_offset,
 
  408                              inner_prod_buffer, chunk_size, chunk_offset,
 
  418 template<
typename NumericT>
 
  431   cl_uint chunk_size   = cl_uint(buffer_chunk_size);
 
  432   cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
 
  437   unsigned int thread_num = 256; 
 
  450                            inner_prod_buffer, chunk_size, chunk_offset,
 
  457 template<
typename NumericT>
 
  470   cl_uint chunk_size   = cl_uint(buffer_chunk_size);
 
  471   cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
 
  475   unsigned int thread_num = 128;
 
  476   unsigned int group_num = 128;
 
  488                            A.
handle().opencl_handle(),
 
  492                            viennacl::traits::opencl_handle(p),
 
  493                            viennacl::traits::opencl_handle(Ap),
 
  496                            inner_prod_buffer, chunk_size, chunk_offset,
 
  504 template<
typename NumericT>
 
  517   cl_uint chunk_size   = cl_uint(buffer_chunk_size);
 
  518   cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
 
  523   unsigned int group_num = 256;
 
  534                            A.
handle().opencl_handle(),
 
  535                            viennacl::traits::opencl_handle(p),
 
  536                            viennacl::traits::opencl_handle(Ap),
 
  540                            inner_prod_buffer, chunk_size, chunk_offset,
 
  549 template<
typename NumericT>
 
  562   cl_uint chunk_size   = cl_uint(buffer_chunk_size);
 
  563   cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
 
  567   unsigned int thread_num = 256;
 
  568   unsigned int group_num = 128;
 
  580                            A.
handle().opencl_handle(),
 
  587                            viennacl::traits::opencl_handle(p),
 
  588                            viennacl::traits::opencl_handle(Ap),
 
  591                            inner_prod_buffer, chunk_size, chunk_offset,
 
  608 template <
typename T>
 
  626   cl_uint size_vk      = cl_uint(v_k.
size());
 
  628   cl_uint R_offset     = cl_uint(offset_in_R);
 
  629   cl_uint chunk_size   = cl_uint(buffer_chunk_size);
 
  630   cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
 
  634                            inner_prod_buffer, chunk_size,
 
  635                            r_dot_vk_buffer, chunk_offset,
 
  641 template <
typename T>
 
  657   cl_uint size_vk          = cl_uint(v_k_size);
 
  658   cl_uint internal_size_vk = cl_uint(v_k_internal_size);
 
  659   cl_uint ocl_k            = cl_uint(param_k);
 
  660   cl_uint chunk_size = cl_uint(buffer_chunk_size);
 
  662                            vi_in_vk_buffer, chunk_size
 
  666 template <
typename T>
 
  685   cl_uint size_vk          = cl_uint(v_k_size);
 
  686   cl_uint internal_size_vk = cl_uint(v_k_internal_size);
 
  687   cl_uint ocl_k            = cl_uint(param_k);
 
  688   cl_uint chunk_size       = cl_uint(buffer_chunk_size);
 
  689   cl_uint ocl_krylov_dim   = cl_uint(krylov_dim);
 
  691                            vi_in_vk_buffer, chunk_size,
 
  692                            R_buffer, ocl_krylov_dim,
 
  698 template <
typename T>
 
  715   cl_uint size_vk          = cl_uint(v_k_size);
 
  716   cl_uint internal_size_vk = cl_uint(v_k_internal_size);
 
  717   cl_uint ocl_k            = cl_uint(param_k);
 
  720                            krylov_basis, size_vk, internal_size_vk,
 
  726 template <
typename T>
 
  740   cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.
size()) / cl_uint(3);
 
  753   if (use_nvidia_blocked)
 
  760                              buffer_size_per_vector,
 
  772                              buffer_size_per_vector,
 
  780 template <
typename T>
 
  790   cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.
size()) / cl_uint(3);
 
  795   inner_prod_buffer.
clear();
 
  798   unsigned int thread_num = 128; 
 
  811                            buffer_size_per_vector,
 
  817 template <
typename T>
 
  827   cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.
size()) / cl_uint(3);
 
  834   unsigned int group_num = 128;
 
  840                            A.
handle().opencl_handle(),
 
  844                            viennacl::traits::opencl_handle(p), start_p,
 
  845                            viennacl::traits::opencl_handle(Ap), start_Ap,
 
  848                            buffer_size_per_vector,
 
  855 template <
typename T>
 
  865   cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.
size()) / cl_uint(3);
 
  872   unsigned int group_num = 128;
 
  883                            A.
handle().opencl_handle(),
 
  884                            viennacl::traits::opencl_handle(p), start_p,
 
  885                            viennacl::traits::opencl_handle(Ap), start_Ap,
 
  889                            buffer_size_per_vector,
 
  897 template <
typename T>
 
  907   cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.
size()) / cl_uint(3);
 
  914   unsigned int group_num = 128;
 
  921                            A.
handle().opencl_handle(),
 
  928                            viennacl::traits::opencl_handle(p), start_p,
 
  929                            viennacl::traits::opencl_handle(Ap), start_Ap,
 
  932                            buffer_size_per_vector,
 
vcl_size_t internal_ellnnz() const 
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
viennacl::ocl::device const & current_device() const 
Returns the current device. 
Main kernel class for generating specialized OpenCL kernels for fast iterative solvers. 
Represents an OpenCL device within ViennaCL. 
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Generic size and resize functionality for different vector and matrix types. 
const handle_type & handle3() const 
const vcl_size_t & size1() const 
Returns the number of rows. 
Represents an OpenCL kernel within ViennaCL. 
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
static void init(viennacl::ocl::context &ctx)
const handle_type & handle() const 
size_type local_work_size(int index=0) const 
Returns the local work size at the respective dimension. 
const handle_type & handle12() const 
Returns the OpenCL handle to the (row, column) index array. 
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
vcl_size_t internal_size1() const 
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
This file provides the forward declarations for the main types used within ViennaCL. 
Determines row and column increments for matrices and matrix proxies. 
const handle_type & handle4() const 
cl_uint vendor_id() const 
A unique device vendor identifier. An example of a unique device identifier could be the PCIe ID...
T max(const T &lhs, const T &rhs)
Maximum. 
vcl_size_t rows_per_block() const 
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm. 
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
const handle_type & handle1() const 
Returns the OpenCL handle to the row index array. 
vcl_size_t internal_size1() const 
Common implementations shared by OpenCL-based operations. 
const vcl_size_t & nnz() const 
Returns the number of nonzero entries. 
const handle_type & handle2() const 
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.) 
A class representing local (shared) OpenCL memory. Typically used as kernel argument. 
OpenCL kernel file for specialized iterative solver kernels. 
Sparse matrix class using the ELLPACK format for storing the nonzeros. 
viennacl::ocl::kernel & get_kernel(std::string const &program_name, std::string const &kernel_name)
Convenience function for retrieving the kernel of a program directly from the context. 
Sparse matrix class using the sliced ELLPACK with parameters C, . 
Implementation of a smart-pointer-like class for handling OpenCL handles. 
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
result_of::size_type< T >::type start(T const &obj)
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle2() const 
Returns the OpenCL handle to the column index array. 
vcl_size_t maxnnz() const 
const handle_type & handle3() const 
Returns the OpenCL handle to the group start index array. 
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc. 
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
const handle_type & handle3() const 
Returns the OpenCL handle to the row block array. 
void clear()
Resets all entries to zero. Does not change the size of the vector. 
const handle_type & handle() const 
Returns the OpenCL handle to the matrix entry array. 
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue. 
Representation of an OpenCL kernel in ViennaCL. 
size_type size() const 
Returns the length of the vector (cf. std::vector) 
vcl_size_t ell_nnz() const 
size_type global_work_size(int index=0) const 
Returns the global work size at the respective dimension. 
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
Forward declarations of the implicit_vector_base, vector_base class. 
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
const handle_type & handle5() const 
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t param_k)
const vcl_size_t & blocks1() const 
Returns the internal number of row blocks for an adaptive SpMV. 
vcl_size_t internal_maxnnz() const 
Implementation of the ViennaCL scalar class. 
void pipelined_gmres_prod(compressed_matrix< T > const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
Simple enable-if variant that uses the SFINAE pattern. 
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...