ViennaCL - The Vienna Computing Library  1.2.0
vector_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_VECTOR_OPERATIONS_HPP_
2 #define VIENNACL_VECTOR_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2011, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8 
9  -----------------
10  ViennaCL - The Vienna Computing Library
11  -----------------
12 
13  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
14 
15  (A list of authors and contributors can be found in the PDF manual)
16 
17  License: MIT (X11), see file LICENSE in the base directory
18 ============================================================================= */
19 
24 #include "viennacl/forwards.h"
25 #include "viennacl/ocl/device.hpp"
26 #include "viennacl/ocl/handle.hpp"
27 #include "viennacl/ocl/kernel.hpp"
28 #include "viennacl/scalar.hpp"
29 #include "viennacl/tools/tools.hpp"
33 #include "viennacl/traits/size.hpp"
36 
37 namespace viennacl
38 {
39  namespace linalg
40  {
47  template <typename V1, typename V2, typename V3>
51  >::type
52  add(const V1 & vec1,
53  const V2 & vec2,
54  V3 & result)
55  {
56  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
57 
58  //TODO: Ensure that correct alignment is chosen for the kernels.
59  const unsigned int ALIGNMENT = V1::alignment;
60 
61  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
63  && "Incompatible vector sizes in add()!");
64 
65  //unsigned int size = std::min(viennacl::traits::internal_size(vec1),
66  // viennacl::traits::internal_size(vec2));
67 
68  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "add");
69 
71  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
72  viennacl::traits::handle(result), cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)) )
73  );
74  }
75 
83  template <typename V1, typename V2>
86  >::type
87  inplace_add(V1 & vec1,
88  const V2 & vec2)
89  {
90  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
91 
92  //TODO: Ensure that correct alignment is chosen for the kernels.
93  const unsigned int ALIGNMENT = V1::alignment;
94 
95  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
96  && "Incompatible vector sizes in inplace_add()!");
97 
98 
99  //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
100 
101  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_add");
102 
104  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)))
105  );
106  }
107 
108 
109 
118  template <typename V1, typename V2, typename V3>
122  >::type
123  sub(const V1 & vec1,
124  const V2 & vec2,
125  V3 & result)
126  {
127  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
128 
129  //TODO: Ensure that correct alignment is chosen for the kernels.
130  const unsigned int ALIGNMENT = V1::alignment;
131 
132  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
133  && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
134  && "Incompatible vector sizes in sub()!");
135 
136  //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
137  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sub");
138 
140  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
141  viennacl::traits::handle(result), cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)) )
142  );
143  }
144 
152  template <typename V1, typename V2>
155  >::type
156  inplace_sub(V1 & vec1,
157  const V2 & vec2)
158  {
159  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
160 
161  //TODO: Ensure that correct alignment is chosen for the kernels.
162  const unsigned int ALIGNMENT = V1::alignment;
163 
164  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
165  && "Incompatible vector sizes in inplace_sub()!");
166 
167  //unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
168  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_sub");
169 
171  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)))
172  );
173  }
174 
175 
176  //result = vec * scalar
185  template <typename V1, typename S2, typename V3>
189  >::type
190  mult(const V1 & vec,
191  S2 const & alpha,
192  V3 & result)
193  {
194  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
195 
196  //TODO: Ensure that correct alignment is chosen for the kernels.
197  const unsigned int ALIGNMENT = V1::alignment;
198 
199  assert( (viennacl::traits::size(vec) == viennacl::traits::size(result))
200  && "Incompatible vector sizes in mult()!");
201 
202  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "mult");
203 
205  alpha,
206  viennacl::traits::handle(result), cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)))
207  );
208  }
209 
218  template <typename V1, typename SCALARTYPE, typename V3>
222  >::type
223  mult(V1 const & vec,
224  SCALARTYPE alpha,
225  V3 & result)
226  {
227  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
228 
229  //TODO: Ensure that correct alignment is chosen for the kernels.
230  const unsigned int ALIGNMENT = V1::alignment;
231 
232  assert( (viennacl::traits::size(vec) == viennacl::traits::size(result))
233  && "Incompatible vector sizes in mult()!");
234 
235  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_mult");
236 
238  static_cast<value_type>(alpha),
239  viennacl::traits::handle(result), cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)))
240  );
241  }
242 
250  template <typename V1, typename S2>
253  >::type
254  inplace_mult(V1 & vec,
255  S2 const & alpha)
256  {
257  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
258 
259  //TODO: Ensure that correct alignment is chosen for the kernels.
260  const unsigned int ALIGNMENT = V1::alignment;
261 
262  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_mult");
263 
265  alpha)
266  );
267  }
268 
276  template <typename V1, typename S2>
279  >::type
280  inplace_mult(V1 & vec,
281  S2 alpha)
282  {
283  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
284 
285  //TODO: Ensure that correct alignment is chosen for the kernels.
286  const unsigned int ALIGNMENT = V1::alignment;
287 
288  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_inplace_mult");
289 
291  static_cast<value_type>(alpha))
292  );
293  }
294 
295  //result = vec / scalar
304  template <typename V1, typename S2, typename V3>
308  >::type
309  divide(V1 const & vec,
310  S2 const & alpha,
311  V3 & result)
312  {
313  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
314 
315  //TODO: Ensure that correct alignment is chosen for the kernels.
316  const unsigned int ALIGNMENT = V1::alignment;
317 
318  assert( (viennacl::traits::size(vec) == viennacl::traits::size(result))
319  && "Incompatible vector sizes in divide()!");
320 
321  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "divide");
322 
324  alpha,
325  viennacl::traits::handle(result), cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)))
326  );
327  }
328 
336  template <typename V1, typename S2>
339  >::type
340  inplace_divide(V1 & vec,
341  S2 const & alpha)
342  {
343  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
344 
345  //TODO: Ensure that correct alignment is chosen for the kernels.
346  const unsigned int ALIGNMENT = V1::alignment;
347 
348  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_divide");
349 
351  alpha)
352  );
353  }
354 
355  //result = factor * vec1 + vec2
365  template <typename V1, typename S2, typename V3, typename V4>
370  >::type
371  mul_add(V1 const & vec1,
372  S2 const & alpha,
373  V3 const & vec2,
374  V4 & result)
375  {
376  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
377 
378  //TODO: Ensure that correct alignment is chosen for the kernels.
379  const unsigned int ALIGNMENT = V1::alignment;
380 
381  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
382  && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
383  && "Incompatible vector sizes in mul_add()!");
384 
385 
386  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "mul_add");
387  //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
388 
390  alpha,
391  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
392  viennacl::traits::handle(result), cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)))
393  );
394  }
395 
405  template <typename V1, typename SCALARTYPE, typename V3, typename V4>
410  >::type
411  mul_add(V1 const & vec1,
412  SCALARTYPE alpha,
413  V3 const & vec2,
414  V4 & result)
415  {
416  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
417 
418  //TODO: Ensure that correct alignment is chosen for the kernels.
419  const unsigned int ALIGNMENT = V1::alignment;
420 
421  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
422  && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
423  && "Incompatible vector sizes in mul_add()!");
424 
425  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_mul_add");
426  //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
427 
429  static_cast<value_type>(alpha),
430  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
431  viennacl::traits::handle(result), cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)))
432  );
433  }
434 
435  //vec1 += factor * vec2
444  template <typename V1, typename V2, typename S3>
448  >::type
449  inplace_mul_add(V1 & vec1,
450  V2 const & vec2,
451  S3 const & alpha)
452  {
453  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
454 
455  //TODO: Ensure that correct alignment is chosen for the kernels.
456  const unsigned int ALIGNMENT = V1::alignment;
457 
458  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
459  && "Incompatible vector sizes in inplace_mul_add()!");
460 
461  //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
462 
463  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_mul_add");
464 
466  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
467  alpha));
468  }
469 
478  template <typename V1, typename V2, typename SCALARTYPE>
482  >::type
483  inplace_mul_add(V1 & vec1,
484  V2 const & vec2,
485  SCALARTYPE alpha)
486  {
487  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
488 
489  //TODO: Ensure that correct alignment is chosen for the kernels.
490  const unsigned int ALIGNMENT = V1::alignment;
491 
492  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
493  && "Incompatible vector sizes in inplace_mul_add()!");
494 
495  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "cpu_inplace_mul_add");
496  //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
497 
499  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
500  value_type(alpha)));
501  }
502 
512  template <typename V1, typename S2, typename V3, typename V4>
517  >::type
518  mul_sub(V1 const & vec1,
519  S2 const & alpha,
520  V3 const & vec2,
521  V4 & result)
522  {
523  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
524 
525  //TODO: Ensure that correct alignment is chosen for the kernels.
526  const unsigned int ALIGNMENT = V1::alignment;
527 
528  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
529  && (viennacl::traits::size(vec1) == viennacl::traits::size(result))
530  && "Incompatible vector sizes in mul_sub()!");
531 
532  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "mul_sub");
533  //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
534 
536  alpha,
537  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
538  viennacl::traits::handle(result), cl_uint(viennacl::traits::start(result)), cl_uint(viennacl::traits::size(result)))
539  );
540  }
541 
542 
551  template <typename V1, typename V2, typename S3>
555  >::type
556  inplace_mul_sub(V1 & vec1,
557  V2 const & vec2,
558  S3 const & alpha)
559  {
560  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
561 
562  //TODO: Ensure that correct alignment is chosen for the kernels.
563  const unsigned int ALIGNMENT = V1::alignment;
564 
565  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
566  && "Incompatible vector sizes in inplace_mul_sub()!");
567 
568  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_mul_sub");
569  //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
570 
572  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
573  alpha)
574  );
575  }
576 
585  template <typename V1, typename V2, typename S3>
589  >::type
590  inplace_div_add(V1 & vec1,
591  V2 const & vec2,
592  S3 const & alpha)
593  {
594  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
595 
596  //TODO: Ensure that correct alignment is chosen for the kernels.
597  const unsigned int ALIGNMENT = V1::alignment;
598 
599  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
600  && "Incompatible vector sizes in inplace_div_add()!");
601 
602  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_div_add");
603  //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
604 
606  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
607  alpha)
608  );
609  }
610 
619  template <typename V1, typename V2, typename S3>
623  >::type
624  inplace_div_sub(V1 & vec1,
625  V2 const & vec2,
626  S3 const & alpha)
627  {
628  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
629 
630  //TODO: Ensure that correct alignment is chosen for the kernels.
631  const unsigned int ALIGNMENT = V1::alignment;
632 
633  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
634  && "Incompatible vector sizes in inplace_div_sub()!");
635 
636  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inplace_div_sub");
637  //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
638 
640  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
641  alpha)
642  );
643  }
644 
645 
647 
648 
649  //implementation of inner product:
650  //namespace {
657  template <typename V1, typename V2, typename S3>
658  void inner_prod_impl(V1 const & vec1,
659  V2 const & vec2,
660  S3 & result,
664 #ifdef _MSC_VER
665  >::type * dummy = 0)
666 #else
667  >::type * dummy)
668 #endif
669  {
670  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
671 
672  //TODO: Ensure that correct alignment is chosen for the kernels.
673  const unsigned int ALIGNMENT = V1::alignment;
674 
675  assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
676  && "Incompatible vector sizes in inner_prod_impl()!");
677 
678  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "inner_prod");
679  //cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
680  unsigned int work_groups = k.global_work_size() / k.local_work_size();
681 
682  static viennacl::vector<value_type> temp(work_groups);
683 
684  //Note: Number of work groups MUST be a power of two!
685  //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
686  assert( work_groups * k.local_work_size() == k.global_work_size() );
687  assert( (k.global_work_size() / k.local_work_size()) == 1
688  || (k.global_work_size() / k.local_work_size()) == 2
689  || (k.global_work_size() / k.local_work_size()) == 4
690  || (k.global_work_size() / k.local_work_size()) == 8
691  || (k.global_work_size() / k.local_work_size()) == 16
692  || (k.global_work_size() / k.local_work_size()) == 32
693  || (k.global_work_size() / k.local_work_size()) == 64
694  || (k.global_work_size() / k.local_work_size()) == 128
695  || (k.global_work_size() / k.local_work_size()) == 256
696  || (k.global_work_size() / k.local_work_size()) == 512 );
697 
699  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
700  viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
701  temp));
702 
703  viennacl::ocl::kernel & ksum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sum");
704 
705  ksum.local_work_size(0, work_groups);
706  ksum.global_work_size(0, work_groups);
708  result)
709  );
710  }
711 
712  //public interface of inner product
719  template <typename V1, typename V2>
722  viennacl::scalar_expression< const V1,
723  const V2,
724  viennacl::op_inner_prod >
725  >::type
726  inner_prod_impl(V1 const & vec1,
727  V2 const & vec2)
728  {
729  return viennacl::scalar_expression< const V1,
730  const V2,
731  viennacl::op_inner_prod >(vec1, vec2);
732  }
733 
734 
735 
741  template <typename V1, typename S2>
742  void norm_1_impl(V1 const & vec,
743  S2 & result,
746 #ifdef _MSC_VER
747  >::type * dummy = 0)
748 #else
749  >::type * dummy)
750 #endif
751  {
752  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
753 
754  //TODO: Ensure that correct alignment is chosen for the kernels.
755  const unsigned int ALIGNMENT = V1::alignment;
756 
757  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "norm_1");
758  //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
759 
760  if (k.local_work_size() != k.global_work_size())
761  {
762  //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible
763  k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size());
764  k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size());
765  }
766 
767  unsigned int work_groups = k.global_work_size() / k.local_work_size();
768  viennacl::vector<value_type> temp(work_groups);
769 
770  //Note: Number of work groups MUST be a power of two!
771  //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
772  assert( work_groups * k.local_work_size() == k.global_work_size() );
773  assert( (k.global_work_size() / k.local_work_size()) == 1
774  || (k.global_work_size() / k.local_work_size()) == 2
775  || (k.global_work_size() / k.local_work_size()) == 4
776  || (k.global_work_size() / k.local_work_size()) == 8
777  || (k.global_work_size() / k.local_work_size()) == 16
778  || (k.global_work_size() / k.local_work_size()) == 32
779  || (k.global_work_size() / k.local_work_size()) == 64
780  || (k.global_work_size() / k.local_work_size()) == 128
781  || (k.global_work_size() / k.local_work_size()) == 256
782  || (k.global_work_size() / k.local_work_size()) == 512 );
783 
785  viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
786  temp));
787 
788  viennacl::ocl::kernel & ksum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sum");
789 
790  ksum.local_work_size(0, work_groups);
791  ksum.global_work_size(0, work_groups);
793  result)
794  );
795  }
796 
802  template <typename V1, typename S2>
803  void norm_2_impl(V1 const & vec,
804  S2 & result,
807 #ifdef _MSC_VER
808  >::type * dummy = 0)
809 #else
810  >::type * dummy)
811 #endif
812  {
813  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
814 
815  //TODO: Ensure that correct alignment is chosen for the kernels.
816  const unsigned int ALIGNMENT = V1::alignment;
817 
818  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "norm_2");
819  //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
820 
821  if (k.local_work_size() != k.global_work_size())
822  {
823  //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible
824  k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size());
825  k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size());
826  }
827 
828  unsigned int work_groups = k.global_work_size() / k.local_work_size();
829  viennacl::vector<value_type> temp(work_groups);
830 
831  //Note: Number of work groups MUST be a power of two!
832  //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
833  assert( work_groups * k.local_work_size() == k.global_work_size() );
834  assert( (k.global_work_size() / k.local_work_size()) == 1
835  || (k.global_work_size() / k.local_work_size()) == 2
836  || (k.global_work_size() / k.local_work_size()) == 4
837  || (k.global_work_size() / k.local_work_size()) == 8
838  || (k.global_work_size() / k.local_work_size()) == 16
839  || (k.global_work_size() / k.local_work_size()) == 32
840  || (k.global_work_size() / k.local_work_size()) == 64
841  || (k.global_work_size() / k.local_work_size()) == 128
842  || (k.global_work_size() / k.local_work_size()) == 256
843  || (k.global_work_size() / k.local_work_size()) == 512 );
844 
846  viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
847  temp)
848  );
849 
850  viennacl::ocl::kernel & sqrt_sum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "sqrt_sum");
851 
852  sqrt_sum.local_work_size(0, work_groups);
853  sqrt_sum.global_work_size(0, work_groups);
855  sqrt_sum(viennacl::traits::handle(temp), cl_uint(viennacl::traits::start(temp)), cl_uint(viennacl::traits::size(temp)),
856  result)
857  );
858  }
859 
865  template <typename V1, typename S2>
866  void norm_inf_impl(V1 const & vec,
867  S2 & result,
870 #ifdef _MSC_VER
871  >::type * dummy = 0)
872 #else
873  >::type * dummy)
874 #endif
875  {
876  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
877 
878  //TODO: Ensure that correct alignment is chosen for the kernels.
879  const unsigned int ALIGNMENT = V1::alignment;
880 
881  //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
882  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "norm_inf");
883 
884  if (k.local_work_size() != k.global_work_size())
885  {
886  //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible
887  k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size());
888  k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size());
889  }
890 
891  unsigned int work_groups = k.global_work_size() / k.local_work_size();
892  viennacl::vector<value_type> temp(work_groups);
893 
894  //Note: Number of work groups MUST be a power of two!
895  //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
896  assert( work_groups * k.local_work_size() == k.global_work_size() );
897  assert( work_groups == 1
898  || work_groups == 2
899  || work_groups == 4
900  || work_groups == 8
901  || work_groups == 16
902  || work_groups == 32
903  || work_groups == 64
904  || work_groups == 128
905  || work_groups == 256
906  || work_groups == 512 );
907 
909  viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
910  temp));
911  //viennacl::ocl::get_queue().finish();
912 
913  //part 2: parallel reduction of reduced kernel:
914  viennacl::ocl::kernel & max_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "vmax");
915  max_kernel.local_work_size(0, work_groups);
916  max_kernel.global_work_size(0, work_groups);
917 
919  max_kernel(viennacl::traits::handle(temp), cl_uint(viennacl::traits::start(temp)), cl_uint(viennacl::traits::size(temp)),
920  result)
921  );
922  }
923 
924  //This function should return a CPU scalar, otherwise statements like
925  // vcl_rhs[index_norm_inf(vcl_rhs)]
926  // are ambiguous
932  template <typename V1>
934  cl_uint
935  >::type
936  index_norm_inf(V1 const & vec)
937  {
938  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
939 
940  //TODO: Ensure that correct alignment is chosen for the kernels.
941  const unsigned int ALIGNMENT = V1::alignment;
942 
943  viennacl::ocl::handle<cl_mem> h = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint));
944 
945  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<value_type, ALIGNMENT>::program_name(), "index_norm_inf");
946  //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
947 
950  viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()),
951  viennacl::ocl::local_mem(sizeof(cl_uint) * k.local_work_size()), h));
952 
953  //read value:
954  cl_uint result;
955  cl_int err;
956  err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), h, CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
957  VIENNACL_ERR_CHECK(err);
958  return result;
959  }
960 
961  //TODO: Special case vec1 == vec2 allows improvement!!
971  template <typename V1, typename V2, typename SCALARTYPE>
975  >::type
976  plane_rotation(V1 & vec1,
977  V2 & vec2,
978  SCALARTYPE alpha,
979  SCALARTYPE beta)
980  {
981  typedef typename viennacl::result_of::cpu_value_type<V1>::type value_type;
982 
983  //TODO: Ensure that correct alignment is chosen for the kernels.
984  const unsigned int ALIGNMENT = V1::alignment;
985 
986  assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2));
987  viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "plane_rotation");
988 
990  viennacl::traits::handle(vec2), cl_uint(viennacl::traits::start(vec2)), cl_uint(viennacl::traits::size(vec2)),
991  alpha,
992  beta)
993  );
994  }
995 
996  } //namespace linalg
997 } //namespace viennacl
998 
999 
1000 #endif