MagickCore  6.9.9
Convert, Edit, Or Compose Bitmap Images
accelerate-kernels-private.h
Go to the documentation of this file.
1 /*
2  Copyright 1999-2018 ImageMagick Studio LLC, a non-profit organization
3  dedicated to making software imaging solutions freely available.
4 
5  You may not use this file except in compliance with the License.
6  obtain a copy of the License at
7 
8  https://www.imagemagick.org/script/license.php
9 
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15 
16  MagickCore private methods for accelerated functions.
17 */
18 
19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21 
22 #if defined(__cplusplus) || defined(c_plusplus)
23 extern "C" {
24 #endif
25 
26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
27 
28 /*
29  Define declarations.
30 */
31 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
32 #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
33 #define OPENCL_ELSE() "\n #""else " " \n"
34 #define OPENCL_ENDIF() "\n #""endif " " \n"
35 #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
37 
38 const char* accelerateKernels =
39 
40 /*
41  Define declarations.
42 */
43  OPENCL_DEFINE(GetPixelAlpha(pixel),(QuantumRange-(pixel).w))
44  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
45  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
46  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
47  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
48  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
49  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
50  OPENCL_DEFINE(SigmaRandom, (attenuate))
51  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
52  OPENCL_DEFINE(MagickMax(x, y), (((x) > (y)) ? (x) : (y)))
53  OPENCL_DEFINE(MagickMin(x, y), (((x) < (y)) ? (x) : (y)))
54 
55 /*
56  Typedef declarations.
57 */
58  STRINGIFY(
59  typedef enum
60  {
62  RGBColorspace, /* Linear RGB colorspace */
63  GRAYColorspace, /* greyscale (non-linear) image (faked 1 channel) */
73  CMYKColorspace, /* negared linear RGB with black separated */
74  sRGBColorspace, /* Default: non-lienar sRGB colorspace */
83  CMYColorspace, /* negated linear RGB colorspace */
86  LCHColorspace, /* alias for LCHuv */
88  LCHabColorspace, /* Cylindrical (Polar) Lab */
89  LCHuvColorspace, /* Cylindrical (Polar) Luv */
92  HSVColorspace, /* alias for HSB */
95  LinearGRAYColorspace /* greyscale (linear) image (faked 1 channel) */
97  )
98 
99  STRINGIFY(
100  typedef enum
101  {
157  /* These are new operators, added after the above was last sorted.
158  * The list should be re-sorted only when a new library version is
159  * created.
160  */
175  )
176 
177  STRINGIFY(
178  typedef enum
179  {
185  } MagickFunction;
186  )
187 
188  STRINGIFY(
189  typedef enum
190  {
192  UniformNoise,
195  ImpulseNoise,
197  PoissonNoise,
199  } NoiseType;
200  )
201 
202  STRINGIFY(
203  typedef enum
204  {
216  )
217 
218  STRINGIFY(
219  typedef enum {
237  )
238 
239  STRINGIFY(
240  typedef enum
241  {
243  RedChannel = 0x0001,
244  GrayChannel = 0x0001,
245  CyanChannel = 0x0001,
246  GreenChannel = 0x0002,
247  MagentaChannel = 0x0002,
248  BlueChannel = 0x0004,
249  YellowChannel = 0x0004,
250  AlphaChannel = 0x0008,
251  OpacityChannel = 0x0008,
252  MatteChannel = 0x0008, /* deprecated */
253  BlackChannel = 0x0020,
254  IndexChannel = 0x0020,
255  CompositeChannels = 0x002F,
256  AllChannels = 0x7ffffff,
257  /*
258  Special purpose channel types.
259  */
260  TrueAlphaChannel = 0x0040, /* extract actual alpha channel from opacity */
261  RGBChannels = 0x0080, /* set alpha from grayscale mask in RGB */
262  GrayChannels = 0x0080,
263  SyncChannels = 0x0100, /* channels should be modified equally */
265  } ChannelType;
266  )
267 
268 /*
269  Helper functions.
270 */
271 
272 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
273 
274  STRINGIFY(
275  inline CLQuantum ScaleCharToQuantum(const unsigned char value)
276  {
277  return((CLQuantum) value);
278  }
279  )
280 
281 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
282 
283  STRINGIFY(
284  inline CLQuantum ScaleCharToQuantum(const unsigned char value)
285  {
286  return((CLQuantum) (257.0f*value));
287  }
288  )
289 
290 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
291 
292  STRINGIFY(
293  inline CLQuantum ScaleCharToQuantum(const unsigned char value)
294  {
295  return((CLQuantum) (16843009.0*value));
296  }
297  )
298 
299 OPENCL_ENDIF()
300 
301 STRINGIFY(
302  inline int ClampToCanvas(const int offset, const int range)
303  {
304  return clamp(offset, (int)0, range - 1);
305  }
306  )
307 
308  STRINGIFY(
309  inline int ClampToCanvasWithHalo(const int offset, const int range, const int edge, const int section)
310  {
311  return clamp(offset, section ? (int)(0 - edge) : (int)0, section ? (range - 1) : (range - 1 + edge));
312  }
313  )
314 
315  STRINGIFY(
316  inline CLQuantum ClampToQuantum(const float value)
317  {
318  return (CLQuantum)(clamp(value, 0.0f, (float)QuantumRange) + 0.5f);
319  }
320  )
321 
322  STRINGIFY(
323  inline uint ScaleQuantumToMap(CLQuantum value)
324  {
325  if (value >= (CLQuantum)MaxMap)
326  return ((uint)MaxMap);
327  else
328  return ((uint)value);
329  }
330  )
331 
332  STRINGIFY(
333  inline float PerceptibleReciprocal(const float x)
334  {
335  float sign = x < (float) 0.0 ? (float)-1.0 : (float) 1.0;
336  return((sign*x) >= MagickEpsilon ? (float) 1.0 / x : sign*((float) 1.0 / MagickEpsilon));
337  }
338  )
339 
340  STRINGIFY(
341  inline float RoundToUnity(const float value)
342  {
343  return clamp(value, 0.0f, 1.0f);
344  }
345  )
346 
347  STRINGIFY(
348 
349  inline CLQuantum getBlue(CLPixelType p) { return p.x; }
350  inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
351  inline float getBlueF4(float4 p) { return p.x; }
352  inline void setBlueF4(float4* p, float value) { (*p).x = value; }
353 
354  inline CLQuantum getGreen(CLPixelType p) { return p.y; }
355  inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
356  inline float getGreenF4(float4 p) { return p.y; }
357  inline void setGreenF4(float4* p, float value) { (*p).y = value; }
358 
359  inline CLQuantum getRed(CLPixelType p) { return p.z; }
360  inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
361  inline float getRedF4(float4 p) { return p.z; }
362  inline void setRedF4(float4* p, float value) { (*p).z = value; }
363 
364  inline CLQuantum getOpacity(CLPixelType p) { return p.w; }
365  inline void setOpacity(CLPixelType* p, CLQuantum value) { (*p).w = value; }
366  inline float getOpacityF4(float4 p) { return p.w; }
367  inline void setOpacityF4(float4* p, float value) { (*p).w = value; }
368 
369  inline void setGray(CLPixelType* p, CLQuantum value) { (*p).z = value; (*p).y = value; (*p).x = value; }
370 
371  inline float GetPixelIntensity(const int method, const int colorspace, CLPixelType p)
372  {
373  float red = getRed(p);
374  float green = getGreen(p);
375  float blue = getBlue(p);
376 
377  float intensity;
378 
379  if (colorspace == GRAYColorspace)
380  return red;
381 
382  switch (method)
383  {
385  {
386  intensity = (red + green + blue) / 3.0;
387  break;
388  }
390  {
391  intensity = MagickMax(MagickMax(red, green), blue);
392  break;
393  }
395  {
396  intensity = (MagickMin(MagickMin(red, green), blue) +
397  MagickMax(MagickMax(red, green), blue)) / 2.0;
398  break;
399  }
401  {
402  intensity = (float)(((float)red*red + green*green + blue*blue) /
403  (3.0*QuantumRange));
404  break;
405  }
407  {
408  /*
409  if (image->colorspace == RGBColorspace)
410  {
411  red=EncodePixelGamma(red);
412  green=EncodePixelGamma(green);
413  blue=EncodePixelGamma(blue);
414  }
415  */
416  intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
417  break;
418  }
420  {
421  /*
422  if (image->colorspace == sRGBColorspace)
423  {
424  red=DecodePixelGamma(red);
425  green=DecodePixelGamma(green);
426  blue=DecodePixelGamma(blue);
427  }
428  */
429  intensity = 0.298839*red + 0.586811*green + 0.114350*blue;
430  break;
431  }
433  default:
434  {
435  /*
436  if (image->colorspace == RGBColorspace)
437  {
438  red=EncodePixelGamma(red);
439  green=EncodePixelGamma(green);
440  blue=EncodePixelGamma(blue);
441  }
442  */
443  intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
444  break;
445  }
447  {
448  /*
449  if (image->colorspace == sRGBColorspace)
450  {
451  red=DecodePixelGamma(red);
452  green=DecodePixelGamma(green);
453  blue=DecodePixelGamma(blue);
454  }
455  */
456  intensity = 0.212656*red + 0.715158*green + 0.072186*blue;
457  break;
458  }
460  {
461  intensity = (float)(sqrt((float)red*red + green*green + blue*blue) /
462  sqrt(3.0));
463  break;
464  }
465  }
466 
467  return intensity;
468 
469  }
470  )
471 
472 /*
473 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
474 % %
475 % %
476 % %
477 % A d d N o i s e %
478 % %
479 % %
480 % %
481 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
482 */
483 
484 STRINGIFY(
485 
486 /*
487 Part of MWC64X by David Thomas, dt10@imperial.ac.uk
488 This is provided under BSD, full license is with the main package.
489 See http://www.doc.ic.ac.uk/~dt10/research
490 */
491 
492 // Pre: a<M, b<M
493 // Post: r=(a+b) mod M
494 ulong MWC_AddMod64(ulong a, ulong b, ulong M)
495 {
496  ulong v=a+b;
497  //if( (v>=M) || (v<a) )
498  if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
499  v=v-M;
500  return v;
501 }
502 
503 // Pre: a<M,b<M
504 // Post: r=(a*b) mod M
505 // This could be done more efficently, but it is portable, and should
506 // be easy to understand. It can be replaced with any of the better
507 // modular multiplication algorithms (for example if you know you have
508 // double precision available or something).
509 ulong MWC_MulMod64(ulong a, ulong b, ulong M)
510 {
511  ulong r=0;
512  while(a!=0){
513  if(a&1)
514  r=MWC_AddMod64(r,b,M);
515  b=MWC_AddMod64(b,b,M);
516  a=a>>1;
517  }
518  return r;
519 }
520 
521 
522 // Pre: a<M, e>=0
523 // Post: r=(a^b) mod M
524 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
525 // most architectures
526 ulong MWC_PowMod64(ulong a, ulong e, ulong M)
527 {
528  ulong sqr=a, acc=1;
529  while(e!=0){
530  if(e&1)
531  acc=MWC_MulMod64(acc,sqr,M);
532  sqr=MWC_MulMod64(sqr,sqr,M);
533  e=e>>1;
534  }
535  return acc;
536 }
537 
538 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
539 {
540  ulong m=MWC_PowMod64(A, distance, M);
541  ulong x=curr.x*(ulong)A+curr.y;
542  x=MWC_MulMod64(x, m, M);
543  return (uint2)((uint)(x/A), (uint)(x%A));
544 }
545 
546 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
547 {
548  // This is an arbitrary constant for starting LCG jumping from. I didn't
549  // want to start from 1, as then you end up with the two or three first values
550  // being a bit poor in ones - once you've decided that, one constant is as
551  // good as any another. There is no deep mathematical reason for it, I just
552  // generated a random number.
553  enum{ MWC_BASEID = 4077358422479273989UL };
554 
555  ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
556  ulong m=MWC_PowMod64(A, dist, M);
557 
558  ulong x=MWC_MulMod64(MWC_BASEID, m, M);
559  return (uint2)((uint)(x/A), (uint)(x%A));
560 }
561 
563 typedef struct{ uint x; uint c; } mwc64x_state_t;
564 
565 enum{ MWC64X_A = 4294883355U };
566 enum{ MWC64X_M = 18446383549859758079UL };
567 
568 void MWC64X_Step(mwc64x_state_t *s)
569 {
570  uint X=s->x, C=s->c;
571 
572  uint Xn=MWC64X_A*X+C;
573  uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
574  uint Cn=mad_hi(MWC64X_A,X,carry);
575 
576  s->x=Xn;
577  s->c=Cn;
578 }
579 
580 void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
581 {
582  uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
583  s->x=tmp.x;
584  s->c=tmp.y;
585 }
586 
587 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
588 {
589  uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
590  s->x=tmp.x;
591  s->c=tmp.y;
592 }
593 
595 uint MWC64X_NextUint(mwc64x_state_t *s)
596 {
597  uint res=s->x ^ s->c;
598  MWC64X_Step(s);
599  return res;
600 }
601 
602 //
603 // End of MWC64X excerpt
604 //
605 
606  float mwcReadPseudoRandomValue(mwc64x_state_t* rng) {
607  return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
608  }
609 
610 
611  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, CLQuantum pixel, NoiseType noise_type, float attenuate) {
612 
613  float
614  alpha,
615  beta,
616  noise,
617  sigma;
618 
619  noise = 0.0f;
620  alpha=mwcReadPseudoRandomValue(r);
621  switch(noise_type) {
622  case UniformNoise:
623  default:
624  {
625  noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
626  break;
627  }
628  case GaussianNoise:
629  {
630  float
631  gamma,
632  tau;
633 
634  if (alpha == 0.0f)
635  alpha=1.0f;
636  beta=mwcReadPseudoRandomValue(r);
637  gamma=sqrt(-2.0f*log(alpha));
638  sigma=gamma*cospi((2.0f*beta));
639  tau=gamma*sinpi((2.0f*beta));
640  noise=(float)(pixel+sqrt((float) pixel)*SigmaGaussian*sigma+
642  break;
643  }
644 
645 
646  case ImpulseNoise:
647  {
648  if (alpha < (SigmaImpulse/2.0f))
649  noise=0.0f;
650  else
651  if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
652  noise=(float)QuantumRange;
653  else
654  noise=(float)pixel;
655  break;
656  }
657  case LaplacianNoise:
658  {
659  if (alpha <= 0.5f)
660  {
661  if (alpha <= MagickEpsilon)
662  noise=(float) (pixel-QuantumRange);
663  else
664  noise=(float) (pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
665  0.5f);
666  break;
667  }
668  beta=1.0f-alpha;
669  if (beta <= (0.5f*MagickEpsilon))
670  noise=(float) (pixel+QuantumRange);
671  else
672  noise=(float) (pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
673  break;
674  }
676  {
677  sigma=1.0f;
678  if (alpha > MagickEpsilon)
679  sigma=sqrt(-2.0f*log(alpha));
680  beta=mwcReadPseudoRandomValue(r);
681  noise=(float) (pixel+pixel*SigmaMultiplicativeGaussian*sigma*
682  cospi((float) (2.0f*beta))/2.0f);
683  break;
684  }
685  case PoissonNoise:
686  {
687  float
688  poisson;
689  unsigned int i;
690  poisson=exp(-SigmaPoisson*QuantumScale*pixel);
691  for (i=0; alpha > poisson; i++)
692  {
693  beta=mwcReadPseudoRandomValue(r);
694  alpha*=beta;
695  }
696  noise=(float) (QuantumRange*i/SigmaPoisson);
697  break;
698  }
699  case RandomNoise:
700  {
701  noise=(float) (QuantumRange*SigmaRandom*alpha);
702  break;
703  }
704 
705  };
706  return noise;
707  }
708 
709  __kernel
710  void AddNoise(const __global CLPixelType* inputImage, __global CLPixelType* filteredImage
711  ,const unsigned int inputPixelCount, const unsigned int pixelsPerWorkItem
712  ,const ChannelType channel
713  ,const NoiseType noise_type, const float attenuate
714  ,const unsigned int seed0, const unsigned int seed1
715  ,const unsigned int numRandomNumbersPerPixel) {
716 
717  mwc64x_state_t rng;
718  rng.x = seed0;
719  rng.c = seed1;
720 
721  uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
722  uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
723 
724  MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
725 
726  uint pos = get_local_size(0) * get_group_id(0) * pixelsPerWorkItem + get_local_id(0); // pixel to process
727 
728  uint count = pixelsPerWorkItem;
729 
730  while (count > 0) {
731  if (pos < inputPixelCount) {
732  CLPixelType p = inputImage[pos];
733 
734  if ((channel&RedChannel)!=0) {
735  setRed(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getRed(p),noise_type,attenuate)));
736  }
737 
738  if ((channel&GreenChannel)!=0) {
739  setGreen(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getGreen(p),noise_type,attenuate)));
740  }
741 
742  if ((channel&BlueChannel)!=0) {
743  setBlue(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getBlue(p),noise_type,attenuate)));
744  }
745 
746  if ((channel & OpacityChannel) != 0) {
747  setOpacity(&p,ClampToQuantum(mwcGenerateDifferentialNoise(&rng,getOpacity(p),noise_type,attenuate)));
748  }
749 
750  filteredImage[pos] = p;
751  //filteredImage[pos] = (CLPixelType)(MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, MWC64X_NextUint(&rng) % 256, 255);
752  }
753  pos += get_local_size(0);
754  --count;
755  }
756  }
757  )
758 
759 /*
760 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
761 % %
762 % %
763 % %
764 % B l u r %
765 % %
766 % %
767 % %
768 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
769 */
770 
771  STRINGIFY(
772  /*
773  Reduce image noise and reduce detail levels by row
774  im: input pixels filtered_in filtered_im: output pixels
775  filter : convolve kernel width: convolve kernel size
776  channel : define which channel is blured
777  is_RGBA_BGRA : define the input is RGBA or BGRA
778  */
779  __kernel void BlurRow(__global CLPixelType *im, __global float4 *filtered_im,
780  const ChannelType channel, __constant float *filter,
781  const unsigned int width,
782  const unsigned int imageColumns, const unsigned int imageRows,
783  __local CLPixelType *temp)
784  {
785  const int x = get_global_id(0);
786  const int y = get_global_id(1);
787 
788  const int columns = imageColumns;
789 
790  const unsigned int radius = (width-1)/2;
791  const int wsize = get_local_size(0);
792  const unsigned int loadSize = wsize+width;
793 
794  //load chunk only for now
795  //event_t e = async_work_group_copy(temp+radius, im+x+y*columns, wsize, 0);
796  //wait_group_events(1,&e);
797 
798  //parallel load and clamp
799  /*
800  int count = 0;
801  for (int i=0; i < loadSize; i=i+wsize)
802  {
803  int currentX = x + wsize*(count++);
804 
805  int localId = get_local_id(0);
806 
807  if ((localId+i) > loadSize)
808  break;
809 
810  temp[localId+i] = im[y*columns+ClampToCanvas(currentX-radius, columns)];
811 
812  if (y==0 && get_group_id(0) == 0)
813  {
814  printf("(%d %d) temp %d load %d currentX %d\n", x, y, localId+i, ClampToCanvas(currentX-radius, columns), currentX);
815  }
816  }
817  */
818 
819  //group coordinate
820  const int groupX=get_local_size(0)*get_group_id(0);
821  const int groupY=get_local_size(1)*get_group_id(1);
822 
823  //parallel load and clamp
824  for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
825  {
826  //int cx = ClampToCanvas(groupX+i, columns);
827  temp[i] = im[y * columns + ClampToCanvas(i+groupX-radius, columns)];
828 
829  /*if (0 && y==0 && get_group_id(1) == 0)
830  {
831  printf("(%d %d) temp %d load %d groupX %d\n", x, y, i, ClampToCanvas(groupX+i, columns), groupX);
832  }*/
833  }
834 
835  // barrier
836  barrier(CLK_LOCAL_MEM_FENCE);
837 
838  // only do the work if this is not a patched item
839  if (get_global_id(0) < columns)
840  {
841  // compute
842  float4 result = (float4) 0;
843 
844  int i = 0;
845 
846  \n #ifndef UFACTOR \n
847  \n #define UFACTOR 8 \n
848  \n #endif \n
849 
850  for ( ; i+UFACTOR < width; )
851  {
852  \n #pragma unroll UFACTOR\n
853  for (int j=0; j < UFACTOR; j++, i++)
854  {
855  result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
856  }
857  }
858 
859  for ( ; i < width; i++)
860  {
861  result+=filter[i]*convert_float4(temp[i+get_local_id(0)]);
862  }
863 
864  result.x = ClampToQuantum(result.x);
865  result.y = ClampToQuantum(result.y);
866  result.z = ClampToQuantum(result.z);
867  result.w = ClampToQuantum(result.w);
868 
869  // write back to global
870  filtered_im[y*columns+x] = result;
871  }
872  }
873  )
874 
875  STRINGIFY(
876  /*
877  Reduce image noise and reduce detail levels by line
878  im: input pixels filtered_in filtered_im: output pixels
879  filter : convolve kernel width: convolve kernel size
880  channel : define which channel is blured\
881  is_RGBA_BGRA : define the input is RGBA or BGRA
882  */
883  __kernel void BlurColumn(const __global float4 *blurRowData, __global CLPixelType *filtered_im,
884  const ChannelType channel, __constant float *filter,
885  const unsigned int width,
886  const unsigned int imageColumns, const unsigned int imageRows,
887  __local float4 *temp)
888  {
889  const int x = get_global_id(0);
890  const int y = get_global_id(1);
891 
892  //const int columns = get_global_size(0);
893  //const int rows = get_global_size(1);
894  const int columns = imageColumns;
895  const int rows = imageRows;
896 
897  unsigned int radius = (width-1)/2;
898  const int wsize = get_local_size(1);
899  const unsigned int loadSize = wsize+width;
900 
901  //group coordinate
902  const int groupX=get_local_size(0)*get_group_id(0);
903  const int groupY=get_local_size(1)*get_group_id(1);
904  //notice that get_local_size(0) is 1, so
905  //groupX=get_group_id(0);
906 
907  //parallel load and clamp
908  for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
909  {
910  temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
911  }
912 
913  // barrier
914  barrier(CLK_LOCAL_MEM_FENCE);
915 
916  // only do the work if this is not a patched item
917  if (get_global_id(1) < rows)
918  {
919  // compute
920  float4 result = (float4) 0;
921 
922  int i = 0;
923 
924  \n #ifndef UFACTOR \n
925  \n #define UFACTOR 8 \n
926  \n #endif \n
927 
928  for ( ; i+UFACTOR < width; )
929  {
930  \n #pragma unroll UFACTOR \n
931  for (int j=0; j < UFACTOR; j++, i++)
932  {
933  result+=filter[i]*temp[i+get_local_id(1)];
934  }
935  }
936 
937  for ( ; i < width; i++)
938  {
939  result+=filter[i]*temp[i+get_local_id(1)];
940  }
941 
942  result.x = ClampToQuantum(result.x);
943  result.y = ClampToQuantum(result.y);
944  result.z = ClampToQuantum(result.z);
945  result.w = ClampToQuantum(result.w);
946 
947  // write back to global
948  filtered_im[y*columns+x] = (CLPixelType) (result.x,result.y,result.z,result.w);
949  }
950 
951  }
952  )
953 
954 /*
955 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
956 % %
957 % %
958 % %
959 % C o m p o s i t e %
960 % %
961 % %
962 % %
963 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
964 */
965 
966  STRINGIFY(
967  inline float ColorDodge(const float Sca,
968  const float Sa,const float Dca,const float Da)
969  {
970  /*
971  Oct 2004 SVG specification.
972  */
973  if ((Sca*Da+Dca*Sa) >= Sa*Da)
974  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
975  return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
976 
977 
978  /*
979  New specification, March 2009 SVG specification. This specification was
980  also wrong of non-overlap cases.
981  */
982  /*
983  if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
984  return(Sca*(1.0-Da));
985  if (fabs(Sca-Sa) < MagickEpsilon)
986  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
987  return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca)));
988  */
989 
990  /*
991  Working from first principles using the original formula:
992 
993  f(Sc,Dc) = Dc/(1-Sc)
994 
995  This works correctly! Looks like the 2004 model was right but just
996  required a extra condition for correct handling.
997  */
998 
999  /*
1000  if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
1001  return(Sca*(1.0-Da)+Dca*(1.0-Sa));
1002  if (fabs(Sca-Sa) < MagickEpsilon)
1003  return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
1004  return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
1005  */
1006  }
1007 
1008  inline void CompositeColorDodge(const float4 *p,
1009  const float4 *q,float4 *composite) {
1010 
1011  float
1012  Da,
1013  gamma,
1014  Sa;
1015 
1016  Sa=1.0f-QuantumScale*getOpacityF4(*p); /* simplify and speed up equations */
1017  Da=1.0f-QuantumScale*getOpacityF4(*q);
1018  gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */
1019  setOpacityF4(composite, QuantumRange*(1.0-gamma));
1020  gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma);
1021  setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale*
1022  getRedF4(*q)*Da,Da));
1023  setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale*
1024  getGreenF4(*q)*Da,Da));
1025  setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale*
1026  getBlueF4(*q)*Da,Da));
1027  }
1028  )
1029 
1030  STRINGIFY(
1031  inline void MagickPixelCompositePlus(const float4 *p,
1032  const float alpha,const float4 *q,
1033  const float beta,float4 *composite)
1034  {
1035  float
1036  gamma;
1037 
1038  float
1039  Da,
1040  Sa;
1041  /*
1042  Add two pixels with the given opacities.
1043  */
1044  Sa=1.0-QuantumScale*alpha;
1045  Da=1.0-QuantumScale*beta;
1046  gamma=RoundToUnity(Sa+Da); /* 'Plus' blending -- not 'Over' blending */
1047  setOpacityF4(composite,(float) QuantumRange*(1.0-gamma));
1048  gamma=PerceptibleReciprocal(gamma);
1049  setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q)));
1050  setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q)));
1051  setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q)));
1052  }
1053  )
1054 
1055  STRINGIFY(
1056  inline void MagickPixelCompositeBlend(const float4 *p,
1057  const float alpha,const float4 *q,
1058  const float beta,float4 *composite)
1059  {
1060  MagickPixelCompositePlus(p,(float) (QuantumRange-alpha*
1061  (QuantumRange-getOpacityF4(*p))),q,(float) (QuantumRange-beta*
1062  (QuantumRange-getOpacityF4(*q))),composite);
1063  }
1064  )
1065 
1066  STRINGIFY(
1067  __kernel
1068  void Composite(__global CLPixelType *image,
1069  const unsigned int imageWidth,
1070  const unsigned int imageHeight,
1071  const unsigned int imageMatte,
1072  const __global CLPixelType *compositeImage,
1073  const unsigned int compositeWidth,
1074  const unsigned int compositeHeight,
1075  const unsigned int compositeMatte,
1076  const unsigned int compose,
1077  const ChannelType channel,
1078  const float destination_dissolve,
1079  const float source_dissolve) {
1080 
1081  uint2 index;
1082  index.x = get_global_id(0);
1083  index.y = get_global_id(1);
1084 
1085 
1086  if (index.x >= imageWidth
1087  || index.y >= imageHeight) {
1088  return;
1089  }
1090  const CLPixelType inputPixel = image[index.y*imageWidth+index.x];
1091  float4 destination;
1092  setRedF4(&destination,getRed(inputPixel));
1093  setGreenF4(&destination,getGreen(inputPixel));
1094  setBlueF4(&destination,getBlue(inputPixel));
1095 
1096 
1097  const CLPixelType compositePixel
1098  = compositeImage[index.y*imageWidth+index.x];
1099  float4 source;
1100  setRedF4(&source,getRed(compositePixel));
1101  setGreenF4(&source,getGreen(compositePixel));
1102  setBlueF4(&source,getBlue(compositePixel));
1103 
1104  if (imageMatte != 0) {
1105  setOpacityF4(&destination,getOpacity(inputPixel));
1106  }
1107  else {
1108  setOpacityF4(&destination,0.0f);
1109  }
1110 
1111  if (compositeMatte != 0) {
1112  setOpacityF4(&source,getOpacity(compositePixel));
1113  }
1114  else {
1115  setOpacityF4(&source,0.0f);
1116  }
1117 
1118  float4 composite=destination;
1119 
1120  CompositeOperator op = (CompositeOperator)compose;
1121  switch (op) {
1122  case ColorDodgeCompositeOp:
1123  CompositeColorDodge(&source,&destination,&composite);
1124  break;
1125  case BlendCompositeOp:
1126  MagickPixelCompositeBlend(&source,source_dissolve,&destination,
1127  destination_dissolve,&composite);
1128  break;
1129  default:
1130  // unsupported operators
1131  break;
1132  };
1133 
1134  CLPixelType outputPixel;
1135  setRed(&outputPixel, ClampToQuantum(getRedF4(composite)));
1136  setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite)));
1137  setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite)));
1138  setOpacity(&outputPixel, ClampToQuantum(getOpacityF4(composite)));
1139  image[index.y*imageWidth+index.x] = outputPixel;
1140  }
1141  )
1142 
1143 /*
1144 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1145 % %
1146 % %
1147 % %
1148 % C o n t r a s t %
1149 % %
1150 % %
1151 % %
1152 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1153 */
1154 
1155  STRINGIFY(
1156 
1157  inline float3 ConvertRGBToHSB(CLPixelType pixel) {
1158  float3 HueSaturationBrightness;
1159  HueSaturationBrightness.x = 0.0f; // Hue
1160  HueSaturationBrightness.y = 0.0f; // Saturation
1161  HueSaturationBrightness.z = 0.0f; // Brightness
1162 
1163  float r=(float) getRed(pixel);
1164  float g=(float) getGreen(pixel);
1165  float b=(float) getBlue(pixel);
1166 
1167  float tmin=MagickMin(MagickMin(r,g),b);
1168  float tmax= MagickMax(MagickMax(r,g),b);
1169 
1170  if (tmax!=0.0f) {
1171  float delta=tmax-tmin;
1172  HueSaturationBrightness.y=delta/tmax;
1173  HueSaturationBrightness.z=QuantumScale*tmax;
1174 
1175  if (delta != 0.0f) {
1176  HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
1177  HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
1178  HueSaturationBrightness.x/=6.0f;
1179  HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
1180  }
1181  }
1182  return HueSaturationBrightness;
1183  }
1184 
1185  inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
1186 
1187  float hue = HueSaturationBrightness.x;
1188  float brightness = HueSaturationBrightness.z;
1189  float saturation = HueSaturationBrightness.y;
1190 
1191  CLPixelType rgb;
1192 
1193  if (saturation == 0.0f) {
1194  setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
1195  setGreen(&rgb,getRed(rgb));
1196  setBlue(&rgb,getRed(rgb));
1197  }
1198  else {
1199 
1200  float h=6.0f*(hue-floor(hue));
1201  float f=h-floor(h);
1202  float p=brightness*(1.0f-saturation);
1203  float q=brightness*(1.0f-saturation*f);
1204  float t=brightness*(1.0f-(saturation*(1.0f-f)));
1205 
1206  float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
1207  float clamped_t = ClampToQuantum(QuantumRange*t);
1208  float clamped_p = ClampToQuantum(QuantumRange*p);
1209  float clamped_q = ClampToQuantum(QuantumRange*q);
1210  int ih = (int)h;
1211  setRed(&rgb, (ih == 1)?clamped_q:
1212  (ih == 2 || ih == 3)?clamped_p:
1213  (ih == 4)?clamped_t:
1214  clampedBrightness);
1215 
1216  setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
1217  (ih == 3)?clamped_q:
1218  (ih == 4 || ih == 5)?clamped_p:
1219  clamped_t);
1220 
1221  setBlue(&rgb, (ih == 2)?clamped_t:
1222  (ih == 3 || ih == 4)?clampedBrightness:
1223  (ih == 5)?clamped_q:
1224  clamped_p);
1225  }
1226  return rgb;
1227  }
1228 
1229  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
1230  {
1231 
1232  const int sign = sharpen!=0?1:-1;
1233  const int x = get_global_id(0);
1234  const int y = get_global_id(1);
1235  const int columns = get_global_size(0);
1236  const int c = x + y * columns;
1237 
1238  CLPixelType pixel = im[c];
1239  float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
1240  float brightness = HueSaturationBrightness.z;
1241  brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1242  brightness = clamp(brightness,0.0f,1.0f);
1243  HueSaturationBrightness.z = brightness;
1244 
1245  CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
1246  filteredPixel.w = pixel.w;
1247  im[c] = filteredPixel;
1248  }
1249  )
1250 
1251 /*
1252 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1253 % %
1254 % %
1255 % %
1256 % C o n t r a s t S t r e t c h %
1257 % %
1258 % %
1259 % %
1260 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1261 */
1262 
1263  STRINGIFY(
1264  /*
1265  */
1266  __kernel void Histogram(__global CLPixelType * restrict im,
1267  const ChannelType channel,
1268  const int method,
1269  const int colorspace,
1270  __global uint4 * restrict histogram)
1271  {
1272  const int x = get_global_id(0);
1273  const int y = get_global_id(1);
1274  const int columns = get_global_size(0);
1275  const int c = x + y * columns;
1276  if ((channel & SyncChannels) != 0)
1277  {
1278  float intensity = GetPixelIntensity(method, colorspace,im[c]);
1279  uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1280  atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1281  }
1282  else
1283  {
1284  // for equalizing, we always need all channels?
1285  // otherwise something more
1286  }
1287  }
1288  )
1289 
1290  STRINGIFY(
1291  /*
1292  */
1293  __kernel void ContrastStretch(__global CLPixelType * restrict im,
1294  const ChannelType channel,
1295  __global CLPixelType * restrict stretch_map,
1296  const float4 white, const float4 black)
1297  {
1298  const int x = get_global_id(0);
1299  const int y = get_global_id(1);
1300  const int columns = get_global_size(0);
1301  const int c = x + y * columns;
1302 
1303  uint ePos;
1304  CLPixelType oValue, eValue;
1305  CLQuantum red, green, blue, opacity;
1306 
1307  //read from global
1308  oValue=im[c];
1309 
1310  if ((channel & RedChannel) != 0)
1311  {
1312  if (getRedF4(white) != getRedF4(black))
1313  {
1314  ePos = ScaleQuantumToMap(getRed(oValue));
1315  eValue = stretch_map[ePos];
1316  red = getRed(eValue);
1317  }
1318  }
1319 
1320  if ((channel & GreenChannel) != 0)
1321  {
1322  if (getGreenF4(white) != getGreenF4(black))
1323  {
1324  ePos = ScaleQuantumToMap(getGreen(oValue));
1325  eValue = stretch_map[ePos];
1326  green = getGreen(eValue);
1327  }
1328  }
1329 
1330  if ((channel & BlueChannel) != 0)
1331  {
1332  if (getBlueF4(white) != getBlueF4(black))
1333  {
1334  ePos = ScaleQuantumToMap(getBlue(oValue));
1335  eValue = stretch_map[ePos];
1336  blue = getBlue(eValue);
1337  }
1338  }
1339 
1340  if ((channel & OpacityChannel) != 0)
1341  {
1342  if (getOpacityF4(white) != getOpacityF4(black))
1343  {
1344  ePos = ScaleQuantumToMap(getOpacity(oValue));
1345  eValue = stretch_map[ePos];
1346  opacity = getOpacity(eValue);
1347  }
1348  }
1349 
1350  //write back
1351  im[c]=(CLPixelType)(blue, green, red, opacity);
1352 
1353  }
1354  )
1355 
1356 /*
1357 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1358 % %
1359 % %
1360 % %
1361 % C o n v o l v e %
1362 % %
1363 % %
1364 % %
1365 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1366 */
1367 
1368  STRINGIFY(
1369  __kernel
1370  void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1371  const unsigned int imageWidth, const unsigned int imageHeight,
1372  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1373  const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1374 
1375  int2 blockID;
1376  blockID.x = get_group_id(0);
1377  blockID.y = get_group_id(1);
1378 
1379  // image area processed by this workgroup
1380  int2 imageAreaOrg;
1381  imageAreaOrg.x = blockID.x * get_local_size(0);
1382  imageAreaOrg.y = blockID.y * get_local_size(1);
1383 
1384  int2 midFilterDimen;
1385  midFilterDimen.x = (filterWidth-1)/2;
1386  midFilterDimen.y = (filterHeight-1)/2;
1387 
1388  int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1389 
1390  // dimension of the local cache
1391  int2 cachedAreaDimen;
1392  cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1393  cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1394 
1395  // cache the pixels accessed by this workgroup in local memory
1396  int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1397  int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1398  int groupSize = get_local_size(0) * get_local_size(1);
1399  for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1400 
1401  int2 cachedAreaIndex;
1402  cachedAreaIndex.x = i % cachedAreaDimen.x;
1403  cachedAreaIndex.y = i / cachedAreaDimen.x;
1404 
1405  int2 imagePixelIndex;
1406  imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1407 
1408  // only support EdgeVirtualPixelMethod through ClampToCanvas
1409  // TODO: implement other virtual pixel method
1410  imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1411  imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1412 
1413  pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1414  }
1415 
1416  // cache the filter
1417  for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1418  filterCache[i] = filter[i];
1419  }
1420  barrier(CLK_LOCAL_MEM_FENCE);
1421 
1422 
1423  int2 imageIndex;
1424  imageIndex.x = imageAreaOrg.x + get_local_id(0);
1425  imageIndex.y = imageAreaOrg.y + get_local_id(1);
1426 
1427  // if out-of-range, stops here and quit
1428  if (imageIndex.x >= imageWidth
1429  || imageIndex.y >= imageHeight) {
1430  return;
1431  }
1432 
1433  int filterIndex = 0;
1434  float4 sum = (float4)0.0f;
1435  float gamma = 0.0f;
1436  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1437  int cacheIndexY = get_local_id(1);
1438  for (int j = 0; j < filterHeight; j++) {
1439  int cacheIndexX = get_local_id(0);
1440  for (int i = 0; i < filterWidth; i++) {
1441  CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1442  float f = filterCache[filterIndex];
1443 
1444  sum.x += f * p.x;
1445  sum.y += f * p.y;
1446  sum.z += f * p.z;
1447  sum.w += f * p.w;
1448 
1449  gamma += f;
1450  filterIndex++;
1451  cacheIndexX++;
1452  }
1453  cacheIndexY++;
1454  }
1455  }
1456  else {
1457  int cacheIndexY = get_local_id(1);
1458  for (int j = 0; j < filterHeight; j++) {
1459  int cacheIndexX = get_local_id(0);
1460  for (int i = 0; i < filterWidth; i++) {
1461 
1462  CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1463  float alpha = QuantumScale*(QuantumRange-p.w);
1464  float f = filterCache[filterIndex];
1465  float g = alpha * f;
1466 
1467  sum.x += g*p.x;
1468  sum.y += g*p.y;
1469  sum.z += g*p.z;
1470  sum.w += f*p.w;
1471 
1472  gamma += g;
1473  filterIndex++;
1474  cacheIndexX++;
1475  }
1476  cacheIndexY++;
1477  }
1478  gamma = PerceptibleReciprocal(gamma);
1479  sum.xyz = gamma*sum.xyz;
1480  }
1481  CLPixelType outputPixel;
1482  outputPixel.x = ClampToQuantum(sum.x);
1483  outputPixel.y = ClampToQuantum(sum.y);
1484  outputPixel.z = ClampToQuantum(sum.z);
1485  outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1486 
1487  output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1488  }
1489  )
1490 
1491  STRINGIFY(
1492  __kernel
1493  void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1494  const uint imageWidth, const uint imageHeight,
1495  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1496  const uint matte, const ChannelType channel) {
1497 
1498  int2 imageIndex;
1499  imageIndex.x = get_global_id(0);
1500  imageIndex.y = get_global_id(1);
1501 
1502  /*
1503  unsigned int imageWidth = get_global_size(0);
1504  unsigned int imageHeight = get_global_size(1);
1505  */
1506  if (imageIndex.x >= imageWidth
1507  || imageIndex.y >= imageHeight)
1508  return;
1509 
1510  int2 midFilterDimen;
1511  midFilterDimen.x = (filterWidth-1)/2;
1512  midFilterDimen.y = (filterHeight-1)/2;
1513 
1514  int filterIndex = 0;
1515  float4 sum = (float4)0.0f;
1516  float gamma = 0.0f;
1517  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
1518  for (int j = 0; j < filterHeight; j++) {
1519  int2 inputPixelIndex;
1520  inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1521  inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1522  for (int i = 0; i < filterWidth; i++) {
1523  inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1524  inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1525 
1526  CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1527  float f = filter[filterIndex];
1528 
1529  sum.x += f * p.x;
1530  sum.y += f * p.y;
1531  sum.z += f * p.z;
1532  sum.w += f * p.w;
1533 
1534  gamma += f;
1535 
1536  filterIndex++;
1537  }
1538  }
1539  }
1540  else {
1541 
1542  for (int j = 0; j < filterHeight; j++) {
1543  int2 inputPixelIndex;
1544  inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1545  inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1546  for (int i = 0; i < filterWidth; i++) {
1547  inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1548  inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1549 
1550  CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1551  float alpha = QuantumScale*(QuantumRange-p.w);
1552  float f = filter[filterIndex];
1553  float g = alpha * f;
1554 
1555  sum.x += g*p.x;
1556  sum.y += g*p.y;
1557  sum.z += g*p.z;
1558  sum.w += f*p.w;
1559 
1560  gamma += g;
1561 
1562 
1563  filterIndex++;
1564  }
1565  }
1566  gamma = PerceptibleReciprocal(gamma);
1567  sum.xyz = gamma*sum.xyz;
1568  }
1569 
1570  CLPixelType outputPixel;
1571  outputPixel.x = ClampToQuantum(sum.x);
1572  outputPixel.y = ClampToQuantum(sum.y);
1573  outputPixel.z = ClampToQuantum(sum.z);
1574  outputPixel.w = ((channel & OpacityChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1575 
1576  output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1577  }
1578  )
1579 
1580 /*
1581 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1582 % %
1583 % %
1584 % %
1585 % D e s p e c k l e %
1586 % %
1587 % %
1588 % %
1589 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1590 */
1591 
1592  STRINGIFY(
1593 
1594  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1595  , const unsigned int imageWidth, const unsigned int imageHeight
1596  , const int2 offset, const int polarity, const int matte) {
1597 
1598  int x = get_global_id(0);
1599  int y = get_global_id(1);
1600 
1601  CLPixelType v = inputImage[y*imageWidth+x];
1602 
1603  int2 neighbor;
1604  neighbor.y = y + offset.y;
1605  neighbor.x = x + offset.x;
1606 
1607  int2 clampedNeighbor;
1608  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1609  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1610 
1611  CLPixelType r = (clampedNeighbor.x == neighbor.x
1612  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1613  :(CLPixelType)0;
1614 
1615  int sv[4];
1616  sv[0] = (int)v.x;
1617  sv[1] = (int)v.y;
1618  sv[2] = (int)v.z;
1619  sv[3] = (int)v.w;
1620 
1621  int sr[4];
1622  sr[0] = (int)r.x;
1623  sr[1] = (int)r.y;
1624  sr[2] = (int)r.z;
1625  sr[3] = (int)r.w;
1626 
1627  if (polarity > 0) {
1628  \n #pragma unroll 4\n
1629  for (unsigned int i = 0; i < 4; i++) {
1630  sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1631  }
1632  }
1633  else {
1634  \n #pragma unroll 4\n
1635  for (unsigned int i = 0; i < 4; i++) {
1636  sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1637  }
1638 
1639  }
1640 
1641  v.x = (CLQuantum)sv[0];
1642  v.y = (CLQuantum)sv[1];
1643  v.z = (CLQuantum)sv[2];
1644 
1645  if (matte!=0)
1646  v.w = (CLQuantum)sv[3];
1647 
1648  outputImage[y*imageWidth+x] = v;
1649 
1650  }
1651 
1652 
1653  )
1654 
1655 
1656 
1657  STRINGIFY(
1658 
1659  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1660  , const unsigned int imageWidth, const unsigned int imageHeight
1661  , const int2 offset, const int polarity, const int matte) {
1662 
1663  int x = get_global_id(0);
1664  int y = get_global_id(1);
1665 
1666  CLPixelType v = inputImage[y*imageWidth+x];
1667 
1668  int2 neighbor, clampedNeighbor;
1669 
1670  neighbor.y = y + offset.y;
1671  neighbor.x = x + offset.x;
1672  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1673  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1674 
1675  CLPixelType r = (clampedNeighbor.x == neighbor.x
1676  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1677  :(CLPixelType)0;
1678 
1679 
1680  neighbor.y = y - offset.y;
1681  neighbor.x = x - offset.x;
1682  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1683  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1684 
1685  CLPixelType s = (clampedNeighbor.x == neighbor.x
1686  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1687  :(CLPixelType)0;
1688 
1689 
1690  int sv[4];
1691  sv[0] = (int)v.x;
1692  sv[1] = (int)v.y;
1693  sv[2] = (int)v.z;
1694  sv[3] = (int)v.w;
1695 
1696  int sr[4];
1697  sr[0] = (int)r.x;
1698  sr[1] = (int)r.y;
1699  sr[2] = (int)r.z;
1700  sr[3] = (int)r.w;
1701 
1702  int ss[4];
1703  ss[0] = (int)s.x;
1704  ss[1] = (int)s.y;
1705  ss[2] = (int)s.z;
1706  ss[3] = (int)s.w;
1707 
1708  if (polarity > 0) {
1709  \n #pragma unroll 4\n
1710  for (unsigned int i = 0; i < 4; i++) {
1711  //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1712  //
1713  //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1714  //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1715  sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1716  }
1717  }
1718  else {
1719  \n #pragma unroll 4\n
1720  for (unsigned int i = 0; i < 4; i++) {
1721  //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1722  //
1723  //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1724  sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1725  }
1726  }
1727 
1728  v.x = (CLQuantum)sv[0];
1729  v.y = (CLQuantum)sv[1];
1730  v.z = (CLQuantum)sv[2];
1731 
1732  if (matte!=0)
1733  v.w = (CLQuantum)sv[3];
1734 
1735  outputImage[y*imageWidth+x] = v;
1736 
1737  }
1738 
1739 
1740  )
1741 
1742 /*
1743 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1744 % %
1745 % %
1746 % %
1747 % E q u a l i z e %
1748 % %
1749 % %
1750 % %
1751 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1752 */
1753 
1754  STRINGIFY(
1755  /*
1756  */
1757  __kernel void Equalize(__global CLPixelType * restrict im,
1758  const ChannelType channel,
1759  __global CLPixelType * restrict equalize_map,
1760  const float4 white, const float4 black)
1761  {
1762  const int x = get_global_id(0);
1763  const int y = get_global_id(1);
1764  const int columns = get_global_size(0);
1765  const int c = x + y * columns;
1766 
1767  uint ePos;
1768  CLPixelType oValue, eValue;
1769  CLQuantum red, green, blue, opacity;
1770 
1771  //read from global
1772  oValue=im[c];
1773 
1774  if ((channel & SyncChannels) != 0)
1775  {
1776  if (getRedF4(white) != getRedF4(black))
1777  {
1778  ePos = ScaleQuantumToMap(getRed(oValue));
1779  eValue = equalize_map[ePos];
1780  red = getRed(eValue);
1781  ePos = ScaleQuantumToMap(getGreen(oValue));
1782  eValue = equalize_map[ePos];
1783  green = getRed(eValue);
1784  ePos = ScaleQuantumToMap(getBlue(oValue));
1785  eValue = equalize_map[ePos];
1786  blue = getRed(eValue);
1787  ePos = ScaleQuantumToMap(getOpacity(oValue));
1788  eValue = equalize_map[ePos];
1789  opacity = getRed(eValue);
1790 
1791  //write back
1792  im[c]=(CLPixelType)(blue, green, red, opacity);
1793  }
1794 
1795  }
1796 
1797  // for equalizing, we always need all channels?
1798  // otherwise something more
1799 
1800  }
1801  )
1802 
1803 /*
1804 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1805 % %
1806 % %
1807 % %
1808 % F u n c t i o n %
1809 % %
1810 % %
1811 % %
1812 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1813 */
1814 
1815  STRINGIFY(
1816 
1817  /*
1818  apply FunctionImageChannel(braightness-contrast)
1819  */
1820  CLPixelType ApplyFunction(CLPixelType pixel,const MagickFunction function,
1821  const unsigned int number_parameters,
1822  __constant float *parameters)
1823  {
1824  float4 result = (float4) 0.0f;
1825  switch (function)
1826  {
1827  case PolynomialFunction:
1828  {
1829  for (unsigned int i=0; i < number_parameters; i++)
1830  result = result*(float4)QuantumScale*convert_float4(pixel) + parameters[i];
1831  result *= (float4)QuantumRange;
1832  break;
1833  }
1834  case SinusoidFunction:
1835  {
1836  float freq,phase,ampl,bias;
1837  freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1838  phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1839  ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1840  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1841  result.x = QuantumRange*(ampl*sin(2.0f*MagickPI*
1842  (freq*QuantumScale*(float)pixel.x + phase/360.0f)) + bias);
1843  result.y = QuantumRange*(ampl*sin(2.0f*MagickPI*
1844  (freq*QuantumScale*(float)pixel.y + phase/360.0f)) + bias);
1845  result.z = QuantumRange*(ampl*sin(2.0f*MagickPI*
1846  (freq*QuantumScale*(float)pixel.z + phase/360.0f)) + bias);
1847  result.w = QuantumRange*(ampl*sin(2.0f*MagickPI*
1848  (freq*QuantumScale*(float)pixel.w + phase/360.0f)) + bias);
1849  break;
1850  }
1851  case ArcsinFunction:
1852  {
1853  float width,range,center,bias;
1854  width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1855  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1856  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1857  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1858 
1859  result.x = 2.0f/width*(QuantumScale*(float)pixel.x - center);
1860  result.x = range/MagickPI*asin(result.x)+bias;
1861  result.x = ( result.x <= -1.0f ) ? bias - range/2.0f : result.x;
1862  result.x = ( result.x >= 1.0f ) ? bias + range/2.0f : result.x;
1863 
1864  result.y = 2.0f/width*(QuantumScale*(float)pixel.y - center);
1865  result.y = range/MagickPI*asin(result.y)+bias;
1866  result.y = ( result.y <= -1.0f ) ? bias - range/2.0f : result.y;
1867  result.y = ( result.y >= 1.0f ) ? bias + range/2.0f : result.y;
1868 
1869  result.z = 2.0f/width*(QuantumScale*(float)pixel.z - center);
1870  result.z = range/MagickPI*asin(result.z)+bias;
1871  result.z = ( result.z <= -1.0f ) ? bias - range/2.0f : result.x;
1872  result.z = ( result.z >= 1.0f ) ? bias + range/2.0f : result.x;
1873 
1874 
1875  result.w = 2.0f/width*(QuantumScale*(float)pixel.w - center);
1876  result.w = range/MagickPI*asin(result.w)+bias;
1877  result.w = ( result.w <= -1.0f ) ? bias - range/2.0f : result.w;
1878  result.w = ( result.w >= 1.0f ) ? bias + range/2.0f : result.w;
1879 
1880  result *= (float4)QuantumRange;
1881  break;
1882  }
1883  case ArctanFunction:
1884  {
1885  float slope,range,center,bias;
1886  slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1887  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1888  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1889  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1890  result = (float4)MagickPI*(float4)slope*((float4)QuantumScale*convert_float4(pixel)-(float4)center);
1891  result = (float4)QuantumRange*((float4)range/(float4)MagickPI*atan(result) + (float4)bias);
1892  break;
1893  }
1894  case UndefinedFunction:
1895  break;
1896  }
1897  return (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
1898  ClampToQuantum(result.z), ClampToQuantum(result.w));
1899  }
1900  )
1901 
1902  STRINGIFY(
1903  /*
1904  Improve brightness / contrast of the image
1905  channel : define which channel is improved
1906  function : the function called to enchance the brightness contrast
1907  number_parameters : numbers of parameters
1908  parameters : the parameter
1909  */
1910  __kernel void ComputeFunction(__global CLPixelType *im,
1911  const ChannelType channel, const MagickFunction function,
1912  const unsigned int number_parameters, __constant float *parameters)
1913  {
1914  const int x = get_global_id(0);
1915  const int y = get_global_id(1);
1916  const int columns = get_global_size(0);
1917  const int c = x + y * columns;
1918  im[c] = ApplyFunction(im[c], function, number_parameters, parameters);
1919  }
1920  )
1921 
1922 /*
1923 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1924 % %
1925 % %
1926 % %
1927 % G r a y s c a l e %
1928 % %
1929 % %
1930 % %
1931 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1932 */
1933 
1934  STRINGIFY(
1935  __kernel void Grayscale(__global CLPixelType *im,
1936  const int method, const int colorspace)
1937  {
1938 
1939  const int x = get_global_id(0);
1940  const int y = get_global_id(1);
1941  const int columns = get_global_size(0);
1942  const int c = x + y * columns;
1943 
1944  CLPixelType pixel = im[c];
1945 
1946  float
1947  blue,
1948  green,
1949  intensity,
1950  red;
1951 
1952  red=(float)getRed(pixel);
1953  green=(float)getGreen(pixel);
1954  blue=(float)getBlue(pixel);
1955 
1956  intensity=0.0;
1957 
1958  CLPixelType filteredPixel;
1959 
1960  switch (method)
1961  {
1963  {
1964  intensity=(red+green+blue)/3.0;
1965  break;
1966  }
1968  {
1969  intensity=MagickMax(MagickMax(red,green),blue);
1970  break;
1971  }
1973  {
1974  intensity=(MagickMin(MagickMin(red,green),blue)+
1975  MagickMax(MagickMax(red,green),blue))/2.0;
1976  break;
1977  }
1979  {
1980  intensity=(float) (((float) red*red+green*green+
1981  blue*blue)/(3.0*QuantumRange));
1982  break;
1983  }
1985  {
1986  /*
1987  if (colorspace == RGBColorspace)
1988  {
1989  red=EncodePixelGamma(red);
1990  green=EncodePixelGamma(green);
1991  blue=EncodePixelGamma(blue);
1992  }
1993  */
1994  intensity=0.298839*red+0.586811*green+0.114350*blue;
1995  break;
1996  }
1998  {
1999  /*
2000  if (image->colorspace == sRGBColorspace)
2001  {
2002  red=DecodePixelGamma(red);
2003  green=DecodePixelGamma(green);
2004  blue=DecodePixelGamma(blue);
2005  }
2006  */
2007  intensity=0.298839*red+0.586811*green+0.114350*blue;
2008  break;
2009  }
2011  default:
2012  {
2013  /*
2014  if (image->colorspace == RGBColorspace)
2015  {
2016  red=EncodePixelGamma(red);
2017  green=EncodePixelGamma(green);
2018  blue=EncodePixelGamma(blue);
2019  }
2020  */
2021  intensity=0.212656*red+0.715158*green+0.072186*blue;
2022  break;
2023  }
2025  {
2026  /*
2027  if (image->colorspace == sRGBColorspace)
2028  {
2029  red=DecodePixelGamma(red);
2030  green=DecodePixelGamma(green);
2031  blue=DecodePixelGamma(blue);
2032  }
2033  */
2034  intensity=0.212656*red+0.715158*green+0.072186*blue;
2035  break;
2036  }
2038  {
2039  intensity=(float) (sqrt((float) red*red+green*green+
2040  blue*blue)/sqrt(3.0));
2041  break;
2042  }
2043 
2044  }
2045 
2046  setGray(&filteredPixel, ClampToQuantum(intensity));
2047 
2048  filteredPixel.w = pixel.w;
2049 
2050  im[c] = filteredPixel;
2051  }
2052  )
2053 
2054 /*
2055 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2056 % %
2057 % %
2058 % %
2059 % L o c a l C o n t r a s t %
2060 % %
2061 % %
2062 % %
2063 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2064 */
2065 
2066  STRINGIFY(
2067  inline int mirrorBottom(int value)
2068  {
2069  return (value < 0) ? - (value) : value;
2070  }
2071  inline int mirrorTop(int value, int width)
2072  {
2073  return (value >= width) ? (2 * width - value - 1) : value;
2074  }
2075 
2076  __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
2077  const int radius,
2078  const int imageWidth,
2079  const int imageHeight)
2080  {
2081  const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
2082 
2083  int x = get_local_id(0);
2084  int y = get_global_id(1);
2085 
2086  if ((x >= imageWidth) || (y >= imageHeight))
2087  return;
2088 
2089  global CLPixelType *src = srcImage + y * imageWidth;
2090 
2091  for (int i = x; i < imageWidth; i += get_local_size(0)) {
2092  float sum = 0.0f;
2093  float weight = 1.0f;
2094 
2095  int j = i - radius;
2096  while ((j + 7) < i) {
2097  for (int k = 0; k < 8; ++k) // Unroll 8x
2098  sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
2099  weight += 8.0f;
2100  j+=8;
2101  }
2102  while (j < i) {
2103  sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
2104  weight += 1.0f;
2105  ++j;
2106  }
2107 
2108  while ((j + 7) < radius + i) {
2109  for (int k = 0; k < 8; ++k) // Unroll 8x
2110  sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
2111  weight -= 8.0f;
2112  j+=8;
2113  }
2114  while (j < radius + i) {
2115  sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
2116  weight -= 1.0f;
2117  ++j;
2118  }
2119 
2120  tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
2121  }
2122  }
2123  )
2124 
2125  STRINGIFY(
2126  __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
2127  const int radius,
2128  const float strength,
2129  const int imageWidth,
2130  const int imageHeight)
2131  {
2132  const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
2133 
2134  int x = get_global_id(0);
2135  int y = get_global_id(1);
2136 
2137  if ((x >= imageWidth) || (y >= imageHeight))
2138  return;
2139 
2140  global float *src = blurImage + x;
2141 
2142  float sum = 0.0f;
2143  float weight = 1.0f;
2144 
2145  int j = y - radius;
2146  while ((j + 7) < y) {
2147  for (int k = 0; k < 8; ++k) // Unroll 8x
2148  sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
2149  weight += 8.0f;
2150  j+=8;
2151  }
2152  while (j < y) {
2153  sum += weight * src[mirrorBottom(j) * imageWidth];
2154  weight += 1.0f;
2155  ++j;
2156  }
2157 
2158  while ((j + 7) < radius + y) {
2159  for (int k = 0; k < 8; ++k) // Unroll 8x
2160  sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
2161  weight -= 8.0f;
2162  j+=8;
2163  }
2164  while (j < radius + y) {
2165  sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
2166  weight -= 1.0f;
2167  ++j;
2168  }
2169 
2170  CLPixelType pixel = srcImage[x + y * imageWidth];
2171  float srcVal = dot(RGB, convert_float4(pixel));
2172  float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
2173  mult = (srcVal + mult) / srcVal;
2174 
2175  pixel.x = ClampToQuantum(pixel.x * mult);
2176  pixel.y = ClampToQuantum(pixel.y * mult);
2177  pixel.z = ClampToQuantum(pixel.z * mult);
2178 
2179  dstImage[x + y * imageWidth] = pixel;
2180  }
2181  )
2182 
2183 /*
2184 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2185 % %
2186 % %
2187 % %
2188 % M o d u l a t e %
2189 % %
2190 % %
2191 % %
2192 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2193 */
2194 
2195  STRINGIFY(
2196 
2197  inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
2198  float *hue, float *saturation, float *lightness)
2199  {
2200  float
2201  c,
2202  tmax,
2203  tmin;
2204 
2205  /*
2206  Convert RGB to HSL colorspace.
2207  */
2208  tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
2209  tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
2210 
2211  c=tmax-tmin;
2212 
2213  *lightness=(tmax+tmin)/2.0;
2214  if (c <= 0.0)
2215  {
2216  *hue=0.0;
2217  *saturation=0.0;
2218  return;
2219  }
2220 
2221  if (tmax == (QuantumScale*red))
2222  {
2223  *hue=(QuantumScale*green-QuantumScale*blue)/c;
2224  if ((QuantumScale*green) < (QuantumScale*blue))
2225  *hue+=6.0;
2226  }
2227  else
2228  if (tmax == (QuantumScale*green))
2229  *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
2230  else
2231  *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
2232 
2233  *hue*=60.0/360.0;
2234  if (*lightness <= 0.5)
2235  *saturation=c/(2.0*(*lightness));
2236  else
2237  *saturation=c/(2.0-2.0*(*lightness));
2238  }
2239 
2240  inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2241  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2242  {
2243  float
2244  b,
2245  c,
2246  g,
2247  h,
2248  tmin,
2249  r,
2250  x;
2251 
2252  /*
2253  Convert HSL to RGB colorspace.
2254  */
2255  h=hue*360.0;
2256  if (lightness <= 0.5)
2257  c=2.0*lightness*saturation;
2258  else
2259  c=(2.0-2.0*lightness)*saturation;
2260  tmin=lightness-0.5*c;
2261  h-=360.0*floor(h/360.0);
2262  h/=60.0;
2263  x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2264  switch ((int) floor(h) % 6)
2265  {
2266  case 0:
2267  {
2268  r=tmin+c;
2269  g=tmin+x;
2270  b=tmin;
2271  break;
2272  }
2273  case 1:
2274  {
2275  r=tmin+x;
2276  g=tmin+c;
2277  b=tmin;
2278  break;
2279  }
2280  case 2:
2281  {
2282  r=tmin;
2283  g=tmin+c;
2284  b=tmin+x;
2285  break;
2286  }
2287  case 3:
2288  {
2289  r=tmin;
2290  g=tmin+x;
2291  b=tmin+c;
2292  break;
2293  }
2294  case 4:
2295  {
2296  r=tmin+x;
2297  g=tmin;
2298  b=tmin+c;
2299  break;
2300  }
2301  case 5:
2302  {
2303  r=tmin+c;
2304  g=tmin;
2305  b=tmin+x;
2306  break;
2307  }
2308  default:
2309  {
2310  r=0.0;
2311  g=0.0;
2312  b=0.0;
2313  }
2314  }
2315  *red=ClampToQuantum(QuantumRange*r);
2316  *green=ClampToQuantum(QuantumRange*g);
2317  *blue=ClampToQuantum(QuantumRange*b);
2318  }
2319 
2320  inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2321  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2322  {
2323  float
2324  hue,
2325  lightness,
2326  saturation;
2327 
2328  /*
2329  Increase or decrease color lightness, saturation, or hue.
2330  */
2331  ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2332  hue+=0.5*(0.01*percent_hue-1.0);
2333  while (hue < 0.0)
2334  hue+=1.0;
2335  while (hue >= 1.0)
2336  hue-=1.0;
2337  saturation*=0.01*percent_saturation;
2338  lightness*=0.01*percent_lightness;
2339  ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2340  }
2341 
2342  __kernel void Modulate(__global CLPixelType *im,
2343  const float percent_brightness,
2344  const float percent_hue,
2345  const float percent_saturation,
2346  const int colorspace)
2347  {
2348 
2349  const int x = get_global_id(0);
2350  const int y = get_global_id(1);
2351  const int columns = get_global_size(0);
2352  const int c = x + y * columns;
2353 
2354  CLPixelType pixel = im[c];
2355 
2356  CLQuantum
2357  blue,
2358  green,
2359  red;
2360 
2361  red=getRed(pixel);
2362  green=getGreen(pixel);
2363  blue=getBlue(pixel);
2364 
2365  switch (colorspace)
2366  {
2367  case HSLColorspace:
2368  default:
2369  {
2370  ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2371  &red, &green, &blue);
2372  }
2373 
2374  }
2375 
2376  CLPixelType filteredPixel;
2377 
2378  setRed(&filteredPixel, red);
2379  setGreen(&filteredPixel, green);
2380  setBlue(&filteredPixel, blue);
2381  filteredPixel.w = pixel.w;
2382 
2383  im[c] = filteredPixel;
2384  }
2385  )
2386 
2387 /*
2388 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2389 % %
2390 % %
2391 % %
2392 % M o t i o n B l u r %
2393 % %
2394 % %
2395 % %
2396 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2397 */
2398 
2399  STRINGIFY(
2400  __kernel
2401  void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2402  const unsigned int imageWidth, const unsigned int imageHeight,
2403  const __global float *filter, const unsigned int width, const __global int2* offset,
2404  const float4 bias,
2405  const ChannelType channel, const unsigned int matte) {
2406 
2407  int2 currentPixel;
2408  currentPixel.x = get_global_id(0);
2409  currentPixel.y = get_global_id(1);
2410 
2411  if (currentPixel.x >= imageWidth
2412  || currentPixel.y >= imageHeight)
2413  return;
2414 
2415  float4 pixel;
2416  pixel.x = (float)bias.x;
2417  pixel.y = (float)bias.y;
2418  pixel.z = (float)bias.z;
2419  pixel.w = (float)bias.w;
2420 
2421  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
2422 
2423  for (int i = 0; i < width; i++) {
2424  // only support EdgeVirtualPixelMethod through ClampToCanvas
2425  // TODO: implement other virtual pixel method
2426  int2 samplePixel = currentPixel + offset[i];
2427  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2428  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2429  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2430 
2431  pixel.x += (filter[i] * (float)samplePixelValue.x);
2432  pixel.y += (filter[i] * (float)samplePixelValue.y);
2433  pixel.z += (filter[i] * (float)samplePixelValue.z);
2434  pixel.w += (filter[i] * (float)samplePixelValue.w);
2435  }
2436 
2437  CLPixelType outputPixel;
2438  outputPixel.x = ClampToQuantum(pixel.x);
2439  outputPixel.y = ClampToQuantum(pixel.y);
2440  outputPixel.z = ClampToQuantum(pixel.z);
2441  outputPixel.w = ClampToQuantum(pixel.w);
2442  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2443  }
2444  else {
2445 
2446  float gamma = 0.0f;
2447  for (int i = 0; i < width; i++) {
2448  // only support EdgeVirtualPixelMethod through ClampToCanvas
2449  // TODO: implement other virtual pixel method
2450  int2 samplePixel = currentPixel + offset[i];
2451  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2452  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2453 
2454  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2455 
2456  float alpha = QuantumScale*(QuantumRange-samplePixelValue.w);
2457  float k = filter[i];
2458  pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2459  pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2460  pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2461 
2462  pixel.w += k * alpha * samplePixelValue.w;
2463 
2464  gamma+=k*alpha;
2465  }
2466  gamma = PerceptibleReciprocal(gamma);
2467  pixel.xyz = gamma*pixel.xyz;
2468 
2469  CLPixelType outputPixel;
2470  outputPixel.x = ClampToQuantum(pixel.x);
2471  outputPixel.y = ClampToQuantum(pixel.y);
2472  outputPixel.z = ClampToQuantum(pixel.z);
2473  outputPixel.w = ClampToQuantum(pixel.w);
2474  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2475  }
2476  }
2477  )
2478 
2479 /*
2480 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2481 % %
2482 % %
2483 % %
2484 % R a d i a l B l u r %
2485 % %
2486 % %
2487 % %
2488 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2489 */
2490 
2491  STRINGIFY(
2492  __kernel void RadialBlur(const __global CLPixelType *im, __global CLPixelType *filtered_im,
2493  const float4 bias,
2494  const unsigned int channel, const unsigned int matte,
2495  const float2 blurCenter,
2496  __constant float *cos_theta, __constant float *sin_theta,
2497  const unsigned int cossin_theta_size)
2498  {
2499  const int x = get_global_id(0);
2500  const int y = get_global_id(1);
2501  const int columns = get_global_size(0);
2502  const int rows = get_global_size(1);
2503  unsigned int step = 1;
2504  float center_x = (float) x - blurCenter.x;
2505  float center_y = (float) y - blurCenter.y;
2506  float radius = hypot(center_x, center_y);
2507 
2508  //float blur_radius = hypot((float) columns/2.0f, (float) rows/2.0f);
2509  float blur_radius = hypot(blurCenter.x, blurCenter.y);
2510 
2511  if (radius > MagickEpsilon)
2512  {
2513  step = (unsigned int) (blur_radius / radius);
2514  if (step == 0)
2515  step = 1;
2516  if (step >= cossin_theta_size)
2517  step = cossin_theta_size-1;
2518  }
2519 
2520  float4 result;
2521  result.x = (float)bias.x;
2522  result.y = (float)bias.y;
2523  result.z = (float)bias.z;
2524  result.w = (float)bias.w;
2525  float normalize = 0.0f;
2526 
2527  if (((channel & OpacityChannel) == 0) || (matte == 0)) {
2528  for (unsigned int i=0; i<cossin_theta_size; i+=step)
2529  {
2530  result += convert_float4(im[
2531  ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
2532  ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
2533  normalize += 1.0f;
2534  }
2535  normalize = PerceptibleReciprocal(normalize);
2536  result = result * normalize;
2537  }
2538  else {
2539  float gamma = 0.0f;
2540  for (unsigned int i=0; i<cossin_theta_size; i+=step)
2541  {
2542  float4 p = convert_float4(im[
2543  ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns)+
2544  ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f, rows)*columns]);
2545 
2546  float alpha = (float)(QuantumScale*(QuantumRange-p.w));
2547  result.x += alpha * p.x;
2548  result.y += alpha * p.y;
2549  result.z += alpha * p.z;
2550  result.w += p.w;
2551  gamma+=alpha;
2552  normalize += 1.0f;
2553  }
2554  gamma = PerceptibleReciprocal(gamma);
2555  normalize = PerceptibleReciprocal(normalize);
2556  result.x = gamma*result.x;
2557  result.y = gamma*result.y;
2558  result.z = gamma*result.z;
2559  result.w = normalize*result.w;
2560  }
2561  filtered_im[y * columns + x] = (CLPixelType) (ClampToQuantum(result.x), ClampToQuantum(result.y),
2562  ClampToQuantum(result.z), ClampToQuantum(result.w));
2563  }
2564  )
2565 
2566 /*
2567 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2568 % %
2569 % %
2570 % %
2571 % R e s i z e %
2572 % %
2573 % %
2574 % %
2575 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2576 */
2577 
2578  STRINGIFY(
2579  // Based on Box from resize.c
2580  float BoxResizeFilter(const float x)
2581  {
2582  return 1.0f;
2583  }
2584  )
2585 
2586  STRINGIFY(
2587  // Based on CubicBC from resize.c
2588  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2589  {
2590  /*
2591  Cubic Filters using B,C determined values:
2592  Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter
2593  Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears
2594  Spline B = 1 C = 0 B-Spline Gaussian approximation
2595  Hermite B = 0 C = 0 B-Spline interpolator
2596 
2597  See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2598  Graphics Computer Graphics, Volume 22, Number 4, August 1988
2599  http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2600  Mitchell.pdf.
2601 
2602  Coefficents are determined from B,C values:
2603  P0 = ( 6 - 2*B )/6 = coeff[0]
2604  P1 = 0
2605  P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2606  P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2607  Q0 = ( 8*B +24*C )/6 = coeff[3]
2608  Q1 = ( -12*B -48*C )/6 = coeff[4]
2609  Q2 = ( 6*B +30*C )/6 = coeff[5]
2610  Q3 = ( - 1*B - 6*C )/6 = coeff[6]
2611 
2612  which are used to define the filter:
2613 
2614  P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1
2615  Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2
2616 
2617  which ensures function is continuous in value and derivative (slope).
2618  */
2619  if (x < 1.0)
2620  return(resizeFilterCoefficients[0]+x*(x*
2621  (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2622  if (x < 2.0)
2623  return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2624  (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2625  return(0.0);
2626  }
2627  )
2628 
2629  STRINGIFY(
2630  float Sinc(const float x)
2631  {
2632  if (x != 0.0f)
2633  {
2634  const float alpha=(float) (MagickPI*x);
2635  return sinpi(x)/alpha;
2636  }
2637  return(1.0f);
2638  }
2639  )
2640 
2641  STRINGIFY(
2642  float Triangle(const float x)
2643  {
2644  /*
2645  1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2646  a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function
2647  for Sinc().
2648  */
2649  return ((x<1.0f)?(1.0f-x):0.0f);
2650  }
2651  )
2652 
2653 
2654  STRINGIFY(
2655  float Hanning(const float x)
2656  {
2657  /*
2658  Cosine window function:
2659  0.5+0.5*cos(pi*x).
2660  */
2661  const float cosine=cos((MagickPI*x));
2662  return(0.5f+0.5f*cosine);
2663  }
2664  )
2665 
2666  STRINGIFY(
2667  float Hamming(const float x)
2668  {
2669  /*
2670  Offset cosine window function:
2671  .54 + .46 cos(pi x).
2672  */
2673  const float cosine=cos((MagickPI*x));
2674  return(0.54f+0.46f*cosine);
2675  }
2676  )
2677 
2678  STRINGIFY(
2679  float Blackman(const float x)
2680  {
2681  /*
2682  Blackman: 2nd order cosine windowing function:
2683  0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2684 
2685  Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2686  five flops.
2687  */
2688  const float cosine=cos((MagickPI*x));
2689  return(0.34f+cosine*(0.5f+cosine*0.16f));
2690  }
2691  )
2692 
2693 
2694 
2695 
2696  STRINGIFY(
2697  inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2698  {
2699  switch (filterType)
2700  {
2701  /* Call Sinc even for SincFast to get better precision on GPU
2702  and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/
2703  case SincWeightingFunction:
2705  return Sinc(x);
2707  return CubicBC(x,filterCoefficients);
2708  case BoxWeightingFunction:
2709  return BoxResizeFilter(x);
2711  return Triangle(x);
2713  return Hanning(x);
2715  return Hamming(x);
2717  return Blackman(x);
2718 
2719  default:
2720  return 0.0f;
2721  }
2722  }
2723  )
2724 
2725 
2726  STRINGIFY(
2727  inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2728  , const ResizeWeightingFunctionType resizeWindowType
2729  , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2730  {
2731  float scale;
2732  float xBlur = fabs(x/resizeFilterBlur);
2733  if (resizeWindowSupport < MagickEpsilon
2734  || resizeWindowType == BoxWeightingFunction)
2735  {
2736  scale = 1.0f;
2737  }
2738  else
2739  {
2740  scale = resizeFilterScale;
2741  scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2742  }
2743  float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2744  return weight;
2745  }
2746 
2747  )
2748 
2749  ;
2750  const char* accelerateKernels2 =
2751 
2752  STRINGIFY(
2753 
2754  inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2755  return (numWorkItems/pixelPerWorkgroup);
2756  }
2757 
2758  // returns the index of the pixel for the current workitem to compute.
2759  // returns -1 if this workitem doesn't need to participate in any computation
2760  inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2761  const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2762  int pixelIndex = itemID/numWorkItemsPerPixel;
2763  pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2764  return pixelIndex;
2765  }
2766 
2767  )
2768 
2769  STRINGIFY(
2770  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2771  void ResizeHorizontalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2772  , const float xFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2773  , const int resizeFilterType, const int resizeWindowType
2774  , const __global float* resizeFilterCubicCoefficients
2775  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2776  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2777  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2778 
2779 
2780  // calculate the range of resized image pixels computed by this workgroup
2781  const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2782  const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2783  const unsigned int actualNumPixelToCompute = stopX - startX;
2784 
2785  // calculate the range of input image pixels to cache
2786  float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2787  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2788  scale = PerceptibleReciprocal(scale);
2789 
2790  const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2791  const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2792 
2793  // cache the input pixels into local memory
2794  const unsigned int y = get_global_id(1);
2795  event_t e = async_work_group_copy(inputImageCache,inputImage+y*inputColumns+cacheRangeStartX,cacheRangeEndX-cacheRangeStartX,0);
2796  wait_group_events(1,&e);
2797 
2798  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2799  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2800  {
2801 
2802  const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2803  const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2804  const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2805 
2806  // determine which resized pixel computed by this workitem
2807  const unsigned int itemID = get_local_id(0);
2808  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2809 
2810  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2811 
2812  float4 filteredPixel = (float4)0.0f;
2813  float density = 0.0f;
2814  float gamma = 0.0f;
2815  // -1 means this workitem doesn't participate in the computation
2816  if (pixelIndex != -1) {
2817 
2818  // x coordinated of the resized pixel computed by this workitem
2819  const int x = chunkStartX + pixelIndex;
2820 
2821  // calculate how many steps required for this pixel
2822  const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2823  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2824  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2825  const unsigned int n = stop - start;
2826 
2827  // calculate how many steps this workitem will contribute
2828  unsigned int numStepsPerWorkItem = n / numItems;
2829  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2830 
2831  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2832  if (startStep < n) {
2833  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2834 
2835  unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2836  if (matte == 0) {
2837 
2838  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2839  float4 cp = convert_float4(inputImageCache[cacheIndex]);
2840 
2841  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2842  , (ResizeWeightingFunctionType)resizeWindowType
2843  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2844 
2845  filteredPixel += ((float4)weight)*cp;
2846  density+=weight;
2847  }
2848 
2849 
2850  }
2851  else {
2852  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
2853  CLPixelType p = inputImageCache[cacheIndex];
2854 
2855  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
2856  , (ResizeWeightingFunctionType)resizeWindowType
2857  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
2858 
2859  float alpha = weight * QuantumScale * GetPixelAlpha(p);
2860  float4 cp = convert_float4(p);
2861 
2862  filteredPixel.x += alpha * cp.x;
2863  filteredPixel.y += alpha * cp.y;
2864  filteredPixel.z += alpha * cp.z;
2865  filteredPixel.w += weight * cp.w;
2866 
2867  density+=weight;
2868  gamma+=alpha;
2869  }
2870  }
2871  }
2872  }
2873 
2874  // initialize the accumulators to zero
2875  if (itemID < actualNumPixelInThisChunk) {
2876  outputPixelCache[itemID] = (float4)0.0f;
2877  densityCache[itemID] = 0.0f;
2878  if (matte != 0)
2879  gammaCache[itemID] = 0.0f;
2880  }
2881  barrier(CLK_LOCAL_MEM_FENCE);
2882 
2883  // accumulatte the filtered pixel value and the density
2884  for (unsigned int i = 0; i < numItems; i++) {
2885  if (pixelIndex != -1) {
2886  if (itemID%numItems == i) {
2887  outputPixelCache[pixelIndex]+=filteredPixel;
2888  densityCache[pixelIndex]+=density;
2889  if (matte!=0) {
2890  gammaCache[pixelIndex]+=gamma;
2891  }
2892  }
2893  }
2894  barrier(CLK_LOCAL_MEM_FENCE);
2895  }
2896 
2897  if (itemID < actualNumPixelInThisChunk) {
2898  if (matte==0) {
2899  float density = densityCache[itemID];
2900  float4 filteredPixel = outputPixelCache[itemID];
2901  if (density!= 0.0f && density != 1.0)
2902  {
2903  density = PerceptibleReciprocal(density);
2904  filteredPixel *= (float4)density;
2905  }
2906  filteredImage[y*filteredColumns+chunkStartX+itemID] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
2907  , ClampToQuantum(filteredPixel.y)
2908  , ClampToQuantum(filteredPixel.z)
2909  , ClampToQuantum(filteredPixel.w));
2910  }
2911  else {
2912  float density = densityCache[itemID];
2913  float gamma = gammaCache[itemID];
2914  float4 filteredPixel = outputPixelCache[itemID];
2915 
2916  if (density!= 0.0f && density != 1.0) {
2917  density = PerceptibleReciprocal(density);
2918  filteredPixel *= (float4)density;
2919  gamma *= density;
2920  }
2921  gamma = PerceptibleReciprocal(gamma);
2922 
2923  CLPixelType fp;
2924  fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
2925  , ClampToQuantum(gamma*filteredPixel.y)
2926  , ClampToQuantum(gamma*filteredPixel.z)
2927  , ClampToQuantum(filteredPixel.w));
2928 
2929  filteredImage[y*filteredColumns+chunkStartX+itemID] = fp;
2930 
2931  }
2932  }
2933 
2934  } // end of chunking loop
2935  }
2936  )
2937 
2938 
2939  STRINGIFY(
2940  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2941  void ResizeVerticalFilter(const __global CLPixelType* inputImage, const unsigned int inputColumns, const unsigned int inputRows, const unsigned int matte
2942  , const float yFactor, __global CLPixelType* filteredImage, const unsigned int filteredColumns, const unsigned int filteredRows
2943  , const int resizeFilterType, const int resizeWindowType
2944  , const __global float* resizeFilterCubicCoefficients
2945  , const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, const float resizeFilterBlur
2946  , __local CLPixelType* inputImageCache, const int numCachedPixels, const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize
2947  , __local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache) {
2948 
2949 
2950  // calculate the range of resized image pixels computed by this workgroup
2951  const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2952  const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2953  const unsigned int actualNumPixelToCompute = stopY - startY;
2954 
2955  // calculate the range of input image pixels to cache
2956  float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2957  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2958  scale = PerceptibleReciprocal(scale);
2959 
2960  const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2961  const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2962 
2963  // cache the input pixels into local memory
2964  const unsigned int x = get_global_id(0);
2965  event_t e = async_work_group_strided_copy(inputImageCache, inputImage+cacheRangeStartY*inputColumns+x, cacheRangeEndY-cacheRangeStartY, inputColumns, 0);
2966  wait_group_events(1,&e);
2967 
2968  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2969  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2970  {
2971 
2972  const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2973  const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2974  const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2975 
2976  // determine which resized pixel computed by this workitem
2977  const unsigned int itemID = get_local_id(1);
2978  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2979 
2980  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2981 
2982  float4 filteredPixel = (float4)0.0f;
2983  float density = 0.0f;
2984  float gamma = 0.0f;
2985  // -1 means this workitem doesn't participate in the computation
2986  if (pixelIndex != -1) {
2987 
2988  // x coordinated of the resized pixel computed by this workitem
2989  const int y = chunkStartY + pixelIndex;
2990 
2991  // calculate how many steps required for this pixel
2992  const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2993  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2994  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2995  const unsigned int n = stop - start;
2996 
2997  // calculate how many steps this workitem will contribute
2998  unsigned int numStepsPerWorkItem = n / numItems;
2999  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
3000 
3001  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
3002  if (startStep < n) {
3003  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
3004 
3005  unsigned int cacheIndex = start+startStep-cacheRangeStartY;
3006  if (matte == 0) {
3007 
3008  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
3009  float4 cp = convert_float4(inputImageCache[cacheIndex]);
3010 
3011  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
3012  , (ResizeWeightingFunctionType)resizeWindowType
3013  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
3014 
3015  filteredPixel += ((float4)weight)*cp;
3016  density+=weight;
3017  }
3018 
3019 
3020  }
3021  else {
3022  for (unsigned int i = startStep; i < stopStep; i++,cacheIndex++) {
3023  CLPixelType p = inputImageCache[cacheIndex];
3024 
3025  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,(ResizeWeightingFunctionType)resizeFilterType
3026  , (ResizeWeightingFunctionType)resizeWindowType
3027  , resizeFilterScale, resizeFilterWindowSupport, resizeFilterBlur,scale*(start+i-bisect+0.5));
3028 
3029  float alpha = weight * QuantumScale * GetPixelAlpha(p);
3030  float4 cp = convert_float4(p);
3031 
3032  filteredPixel.x += alpha * cp.x;
3033  filteredPixel.y += alpha * cp.y;
3034  filteredPixel.z += alpha * cp.z;
3035  filteredPixel.w += weight * cp.w;
3036 
3037  density+=weight;
3038  gamma+=alpha;
3039  }
3040  }
3041  }
3042  }
3043 
3044  // initialize the accumulators to zero
3045  if (itemID < actualNumPixelInThisChunk) {
3046  outputPixelCache[itemID] = (float4)0.0f;
3047  densityCache[itemID] = 0.0f;
3048  if (matte != 0)
3049  gammaCache[itemID] = 0.0f;
3050  }
3051  barrier(CLK_LOCAL_MEM_FENCE);
3052 
3053  // accumulatte the filtered pixel value and the density
3054  for (unsigned int i = 0; i < numItems; i++) {
3055  if (pixelIndex != -1) {
3056  if (itemID%numItems == i) {
3057  outputPixelCache[pixelIndex]+=filteredPixel;
3058  densityCache[pixelIndex]+=density;
3059  if (matte!=0) {
3060  gammaCache[pixelIndex]+=gamma;
3061  }
3062  }
3063  }
3064  barrier(CLK_LOCAL_MEM_FENCE);
3065  }
3066 
3067  if (itemID < actualNumPixelInThisChunk) {
3068  if (matte==0) {
3069  float density = densityCache[itemID];
3070  float4 filteredPixel = outputPixelCache[itemID];
3071  if (density!= 0.0f && density != 1.0)
3072  {
3073  density = PerceptibleReciprocal(density);
3074  filteredPixel *= (float4)density;
3075  }
3076  filteredImage[(chunkStartY+itemID)*filteredColumns+x] = (CLPixelType) (ClampToQuantum(filteredPixel.x)
3077  , ClampToQuantum(filteredPixel.y)
3078  , ClampToQuantum(filteredPixel.z)
3079  , ClampToQuantum(filteredPixel.w));
3080  }
3081  else {
3082  float density = densityCache[itemID];
3083  float gamma = gammaCache[itemID];
3084  float4 filteredPixel = outputPixelCache[itemID];
3085 
3086  if (density!= 0.0f && density != 1.0) {
3087  density = PerceptibleReciprocal(density);
3088  filteredPixel *= (float4)density;
3089  gamma *= density;
3090  }
3091  gamma = PerceptibleReciprocal(gamma);
3092 
3093  CLPixelType fp;
3094  fp = (CLPixelType) ( ClampToQuantum(gamma*filteredPixel.x)
3095  , ClampToQuantum(gamma*filteredPixel.y)
3096  , ClampToQuantum(gamma*filteredPixel.z)
3097  , ClampToQuantum(filteredPixel.w));
3098 
3099  filteredImage[(chunkStartY+itemID)*filteredColumns+x] = fp;
3100 
3101  }
3102  }
3103 
3104  } // end of chunking loop
3105  }
3106  )
3107 
3108 /*
3109 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3110 % %
3111 % %
3112 % %
3113 % U n s h a r p M a s k %
3114 % %
3115 % %
3116 % %
3117 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3118 */
3119 
3120  STRINGIFY(
3121  __kernel void UnsharpMaskBlurColumn(const __global CLPixelType* inputImage,
3122  const __global float4 *blurRowData, __global CLPixelType *filtered_im,
3123  const unsigned int imageColumns, const unsigned int imageRows,
3124  __local float4* cachedData, __local float* cachedFilter,
3125  const ChannelType channel, const __global float *filter, const unsigned int width,
3126  const float gain, const float threshold)
3127  {
3128  const unsigned int radius = (width-1)/2;
3129 
3130  // cache the pixel shared by the workgroup
3131  const int groupX = get_group_id(0);
3132  const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
3133  const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
3134 
3135  if (groupStartY >= 0
3136  && groupStopY < imageRows) {
3137  event_t e = async_work_group_strided_copy(cachedData
3138  ,blurRowData+groupStartY*imageColumns+groupX
3139  ,groupStopY-groupStartY,imageColumns,0);
3140  wait_group_events(1,&e);
3141  }
3142  else {
3143  for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) {
3144  cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,imageRows)*imageColumns+ groupX];
3145  }
3146  barrier(CLK_LOCAL_MEM_FENCE);
3147  }
3148  // cache the filter as well
3149  event_t e = async_work_group_copy(cachedFilter,filter,width,0);
3150  wait_group_events(1,&e);
3151 
3152  // only do the work if this is not a patched item
3153  //const int cy = get_group_id(1)*get_local_size(1)+get_local_id(1);
3154  const int cy = get_global_id(1);
3155 
3156  if (cy < imageRows) {
3157  float4 blurredPixel = (float4) 0.0f;
3158 
3159  int i = 0;
3160 
3161  \n #ifndef UFACTOR \n
3162  \n #define UFACTOR 8 \n
3163  \n #endif \n
3164 
3165  for ( ; i+UFACTOR < width; )
3166  {
3167  \n #pragma unroll UFACTOR \n
3168  for (int j=0; j < UFACTOR; j++, i++)
3169  {
3170  blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3171  }
3172  }
3173 
3174  for ( ; i < width; i++)
3175  {
3176  blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3177  }
3178 
3179  blurredPixel = floor((float4)(ClampToQuantum(blurredPixel.x), ClampToQuantum(blurredPixel.y)
3180  ,ClampToQuantum(blurredPixel.z), ClampToQuantum(blurredPixel.w)));
3181 
3182  float4 inputImagePixel = convert_float4(inputImage[cy*imageColumns+groupX]);
3183  float4 outputPixel = inputImagePixel - blurredPixel;
3184 
3185  float quantumThreshold = QuantumRange*threshold;
3186 
3187  int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
3188  outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
3189 
3190  //write back
3191  filtered_im[cy*imageColumns+groupX] = (CLPixelType) (ClampToQuantum(outputPixel.x), ClampToQuantum(outputPixel.y)
3192  ,ClampToQuantum(outputPixel.z), ClampToQuantum(outputPixel.w));
3193 
3194  }
3195  }
3196  )
3197 
3198 
3199 
3200  STRINGIFY(
3201  __kernel void UnsharpMask(__global CLPixelType *im, __global CLPixelType *filtered_im,
3202  __constant float *filter,
3203  const unsigned int width,
3204  const unsigned int imageColumns, const unsigned int imageRows,
3205  __local float4 *pixels,
3206  const float gain, const float threshold, const unsigned int justBlur)
3207  {
3208  const int x = get_global_id(0);
3209  const int y = get_global_id(1);
3210 
3211  const unsigned int radius = (width - 1) / 2;
3212 
3213  int row = y - radius;
3214  int baseRow = get_group_id(1) * get_local_size(1) - radius;
3215  int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
3216 
3217  while (row < endRow) {
3218  int srcy = (row < 0) ? -row : row; // mirror pad
3219  srcy = (srcy >= imageRows) ? (2 * imageRows - srcy - 1) : srcy;
3220 
3221  float4 value = 0.0f;
3222 
3223  int ix = x - radius;
3224  int i = 0;
3225 
3226  while (i + 7 < width) {
3227  for (int j = 0; j < 8; ++j) { // unrolled
3228  int srcx = ix + j;
3229  srcx = (srcx < 0) ? -srcx : srcx;
3230  srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
3231  value += filter[i + j] * convert_float4(im[srcx + srcy * imageColumns]);
3232  }
3233  ix += 8;
3234  i += 8;
3235  }
3236 
3237  while (i < width) {
3238  int srcx = (ix < 0) ? -ix : ix; // mirror pad
3239  srcx = (srcx >= imageColumns) ? (2 * imageColumns - srcx - 1) : srcx;
3240  value += filter[i] * convert_float4(im[srcx + srcy * imageColumns]);
3241  ++i;
3242  ++ix;
3243  }
3244  pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
3245  row += get_local_size(1);
3246  }
3247 
3248 
3249  barrier(CLK_LOCAL_MEM_FENCE);
3250 
3251 
3252  const int px = get_local_id(0);
3253  const int py = get_local_id(1);
3254  const int prp = get_local_size(0);
3255  float4 value = (float4)(0.0f);
3256 
3257  int i = 0;
3258  while (i + 7 < width) { // unrolled
3259  value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3260  value += (float4)(filter[i]) * pixels[px + (py + i + 1) * prp];
3261  value += (float4)(filter[i]) * pixels[px + (py + i + 2) * prp];
3262  value += (float4)(filter[i]) * pixels[px + (py + i + 3) * prp];
3263  value += (float4)(filter[i]) * pixels[px + (py + i + 4) * prp];
3264  value += (float4)(filter[i]) * pixels[px + (py + i + 5) * prp];
3265  value += (float4)(filter[i]) * pixels[px + (py + i + 6) * prp];
3266  value += (float4)(filter[i]) * pixels[px + (py + i + 7) * prp];
3267  i += 8;
3268  }
3269  while (i < width) {
3270  value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3271  ++i;
3272  }
3273  if ((x < imageColumns) && (y < imageRows)) {
3274  if (justBlur == 0) { // apply sharpening
3275  float4 srcPixel = convert_float4(im[x + y * imageColumns]);
3276  float4 diff = srcPixel - value;
3277 
3278  float quantumThreshold = QuantumRange*threshold;
3279 
3280  int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3281  value = select(srcPixel + diff * gain, srcPixel, mask);
3282  }
3283  filtered_im[x + y * imageColumns] = (CLPixelType)(ClampToQuantum(value.s0), ClampToQuantum(value.s1), ClampToQuantum(value.s2), ClampToQuantum(value.s3));
3284  }
3285  }
3286  )
3287 
3288  STRINGIFY(
3289  __kernel __attribute__((reqd_work_group_size(64, 4, 1))) void WaveletDenoise(__global CLPixelType *srcImage, __global CLPixelType *dstImage,
3290  const float threshold,
3291  const int passes,
3292  const int imageWidth,
3293  const int imageHeight)
3294  {
3295  const int pad = (1 << (passes - 1));
3296  const int tileSize = 64;
3297  const int tileRowPixels = 64;
3298  const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3299 
3300  CLPixelType stage[16];
3301 
3302  local float buffer[64 * 64];
3303 
3304  int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
3305  int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
3306 
3307  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3308  stage[i / 4] = srcImage[mirrorTop(mirrorBottom(srcx), imageWidth) + (mirrorTop(mirrorBottom(srcy + i) , imageHeight)) * imageWidth];
3309  }
3310 
3311 
3312  for (int channel = 0; channel < 3; ++channel) {
3313  // Load LDS
3314  switch (channel) {
3315  case 0:
3316  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3317  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s0);
3318  break;
3319  case 1:
3320  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3321  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s1);
3322  break;
3323  case 2:
3324  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3325  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[i / 4].s2);
3326  break;
3327  }
3328 
3329 
3330  // Process
3331 
3332  float tmp[16];
3333  float accum[16];
3334  float pixel;
3335 
3336  for (int pass = 0; pass < passes; ++pass) {
3337  const int radius = 1 << pass;
3338  const int x = get_local_id(0);
3339  const float thresh = threshold * noise[pass];
3340 
3341  if (pass == 0)
3342  accum[0] = accum[1] = accum[2] = accum[3] = accum[4] = accum[5] = accum[6] = accum[6] = accum[7] = accum[8] = accum[9] = accum[10] = accum[11] = accum[12] = accum[13] = accum[14] = accum[15] = 0.0f;
3343 
3344  // Snapshot input
3345 
3346  // Apply horizontal hat
3347  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3348  const int offset = i * tileRowPixels;
3349  if (pass == 0)
3350  tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3351  pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3352  barrier(CLK_LOCAL_MEM_FENCE);
3353  buffer[x + offset] = pixel;
3354  }
3355  barrier(CLK_LOCAL_MEM_FENCE);
3356  // Apply vertical hat
3357  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3358  pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3359  float delta = tmp[i / 4] - pixel;
3360  tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3361  if (delta < -thresh)
3362  delta += thresh;
3363  else if (delta > thresh)
3364  delta -= thresh;
3365  else
3366  delta = 0;
3367  accum[i / 4] += delta;
3368 
3369  }
3370  barrier(CLK_LOCAL_MEM_FENCE);
3371  if (pass < passes - 1)
3372  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3373  buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3374  else // last pass
3375  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3376  accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3377  barrier(CLK_LOCAL_MEM_FENCE);
3378  }
3379 
3380  switch (channel) {
3381  case 0:
3382  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3383  stage[i / 4].s0 = ClampToQuantum(accum[i / 4]);
3384  break;
3385  case 1:
3386  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3387  stage[i / 4].s1 = ClampToQuantum(accum[i / 4]);
3388  break;
3389  case 2:
3390  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3391  stage[i / 4].s2 = ClampToQuantum(accum[i / 4]);
3392  break;
3393  }
3394 
3395  barrier(CLK_LOCAL_MEM_FENCE);
3396  }
3397 
3398  // Write from stage to output
3399 
3400  if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3401  //for (int i = pad + get_local_id(1); i < tileSize - pad; i += get_local_size(1)) {
3402  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3403  if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3404  dstImage[srcx + (srcy + i) * imageWidth] = stage[i / 4];
3405  }
3406  }
3407  }
3408  }
3409  )
3410  ;
3411 
3412 #endif // MAGICKCORE_OPENCL_SUPPORT
3413 
3414 #if defined(__cplusplus) || defined(c_plusplus)
3415 }
3416 #endif
3417 
3418 #endif // MAGICKCORE_ACCELERATE_PRIVATE_H
Definition: composite.h:91
Definition: composite.h:94
Definition: composite.h:65
Definition: colorspace.h:44
Definition: resize-private.h:31
Definition: colorspace.h:36
#define SigmaPoisson
Definition: resize-private.h:37
Definition: pixel.h:75
Definition: statistic.h:116
Definition: resize-private.h:33
Definition: magick-type.h:176
Definition: composite.h:75
Definition: pixel.h:72
Definition: colorspace.h:40
static void MagickPixelCompositeBlend(const MagickPixelPacket *p, const MagickRealType alpha, const MagickPixelPacket *q, const MagickRealType beta, MagickPixelPacket *composite)
Definition: composite-private.h:138
Definition: composite.h:31
Definition: composite.h:93
Definition: colorspace.h:45
Definition: colorspace.h:33
Definition: composite.h:80
#define SigmaRandom
Definition: composite.h:33
Definition: resize-private.h:40
Definition: composite.h:90
Definition: resize-private.h:29
static MagickRealType ColorDodge(const MagickRealType Sca, const MagickRealType Sa, const MagickRealType Dca, const MagickRealType Da)
Definition: composite.c:293
Definition: fx.h:34
PixelIntensityMethod
Definition: pixel.h:67
Definition: magick-type.h:165
Definition: composite.h:95
Definition: colorspace.h:59
Definition: magick-type.h:171
Definition: composite.h:59
Definition: composite.h:89
Definition: magick-type.h:160
Definition: composite.h:27
Definition: colorspace.h:41
Definition: colorspace.h:37
static MagickRealType RoundToUnity(const MagickRealType value)
Definition: composite-private.h:33
Definition: composite.h:35
Definition: composite.h:87
#define MagickPI
Definition: image-private.h:28
Definition: colorspace.h:58
Definition: colorspace.h:50
static MagickRealType Hanning(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:287
Definition: colorspace.h:47
Definition: fx.h:29
Definition: statistic.h:115
Definition: colorspace.h:31
#define MAGICKCORE_QUANTUM_DEPTH
Definition: magick-type.h:28
Definition: composite.h:53
Definition: colorspace.h:35
Definition: resize-private.h:38
Definition: pixel.h:77
#define MagickEpsilon
Definition: magick-type.h:115
#define SigmaLaplacian
MagickExport void ConvertRGBToHSL(const Quantum red, const Quantum green, const Quantum blue, double *hue, double *saturation, double *lightness)
Definition: gem.c:1127
Definition: magick-type.h:166
Definition: colorspace.h:48
Definition: statistic.h:117
Definition: magick-type.h:178
NoiseType
Definition: fx.h:27
Definition: colorspace.h:52
Definition: composite.h:47
static MagickRealType Hamming(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:301
Definition: resize-private.h:41
Definition: composite.h:73
Definition: composite.h:29
Definition: composite.h:72
Definition: composite.h:42
Definition: colorspace.h:43
#define SigmaUniform
Definition: composite.h:97
static void ModulateHSL(const double percent_hue, const double percent_saturation, const double percent_lightness, Quantum *red, Quantum *green, Quantum *blue)
Definition: enhance.c:3569
Definition: colorspace.h:34
Definition: colorspace.h:57
Definition: resize-private.h:30
static double PerceptibleReciprocal(const double x)
Definition: pixel-accessor.h:124
Definition: composite.h:54
#define GetPixelAlpha(pixel)
Definition: pixel-accessor.h:36
Definition: composite.h:38
Definition: composite.h:68
Definition: composite.h:96
Definition: magick-type.h:162
Definition: composite.h:71
Definition: resize-private.h:32
Definition: composite.h:55
Definition: composite.h:56
Definition: fx.h:31
#define SigmaGaussian
Definition: composite.h:69
Definition: pixel.h:71
static Quantum ApplyFunction(Quantum pixel, const MagickFunction function, const size_t number_parameters, const double *parameters, ExceptionInfo *exception)
Definition: statistic.c:976
Definition: colorspace.h:38
Definition: pixel.h:70
Definition: composite.h:86
Definition: resize-private.h:36
Definition: colorspace.h:30
#define SigmaMultiplicativeGaussian
Definition: composite.h:49
Definition: composite.h:44
#define TauGaussian
MagickExport void ConvertRGBToHSB(const Quantum red, const Quantum green, const Quantum blue, double *hue, double *saturation, double *brightness)
Definition: gem.c:994
Definition: magick-type.h:164
static void Contrast(const int sign, Quantum *red, Quantum *green, Quantum *blue)
Definition: enhance.c:915
Definition: magick-type.h:179
Definition: composite.h:46
Definition: statistic.h:113
Definition: composite.h:28
Definition: magick-type.h:159
Definition: magick-type.h:168
Definition: colorspace.h:54
Definition: magick-type.h:167
Definition: resize-private.h:39
Definition: composite.h:78
Definition: resize-private.h:34
#define QuantumScale
Definition: magick-type.h:118
Definition: colorspace.h:55
Definition: fx.h:33
Definition: composite.h:62
Definition: colorspace.h:39
#define MaxMap
Definition: magick-type.h:78
Definition: magick-type.h:175
#define MagickMax(x, y)
Definition: image-private.h:26
Definition: composite.h:98
Definition: composite.h:39
static void CompositeColorDodge(const MagickPixelPacket *p, const MagickPixelPacket *q, MagickPixelPacket *composite)
Definition: composite.c:330
MagickExport void ConvertHSBToRGB(const double hue, const double saturation, const double brightness, Quantum *red, Quantum *green, Quantum *blue)
Definition: gem.c:284
Definition: composite.h:45
ChannelType
Definition: magick-type.h:155
Definition: composite.h:70
Definition: colorspace.h:46
Definition: resize-private.h:28
Definition: composite.h:81
Definition: composite.h:41
Definition: composite.h:52
Definition: pixel.h:69
Definition: colorspace.h:49
MagickExport void ConvertHSLToRGB(const double hue, const double saturation, const double lightness, Quantum *red, Quantum *green, Quantum *blue)
Definition: gem.c:460
Definition: composite.h:77
static Quantum ClampToQuantum(const MagickRealType value)
Definition: quantum.h:87
Definition: colorspace.h:53
Definition: composite.h:61
Definition: magick-type.h:161
static void MagickPixelCompositePlus(const MagickPixelPacket *p, const MagickRealType alpha, const MagickPixelPacket *q, const MagickRealType beta, MagickPixelPacket *composite)
Definition: composite-private.h:111
Definition: composite.h:76
Definition: magick-type.h:157
Definition: colorspace.h:28
Definition: resize-private.h:42
Definition: composite.h:50
Definition: composite.h:36
Definition: composite.h:43
MagickExport MagickRealType GetPixelIntensity(const Image *image, const PixelPacket *magick_restrict pixel)
Definition: pixel.c:2301
static MagickRealType Sinc(const MagickRealType, const ResizeFilter *)
Definition: composite.h:37
Definition: composite.h:60
Definition: statistic.h:114
ResizeWeightingFunctionType
Definition: resize-private.h:25
static MagickRealType Blackman(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:148
Definition: colorspace.h:56
#define MagickMin(x, y)
Definition: image-private.h:27
ColorspaceType
Definition: colorspace.h:25
Definition: composite.h:32
Definition: colorspace.h:29
Definition: composite.h:88
Definition: colorspace.h:42
#define SigmaImpulse
Definition: composite.h:48
Definition: composite.h:64
Definition: magick-type.h:163
Definition: colorspace.h:51
CompositeOperator
Definition: composite.h:25
Definition: composite.h:79
Definition: colorspace.h:62
Definition: magick-type.h:170
Definition: colorspace.h:32
Definition: pixel.h:78
Definition: composite.h:66
Definition: composite.h:30
Definition: colorspace.h:60
Definition: magick-type.h:158
Definition: composite.h:63
Definition: composite.h:58
Definition: composite.h:92
Definition: magick-type.h:177
Definition: composite.h:34
static MagickRealType CubicBC(const MagickRealType x, const ResizeFilter *resize_filter)
Definition: resize.c:210
Definition: resize-private.h:27
Definition: composite.h:74
Definition: colorspace.h:27
MagickFunction
Definition: statistic.h:111
Definition: fx.h:30
Definition: composite.h:40
Definition: composite.h:67
Definition: resize-private.h:35
#define QuantumRange
Definition: magick-type.h:86
static MagickRealType Triangle(const MagickRealType x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:514
Definition: fx.h:35
Definition: pixel.h:73
Definition: composite.h:51
Definition: fx.h:32
Definition: magick-type.h:169
Definition: fx.h:36
Definition: composite.h:57