Libav
idctdsp_mmx.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized IDCT-related routines
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of Libav.
9  *
10  * Libav is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * Libav is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with Libav; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "config.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "idctdsp.h"
29 #include "inline_asm.h"
30 
31 #if HAVE_INLINE_ASM
32 
33 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
34  int line_size)
35 {
36  const int16_t *p;
37  uint8_t *pix;
38 
39  /* read the pixels */
40  p = block;
41  pix = pixels;
42  /* unrolled loop */
43  __asm__ volatile (
44  "movq (%3), %%mm0 \n\t"
45  "movq 8(%3), %%mm1 \n\t"
46  "movq 16(%3), %%mm2 \n\t"
47  "movq 24(%3), %%mm3 \n\t"
48  "movq 32(%3), %%mm4 \n\t"
49  "movq 40(%3), %%mm5 \n\t"
50  "movq 48(%3), %%mm6 \n\t"
51  "movq 56(%3), %%mm7 \n\t"
52  "packuswb %%mm1, %%mm0 \n\t"
53  "packuswb %%mm3, %%mm2 \n\t"
54  "packuswb %%mm5, %%mm4 \n\t"
55  "packuswb %%mm7, %%mm6 \n\t"
56  "movq %%mm0, (%0) \n\t"
57  "movq %%mm2, (%0, %1) \n\t"
58  "movq %%mm4, (%0, %1, 2) \n\t"
59  "movq %%mm6, (%0, %2) \n\t"
60  :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
61  "r" (p)
62  : "memory");
63  pix += line_size * 4;
64  p += 32;
65 
66  // if here would be an exact copy of the code above
67  // compiler would generate some very strange code
68  // thus using "r"
69  __asm__ volatile (
70  "movq (%3), %%mm0 \n\t"
71  "movq 8(%3), %%mm1 \n\t"
72  "movq 16(%3), %%mm2 \n\t"
73  "movq 24(%3), %%mm3 \n\t"
74  "movq 32(%3), %%mm4 \n\t"
75  "movq 40(%3), %%mm5 \n\t"
76  "movq 48(%3), %%mm6 \n\t"
77  "movq 56(%3), %%mm7 \n\t"
78  "packuswb %%mm1, %%mm0 \n\t"
79  "packuswb %%mm3, %%mm2 \n\t"
80  "packuswb %%mm5, %%mm4 \n\t"
81  "packuswb %%mm7, %%mm6 \n\t"
82  "movq %%mm0, (%0) \n\t"
83  "movq %%mm2, (%0, %1) \n\t"
84  "movq %%mm4, (%0, %1, 2) \n\t"
85  "movq %%mm6, (%0, %2) \n\t"
86  :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3),
87  "r" (p)
88  : "memory");
89 }
90 
91 #define put_signed_pixels_clamped_mmx_half(off) \
92  "movq "#off"(%2), %%mm1 \n\t" \
93  "movq 16 + "#off"(%2), %%mm2 \n\t" \
94  "movq 32 + "#off"(%2), %%mm3 \n\t" \
95  "movq 48 + "#off"(%2), %%mm4 \n\t" \
96  "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
97  "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
98  "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
99  "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
100  "paddb %%mm0, %%mm1 \n\t" \
101  "paddb %%mm0, %%mm2 \n\t" \
102  "paddb %%mm0, %%mm3 \n\t" \
103  "paddb %%mm0, %%mm4 \n\t" \
104  "movq %%mm1, (%0) \n\t" \
105  "movq %%mm2, (%0, %3) \n\t" \
106  "movq %%mm3, (%0, %3, 2) \n\t" \
107  "movq %%mm4, (%0, %1) \n\t"
108 
109 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
110  int line_size)
111 {
112  x86_reg line_skip = line_size;
113  x86_reg line_skip3;
114 
115  __asm__ volatile (
116  "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
117  "lea (%3, %3, 2), %1 \n\t"
118  put_signed_pixels_clamped_mmx_half(0)
119  "lea (%0, %3, 4), %0 \n\t"
120  put_signed_pixels_clamped_mmx_half(64)
121  : "+&r" (pixels), "=&r" (line_skip3)
122  : "r" (block), "r" (line_skip)
123  : "memory");
124 }
125 
126 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
127  int line_size)
128 {
129  const int16_t *p;
130  uint8_t *pix;
131  int i;
132 
133  /* read the pixels */
134  p = block;
135  pix = pixels;
136  MOVQ_ZERO(mm7);
137  i = 4;
138  do {
139  __asm__ volatile (
140  "movq (%2), %%mm0 \n\t"
141  "movq 8(%2), %%mm1 \n\t"
142  "movq 16(%2), %%mm2 \n\t"
143  "movq 24(%2), %%mm3 \n\t"
144  "movq %0, %%mm4 \n\t"
145  "movq %1, %%mm6 \n\t"
146  "movq %%mm4, %%mm5 \n\t"
147  "punpcklbw %%mm7, %%mm4 \n\t"
148  "punpckhbw %%mm7, %%mm5 \n\t"
149  "paddsw %%mm4, %%mm0 \n\t"
150  "paddsw %%mm5, %%mm1 \n\t"
151  "movq %%mm6, %%mm5 \n\t"
152  "punpcklbw %%mm7, %%mm6 \n\t"
153  "punpckhbw %%mm7, %%mm5 \n\t"
154  "paddsw %%mm6, %%mm2 \n\t"
155  "paddsw %%mm5, %%mm3 \n\t"
156  "packuswb %%mm1, %%mm0 \n\t"
157  "packuswb %%mm3, %%mm2 \n\t"
158  "movq %%mm0, %0 \n\t"
159  "movq %%mm2, %1 \n\t"
160  : "+m" (*pix), "+m" (*(pix + line_size))
161  : "r" (p)
162  : "memory");
163  pix += line_size * 2;
164  p += 16;
165  } while (--i);
166 }
167 
168 #endif /* HAVE_INLINE_ASM */
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
uint8_t
int x86_reg
Definition: asm.h:70
#define MANGLE(a)
Definition: asm.h:110
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
#define MOVQ_ZERO(regd)
Definition: inline_asm.h:32
const xmm_reg ff_pb_80
Definition: constants.c:52
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
static int16_t block[64]
Definition: dct-test.c:88