26 #include "libavutil/avassert.h"
27 #include "libavutil/cpu.h"
47 "movq (%3), %%mm0 \n\t"
48 "movq 8(%3), %%mm1 \n\t"
49 "movq 16(%3), %%mm2 \n\t"
50 "movq 24(%3), %%mm3 \n\t"
51 "movq 32(%3), %%mm4 \n\t"
52 "movq 40(%3), %%mm5 \n\t"
53 "movq 48(%3), %%mm6 \n\t"
54 "movq 56(%3), %%mm7 \n\t"
55 "packuswb %%mm1, %%mm0 \n\t"
56 "packuswb %%mm3, %%mm2 \n\t"
57 "packuswb %%mm5, %%mm4 \n\t"
58 "packuswb %%mm7, %%mm6 \n\t"
59 "movq %%mm0, (%0) \n\t"
60 "movq %%mm2, (%0, %1) \n\t"
61 "movq %%mm4, (%0, %1, 2) \n\t"
62 "movq %%mm6, (%0, %2) \n\t"
73 "movq (%3), %%mm0 \n\t"
74 "movq 8(%3), %%mm1 \n\t"
75 "movq 16(%3), %%mm2 \n\t"
76 "movq 24(%3), %%mm3 \n\t"
77 "movq 32(%3), %%mm4 \n\t"
78 "movq 40(%3), %%mm5 \n\t"
79 "movq 48(%3), %%mm6 \n\t"
80 "movq 56(%3), %%mm7 \n\t"
81 "packuswb %%mm1, %%mm0 \n\t"
82 "packuswb %%mm3, %%mm2 \n\t"
83 "packuswb %%mm5, %%mm4 \n\t"
84 "packuswb %%mm7, %%mm6 \n\t"
85 "movq %%mm0, (%0) \n\t"
86 "movq %%mm2, (%0, %1) \n\t"
87 "movq %%mm4, (%0, %1, 2) \n\t"
88 "movq %%mm6, (%0, %2) \n\t"
89 ::
"r"(pix),
"r"((
x86_reg)line_size),
"r"((
x86_reg)line_size * 3),
"r"(p)
93 #define put_signed_pixels_clamped_mmx_half(off) \
94 "movq "#off"(%2), %%mm1 \n\t" \
95 "movq 16 + "#off"(%2), %%mm2 \n\t" \
96 "movq 32 + "#off"(%2), %%mm3 \n\t" \
97 "movq 48 + "#off"(%2), %%mm4 \n\t" \
98 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
99 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
100 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
101 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
102 "paddb %%mm0, %%mm1 \n\t" \
103 "paddb %%mm0, %%mm2 \n\t" \
104 "paddb %%mm0, %%mm3 \n\t" \
105 "paddb %%mm0, %%mm4 \n\t" \
106 "movq %%mm1, (%0) \n\t" \
107 "movq %%mm2, (%0, %3) \n\t" \
108 "movq %%mm3, (%0, %3, 2) \n\t" \
109 "movq %%mm4, (%0, %1) \n\t"
119 "lea (%3, %3, 2), %1 \n\t"
120 put_signed_pixels_clamped_mmx_half(0)
121 "lea (%0, %3, 4), %0 \n\t"
122 put_signed_pixels_clamped_mmx_half(64)
123 :
"+&r"(pixels),
"=&r"(line_skip3)
124 :
"r"(block),
"r"(line_skip)
142 "movq (%2), %%mm0 \n\t"
143 "movq 8(%2), %%mm1 \n\t"
144 "movq 16(%2), %%mm2 \n\t"
145 "movq 24(%2), %%mm3 \n\t"
146 "movq %0, %%mm4 \n\t"
147 "movq %1, %%mm6 \n\t"
148 "movq %%mm4, %%mm5 \n\t"
149 "punpcklbw %%mm7, %%mm4 \n\t"
150 "punpckhbw %%mm7, %%mm5 \n\t"
151 "paddsw %%mm4, %%mm0 \n\t"
152 "paddsw %%mm5, %%mm1 \n\t"
153 "movq %%mm6, %%mm5 \n\t"
154 "punpcklbw %%mm7, %%mm6 \n\t"
155 "punpckhbw %%mm7, %%mm5 \n\t"
156 "paddsw %%mm6, %%mm2 \n\t"
157 "paddsw %%mm5, %%mm3 \n\t"
158 "packuswb %%mm1, %%mm0 \n\t"
159 "packuswb %%mm3, %%mm2 \n\t"
160 "movq %%mm0, %0 \n\t"
161 "movq %%mm2, %1 \n\t"
162 :
"+m"(*pix),
"+m"(*(pix + line_size))
165 pix += line_size * 2;
170 #define CLEAR_BLOCKS(name, n) \
171 void name(int16_t *blocks) \
174 "pxor %%mm7, %%mm7 \n\t" \
175 "mov %1, %%"REG_a" \n\t" \
177 "movq %%mm7, (%0, %%"REG_a") \n\t" \
178 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
179 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
180 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
181 "add $32, %%"REG_a" \n\t" \
183 :: "r"(((uint8_t *)blocks) + 128 * n), \
194 "xorps %%xmm0, %%xmm0 \n"
195 "movaps %%xmm0, (%0) \n"
196 "movaps %%xmm0, 16(%0) \n"
197 "movaps %%xmm0, 32(%0) \n"
198 "movaps %%xmm0, 48(%0) \n"
199 "movaps %%xmm0, 64(%0) \n"
200 "movaps %%xmm0, 80(%0) \n"
201 "movaps %%xmm0, 96(%0) \n"
202 "movaps %%xmm0, 112(%0) \n"
211 "xorps %%xmm0, %%xmm0 \n"
212 "mov %1, %%"REG_a
" \n"
214 "movaps %%xmm0, (%0, %%"REG_a
") \n"
215 "movaps %%xmm0, 16(%0, %%"REG_a
") \n"
216 "movaps %%xmm0, 32(%0, %%"REG_a
") \n"
217 "movaps %%xmm0, 48(%0, %%"REG_a
") \n"
218 "movaps %%xmm0, 64(%0, %%"REG_a
") \n"
219 "movaps %%xmm0, 80(%0, %%"REG_a
") \n"
220 "movaps %%xmm0, 96(%0, %%"REG_a
") \n"
221 "movaps %%xmm0, 112(%0, %%"REG_a
") \n"
222 "add $128, %%"REG_a
" \n"
224 ::
"r"(((
uint8_t *)blocks) + 128 * 6),
236 "movq (%1, %0), %%mm0 \n\t"
237 "movq (%2, %0), %%mm1 \n\t"
238 "paddb %%mm0, %%mm1 \n\t"
239 "movq %%mm1, (%2, %0) \n\t"
240 "movq 8(%1, %0), %%mm0 \n\t"
241 "movq 8(%2, %0), %%mm1 \n\t"
242 "paddb %%mm0, %%mm1 \n\t"
243 "movq %%mm1, 8(%2, %0) \n\t"
249 :
"r"(src),
"r"(dst),
"r"((
x86_reg)w - 15)
252 dst[i + 0] += src[i + 0];
258 int w,
int h,
int sides)
263 last_line = buf + (height - 1) * wrap;
269 "movd (%0), %%mm0 \n\t"
270 "punpcklbw %%mm0, %%mm0 \n\t"
271 "punpcklwd %%mm0, %%mm0 \n\t"
272 "punpckldq %%mm0, %%mm0 \n\t"
273 "movq %%mm0, -8(%0) \n\t"
274 "movq -8(%0, %2), %%mm1 \n\t"
275 "punpckhbw %%mm1, %%mm1 \n\t"
276 "punpckhwd %%mm1, %%mm1 \n\t"
277 "punpckhdq %%mm1, %%mm1 \n\t"
278 "movq %%mm1, (%0, %2) \n\t"
288 "movd (%0), %%mm0 \n\t"
289 "punpcklbw %%mm0, %%mm0 \n\t"
290 "punpcklwd %%mm0, %%mm0 \n\t"
291 "punpckldq %%mm0, %%mm0 \n\t"
292 "movq %%mm0, -8(%0) \n\t"
293 "movq %%mm0, -16(%0) \n\t"
294 "movq -8(%0, %2), %%mm1 \n\t"
295 "punpckhbw %%mm1, %%mm1 \n\t"
296 "punpckhwd %%mm1, %%mm1 \n\t"
297 "punpckhdq %%mm1, %%mm1 \n\t"
298 "movq %%mm1, (%0, %2) \n\t"
299 "movq %%mm1, 8(%0, %2) \n\t"
310 "movd (%0), %%mm0 \n\t"
311 "punpcklbw %%mm0, %%mm0 \n\t"
312 "punpcklwd %%mm0, %%mm0 \n\t"
313 "movd %%mm0, -4(%0) \n\t"
314 "movd -4(%0, %2), %%mm1 \n\t"
315 "punpcklbw %%mm1, %%mm1 \n\t"
316 "punpckhwd %%mm1, %%mm1 \n\t"
317 "punpckhdq %%mm1, %%mm1 \n\t"
318 "movd %%mm1, (%0, %2) \n\t"
329 for (i = 0; i < h; i += 4) {
330 ptr = buf - (i + 1) * wrap - w;
333 "movq (%1, %0), %%mm0 \n\t"
334 "movq %%mm0, (%0) \n\t"
335 "movq %%mm0, (%0, %2) \n\t"
336 "movq %%mm0, (%0, %2, 2) \n\t"
337 "movq %%mm0, (%0, %3) \n\t"
343 "r"((
x86_reg) -wrap * 3),
"r"(ptr + width + 2 * w)
349 for (i = 0; i < h; i += 4) {
350 ptr = last_line + (i + 1) * wrap - w;
353 "movq (%1, %0), %%mm0 \n\t"
354 "movq %%mm0, (%0) \n\t"
355 "movq %%mm0, (%0, %2) \n\t"
356 "movq %%mm0, (%0, %2, 2) \n\t"
357 "movq %%mm0, (%0, %3) \n\t"
364 "r"(ptr + width + 2 * w)
370 typedef void emulated_edge_mc_func(
uint8_t *dst, ptrdiff_t dst_stride,
371 const uint8_t *src, ptrdiff_t src_linesize,
372 int block_w,
int block_h,
373 int src_x,
int src_y,
int w,
int h);
376 int stride,
int h,
int ox,
int oy,
377 int dxx,
int dxy,
int dyx,
int dyy,
378 int shift,
int r,
int width,
int height,
379 emulated_edge_mc_func *emu_edge_fn)
382 const int ix = ox >> (16 +
shift);
383 const int iy = oy >> (16 +
shift);
384 const int oxs = ox >> 4;
385 const int oys = oy >> 4;
386 const int dxxs = dxx >> 4;
387 const int dxys = dxy >> 4;
388 const int dyxs = dyx >> 4;
389 const int dyys = dyy >> 4;
390 const uint16_t r4[4] = {
r,
r,
r, r };
391 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
392 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
394 #define MAX_STRIDE 4096U
396 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
399 const int dxw = (dxx - (1 << (16 +
shift))) * (w - 1);
400 const int dyh = (dyy - (1 << (16 +
shift))) * (h - 1);
401 const int dxh = dxy * (h - 1);
402 const int dyw = dyx * (w - 1);
403 int need_emu = (unsigned)ix >= width - w ||
404 (
unsigned)iy >= height - h;
407 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
408 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 +
shift)
410 || (dxx | dxy | dyx | dyy) & 15
411 || (need_emu && (h > MAX_H ||
stride > MAX_STRIDE))) {
413 ff_gmc_c(dst, src,
stride, h, ox, oy, dxx, dxy, dyx, dyy,
420 emu_edge_fn(edge_buf, stride, src, stride, w + 1, h + 1, ix, iy, width, height);
425 "movd %0, %%mm6 \n\t"
426 "pxor %%mm7, %%mm7 \n\t"
427 "punpcklwd %%mm6, %%mm6 \n\t"
428 "punpcklwd %%mm6, %%mm6 \n\t"
432 for (x = 0; x < w; x += 4) {
433 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
434 oxs - dxys + dxxs * (x + 1),
435 oxs - dxys + dxxs * (x + 2),
436 oxs - dxys + dxxs * (x + 3) };
437 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
438 oys - dyys + dyxs * (x + 1),
439 oys - dyys + dyxs * (x + 2),
440 oys - dyys + dyxs * (x + 3) };
442 for (
y = 0;
y < h;
y++) {
444 "movq %0, %%mm4 \n\t"
445 "movq %1, %%mm5 \n\t"
446 "paddw %2, %%mm4 \n\t"
447 "paddw %3, %%mm5 \n\t"
448 "movq %%mm4, %0 \n\t"
449 "movq %%mm5, %1 \n\t"
450 "psrlw $12, %%mm4 \n\t"
451 "psrlw $12, %%mm5 \n\t"
452 :
"+m"(*dx4),
"+m"(*dy4)
453 :
"m"(*dxy4),
"m"(*dyy4)
457 "movq %%mm6, %%mm2 \n\t"
458 "movq %%mm6, %%mm1 \n\t"
459 "psubw %%mm4, %%mm2 \n\t"
460 "psubw %%mm5, %%mm1 \n\t"
461 "movq %%mm2, %%mm0 \n\t"
462 "movq %%mm4, %%mm3 \n\t"
463 "pmullw %%mm1, %%mm0 \n\t"
464 "pmullw %%mm5, %%mm3 \n\t"
465 "pmullw %%mm5, %%mm2 \n\t"
466 "pmullw %%mm4, %%mm1 \n\t"
468 "movd %4, %%mm5 \n\t"
469 "movd %3, %%mm4 \n\t"
470 "punpcklbw %%mm7, %%mm5 \n\t"
471 "punpcklbw %%mm7, %%mm4 \n\t"
472 "pmullw %%mm5, %%mm3 \n\t"
473 "pmullw %%mm4, %%mm2 \n\t"
475 "movd %2, %%mm5 \n\t"
476 "movd %1, %%mm4 \n\t"
477 "punpcklbw %%mm7, %%mm5 \n\t"
478 "punpcklbw %%mm7, %%mm4 \n\t"
479 "pmullw %%mm5, %%mm1 \n\t"
480 "pmullw %%mm4, %%mm0 \n\t"
481 "paddw %5, %%mm1 \n\t"
482 "paddw %%mm3, %%mm2 \n\t"
483 "paddw %%mm1, %%mm0 \n\t"
484 "paddw %%mm2, %%mm0 \n\t"
486 "psrlw %6, %%mm0 \n\t"
487 "packuswb %%mm0, %%mm0 \n\t"
488 "movd %%mm0, %0 \n\t"
491 :
"m"(src[0]),
"m"(src[1]),
492 "m"(src[stride]),
"m"(src[stride + 1]),
505 int stride,
int h,
int ox,
int oy,
506 int dxx,
int dxy,
int dyx,
int dyy,
507 int shift,
int r,
int width,
int height)
509 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
510 width, height, &ff_emulated_edge_mc_8);
514 int stride,
int h,
int ox,
int oy,
515 int dxx,
int dxy,
int dyx,
int dyy,
516 int shift,
int r,
int width,
int height)
518 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
519 width, height, &ff_emulated_edge_mc_8);
523 int stride,
int h,
int ox,
int oy,
524 int dxx,
int dxy,
int dyx,
int dyy,
525 int shift,
int r,
int width,
int height)
527 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
528 width, height, &ff_emulated_edge_mc_8);
533 #if CONFIG_DIRAC_DECODER
534 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
535 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
538 ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
540 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
542 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
545 ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
547 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
549 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
552 ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
554 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
555 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
560 PIXELS16(
static, ff_avg, , , _mmxext)
561 DIRAC_PIXOP(put, ff_put, mmx)
562 DIRAC_PIXOP(avg, ff_avg, mmx)
566 DIRAC_PIXOP(avg, ff_avg, mmxext)
571 ff_put_dirac_pixels16_c(dst, src, stride, h);
578 ff_avg_dirac_pixels16_c(dst, src, stride, h);
585 ff_put_dirac_pixels32_c(dst, src, stride, h);
594 ff_avg_dirac_pixels32_c(dst, src, stride, h);
604 float min,
float max,
int len)
608 "movss %3, %%xmm4 \n\t"
609 "movss %4, %%xmm5 \n\t"
610 "shufps $0, %%xmm4, %%xmm4 \n\t"
611 "shufps $0, %%xmm5, %%xmm5 \n\t"
613 "movaps (%2, %0), %%xmm0 \n\t"
614 "movaps 16(%2, %0), %%xmm1 \n\t"
615 "movaps 32(%2, %0), %%xmm2 \n\t"
616 "movaps 48(%2, %0), %%xmm3 \n\t"
617 "maxps %%xmm4, %%xmm0 \n\t"
618 "maxps %%xmm4, %%xmm1 \n\t"
619 "maxps %%xmm4, %%xmm2 \n\t"
620 "maxps %%xmm4, %%xmm3 \n\t"
621 "minps %%xmm5, %%xmm0 \n\t"
622 "minps %%xmm5, %%xmm1 \n\t"
623 "minps %%xmm5, %%xmm2 \n\t"
624 "minps %%xmm5, %%xmm3 \n\t"
625 "movaps %%xmm0, (%1, %0) \n\t"
626 "movaps %%xmm1, 16(%1, %0) \n\t"
627 "movaps %%xmm2, 32(%1, %0) \n\t"
628 "movaps %%xmm3, 48(%1, %0) \n\t"
632 :
"r"(dst),
"r"(
src),
"m"(min),
"m"(max)
static int shift(int a, int b)
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
void ff_gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
void ff_clear_blocks_mmx(int16_t *blocks)
void ff_clear_blocks_sse(int16_t *blocks)
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
void ff_clear_block_sse(int16_t *block)
void ff_gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
typedef void(RENAME(mix_any_func_type))
BYTE int const BYTE int int int height
void ff_clear_block_mmx(int16_t *block)
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
static const int shift2[6]
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Core video DSP helper functions.
void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size)
void ff_vector_clipf_sse(float *dst, const float *src, float min, float max, int len)
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
#define PIXELS16(STATIC, PFX1, PFX2, TYPE, CPUEXT)