35 #if COMPILE_TEMPLATE_AMD3DNOW
36 #define PREFETCH "prefetch"
37 #define PAVGB "pavgusb"
38 #elif COMPILE_TEMPLATE_MMXEXT
39 #define PREFETCH "prefetchnta"
42 #define PREFETCH " # nop"
45 #if COMPILE_TEMPLATE_AMD3DNOW
52 #if COMPILE_TEMPLATE_MMXEXT
53 #define MOVNTQ "movntq"
54 #define SFENCE "sfence"
57 #define SFENCE " # nop"
60 #if !COMPILE_TEMPLATE_SSE2
62 #if !COMPILE_TEMPLATE_AMD3DNOW
71 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
73 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask32a):
"memory");
77 "movd (%1), %%mm0 \n\t"
78 "punpckldq 3(%1), %%mm0 \n\t"
79 "movd 6(%1), %%mm1 \n\t"
80 "punpckldq 9(%1), %%mm1 \n\t"
81 "movd 12(%1), %%mm2 \n\t"
82 "punpckldq 15(%1), %%mm2 \n\t"
83 "movd 18(%1), %%mm3 \n\t"
84 "punpckldq 21(%1), %%mm3 \n\t"
85 "por %%mm7, %%mm0 \n\t"
86 "por %%mm7, %%mm1 \n\t"
87 "por %%mm7, %%mm2 \n\t"
88 "por %%mm7, %%mm3 \n\t"
91 MOVNTQ" %%mm2, 16(%0) \n\t"
98 __asm__
volatile(
SFENCE:::
"memory");
99 __asm__
volatile(
EMMS:::
"memory");
108 #define STORE_BGR24_MMX \
109 "psrlq $8, %%mm2 \n\t" \
110 "psrlq $8, %%mm3 \n\t" \
111 "psrlq $8, %%mm6 \n\t" \
112 "psrlq $8, %%mm7 \n\t" \
113 "pand "MANGLE(mask24l)", %%mm0\n\t" \
114 "pand "MANGLE(mask24l)", %%mm1\n\t" \
115 "pand "MANGLE(mask24l)", %%mm4\n\t" \
116 "pand "MANGLE(mask24l)", %%mm5\n\t" \
117 "pand "MANGLE(mask24h)", %%mm2\n\t" \
118 "pand "MANGLE(mask24h)", %%mm3\n\t" \
119 "pand "MANGLE(mask24h)", %%mm6\n\t" \
120 "pand "MANGLE(mask24h)", %%mm7\n\t" \
121 "por %%mm2, %%mm0 \n\t" \
122 "por %%mm3, %%mm1 \n\t" \
123 "por %%mm6, %%mm4 \n\t" \
124 "por %%mm7, %%mm5 \n\t" \
126 "movq %%mm1, %%mm2 \n\t" \
127 "movq %%mm4, %%mm3 \n\t" \
128 "psllq $48, %%mm2 \n\t" \
129 "psllq $32, %%mm3 \n\t" \
130 "por %%mm2, %%mm0 \n\t" \
131 "psrlq $16, %%mm1 \n\t" \
132 "psrlq $32, %%mm4 \n\t" \
133 "psllq $16, %%mm5 \n\t" \
134 "por %%mm3, %%mm1 \n\t" \
135 "por %%mm5, %%mm4 \n\t" \
137 MOVNTQ" %%mm0, (%0) \n\t" \
138 MOVNTQ" %%mm1, 8(%0) \n\t" \
139 MOVNTQ" %%mm4, 16(%0)"
149 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
154 "movq (%1), %%mm0 \n\t"
155 "movq 8(%1), %%mm1 \n\t"
156 "movq 16(%1), %%mm4 \n\t"
157 "movq 24(%1), %%mm5 \n\t"
158 "movq %%mm0, %%mm2 \n\t"
159 "movq %%mm1, %%mm3 \n\t"
160 "movq %%mm4, %%mm6 \n\t"
161 "movq %%mm5, %%mm7 \n\t"
168 __asm__
volatile(
SFENCE:::
"memory");
169 __asm__
volatile(
EMMS:::
"memory");
191 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
192 __asm__
volatile(
"movq %0, %%mm4"::
"m"(mask15s));
197 "movq (%1), %%mm0 \n\t"
198 "movq 8(%1), %%mm2 \n\t"
199 "movq %%mm0, %%mm1 \n\t"
200 "movq %%mm2, %%mm3 \n\t"
201 "pand %%mm4, %%mm0 \n\t"
202 "pand %%mm4, %%mm2 \n\t"
203 "paddw %%mm1, %%mm0 \n\t"
204 "paddw %%mm3, %%mm2 \n\t"
212 __asm__
volatile(
SFENCE:::
"memory");
213 __asm__
volatile(
EMMS:::
"memory");
216 register unsigned x= *((
const uint32_t *)s);
217 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
222 register unsigned short x= *((
const uint16_t *)s);
223 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
234 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
235 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask15rg));
236 __asm__
volatile(
"movq %0, %%mm6"::
"m"(mask15b));
241 "movq (%1), %%mm0 \n\t"
242 "movq 8(%1), %%mm2 \n\t"
243 "movq %%mm0, %%mm1 \n\t"
244 "movq %%mm2, %%mm3 \n\t"
245 "psrlq $1, %%mm0 \n\t"
246 "psrlq $1, %%mm2 \n\t"
247 "pand %%mm7, %%mm0 \n\t"
248 "pand %%mm7, %%mm2 \n\t"
249 "pand %%mm6, %%mm1 \n\t"
250 "pand %%mm6, %%mm3 \n\t"
251 "por %%mm1, %%mm0 \n\t"
252 "por %%mm3, %%mm2 \n\t"
260 __asm__
volatile(
SFENCE:::
"memory");
261 __asm__
volatile(
EMMS:::
"memory");
264 register uint32_t x= *((
const uint32_t*)s);
265 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
270 register uint16_t x= *((
const uint16_t*)s);
271 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
280 uint16_t *d = (uint16_t *)
dst;
284 "movq %3, %%mm5 \n\t"
285 "movq %4, %%mm6 \n\t"
286 "movq %5, %%mm7 \n\t"
291 "movd (%1), %%mm0 \n\t"
292 "movd 4(%1), %%mm3 \n\t"
293 "punpckldq 8(%1), %%mm0 \n\t"
294 "punpckldq 12(%1), %%mm3 \n\t"
295 "movq %%mm0, %%mm1 \n\t"
296 "movq %%mm3, %%mm4 \n\t"
297 "pand %%mm6, %%mm0 \n\t"
298 "pand %%mm6, %%mm3 \n\t"
299 "pmaddwd %%mm7, %%mm0 \n\t"
300 "pmaddwd %%mm7, %%mm3 \n\t"
301 "pand %%mm5, %%mm1 \n\t"
302 "pand %%mm5, %%mm4 \n\t"
303 "por %%mm1, %%mm0 \n\t"
304 "por %%mm4, %%mm3 \n\t"
305 "psrld $5, %%mm0 \n\t"
306 "pslld $11, %%mm3 \n\t"
307 "por %%mm3, %%mm0 \n\t"
315 :
"r" (mm_end),
"m" (mask3216g),
"m" (mask3216br),
"m" (mul3216)
317 __asm__
volatile(
SFENCE:::
"memory");
318 __asm__
volatile(
EMMS:::
"memory");
320 register int rgb = *(
const uint32_t*)s; s += 4;
321 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
330 uint16_t *d = (uint16_t *)
dst;
332 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
334 "movq %0, %%mm7 \n\t"
335 "movq %1, %%mm6 \n\t"
336 ::
"m"(red_16mask),
"m"(green_16mask));
341 "movd (%1), %%mm0 \n\t"
342 "movd 4(%1), %%mm3 \n\t"
343 "punpckldq 8(%1), %%mm0 \n\t"
344 "punpckldq 12(%1), %%mm3 \n\t"
345 "movq %%mm0, %%mm1 \n\t"
346 "movq %%mm0, %%mm2 \n\t"
347 "movq %%mm3, %%mm4 \n\t"
348 "movq %%mm3, %%mm5 \n\t"
349 "psllq $8, %%mm0 \n\t"
350 "psllq $8, %%mm3 \n\t"
351 "pand %%mm7, %%mm0 \n\t"
352 "pand %%mm7, %%mm3 \n\t"
353 "psrlq $5, %%mm1 \n\t"
354 "psrlq $5, %%mm4 \n\t"
355 "pand %%mm6, %%mm1 \n\t"
356 "pand %%mm6, %%mm4 \n\t"
357 "psrlq $19, %%mm2 \n\t"
358 "psrlq $19, %%mm5 \n\t"
359 "pand %2, %%mm2 \n\t"
360 "pand %2, %%mm5 \n\t"
361 "por %%mm1, %%mm0 \n\t"
362 "por %%mm4, %%mm3 \n\t"
363 "por %%mm2, %%mm0 \n\t"
364 "por %%mm5, %%mm3 \n\t"
365 "psllq $16, %%mm3 \n\t"
366 "por %%mm3, %%mm0 \n\t"
368 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
372 __asm__
volatile(
SFENCE:::
"memory");
373 __asm__
volatile(
EMMS:::
"memory");
375 register int rgb = *(
const uint32_t*)s; s += 4;
376 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
385 uint16_t *d = (uint16_t *)
dst;
389 "movq %3, %%mm5 \n\t"
390 "movq %4, %%mm6 \n\t"
391 "movq %5, %%mm7 \n\t"
396 "movd (%1), %%mm0 \n\t"
397 "movd 4(%1), %%mm3 \n\t"
398 "punpckldq 8(%1), %%mm0 \n\t"
399 "punpckldq 12(%1), %%mm3 \n\t"
400 "movq %%mm0, %%mm1 \n\t"
401 "movq %%mm3, %%mm4 \n\t"
402 "pand %%mm6, %%mm0 \n\t"
403 "pand %%mm6, %%mm3 \n\t"
404 "pmaddwd %%mm7, %%mm0 \n\t"
405 "pmaddwd %%mm7, %%mm3 \n\t"
406 "pand %%mm5, %%mm1 \n\t"
407 "pand %%mm5, %%mm4 \n\t"
408 "por %%mm1, %%mm0 \n\t"
409 "por %%mm4, %%mm3 \n\t"
410 "psrld $6, %%mm0 \n\t"
411 "pslld $10, %%mm3 \n\t"
412 "por %%mm3, %%mm0 \n\t"
420 :
"r" (mm_end),
"m" (mask3215g),
"m" (mask3216br),
"m" (mul3215)
422 __asm__
volatile(
SFENCE:::
"memory");
423 __asm__
volatile(
EMMS:::
"memory");
425 register int rgb = *(
const uint32_t*)s; s += 4;
426 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
435 uint16_t *d = (uint16_t *)
dst;
437 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
439 "movq %0, %%mm7 \n\t"
440 "movq %1, %%mm6 \n\t"
441 ::
"m"(red_15mask),
"m"(green_15mask));
446 "movd (%1), %%mm0 \n\t"
447 "movd 4(%1), %%mm3 \n\t"
448 "punpckldq 8(%1), %%mm0 \n\t"
449 "punpckldq 12(%1), %%mm3 \n\t"
450 "movq %%mm0, %%mm1 \n\t"
451 "movq %%mm0, %%mm2 \n\t"
452 "movq %%mm3, %%mm4 \n\t"
453 "movq %%mm3, %%mm5 \n\t"
454 "psllq $7, %%mm0 \n\t"
455 "psllq $7, %%mm3 \n\t"
456 "pand %%mm7, %%mm0 \n\t"
457 "pand %%mm7, %%mm3 \n\t"
458 "psrlq $6, %%mm1 \n\t"
459 "psrlq $6, %%mm4 \n\t"
460 "pand %%mm6, %%mm1 \n\t"
461 "pand %%mm6, %%mm4 \n\t"
462 "psrlq $19, %%mm2 \n\t"
463 "psrlq $19, %%mm5 \n\t"
464 "pand %2, %%mm2 \n\t"
465 "pand %2, %%mm5 \n\t"
466 "por %%mm1, %%mm0 \n\t"
467 "por %%mm4, %%mm3 \n\t"
468 "por %%mm2, %%mm0 \n\t"
469 "por %%mm5, %%mm3 \n\t"
470 "psllq $16, %%mm3 \n\t"
471 "por %%mm3, %%mm0 \n\t"
473 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
477 __asm__
volatile(
SFENCE:::
"memory");
478 __asm__
volatile(
EMMS:::
"memory");
480 register int rgb = *(
const uint32_t*)s; s += 4;
481 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
490 uint16_t *d = (uint16_t *)
dst;
492 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
494 "movq %0, %%mm7 \n\t"
495 "movq %1, %%mm6 \n\t"
496 ::
"m"(red_16mask),
"m"(green_16mask));
501 "movd (%1), %%mm0 \n\t"
502 "movd 3(%1), %%mm3 \n\t"
503 "punpckldq 6(%1), %%mm0 \n\t"
504 "punpckldq 9(%1), %%mm3 \n\t"
505 "movq %%mm0, %%mm1 \n\t"
506 "movq %%mm0, %%mm2 \n\t"
507 "movq %%mm3, %%mm4 \n\t"
508 "movq %%mm3, %%mm5 \n\t"
509 "psrlq $3, %%mm0 \n\t"
510 "psrlq $3, %%mm3 \n\t"
511 "pand %2, %%mm0 \n\t"
512 "pand %2, %%mm3 \n\t"
513 "psrlq $5, %%mm1 \n\t"
514 "psrlq $5, %%mm4 \n\t"
515 "pand %%mm6, %%mm1 \n\t"
516 "pand %%mm6, %%mm4 \n\t"
517 "psrlq $8, %%mm2 \n\t"
518 "psrlq $8, %%mm5 \n\t"
519 "pand %%mm7, %%mm2 \n\t"
520 "pand %%mm7, %%mm5 \n\t"
521 "por %%mm1, %%mm0 \n\t"
522 "por %%mm4, %%mm3 \n\t"
523 "por %%mm2, %%mm0 \n\t"
524 "por %%mm5, %%mm3 \n\t"
525 "psllq $16, %%mm3 \n\t"
526 "por %%mm3, %%mm0 \n\t"
528 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
532 __asm__
volatile(
SFENCE:::
"memory");
533 __asm__
volatile(
EMMS:::
"memory");
538 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
547 uint16_t *d = (uint16_t *)
dst;
549 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
551 "movq %0, %%mm7 \n\t"
552 "movq %1, %%mm6 \n\t"
553 ::
"m"(red_16mask),
"m"(green_16mask));
558 "movd (%1), %%mm0 \n\t"
559 "movd 3(%1), %%mm3 \n\t"
560 "punpckldq 6(%1), %%mm0 \n\t"
561 "punpckldq 9(%1), %%mm3 \n\t"
562 "movq %%mm0, %%mm1 \n\t"
563 "movq %%mm0, %%mm2 \n\t"
564 "movq %%mm3, %%mm4 \n\t"
565 "movq %%mm3, %%mm5 \n\t"
566 "psllq $8, %%mm0 \n\t"
567 "psllq $8, %%mm3 \n\t"
568 "pand %%mm7, %%mm0 \n\t"
569 "pand %%mm7, %%mm3 \n\t"
570 "psrlq $5, %%mm1 \n\t"
571 "psrlq $5, %%mm4 \n\t"
572 "pand %%mm6, %%mm1 \n\t"
573 "pand %%mm6, %%mm4 \n\t"
574 "psrlq $19, %%mm2 \n\t"
575 "psrlq $19, %%mm5 \n\t"
576 "pand %2, %%mm2 \n\t"
577 "pand %2, %%mm5 \n\t"
578 "por %%mm1, %%mm0 \n\t"
579 "por %%mm4, %%mm3 \n\t"
580 "por %%mm2, %%mm0 \n\t"
581 "por %%mm5, %%mm3 \n\t"
582 "psllq $16, %%mm3 \n\t"
583 "por %%mm3, %%mm0 \n\t"
585 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
589 __asm__
volatile(
SFENCE:::
"memory");
590 __asm__
volatile(
EMMS:::
"memory");
595 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
604 uint16_t *d = (uint16_t *)
dst;
606 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
608 "movq %0, %%mm7 \n\t"
609 "movq %1, %%mm6 \n\t"
610 ::
"m"(red_15mask),
"m"(green_15mask));
615 "movd (%1), %%mm0 \n\t"
616 "movd 3(%1), %%mm3 \n\t"
617 "punpckldq 6(%1), %%mm0 \n\t"
618 "punpckldq 9(%1), %%mm3 \n\t"
619 "movq %%mm0, %%mm1 \n\t"
620 "movq %%mm0, %%mm2 \n\t"
621 "movq %%mm3, %%mm4 \n\t"
622 "movq %%mm3, %%mm5 \n\t"
623 "psrlq $3, %%mm0 \n\t"
624 "psrlq $3, %%mm3 \n\t"
625 "pand %2, %%mm0 \n\t"
626 "pand %2, %%mm3 \n\t"
627 "psrlq $6, %%mm1 \n\t"
628 "psrlq $6, %%mm4 \n\t"
629 "pand %%mm6, %%mm1 \n\t"
630 "pand %%mm6, %%mm4 \n\t"
631 "psrlq $9, %%mm2 \n\t"
632 "psrlq $9, %%mm5 \n\t"
633 "pand %%mm7, %%mm2 \n\t"
634 "pand %%mm7, %%mm5 \n\t"
635 "por %%mm1, %%mm0 \n\t"
636 "por %%mm4, %%mm3 \n\t"
637 "por %%mm2, %%mm0 \n\t"
638 "por %%mm5, %%mm3 \n\t"
639 "psllq $16, %%mm3 \n\t"
640 "por %%mm3, %%mm0 \n\t"
642 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
646 __asm__
volatile(
SFENCE:::
"memory");
647 __asm__
volatile(
EMMS:::
"memory");
652 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
661 uint16_t *d = (uint16_t *)
dst;
663 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
665 "movq %0, %%mm7 \n\t"
666 "movq %1, %%mm6 \n\t"
667 ::
"m"(red_15mask),
"m"(green_15mask));
672 "movd (%1), %%mm0 \n\t"
673 "movd 3(%1), %%mm3 \n\t"
674 "punpckldq 6(%1), %%mm0 \n\t"
675 "punpckldq 9(%1), %%mm3 \n\t"
676 "movq %%mm0, %%mm1 \n\t"
677 "movq %%mm0, %%mm2 \n\t"
678 "movq %%mm3, %%mm4 \n\t"
679 "movq %%mm3, %%mm5 \n\t"
680 "psllq $7, %%mm0 \n\t"
681 "psllq $7, %%mm3 \n\t"
682 "pand %%mm7, %%mm0 \n\t"
683 "pand %%mm7, %%mm3 \n\t"
684 "psrlq $6, %%mm1 \n\t"
685 "psrlq $6, %%mm4 \n\t"
686 "pand %%mm6, %%mm1 \n\t"
687 "pand %%mm6, %%mm4 \n\t"
688 "psrlq $19, %%mm2 \n\t"
689 "psrlq $19, %%mm5 \n\t"
690 "pand %2, %%mm2 \n\t"
691 "pand %2, %%mm5 \n\t"
692 "por %%mm1, %%mm0 \n\t"
693 "por %%mm4, %%mm3 \n\t"
694 "por %%mm2, %%mm0 \n\t"
695 "por %%mm5, %%mm3 \n\t"
696 "psllq $16, %%mm3 \n\t"
697 "por %%mm3, %%mm0 \n\t"
699 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
703 __asm__
volatile(
SFENCE:::
"memory");
704 __asm__
volatile(
EMMS:::
"memory");
709 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
716 const uint16_t *mm_end;
718 const uint16_t *s = (
const uint16_t*)src;
719 end = s + src_size/2;
720 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
725 "movq (%1), %%mm0 \n\t"
726 "movq (%1), %%mm1 \n\t"
727 "movq (%1), %%mm2 \n\t"
728 "pand %2, %%mm0 \n\t"
729 "pand %3, %%mm1 \n\t"
730 "pand %4, %%mm2 \n\t"
731 "psllq $5, %%mm0 \n\t"
732 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
733 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t"
734 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
735 "movq %%mm0, %%mm3 \n\t"
736 "movq %%mm1, %%mm4 \n\t"
737 "movq %%mm2, %%mm5 \n\t"
738 "punpcklwd %5, %%mm0 \n\t"
739 "punpcklwd %5, %%mm1 \n\t"
740 "punpcklwd %5, %%mm2 \n\t"
741 "punpckhwd %5, %%mm3 \n\t"
742 "punpckhwd %5, %%mm4 \n\t"
743 "punpckhwd %5, %%mm5 \n\t"
744 "psllq $8, %%mm1 \n\t"
745 "psllq $16, %%mm2 \n\t"
746 "por %%mm1, %%mm0 \n\t"
747 "por %%mm2, %%mm0 \n\t"
748 "psllq $8, %%mm4 \n\t"
749 "psllq $16, %%mm5 \n\t"
750 "por %%mm4, %%mm3 \n\t"
751 "por %%mm5, %%mm3 \n\t"
753 "movq %%mm0, %%mm6 \n\t"
754 "movq %%mm3, %%mm7 \n\t"
756 "movq 8(%1), %%mm0 \n\t"
757 "movq 8(%1), %%mm1 \n\t"
758 "movq 8(%1), %%mm2 \n\t"
759 "pand %2, %%mm0 \n\t"
760 "pand %3, %%mm1 \n\t"
761 "pand %4, %%mm2 \n\t"
762 "psllq $5, %%mm0 \n\t"
763 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
764 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t"
765 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
766 "movq %%mm0, %%mm3 \n\t"
767 "movq %%mm1, %%mm4 \n\t"
768 "movq %%mm2, %%mm5 \n\t"
769 "punpcklwd %5, %%mm0 \n\t"
770 "punpcklwd %5, %%mm1 \n\t"
771 "punpcklwd %5, %%mm2 \n\t"
772 "punpckhwd %5, %%mm3 \n\t"
773 "punpckhwd %5, %%mm4 \n\t"
774 "punpckhwd %5, %%mm5 \n\t"
775 "psllq $8, %%mm1 \n\t"
776 "psllq $16, %%mm2 \n\t"
777 "por %%mm1, %%mm0 \n\t"
778 "por %%mm2, %%mm0 \n\t"
779 "psllq $8, %%mm4 \n\t"
780 "psllq $16, %%mm5 \n\t"
781 "por %%mm4, %%mm3 \n\t"
782 "por %%mm5, %%mm3 \n\t"
785 :
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r),
"m"(mmx_null)
789 "movq %%mm0, %%mm4 \n\t"
790 "movq %%mm3, %%mm5 \n\t"
791 "movq %%mm6, %%mm0 \n\t"
792 "movq %%mm7, %%mm1 \n\t"
794 "movq %%mm4, %%mm6 \n\t"
795 "movq %%mm5, %%mm7 \n\t"
796 "movq %%mm0, %%mm2 \n\t"
797 "movq %%mm1, %%mm3 \n\t"
806 __asm__
volatile(
SFENCE:::
"memory");
807 __asm__
volatile(
EMMS:::
"memory");
809 register uint16_t bgr;
811 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
812 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
813 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
820 const uint16_t *mm_end;
822 const uint16_t *s = (
const uint16_t *)src;
823 end = s + src_size/2;
824 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
829 "movq (%1), %%mm0 \n\t"
830 "movq (%1), %%mm1 \n\t"
831 "movq (%1), %%mm2 \n\t"
832 "pand %2, %%mm0 \n\t"
833 "pand %3, %%mm1 \n\t"
834 "pand %4, %%mm2 \n\t"
835 "psllq $5, %%mm0 \n\t"
836 "psrlq $1, %%mm2 \n\t"
837 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
838 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
839 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
840 "movq %%mm0, %%mm3 \n\t"
841 "movq %%mm1, %%mm4 \n\t"
842 "movq %%mm2, %%mm5 \n\t"
843 "punpcklwd %5, %%mm0 \n\t"
844 "punpcklwd %5, %%mm1 \n\t"
845 "punpcklwd %5, %%mm2 \n\t"
846 "punpckhwd %5, %%mm3 \n\t"
847 "punpckhwd %5, %%mm4 \n\t"
848 "punpckhwd %5, %%mm5 \n\t"
849 "psllq $8, %%mm1 \n\t"
850 "psllq $16, %%mm2 \n\t"
851 "por %%mm1, %%mm0 \n\t"
852 "por %%mm2, %%mm0 \n\t"
853 "psllq $8, %%mm4 \n\t"
854 "psllq $16, %%mm5 \n\t"
855 "por %%mm4, %%mm3 \n\t"
856 "por %%mm5, %%mm3 \n\t"
858 "movq %%mm0, %%mm6 \n\t"
859 "movq %%mm3, %%mm7 \n\t"
861 "movq 8(%1), %%mm0 \n\t"
862 "movq 8(%1), %%mm1 \n\t"
863 "movq 8(%1), %%mm2 \n\t"
864 "pand %2, %%mm0 \n\t"
865 "pand %3, %%mm1 \n\t"
866 "pand %4, %%mm2 \n\t"
867 "psllq $5, %%mm0 \n\t"
868 "psrlq $1, %%mm2 \n\t"
869 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
870 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
871 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
872 "movq %%mm0, %%mm3 \n\t"
873 "movq %%mm1, %%mm4 \n\t"
874 "movq %%mm2, %%mm5 \n\t"
875 "punpcklwd %5, %%mm0 \n\t"
876 "punpcklwd %5, %%mm1 \n\t"
877 "punpcklwd %5, %%mm2 \n\t"
878 "punpckhwd %5, %%mm3 \n\t"
879 "punpckhwd %5, %%mm4 \n\t"
880 "punpckhwd %5, %%mm5 \n\t"
881 "psllq $8, %%mm1 \n\t"
882 "psllq $16, %%mm2 \n\t"
883 "por %%mm1, %%mm0 \n\t"
884 "por %%mm2, %%mm0 \n\t"
885 "psllq $8, %%mm4 \n\t"
886 "psllq $16, %%mm5 \n\t"
887 "por %%mm4, %%mm3 \n\t"
888 "por %%mm5, %%mm3 \n\t"
890 :
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mmx_null)
894 "movq %%mm0, %%mm4 \n\t"
895 "movq %%mm3, %%mm5 \n\t"
896 "movq %%mm6, %%mm0 \n\t"
897 "movq %%mm7, %%mm1 \n\t"
899 "movq %%mm4, %%mm6 \n\t"
900 "movq %%mm5, %%mm7 \n\t"
901 "movq %%mm0, %%mm2 \n\t"
902 "movq %%mm1, %%mm3 \n\t"
911 __asm__
volatile(
SFENCE:::
"memory");
912 __asm__
volatile(
EMMS:::
"memory");
914 register uint16_t bgr;
916 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
917 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
918 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
930 "packuswb %%mm7, %%mm0 \n\t" \
931 "packuswb %%mm7, %%mm1 \n\t" \
932 "packuswb %%mm7, %%mm2 \n\t" \
933 "punpcklbw %%mm1, %%mm0 \n\t" \
934 "punpcklbw %%mm6, %%mm2 \n\t" \
935 "movq %%mm0, %%mm3 \n\t" \
936 "punpcklwd %%mm2, %%mm0 \n\t" \
937 "punpckhwd %%mm2, %%mm3 \n\t" \
938 MOVNTQ" %%mm0, (%0) \n\t" \
939 MOVNTQ" %%mm3, 8(%0) \n\t" \
944 const uint16_t *mm_end;
946 const uint16_t *s = (
const uint16_t *)src;
947 end = s + src_size/2;
948 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
949 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
950 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
955 "movq (%1), %%mm0 \n\t"
956 "movq (%1), %%mm1 \n\t"
957 "movq (%1), %%mm2 \n\t"
958 "pand %2, %%mm0 \n\t"
959 "pand %3, %%mm1 \n\t"
960 "pand %4, %%mm2 \n\t"
961 "psllq $5, %%mm0 \n\t"
962 "pmulhw %5, %%mm0 \n\t"
963 "pmulhw %5, %%mm1 \n\t"
964 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
966 ::
"r"(d),
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r) ,
"m"(mul15_mid)
971 __asm__
volatile(
SFENCE:::
"memory");
972 __asm__
volatile(
EMMS:::
"memory");
974 register uint16_t bgr;
976 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
977 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
978 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
986 const uint16_t *mm_end;
988 const uint16_t *s = (
const uint16_t*)src;
989 end = s + src_size/2;
990 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
991 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
992 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
997 "movq (%1), %%mm0 \n\t"
998 "movq (%1), %%mm1 \n\t"
999 "movq (%1), %%mm2 \n\t"
1000 "pand %2, %%mm0 \n\t"
1001 "pand %3, %%mm1 \n\t"
1002 "pand %4, %%mm2 \n\t"
1003 "psllq $5, %%mm0 \n\t"
1004 "psrlq $1, %%mm2 \n\t"
1005 "pmulhw %5, %%mm0 \n\t"
1006 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
1007 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
1009 ::
"r"(d),
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mul15_mid)
1014 __asm__
volatile(
SFENCE:::
"memory");
1015 __asm__
volatile(
EMMS:::
"memory");
1017 register uint16_t bgr;
1019 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1020 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1021 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1035 "movq %3, %%mm7 \n\t"
1036 "pxor %4, %%mm7 \n\t"
1037 "movq %%mm7, %%mm6 \n\t"
1038 "pxor %5, %%mm7 \n\t"
1042 "movq (%1, %0), %%mm0 \n\t"
1043 "movq 8(%1, %0), %%mm1 \n\t"
1044 # if COMPILE_TEMPLATE_MMXEXT
1045 "pshufw $177, %%mm0, %%mm3 \n\t"
1046 "pshufw $177, %%mm1, %%mm5 \n\t"
1047 "pand %%mm7, %%mm0 \n\t"
1048 "pand %%mm6, %%mm3 \n\t"
1049 "pand %%mm7, %%mm1 \n\t"
1050 "pand %%mm6, %%mm5 \n\t"
1051 "por %%mm3, %%mm0 \n\t"
1052 "por %%mm5, %%mm1 \n\t"
1054 "movq %%mm0, %%mm2 \n\t"
1055 "movq %%mm1, %%mm4 \n\t"
1056 "pand %%mm7, %%mm0 \n\t"
1057 "pand %%mm6, %%mm2 \n\t"
1058 "pand %%mm7, %%mm1 \n\t"
1059 "pand %%mm6, %%mm4 \n\t"
1060 "movq %%mm2, %%mm3 \n\t"
1061 "movq %%mm4, %%mm5 \n\t"
1062 "pslld $16, %%mm2 \n\t"
1063 "psrld $16, %%mm3 \n\t"
1064 "pslld $16, %%mm4 \n\t"
1065 "psrld $16, %%mm5 \n\t"
1066 "por %%mm2, %%mm0 \n\t"
1067 "por %%mm4, %%mm1 \n\t"
1068 "por %%mm3, %%mm0 \n\t"
1069 "por %%mm5, %%mm1 \n\t"
1071 MOVNTQ" %%mm0, (%2, %0) \n\t"
1072 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1079 :
"r" (s),
"r" (d),
"m" (mask32b),
"m" (mask32r),
"m" (mmx_one)
1081 for (; idx<15; idx+=4) {
1082 register int v = *(
const uint32_t *)&s[idx],
g = v & 0xff00ff00;
1084 *(uint32_t *)&d[idx] = (v>>16) +
g + (v<<16);
1091 x86_reg mmx_size= 23 - src_size;
1093 "test %%"REG_a
", %%"REG_a
" \n\t"
1095 "movq "MANGLE(mask24r)
", %%mm5 \n\t"
1096 "movq "MANGLE(mask24g)
", %%mm6 \n\t"
1097 "movq "MANGLE(mask24b)
", %%mm7 \n\t"
1101 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1102 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1103 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t"
1104 "psllq $16, %%mm0 \n\t"
1105 "pand %%mm5, %%mm0 \n\t"
1106 "pand %%mm6, %%mm1 \n\t"
1107 "pand %%mm7, %%mm2 \n\t"
1108 "por %%mm0, %%mm1 \n\t"
1109 "por %%mm2, %%mm1 \n\t"
1110 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t"
1111 MOVNTQ" %%mm1, (%2, %%"REG_a
") \n\t"
1112 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t"
1113 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t"
1114 "pand %%mm7, %%mm0 \n\t"
1115 "pand %%mm5, %%mm1 \n\t"
1116 "pand %%mm6, %%mm2 \n\t"
1117 "por %%mm0, %%mm1 \n\t"
1118 "por %%mm2, %%mm1 \n\t"
1119 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t"
1120 MOVNTQ" %%mm1, 8(%2, %%"REG_a
") \n\t"
1121 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t"
1122 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t"
1123 "pand %%mm6, %%mm0 \n\t"
1124 "pand %%mm7, %%mm1 \n\t"
1125 "pand %%mm5, %%mm2 \n\t"
1126 "por %%mm0, %%mm1 \n\t"
1127 "por %%mm2, %%mm1 \n\t"
1128 MOVNTQ" %%mm1, 16(%2, %%"REG_a
") \n\t"
1129 "add $24, %%"REG_a
" \n\t"
1133 :
"r" (src-mmx_size),
"r"(
dst-mmx_size)
1136 __asm__
volatile(
SFENCE:::
"memory");
1137 __asm__
volatile(
EMMS:::
"memory");
1139 if (mmx_size==23)
return;
1143 src_size= 23-mmx_size;
1146 for (i=0; i<src_size; i+=3) {
1149 dst[i + 1] = src[i + 1];
1150 dst[i + 2] = src[i + 0];
1157 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1161 for (y=0; y<
height; y++) {
1164 "xor %%"REG_a
", %%"REG_a
" \n\t"
1167 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1170 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1171 "movq %%mm0, %%mm2 \n\t"
1172 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1173 "punpcklbw %%mm1, %%mm0 \n\t"
1174 "punpckhbw %%mm1, %%mm2 \n\t"
1176 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1177 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1178 "movq %%mm3, %%mm4 \n\t"
1179 "movq %%mm5, %%mm6 \n\t"
1180 "punpcklbw %%mm0, %%mm3 \n\t"
1181 "punpckhbw %%mm0, %%mm4 \n\t"
1182 "punpcklbw %%mm2, %%mm5 \n\t"
1183 "punpckhbw %%mm2, %%mm6 \n\t"
1185 MOVNTQ" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1186 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1187 MOVNTQ" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1188 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1190 "add $8, %%"REG_a
" \n\t"
1191 "cmp %4, %%"REG_a
" \n\t"
1193 ::
"r"(
dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1196 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1197 usrc += chromStride;
1198 vsrc += chromStride;
1214 int lumStride,
int chromStride,
int dstStride)
1222 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1226 for (y=0; y<
height; y++) {
1229 "xor %%"REG_a
", %%"REG_a
" \n\t"
1232 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1235 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1236 "movq %%mm0, %%mm2 \n\t"
1237 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1238 "punpcklbw %%mm1, %%mm0 \n\t"
1239 "punpckhbw %%mm1, %%mm2 \n\t"
1241 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1242 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1243 "movq %%mm0, %%mm4 \n\t"
1244 "movq %%mm2, %%mm6 \n\t"
1245 "punpcklbw %%mm3, %%mm0 \n\t"
1246 "punpckhbw %%mm3, %%mm4 \n\t"
1247 "punpcklbw %%mm5, %%mm2 \n\t"
1248 "punpckhbw %%mm5, %%mm6 \n\t"
1250 MOVNTQ" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1251 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1252 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1253 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1255 "add $8, %%"REG_a
" \n\t"
1256 "cmp %4, %%"REG_a
" \n\t"
1258 ::
"r"(
dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1261 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1262 usrc += chromStride;
1263 vsrc += chromStride;
1279 int lumStride,
int chromStride,
int dstStride)
1290 int lumStride,
int chromStride,
int dstStride)
1300 int lumStride,
int chromStride,
int dstStride)
1311 int lumStride,
int chromStride,
int srcStride)
1315 for (y=0; y<
height; y+=2) {
1317 "xor %%"REG_a
", %%"REG_a
" \n\t"
1318 "pcmpeqw %%mm7, %%mm7 \n\t"
1319 "psrlw $8, %%mm7 \n\t"
1322 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1323 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1324 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1325 "movq %%mm0, %%mm2 \n\t"
1326 "movq %%mm1, %%mm3 \n\t"
1327 "psrlw $8, %%mm0 \n\t"
1328 "psrlw $8, %%mm1 \n\t"
1329 "pand %%mm7, %%mm2 \n\t"
1330 "pand %%mm7, %%mm3 \n\t"
1331 "packuswb %%mm1, %%mm0 \n\t"
1332 "packuswb %%mm3, %%mm2 \n\t"
1334 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1336 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1337 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1338 "movq %%mm1, %%mm3 \n\t"
1339 "movq %%mm2, %%mm4 \n\t"
1340 "psrlw $8, %%mm1 \n\t"
1341 "psrlw $8, %%mm2 \n\t"
1342 "pand %%mm7, %%mm3 \n\t"
1343 "pand %%mm7, %%mm4 \n\t"
1344 "packuswb %%mm2, %%mm1 \n\t"
1345 "packuswb %%mm4, %%mm3 \n\t"
1347 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1349 "movq %%mm0, %%mm2 \n\t"
1350 "movq %%mm1, %%mm3 \n\t"
1351 "psrlw $8, %%mm0 \n\t"
1352 "psrlw $8, %%mm1 \n\t"
1353 "pand %%mm7, %%mm2 \n\t"
1354 "pand %%mm7, %%mm3 \n\t"
1355 "packuswb %%mm1, %%mm0 \n\t"
1356 "packuswb %%mm3, %%mm2 \n\t"
1358 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1359 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1361 "add $8, %%"REG_a
" \n\t"
1362 "cmp %4, %%"REG_a
" \n\t"
1364 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1365 :
"memory",
"%"REG_a
1372 "xor %%"REG_a
", %%"REG_a
" \n\t"
1375 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1376 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1377 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1378 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1379 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1380 "pand %%mm7, %%mm0 \n\t"
1381 "pand %%mm7, %%mm1 \n\t"
1382 "pand %%mm7, %%mm2 \n\t"
1383 "pand %%mm7, %%mm3 \n\t"
1384 "packuswb %%mm1, %%mm0 \n\t"
1385 "packuswb %%mm3, %%mm2 \n\t"
1387 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1388 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1390 "add $8, %%"REG_a
" \n\t"
1391 "cmp %4, %%"REG_a
" \n\t"
1394 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1395 :
"memory",
"%"REG_a
1397 udst += chromStride;
1398 vdst += chromStride;
1402 __asm__
volatile(
EMMS" \n\t"
1408 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1416 for (x=0; x<srcWidth-1; x++) {
1417 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1418 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1420 dst[2*srcWidth-1]= src[srcWidth-1];
1424 for (y=1; y<srcHeight; y++) {
1425 const x86_reg mmxSize= srcWidth&~15;
1427 "mov %4, %%"REG_a
" \n\t"
1428 "movq "MANGLE(mmx_ff)
", %%mm0 \n\t"
1429 "movq (%0, %%"REG_a
"), %%mm4 \n\t"
1430 "movq %%mm4, %%mm2 \n\t"
1431 "psllq $8, %%mm4 \n\t"
1432 "pand %%mm0, %%mm2 \n\t"
1433 "por %%mm2, %%mm4 \n\t"
1434 "movq (%1, %%"REG_a
"), %%mm5 \n\t"
1435 "movq %%mm5, %%mm3 \n\t"
1436 "psllq $8, %%mm5 \n\t"
1437 "pand %%mm0, %%mm3 \n\t"
1438 "por %%mm3, %%mm5 \n\t"
1440 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1441 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1442 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1443 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1444 PAVGB" %%mm0, %%mm5 \n\t"
1445 PAVGB" %%mm0, %%mm3 \n\t"
1446 PAVGB" %%mm0, %%mm5 \n\t"
1447 PAVGB" %%mm0, %%mm3 \n\t"
1448 PAVGB" %%mm1, %%mm4 \n\t"
1449 PAVGB" %%mm1, %%mm2 \n\t"
1450 PAVGB" %%mm1, %%mm4 \n\t"
1451 PAVGB" %%mm1, %%mm2 \n\t"
1452 "movq %%mm5, %%mm7 \n\t"
1453 "movq %%mm4, %%mm6 \n\t"
1454 "punpcklbw %%mm3, %%mm5 \n\t"
1455 "punpckhbw %%mm3, %%mm7 \n\t"
1456 "punpcklbw %%mm2, %%mm4 \n\t"
1457 "punpckhbw %%mm2, %%mm6 \n\t"
1458 MOVNTQ" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1459 MOVNTQ" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1460 MOVNTQ" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1461 MOVNTQ" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1462 "add $8, %%"REG_a
" \n\t"
1463 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1464 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1466 ::
"r" (src + mmxSize ),
"r" (src + srcStride + mmxSize ),
1467 "r" (
dst + mmxSize*2),
"r" (
dst + dstStride + mmxSize*2),
1472 for (x=mmxSize-1; x<srcWidth-1; x++) {
1473 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1474 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1475 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1476 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1478 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1479 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1488 for (x=0; x<srcWidth-1; x++) {
1489 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1490 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1492 dst[2*srcWidth-1]= src[srcWidth-1];
1494 __asm__
volatile(
EMMS" \n\t"
1500 #if !COMPILE_TEMPLATE_AMD3DNOW
1509 int lumStride,
int chromStride,
int srcStride)
1512 const x86_reg chromWidth= width>>1;
1513 for (y=0; y<
height; y+=2) {
1515 "xor %%"REG_a
", %%"REG_a
" \n\t"
1516 "pcmpeqw %%mm7, %%mm7 \n\t"
1517 "psrlw $8, %%mm7 \n\t"
1520 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1521 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1522 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1523 "movq %%mm0, %%mm2 \n\t"
1524 "movq %%mm1, %%mm3 \n\t"
1525 "pand %%mm7, %%mm0 \n\t"
1526 "pand %%mm7, %%mm1 \n\t"
1527 "psrlw $8, %%mm2 \n\t"
1528 "psrlw $8, %%mm3 \n\t"
1529 "packuswb %%mm1, %%mm0 \n\t"
1530 "packuswb %%mm3, %%mm2 \n\t"
1532 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1534 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1535 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1536 "movq %%mm1, %%mm3 \n\t"
1537 "movq %%mm2, %%mm4 \n\t"
1538 "pand %%mm7, %%mm1 \n\t"
1539 "pand %%mm7, %%mm2 \n\t"
1540 "psrlw $8, %%mm3 \n\t"
1541 "psrlw $8, %%mm4 \n\t"
1542 "packuswb %%mm2, %%mm1 \n\t"
1543 "packuswb %%mm4, %%mm3 \n\t"
1545 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1547 "movq %%mm0, %%mm2 \n\t"
1548 "movq %%mm1, %%mm3 \n\t"
1549 "psrlw $8, %%mm0 \n\t"
1550 "psrlw $8, %%mm1 \n\t"
1551 "pand %%mm7, %%mm2 \n\t"
1552 "pand %%mm7, %%mm3 \n\t"
1553 "packuswb %%mm1, %%mm0 \n\t"
1554 "packuswb %%mm3, %%mm2 \n\t"
1556 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1557 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1559 "add $8, %%"REG_a
" \n\t"
1560 "cmp %4, %%"REG_a
" \n\t"
1562 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1563 :
"memory",
"%"REG_a
1570 "xor %%"REG_a
", %%"REG_a
" \n\t"
1573 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1574 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1575 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1576 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1577 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1578 "psrlw $8, %%mm0 \n\t"
1579 "psrlw $8, %%mm1 \n\t"
1580 "psrlw $8, %%mm2 \n\t"
1581 "psrlw $8, %%mm3 \n\t"
1582 "packuswb %%mm1, %%mm0 \n\t"
1583 "packuswb %%mm3, %%mm2 \n\t"
1585 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1586 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1588 "add $8, %%"REG_a
" \n\t"
1589 "cmp %4, %%"REG_a
" \n\t"
1592 ::
"r"(src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1593 :
"memory",
"%"REG_a
1595 udst += chromStride;
1596 vdst += chromStride;
1600 __asm__
volatile(
EMMS" \n\t"
1615 int lumStride,
int chromStride,
int srcStride)
1618 const x86_reg chromWidth= width>>1;
1619 for (y=0; y<height-2; y+=2) {
1621 for (i=0; i<2; i++) {
1623 "mov %2, %%"REG_a
" \n\t"
1624 "movq "MANGLE(ff_bgr2YCoeff)
", %%mm6 \n\t"
1625 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1626 "pxor %%mm7, %%mm7 \n\t"
1627 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1631 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1632 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
1633 "punpcklbw %%mm7, %%mm0 \n\t"
1634 "punpcklbw %%mm7, %%mm1 \n\t"
1635 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
1636 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
1637 "punpcklbw %%mm7, %%mm2 \n\t"
1638 "punpcklbw %%mm7, %%mm3 \n\t"
1639 "pmaddwd %%mm6, %%mm0 \n\t"
1640 "pmaddwd %%mm6, %%mm1 \n\t"
1641 "pmaddwd %%mm6, %%mm2 \n\t"
1642 "pmaddwd %%mm6, %%mm3 \n\t"
1643 #ifndef FAST_BGR2YV12
1644 "psrad $8, %%mm0 \n\t"
1645 "psrad $8, %%mm1 \n\t"
1646 "psrad $8, %%mm2 \n\t"
1647 "psrad $8, %%mm3 \n\t"
1649 "packssdw %%mm1, %%mm0 \n\t"
1650 "packssdw %%mm3, %%mm2 \n\t"
1651 "pmaddwd %%mm5, %%mm0 \n\t"
1652 "pmaddwd %%mm5, %%mm2 \n\t"
1653 "packssdw %%mm2, %%mm0 \n\t"
1654 "psraw $7, %%mm0 \n\t"
1656 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1657 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
1658 "punpcklbw %%mm7, %%mm4 \n\t"
1659 "punpcklbw %%mm7, %%mm1 \n\t"
1660 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
1661 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
1662 "punpcklbw %%mm7, %%mm2 \n\t"
1663 "punpcklbw %%mm7, %%mm3 \n\t"
1664 "pmaddwd %%mm6, %%mm4 \n\t"
1665 "pmaddwd %%mm6, %%mm1 \n\t"
1666 "pmaddwd %%mm6, %%mm2 \n\t"
1667 "pmaddwd %%mm6, %%mm3 \n\t"
1668 #ifndef FAST_BGR2YV12
1669 "psrad $8, %%mm4 \n\t"
1670 "psrad $8, %%mm1 \n\t"
1671 "psrad $8, %%mm2 \n\t"
1672 "psrad $8, %%mm3 \n\t"
1674 "packssdw %%mm1, %%mm4 \n\t"
1675 "packssdw %%mm3, %%mm2 \n\t"
1676 "pmaddwd %%mm5, %%mm4 \n\t"
1677 "pmaddwd %%mm5, %%mm2 \n\t"
1678 "add $24, %%"REG_d
" \n\t"
1679 "packssdw %%mm2, %%mm4 \n\t"
1680 "psraw $7, %%mm4 \n\t"
1682 "packuswb %%mm4, %%mm0 \n\t"
1683 "paddusb "MANGLE(ff_bgr2YOffset)
", %%mm0 \n\t"
1685 MOVNTQ" %%mm0, (%1, %%"REG_a
") \n\t"
1686 "add $8, %%"REG_a
" \n\t"
1688 : :
"r" (src+width*3),
"r" (ydst+width),
"g" ((
x86_reg)-width)
1689 :
"%"REG_a,
"%"REG_d
1696 "mov %4, %%"REG_a
" \n\t"
1697 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1698 "movq "MANGLE(ff_bgr2UCoeff)
", %%mm6 \n\t"
1699 "pxor %%mm7, %%mm7 \n\t"
1700 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1701 "add %%"REG_d
", %%"REG_d
" \n\t"
1706 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1707 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
1708 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
1709 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
1710 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
1711 PAVGB" %%mm1, %%mm0 \n\t"
1712 PAVGB" %%mm3, %%mm2 \n\t"
1713 "movq %%mm0, %%mm1 \n\t"
1714 "movq %%mm2, %%mm3 \n\t"
1715 "psrlq $24, %%mm0 \n\t"
1716 "psrlq $24, %%mm2 \n\t"
1717 PAVGB" %%mm1, %%mm0 \n\t"
1718 PAVGB" %%mm3, %%mm2 \n\t"
1719 "punpcklbw %%mm7, %%mm0 \n\t"
1720 "punpcklbw %%mm7, %%mm2 \n\t"
1722 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1723 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
1724 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
1725 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
1726 "punpcklbw %%mm7, %%mm0 \n\t"
1727 "punpcklbw %%mm7, %%mm1 \n\t"
1728 "punpcklbw %%mm7, %%mm2 \n\t"
1729 "punpcklbw %%mm7, %%mm3 \n\t"
1730 "paddw %%mm1, %%mm0 \n\t"
1731 "paddw %%mm3, %%mm2 \n\t"
1732 "paddw %%mm2, %%mm0 \n\t"
1733 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
1734 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
1735 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
1736 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
1737 "punpcklbw %%mm7, %%mm4 \n\t"
1738 "punpcklbw %%mm7, %%mm1 \n\t"
1739 "punpcklbw %%mm7, %%mm2 \n\t"
1740 "punpcklbw %%mm7, %%mm3 \n\t"
1741 "paddw %%mm1, %%mm4 \n\t"
1742 "paddw %%mm3, %%mm2 \n\t"
1743 "paddw %%mm4, %%mm2 \n\t"
1744 "psrlw $2, %%mm0 \n\t"
1745 "psrlw $2, %%mm2 \n\t"
1747 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm1 \n\t"
1748 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm3 \n\t"
1750 "pmaddwd %%mm0, %%mm1 \n\t"
1751 "pmaddwd %%mm2, %%mm3 \n\t"
1752 "pmaddwd %%mm6, %%mm0 \n\t"
1753 "pmaddwd %%mm6, %%mm2 \n\t"
1754 #ifndef FAST_BGR2YV12
1755 "psrad $8, %%mm0 \n\t"
1756 "psrad $8, %%mm1 \n\t"
1757 "psrad $8, %%mm2 \n\t"
1758 "psrad $8, %%mm3 \n\t"
1760 "packssdw %%mm2, %%mm0 \n\t"
1761 "packssdw %%mm3, %%mm1 \n\t"
1762 "pmaddwd %%mm5, %%mm0 \n\t"
1763 "pmaddwd %%mm5, %%mm1 \n\t"
1764 "packssdw %%mm1, %%mm0 \n\t"
1765 "psraw $7, %%mm0 \n\t"
1767 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1768 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
1769 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
1770 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
1771 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
1772 PAVGB" %%mm1, %%mm4 \n\t"
1773 PAVGB" %%mm3, %%mm2 \n\t"
1774 "movq %%mm4, %%mm1 \n\t"
1775 "movq %%mm2, %%mm3 \n\t"
1776 "psrlq $24, %%mm4 \n\t"
1777 "psrlq $24, %%mm2 \n\t"
1778 PAVGB" %%mm1, %%mm4 \n\t"
1779 PAVGB" %%mm3, %%mm2 \n\t"
1780 "punpcklbw %%mm7, %%mm4 \n\t"
1781 "punpcklbw %%mm7, %%mm2 \n\t"
1783 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1784 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
1785 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
1786 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
1787 "punpcklbw %%mm7, %%mm4 \n\t"
1788 "punpcklbw %%mm7, %%mm1 \n\t"
1789 "punpcklbw %%mm7, %%mm2 \n\t"
1790 "punpcklbw %%mm7, %%mm3 \n\t"
1791 "paddw %%mm1, %%mm4 \n\t"
1792 "paddw %%mm3, %%mm2 \n\t"
1793 "paddw %%mm2, %%mm4 \n\t"
1794 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
1795 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
1796 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
1797 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
1798 "punpcklbw %%mm7, %%mm5 \n\t"
1799 "punpcklbw %%mm7, %%mm1 \n\t"
1800 "punpcklbw %%mm7, %%mm2 \n\t"
1801 "punpcklbw %%mm7, %%mm3 \n\t"
1802 "paddw %%mm1, %%mm5 \n\t"
1803 "paddw %%mm3, %%mm2 \n\t"
1804 "paddw %%mm5, %%mm2 \n\t"
1805 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1806 "psrlw $2, %%mm4 \n\t"
1807 "psrlw $2, %%mm2 \n\t"
1809 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm1 \n\t"
1810 "movq "MANGLE(ff_bgr2VCoeff)
", %%mm3 \n\t"
1812 "pmaddwd %%mm4, %%mm1 \n\t"
1813 "pmaddwd %%mm2, %%mm3 \n\t"
1814 "pmaddwd %%mm6, %%mm4 \n\t"
1815 "pmaddwd %%mm6, %%mm2 \n\t"
1816 #ifndef FAST_BGR2YV12
1817 "psrad $8, %%mm4 \n\t"
1818 "psrad $8, %%mm1 \n\t"
1819 "psrad $8, %%mm2 \n\t"
1820 "psrad $8, %%mm3 \n\t"
1822 "packssdw %%mm2, %%mm4 \n\t"
1823 "packssdw %%mm3, %%mm1 \n\t"
1824 "pmaddwd %%mm5, %%mm4 \n\t"
1825 "pmaddwd %%mm5, %%mm1 \n\t"
1826 "add $24, %%"REG_d
" \n\t"
1827 "packssdw %%mm1, %%mm4 \n\t"
1828 "psraw $7, %%mm4 \n\t"
1830 "movq %%mm0, %%mm1 \n\t"
1831 "punpckldq %%mm4, %%mm0 \n\t"
1832 "punpckhdq %%mm4, %%mm1 \n\t"
1833 "packsswb %%mm1, %%mm0 \n\t"
1834 "paddb "MANGLE(ff_bgr2UVOffset)
", %%mm0 \n\t"
1835 "movd %%mm0, (%2, %%"REG_a
") \n\t"
1836 "punpckhdq %%mm0, %%mm0 \n\t"
1837 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1838 "add $4, %%"REG_a
" \n\t"
1840 : :
"r" (src+chromWidth*6),
"r" (src+srcStride+chromWidth*6),
"r" (udst+chromWidth),
"r" (vdst+chromWidth),
"g" (-chromWidth)
1841 :
"%"REG_a,
"%"REG_d
1844 udst += chromStride;
1845 vdst += chromStride;
1849 __asm__
volatile(
EMMS" \n\t"
1853 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
1857 #if !COMPILE_TEMPLATE_AMD3DNOW
1860 int src2Stride,
int dstStride)
1864 for (h=0; h <
height; h++) {
1867 #if COMPILE_TEMPLATE_SSE2
1869 "xor %%"REG_a
", %%"REG_a
" \n\t"
1873 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
1874 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
1875 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
1876 "punpcklbw %%xmm2, %%xmm0 \n\t"
1877 "punpckhbw %%xmm2, %%xmm1 \n\t"
1878 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
1879 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
1880 "add $16, %%"REG_a
" \n\t"
1881 "cmp %3, %%"REG_a
" \n\t"
1883 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1884 :
"memory",
"%"REG_a
""
1888 "xor %%"REG_a
", %%"REG_a
" \n\t"
1892 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1893 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
1894 "movq %%mm0, %%mm1 \n\t"
1895 "movq %%mm2, %%mm3 \n\t"
1896 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
1897 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
1898 "punpcklbw %%mm4, %%mm0 \n\t"
1899 "punpckhbw %%mm4, %%mm1 \n\t"
1900 "punpcklbw %%mm5, %%mm2 \n\t"
1901 "punpckhbw %%mm5, %%mm3 \n\t"
1902 MOVNTQ" %%mm0, (%0, %%"REG_a
", 2) \n\t"
1903 MOVNTQ" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
1904 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
1905 MOVNTQ" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
1906 "add $16, %%"REG_a
" \n\t"
1907 "cmp %3, %%"REG_a
" \n\t"
1909 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1910 :
"memory",
"%"REG_a
1913 for (w= (width&(~15)); w <
width; w++) {
1914 dest[2*w+0] = src1[w];
1915 dest[2*w+1] = src2[w];
1929 #if !COMPILE_TEMPLATE_SSE2
1930 #if !COMPILE_TEMPLATE_AMD3DNOW
1934 int srcStride1,
int srcStride2,
1935 int dstStride1,
int dstStride2)
1939 w=width/2; h=height/2;
1943 ::
"m"(*(src1+srcStride1)),
"m"(*(src2+srcStride2)):
"memory");
1945 const uint8_t*
s1=src1+srcStride1*(y>>1);
1948 for (;x<w-31;x+=32) {
1951 "movq (%1,%2), %%mm0 \n\t"
1952 "movq 8(%1,%2), %%mm2 \n\t"
1953 "movq 16(%1,%2), %%mm4 \n\t"
1954 "movq 24(%1,%2), %%mm6 \n\t"
1955 "movq %%mm0, %%mm1 \n\t"
1956 "movq %%mm2, %%mm3 \n\t"
1957 "movq %%mm4, %%mm5 \n\t"
1958 "movq %%mm6, %%mm7 \n\t"
1959 "punpcklbw %%mm0, %%mm0 \n\t"
1960 "punpckhbw %%mm1, %%mm1 \n\t"
1961 "punpcklbw %%mm2, %%mm2 \n\t"
1962 "punpckhbw %%mm3, %%mm3 \n\t"
1963 "punpcklbw %%mm4, %%mm4 \n\t"
1964 "punpckhbw %%mm5, %%mm5 \n\t"
1965 "punpcklbw %%mm6, %%mm6 \n\t"
1966 "punpckhbw %%mm7, %%mm7 \n\t"
1967 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1968 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1969 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1970 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1971 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1972 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1973 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1974 MOVNTQ" %%mm7, 56(%0,%2,2)"
1975 ::
"r"(d),
"r"(s1),
"r"(x)
1978 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1981 const uint8_t*
s2=src2+srcStride2*(y>>1);
1984 for (;x<w-31;x+=32) {
1987 "movq (%1,%2), %%mm0 \n\t"
1988 "movq 8(%1,%2), %%mm2 \n\t"
1989 "movq 16(%1,%2), %%mm4 \n\t"
1990 "movq 24(%1,%2), %%mm6 \n\t"
1991 "movq %%mm0, %%mm1 \n\t"
1992 "movq %%mm2, %%mm3 \n\t"
1993 "movq %%mm4, %%mm5 \n\t"
1994 "movq %%mm6, %%mm7 \n\t"
1995 "punpcklbw %%mm0, %%mm0 \n\t"
1996 "punpckhbw %%mm1, %%mm1 \n\t"
1997 "punpcklbw %%mm2, %%mm2 \n\t"
1998 "punpckhbw %%mm3, %%mm3 \n\t"
1999 "punpcklbw %%mm4, %%mm4 \n\t"
2000 "punpckhbw %%mm5, %%mm5 \n\t"
2001 "punpcklbw %%mm6, %%mm6 \n\t"
2002 "punpckhbw %%mm7, %%mm7 \n\t"
2003 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2004 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2005 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2006 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2007 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2008 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2009 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2010 MOVNTQ" %%mm7, 56(%0,%2,2)"
2011 ::
"r"(d),
"r"(s2),
"r"(x)
2014 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2026 int srcStride1,
int srcStride2,
2027 int srcStride3,
int dstStride)
2033 const uint8_t* yp=src1+srcStride1*
y;
2034 const uint8_t* up=src2+srcStride2*(y>>2);
2035 const uint8_t* vp=src3+srcStride3*(y>>2);
2043 "movq (%1, %0, 4), %%mm0 \n\t"
2044 "movq (%2, %0), %%mm1 \n\t"
2045 "movq (%3, %0), %%mm2 \n\t"
2046 "movq %%mm0, %%mm3 \n\t"
2047 "movq %%mm1, %%mm4 \n\t"
2048 "movq %%mm2, %%mm5 \n\t"
2049 "punpcklbw %%mm1, %%mm1 \n\t"
2050 "punpcklbw %%mm2, %%mm2 \n\t"
2051 "punpckhbw %%mm4, %%mm4 \n\t"
2052 "punpckhbw %%mm5, %%mm5 \n\t"
2054 "movq %%mm1, %%mm6 \n\t"
2055 "punpcklbw %%mm2, %%mm1 \n\t"
2056 "punpcklbw %%mm1, %%mm0 \n\t"
2057 "punpckhbw %%mm1, %%mm3 \n\t"
2058 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2059 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2061 "punpckhbw %%mm2, %%mm6 \n\t"
2062 "movq 8(%1, %0, 4), %%mm0 \n\t"
2063 "movq %%mm0, %%mm3 \n\t"
2064 "punpcklbw %%mm6, %%mm0 \n\t"
2065 "punpckhbw %%mm6, %%mm3 \n\t"
2066 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2067 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2069 "movq %%mm4, %%mm6 \n\t"
2070 "movq 16(%1, %0, 4), %%mm0 \n\t"
2071 "movq %%mm0, %%mm3 \n\t"
2072 "punpcklbw %%mm5, %%mm4 \n\t"
2073 "punpcklbw %%mm4, %%mm0 \n\t"
2074 "punpckhbw %%mm4, %%mm3 \n\t"
2075 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2076 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2078 "punpckhbw %%mm5, %%mm6 \n\t"
2079 "movq 24(%1, %0, 4), %%mm0 \n\t"
2080 "movq %%mm0, %%mm3 \n\t"
2081 "punpcklbw %%mm6, %%mm0 \n\t"
2082 "punpckhbw %%mm6, %%mm3 \n\t"
2083 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2084 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2087 :
"r"(yp),
"r" (up),
"r"(vp),
"r"(d)
2091 const int x2 = x<<2;
2094 d[8*x+2] = yp[x2+1];
2096 d[8*x+4] = yp[x2+2];
2098 d[8*x+6] = yp[x2+3];
2119 "pcmpeqw %%mm7, %%mm7 \n\t"
2120 "psrlw $8, %%mm7 \n\t"
2122 "movq -30(%1, %0, 2), %%mm0 \n\t"
2123 "movq -22(%1, %0, 2), %%mm1 \n\t"
2124 "movq -14(%1, %0, 2), %%mm2 \n\t"
2125 "movq -6(%1, %0, 2), %%mm3 \n\t"
2126 "pand %%mm7, %%mm0 \n\t"
2127 "pand %%mm7, %%mm1 \n\t"
2128 "pand %%mm7, %%mm2 \n\t"
2129 "pand %%mm7, %%mm3 \n\t"
2130 "packuswb %%mm1, %%mm0 \n\t"
2131 "packuswb %%mm3, %%mm2 \n\t"
2132 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2133 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2137 :
"r"(src),
"r"(
dst)
2142 dst[count]= src[2*count];
2147 #if !COMPILE_TEMPLATE_AMD3DNOW
2157 "pcmpeqw %%mm7, %%mm7 \n\t"
2158 "psrlw $8, %%mm7 \n\t"
2160 "movq -28(%1, %0, 4), %%mm0 \n\t"
2161 "movq -20(%1, %0, 4), %%mm1 \n\t"
2162 "movq -12(%1, %0, 4), %%mm2 \n\t"
2163 "movq -4(%1, %0, 4), %%mm3 \n\t"
2164 "pand %%mm7, %%mm0 \n\t"
2165 "pand %%mm7, %%mm1 \n\t"
2166 "pand %%mm7, %%mm2 \n\t"
2167 "pand %%mm7, %%mm3 \n\t"
2168 "packuswb %%mm1, %%mm0 \n\t"
2169 "packuswb %%mm3, %%mm2 \n\t"
2170 "movq %%mm0, %%mm1 \n\t"
2171 "movq %%mm2, %%mm3 \n\t"
2172 "psrlw $8, %%mm0 \n\t"
2173 "psrlw $8, %%mm2 \n\t"
2174 "pand %%mm7, %%mm1 \n\t"
2175 "pand %%mm7, %%mm3 \n\t"
2176 "packuswb %%mm2, %%mm0 \n\t"
2177 "packuswb %%mm3, %%mm1 \n\t"
2178 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2179 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2183 :
"r"(src),
"r"(dst0),
"r"(dst1)
2188 dst0[count]= src[4*count+0];
2189 dst1[count]= src[4*count+2];
2206 "pcmpeqw %%mm7, %%mm7 \n\t"
2207 "psrlw $8, %%mm7 \n\t"
2209 "movq -28(%1, %0, 4), %%mm0 \n\t"
2210 "movq -20(%1, %0, 4), %%mm1 \n\t"
2211 "movq -12(%1, %0, 4), %%mm2 \n\t"
2212 "movq -4(%1, %0, 4), %%mm3 \n\t"
2213 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2214 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2215 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2216 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2217 "pand %%mm7, %%mm0 \n\t"
2218 "pand %%mm7, %%mm1 \n\t"
2219 "pand %%mm7, %%mm2 \n\t"
2220 "pand %%mm7, %%mm3 \n\t"
2221 "packuswb %%mm1, %%mm0 \n\t"
2222 "packuswb %%mm3, %%mm2 \n\t"
2223 "movq %%mm0, %%mm1 \n\t"
2224 "movq %%mm2, %%mm3 \n\t"
2225 "psrlw $8, %%mm0 \n\t"
2226 "psrlw $8, %%mm2 \n\t"
2227 "pand %%mm7, %%mm1 \n\t"
2228 "pand %%mm7, %%mm3 \n\t"
2229 "packuswb %%mm2, %%mm0 \n\t"
2230 "packuswb %%mm3, %%mm1 \n\t"
2231 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2232 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2236 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2242 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2243 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2248 #if !COMPILE_TEMPLATE_AMD3DNOW
2258 "pcmpeqw %%mm7, %%mm7 \n\t"
2259 "psrlw $8, %%mm7 \n\t"
2261 "movq -28(%1, %0, 4), %%mm0 \n\t"
2262 "movq -20(%1, %0, 4), %%mm1 \n\t"
2263 "movq -12(%1, %0, 4), %%mm2 \n\t"
2264 "movq -4(%1, %0, 4), %%mm3 \n\t"
2265 "psrlw $8, %%mm0 \n\t"
2266 "psrlw $8, %%mm1 \n\t"
2267 "psrlw $8, %%mm2 \n\t"
2268 "psrlw $8, %%mm3 \n\t"
2269 "packuswb %%mm1, %%mm0 \n\t"
2270 "packuswb %%mm3, %%mm2 \n\t"
2271 "movq %%mm0, %%mm1 \n\t"
2272 "movq %%mm2, %%mm3 \n\t"
2273 "psrlw $8, %%mm0 \n\t"
2274 "psrlw $8, %%mm2 \n\t"
2275 "pand %%mm7, %%mm1 \n\t"
2276 "pand %%mm7, %%mm3 \n\t"
2277 "packuswb %%mm2, %%mm0 \n\t"
2278 "packuswb %%mm3, %%mm1 \n\t"
2279 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2280 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2284 :
"r"(src),
"r"(dst0),
"r"(dst1)
2290 dst0[count]= src[4*count+0];
2291 dst1[count]= src[4*count+2];
2308 "pcmpeqw %%mm7, %%mm7 \n\t"
2309 "psrlw $8, %%mm7 \n\t"
2311 "movq -28(%1, %0, 4), %%mm0 \n\t"
2312 "movq -20(%1, %0, 4), %%mm1 \n\t"
2313 "movq -12(%1, %0, 4), %%mm2 \n\t"
2314 "movq -4(%1, %0, 4), %%mm3 \n\t"
2315 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2316 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2317 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2318 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2319 "psrlw $8, %%mm0 \n\t"
2320 "psrlw $8, %%mm1 \n\t"
2321 "psrlw $8, %%mm2 \n\t"
2322 "psrlw $8, %%mm3 \n\t"
2323 "packuswb %%mm1, %%mm0 \n\t"
2324 "packuswb %%mm3, %%mm2 \n\t"
2325 "movq %%mm0, %%mm1 \n\t"
2326 "movq %%mm2, %%mm3 \n\t"
2327 "psrlw $8, %%mm0 \n\t"
2328 "psrlw $8, %%mm2 \n\t"
2329 "pand %%mm7, %%mm1 \n\t"
2330 "pand %%mm7, %%mm3 \n\t"
2331 "packuswb %%mm2, %%mm0 \n\t"
2332 "packuswb %%mm3, %%mm1 \n\t"
2333 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2334 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2338 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2346 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2347 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2354 int lumStride,
int chromStride,
int srcStride)
2357 const int chromWidth= -((-
width)>>1);
2359 for (y=0; y<
height; y++) {
2377 #if !COMPILE_TEMPLATE_AMD3DNOW
2380 int lumStride,
int chromStride,
int srcStride)
2383 const int chromWidth= -((-
width)>>1);
2385 for (y=0; y<
height; y++) {
2404 int lumStride,
int chromStride,
int srcStride)
2407 const int chromWidth= -((-
width)>>1);
2409 for (y=0; y<
height; y++) {
2427 #if !COMPILE_TEMPLATE_AMD3DNOW
2430 int lumStride,
int chromStride,
int srcStride)
2433 const int chromWidth= -((-
width)>>1);
2435 for (y=0; y<
height; y++) {
2455 #if !COMPILE_TEMPLATE_SSE2
2456 #if !COMPILE_TEMPLATE_AMD3DNOW
2486 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2495 #if !COMPILE_TEMPLATE_AMD3DNOW