• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • Examples
  • File List
  • Globals

libavcodec/x86/dsputilenc_mmx.c

Go to the documentation of this file.
00001 /*
00002  * MMX optimized DSP utils
00003  * Copyright (c) 2000, 2001 Fabrice Bellard
00004  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  *
00022  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
00023  */
00024 
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86_cpu.h"
00027 #include "libavcodec/dsputil.h"
00028 #include "libavcodec/mpegvideo.h"
00029 #include "libavcodec/mathops.h"
00030 #include "dsputil_mmx.h"
00031 
00032 
00033 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
00034 {
00035     __asm__ volatile(
00036         "mov $-128, %%"REG_a"           \n\t"
00037         "pxor %%mm7, %%mm7              \n\t"
00038         ".p2align 4                     \n\t"
00039         "1:                             \n\t"
00040         "movq (%0), %%mm0               \n\t"
00041         "movq (%0, %2), %%mm2           \n\t"
00042         "movq %%mm0, %%mm1              \n\t"
00043         "movq %%mm2, %%mm3              \n\t"
00044         "punpcklbw %%mm7, %%mm0         \n\t"
00045         "punpckhbw %%mm7, %%mm1         \n\t"
00046         "punpcklbw %%mm7, %%mm2         \n\t"
00047         "punpckhbw %%mm7, %%mm3         \n\t"
00048         "movq %%mm0, (%1, %%"REG_a")    \n\t"
00049         "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
00050         "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
00051         "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
00052         "add %3, %0                     \n\t"
00053         "add $32, %%"REG_a"             \n\t"
00054         "js 1b                          \n\t"
00055         : "+r" (pixels)
00056         : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
00057         : "%"REG_a
00058     );
00059 }
00060 
00061 static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
00062 {
00063     __asm__ volatile(
00064         "pxor %%xmm4,      %%xmm4         \n\t"
00065         "movq (%0),        %%xmm0         \n\t"
00066         "movq (%0, %2),    %%xmm1         \n\t"
00067         "movq (%0, %2,2),  %%xmm2         \n\t"
00068         "movq (%0, %3),    %%xmm3         \n\t"
00069         "lea (%0,%2,4), %0                \n\t"
00070         "punpcklbw %%xmm4, %%xmm0         \n\t"
00071         "punpcklbw %%xmm4, %%xmm1         \n\t"
00072         "punpcklbw %%xmm4, %%xmm2         \n\t"
00073         "punpcklbw %%xmm4, %%xmm3         \n\t"
00074         "movdqa %%xmm0,      (%1)         \n\t"
00075         "movdqa %%xmm1,    16(%1)         \n\t"
00076         "movdqa %%xmm2,    32(%1)         \n\t"
00077         "movdqa %%xmm3,    48(%1)         \n\t"
00078         "movq (%0),        %%xmm0         \n\t"
00079         "movq (%0, %2),    %%xmm1         \n\t"
00080         "movq (%0, %2,2),  %%xmm2         \n\t"
00081         "movq (%0, %3),    %%xmm3         \n\t"
00082         "punpcklbw %%xmm4, %%xmm0         \n\t"
00083         "punpcklbw %%xmm4, %%xmm1         \n\t"
00084         "punpcklbw %%xmm4, %%xmm2         \n\t"
00085         "punpcklbw %%xmm4, %%xmm3         \n\t"
00086         "movdqa %%xmm0,    64(%1)         \n\t"
00087         "movdqa %%xmm1,    80(%1)         \n\t"
00088         "movdqa %%xmm2,    96(%1)         \n\t"
00089         "movdqa %%xmm3,   112(%1)         \n\t"
00090         : "+r" (pixels)
00091         : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
00092     );
00093 }
00094 
00095 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
00096 {
00097     __asm__ volatile(
00098         "pxor %%mm7, %%mm7              \n\t"
00099         "mov $-128, %%"REG_a"           \n\t"
00100         ".p2align 4                     \n\t"
00101         "1:                             \n\t"
00102         "movq (%0), %%mm0               \n\t"
00103         "movq (%1), %%mm2               \n\t"
00104         "movq %%mm0, %%mm1              \n\t"
00105         "movq %%mm2, %%mm3              \n\t"
00106         "punpcklbw %%mm7, %%mm0         \n\t"
00107         "punpckhbw %%mm7, %%mm1         \n\t"
00108         "punpcklbw %%mm7, %%mm2         \n\t"
00109         "punpckhbw %%mm7, %%mm3         \n\t"
00110         "psubw %%mm2, %%mm0             \n\t"
00111         "psubw %%mm3, %%mm1             \n\t"
00112         "movq %%mm0, (%2, %%"REG_a")    \n\t"
00113         "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
00114         "add %3, %0                     \n\t"
00115         "add %3, %1                     \n\t"
00116         "add $16, %%"REG_a"             \n\t"
00117         "jnz 1b                         \n\t"
00118         : "+r" (s1), "+r" (s2)
00119         : "r" (block+64), "r" ((x86_reg)stride)
00120         : "%"REG_a
00121     );
00122 }
00123 
00124 static int pix_sum16_mmx(uint8_t * pix, int line_size){
00125     const int h=16;
00126     int sum;
00127     x86_reg index= -line_size*h;
00128 
00129     __asm__ volatile(
00130                 "pxor %%mm7, %%mm7              \n\t"
00131                 "pxor %%mm6, %%mm6              \n\t"
00132                 "1:                             \n\t"
00133                 "movq (%2, %1), %%mm0           \n\t"
00134                 "movq (%2, %1), %%mm1           \n\t"
00135                 "movq 8(%2, %1), %%mm2          \n\t"
00136                 "movq 8(%2, %1), %%mm3          \n\t"
00137                 "punpcklbw %%mm7, %%mm0         \n\t"
00138                 "punpckhbw %%mm7, %%mm1         \n\t"
00139                 "punpcklbw %%mm7, %%mm2         \n\t"
00140                 "punpckhbw %%mm7, %%mm3         \n\t"
00141                 "paddw %%mm0, %%mm1             \n\t"
00142                 "paddw %%mm2, %%mm3             \n\t"
00143                 "paddw %%mm1, %%mm3             \n\t"
00144                 "paddw %%mm3, %%mm6             \n\t"
00145                 "add %3, %1                     \n\t"
00146                 " js 1b                         \n\t"
00147                 "movq %%mm6, %%mm5              \n\t"
00148                 "psrlq $32, %%mm6               \n\t"
00149                 "paddw %%mm5, %%mm6             \n\t"
00150                 "movq %%mm6, %%mm5              \n\t"
00151                 "psrlq $16, %%mm6               \n\t"
00152                 "paddw %%mm5, %%mm6             \n\t"
00153                 "movd %%mm6, %0                 \n\t"
00154                 "andl $0xFFFF, %0               \n\t"
00155                 : "=&r" (sum), "+r" (index)
00156                 : "r" (pix - index), "r" ((x86_reg)line_size)
00157         );
00158 
00159         return sum;
00160 }
00161 
00162 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
00163     int tmp;
00164   __asm__ volatile (
00165       "movl $16,%%ecx\n"
00166       "pxor %%mm0,%%mm0\n"
00167       "pxor %%mm7,%%mm7\n"
00168       "1:\n"
00169       "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
00170       "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
00171 
00172       "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
00173 
00174       "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
00175       "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
00176 
00177       "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
00178       "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
00179       "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
00180 
00181       "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
00182       "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
00183 
00184       "pmaddwd %%mm3,%%mm3\n"
00185       "pmaddwd %%mm4,%%mm4\n"
00186 
00187       "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
00188                                           pix2^2+pix3^2+pix6^2+pix7^2) */
00189       "paddd %%mm3,%%mm4\n"
00190       "paddd %%mm2,%%mm7\n"
00191 
00192       "add %2, %0\n"
00193       "paddd %%mm4,%%mm7\n"
00194       "dec %%ecx\n"
00195       "jnz 1b\n"
00196 
00197       "movq %%mm7,%%mm1\n"
00198       "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
00199       "paddd %%mm7,%%mm1\n"
00200       "movd %%mm1,%1\n"
00201       : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
00202     return tmp;
00203 }
00204 
00205 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00206     int tmp;
00207   __asm__ volatile (
00208       "movl %4,%%ecx\n"
00209       "shr $1,%%ecx\n"
00210       "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
00211       "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
00212       "1:\n"
00213       "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
00214       "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
00215       "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
00216       "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
00217 
00218       /* todo: mm1-mm2, mm3-mm4 */
00219       /* algo: subtract mm1 from mm2 with saturation and vice versa */
00220       /*       OR the results to get absolute difference */
00221       "movq %%mm1,%%mm5\n"
00222       "movq %%mm3,%%mm6\n"
00223       "psubusb %%mm2,%%mm1\n"
00224       "psubusb %%mm4,%%mm3\n"
00225       "psubusb %%mm5,%%mm2\n"
00226       "psubusb %%mm6,%%mm4\n"
00227 
00228       "por %%mm1,%%mm2\n"
00229       "por %%mm3,%%mm4\n"
00230 
00231       /* now convert to 16-bit vectors so we can square them */
00232       "movq %%mm2,%%mm1\n"
00233       "movq %%mm4,%%mm3\n"
00234 
00235       "punpckhbw %%mm0,%%mm2\n"
00236       "punpckhbw %%mm0,%%mm4\n"
00237       "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
00238       "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
00239 
00240       "pmaddwd %%mm2,%%mm2\n"
00241       "pmaddwd %%mm4,%%mm4\n"
00242       "pmaddwd %%mm1,%%mm1\n"
00243       "pmaddwd %%mm3,%%mm3\n"
00244 
00245       "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
00246       "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
00247 
00248       "paddd %%mm2,%%mm1\n"
00249       "paddd %%mm4,%%mm3\n"
00250       "paddd %%mm1,%%mm7\n"
00251       "paddd %%mm3,%%mm7\n"
00252 
00253       "decl %%ecx\n"
00254       "jnz 1b\n"
00255 
00256       "movq %%mm7,%%mm1\n"
00257       "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
00258       "paddd %%mm7,%%mm1\n"
00259       "movd %%mm1,%2\n"
00260       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00261       : "r" ((x86_reg)line_size) , "m" (h)
00262       : "%ecx");
00263     return tmp;
00264 }
00265 
00266 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00267     int tmp;
00268   __asm__ volatile (
00269       "movl %4,%%ecx\n"
00270       "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
00271       "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
00272       "1:\n"
00273       "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
00274       "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
00275       "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
00276       "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
00277 
00278       /* todo: mm1-mm2, mm3-mm4 */
00279       /* algo: subtract mm1 from mm2 with saturation and vice versa */
00280       /*       OR the results to get absolute difference */
00281       "movq %%mm1,%%mm5\n"
00282       "movq %%mm3,%%mm6\n"
00283       "psubusb %%mm2,%%mm1\n"
00284       "psubusb %%mm4,%%mm3\n"
00285       "psubusb %%mm5,%%mm2\n"
00286       "psubusb %%mm6,%%mm4\n"
00287 
00288       "por %%mm1,%%mm2\n"
00289       "por %%mm3,%%mm4\n"
00290 
00291       /* now convert to 16-bit vectors so we can square them */
00292       "movq %%mm2,%%mm1\n"
00293       "movq %%mm4,%%mm3\n"
00294 
00295       "punpckhbw %%mm0,%%mm2\n"
00296       "punpckhbw %%mm0,%%mm4\n"
00297       "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
00298       "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
00299 
00300       "pmaddwd %%mm2,%%mm2\n"
00301       "pmaddwd %%mm4,%%mm4\n"
00302       "pmaddwd %%mm1,%%mm1\n"
00303       "pmaddwd %%mm3,%%mm3\n"
00304 
00305       "add %3,%0\n"
00306       "add %3,%1\n"
00307 
00308       "paddd %%mm2,%%mm1\n"
00309       "paddd %%mm4,%%mm3\n"
00310       "paddd %%mm1,%%mm7\n"
00311       "paddd %%mm3,%%mm7\n"
00312 
00313       "decl %%ecx\n"
00314       "jnz 1b\n"
00315 
00316       "movq %%mm7,%%mm1\n"
00317       "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
00318       "paddd %%mm7,%%mm1\n"
00319       "movd %%mm1,%2\n"
00320       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00321       : "r" ((x86_reg)line_size) , "m" (h)
00322       : "%ecx");
00323     return tmp;
00324 }
00325 
00326 int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
00327 
00328 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
00329     int tmp;
00330   __asm__ volatile (
00331       "movl %3,%%ecx\n"
00332       "pxor %%mm7,%%mm7\n"
00333       "pxor %%mm6,%%mm6\n"
00334 
00335       "movq (%0),%%mm0\n"
00336       "movq %%mm0, %%mm1\n"
00337       "psllq $8, %%mm0\n"
00338       "psrlq $8, %%mm1\n"
00339       "psrlq $8, %%mm0\n"
00340       "movq %%mm0, %%mm2\n"
00341       "movq %%mm1, %%mm3\n"
00342       "punpcklbw %%mm7,%%mm0\n"
00343       "punpcklbw %%mm7,%%mm1\n"
00344       "punpckhbw %%mm7,%%mm2\n"
00345       "punpckhbw %%mm7,%%mm3\n"
00346       "psubw %%mm1, %%mm0\n"
00347       "psubw %%mm3, %%mm2\n"
00348 
00349       "add %2,%0\n"
00350 
00351       "movq (%0),%%mm4\n"
00352       "movq %%mm4, %%mm1\n"
00353       "psllq $8, %%mm4\n"
00354       "psrlq $8, %%mm1\n"
00355       "psrlq $8, %%mm4\n"
00356       "movq %%mm4, %%mm5\n"
00357       "movq %%mm1, %%mm3\n"
00358       "punpcklbw %%mm7,%%mm4\n"
00359       "punpcklbw %%mm7,%%mm1\n"
00360       "punpckhbw %%mm7,%%mm5\n"
00361       "punpckhbw %%mm7,%%mm3\n"
00362       "psubw %%mm1, %%mm4\n"
00363       "psubw %%mm3, %%mm5\n"
00364       "psubw %%mm4, %%mm0\n"
00365       "psubw %%mm5, %%mm2\n"
00366       "pxor %%mm3, %%mm3\n"
00367       "pxor %%mm1, %%mm1\n"
00368       "pcmpgtw %%mm0, %%mm3\n\t"
00369       "pcmpgtw %%mm2, %%mm1\n\t"
00370       "pxor %%mm3, %%mm0\n"
00371       "pxor %%mm1, %%mm2\n"
00372       "psubw %%mm3, %%mm0\n"
00373       "psubw %%mm1, %%mm2\n"
00374       "paddw %%mm0, %%mm2\n"
00375       "paddw %%mm2, %%mm6\n"
00376 
00377       "add %2,%0\n"
00378       "1:\n"
00379 
00380       "movq (%0),%%mm0\n"
00381       "movq %%mm0, %%mm1\n"
00382       "psllq $8, %%mm0\n"
00383       "psrlq $8, %%mm1\n"
00384       "psrlq $8, %%mm0\n"
00385       "movq %%mm0, %%mm2\n"
00386       "movq %%mm1, %%mm3\n"
00387       "punpcklbw %%mm7,%%mm0\n"
00388       "punpcklbw %%mm7,%%mm1\n"
00389       "punpckhbw %%mm7,%%mm2\n"
00390       "punpckhbw %%mm7,%%mm3\n"
00391       "psubw %%mm1, %%mm0\n"
00392       "psubw %%mm3, %%mm2\n"
00393       "psubw %%mm0, %%mm4\n"
00394       "psubw %%mm2, %%mm5\n"
00395       "pxor %%mm3, %%mm3\n"
00396       "pxor %%mm1, %%mm1\n"
00397       "pcmpgtw %%mm4, %%mm3\n\t"
00398       "pcmpgtw %%mm5, %%mm1\n\t"
00399       "pxor %%mm3, %%mm4\n"
00400       "pxor %%mm1, %%mm5\n"
00401       "psubw %%mm3, %%mm4\n"
00402       "psubw %%mm1, %%mm5\n"
00403       "paddw %%mm4, %%mm5\n"
00404       "paddw %%mm5, %%mm6\n"
00405 
00406       "add %2,%0\n"
00407 
00408       "movq (%0),%%mm4\n"
00409       "movq %%mm4, %%mm1\n"
00410       "psllq $8, %%mm4\n"
00411       "psrlq $8, %%mm1\n"
00412       "psrlq $8, %%mm4\n"
00413       "movq %%mm4, %%mm5\n"
00414       "movq %%mm1, %%mm3\n"
00415       "punpcklbw %%mm7,%%mm4\n"
00416       "punpcklbw %%mm7,%%mm1\n"
00417       "punpckhbw %%mm7,%%mm5\n"
00418       "punpckhbw %%mm7,%%mm3\n"
00419       "psubw %%mm1, %%mm4\n"
00420       "psubw %%mm3, %%mm5\n"
00421       "psubw %%mm4, %%mm0\n"
00422       "psubw %%mm5, %%mm2\n"
00423       "pxor %%mm3, %%mm3\n"
00424       "pxor %%mm1, %%mm1\n"
00425       "pcmpgtw %%mm0, %%mm3\n\t"
00426       "pcmpgtw %%mm2, %%mm1\n\t"
00427       "pxor %%mm3, %%mm0\n"
00428       "pxor %%mm1, %%mm2\n"
00429       "psubw %%mm3, %%mm0\n"
00430       "psubw %%mm1, %%mm2\n"
00431       "paddw %%mm0, %%mm2\n"
00432       "paddw %%mm2, %%mm6\n"
00433 
00434       "add %2,%0\n"
00435       "subl $2, %%ecx\n"
00436       " jnz 1b\n"
00437 
00438       "movq %%mm6, %%mm0\n"
00439       "punpcklwd %%mm7,%%mm0\n"
00440       "punpckhwd %%mm7,%%mm6\n"
00441       "paddd %%mm0, %%mm6\n"
00442 
00443       "movq %%mm6,%%mm0\n"
00444       "psrlq $32, %%mm6\n"
00445       "paddd %%mm6,%%mm0\n"
00446       "movd %%mm0,%1\n"
00447       : "+r" (pix1), "=r"(tmp)
00448       : "r" ((x86_reg)line_size) , "g" (h-2)
00449       : "%ecx");
00450       return tmp;
00451 }
00452 
00453 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
00454     int tmp;
00455     uint8_t * pix= pix1;
00456   __asm__ volatile (
00457       "movl %3,%%ecx\n"
00458       "pxor %%mm7,%%mm7\n"
00459       "pxor %%mm6,%%mm6\n"
00460 
00461       "movq (%0),%%mm0\n"
00462       "movq 1(%0),%%mm1\n"
00463       "movq %%mm0, %%mm2\n"
00464       "movq %%mm1, %%mm3\n"
00465       "punpcklbw %%mm7,%%mm0\n"
00466       "punpcklbw %%mm7,%%mm1\n"
00467       "punpckhbw %%mm7,%%mm2\n"
00468       "punpckhbw %%mm7,%%mm3\n"
00469       "psubw %%mm1, %%mm0\n"
00470       "psubw %%mm3, %%mm2\n"
00471 
00472       "add %2,%0\n"
00473 
00474       "movq (%0),%%mm4\n"
00475       "movq 1(%0),%%mm1\n"
00476       "movq %%mm4, %%mm5\n"
00477       "movq %%mm1, %%mm3\n"
00478       "punpcklbw %%mm7,%%mm4\n"
00479       "punpcklbw %%mm7,%%mm1\n"
00480       "punpckhbw %%mm7,%%mm5\n"
00481       "punpckhbw %%mm7,%%mm3\n"
00482       "psubw %%mm1, %%mm4\n"
00483       "psubw %%mm3, %%mm5\n"
00484       "psubw %%mm4, %%mm0\n"
00485       "psubw %%mm5, %%mm2\n"
00486       "pxor %%mm3, %%mm3\n"
00487       "pxor %%mm1, %%mm1\n"
00488       "pcmpgtw %%mm0, %%mm3\n\t"
00489       "pcmpgtw %%mm2, %%mm1\n\t"
00490       "pxor %%mm3, %%mm0\n"
00491       "pxor %%mm1, %%mm2\n"
00492       "psubw %%mm3, %%mm0\n"
00493       "psubw %%mm1, %%mm2\n"
00494       "paddw %%mm0, %%mm2\n"
00495       "paddw %%mm2, %%mm6\n"
00496 
00497       "add %2,%0\n"
00498       "1:\n"
00499 
00500       "movq (%0),%%mm0\n"
00501       "movq 1(%0),%%mm1\n"
00502       "movq %%mm0, %%mm2\n"
00503       "movq %%mm1, %%mm3\n"
00504       "punpcklbw %%mm7,%%mm0\n"
00505       "punpcklbw %%mm7,%%mm1\n"
00506       "punpckhbw %%mm7,%%mm2\n"
00507       "punpckhbw %%mm7,%%mm3\n"
00508       "psubw %%mm1, %%mm0\n"
00509       "psubw %%mm3, %%mm2\n"
00510       "psubw %%mm0, %%mm4\n"
00511       "psubw %%mm2, %%mm5\n"
00512       "pxor %%mm3, %%mm3\n"
00513       "pxor %%mm1, %%mm1\n"
00514       "pcmpgtw %%mm4, %%mm3\n\t"
00515       "pcmpgtw %%mm5, %%mm1\n\t"
00516       "pxor %%mm3, %%mm4\n"
00517       "pxor %%mm1, %%mm5\n"
00518       "psubw %%mm3, %%mm4\n"
00519       "psubw %%mm1, %%mm5\n"
00520       "paddw %%mm4, %%mm5\n"
00521       "paddw %%mm5, %%mm6\n"
00522 
00523       "add %2,%0\n"
00524 
00525       "movq (%0),%%mm4\n"
00526       "movq 1(%0),%%mm1\n"
00527       "movq %%mm4, %%mm5\n"
00528       "movq %%mm1, %%mm3\n"
00529       "punpcklbw %%mm7,%%mm4\n"
00530       "punpcklbw %%mm7,%%mm1\n"
00531       "punpckhbw %%mm7,%%mm5\n"
00532       "punpckhbw %%mm7,%%mm3\n"
00533       "psubw %%mm1, %%mm4\n"
00534       "psubw %%mm3, %%mm5\n"
00535       "psubw %%mm4, %%mm0\n"
00536       "psubw %%mm5, %%mm2\n"
00537       "pxor %%mm3, %%mm3\n"
00538       "pxor %%mm1, %%mm1\n"
00539       "pcmpgtw %%mm0, %%mm3\n\t"
00540       "pcmpgtw %%mm2, %%mm1\n\t"
00541       "pxor %%mm3, %%mm0\n"
00542       "pxor %%mm1, %%mm2\n"
00543       "psubw %%mm3, %%mm0\n"
00544       "psubw %%mm1, %%mm2\n"
00545       "paddw %%mm0, %%mm2\n"
00546       "paddw %%mm2, %%mm6\n"
00547 
00548       "add %2,%0\n"
00549       "subl $2, %%ecx\n"
00550       " jnz 1b\n"
00551 
00552       "movq %%mm6, %%mm0\n"
00553       "punpcklwd %%mm7,%%mm0\n"
00554       "punpckhwd %%mm7,%%mm6\n"
00555       "paddd %%mm0, %%mm6\n"
00556 
00557       "movq %%mm6,%%mm0\n"
00558       "psrlq $32, %%mm6\n"
00559       "paddd %%mm6,%%mm0\n"
00560       "movd %%mm0,%1\n"
00561       : "+r" (pix1), "=r"(tmp)
00562       : "r" ((x86_reg)line_size) , "g" (h-2)
00563       : "%ecx");
00564       return tmp + hf_noise8_mmx(pix+8, line_size, h);
00565 }
00566 
00567 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00568     MpegEncContext *c = p;
00569     int score1, score2;
00570 
00571     if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
00572     else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
00573     score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
00574 
00575     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
00576     else  return score1 + FFABS(score2)*8;
00577 }
00578 
00579 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00580     MpegEncContext *c = p;
00581     int score1= sse8_mmx(c, pix1, pix2, line_size, h);
00582     int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
00583 
00584     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
00585     else  return score1 + FFABS(score2)*8;
00586 }
00587 
00588 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
00589     int tmp;
00590 
00591     assert( (((int)pix) & 7) == 0);
00592     assert((line_size &7) ==0);
00593 
00594 #define SUM(in0, in1, out0, out1) \
00595       "movq (%0), %%mm2\n"\
00596       "movq 8(%0), %%mm3\n"\
00597       "add %2,%0\n"\
00598       "movq %%mm2, " #out0 "\n"\
00599       "movq %%mm3, " #out1 "\n"\
00600       "psubusb " #in0 ", %%mm2\n"\
00601       "psubusb " #in1 ", %%mm3\n"\
00602       "psubusb " #out0 ", " #in0 "\n"\
00603       "psubusb " #out1 ", " #in1 "\n"\
00604       "por %%mm2, " #in0 "\n"\
00605       "por %%mm3, " #in1 "\n"\
00606       "movq " #in0 ", %%mm2\n"\
00607       "movq " #in1 ", %%mm3\n"\
00608       "punpcklbw %%mm7, " #in0 "\n"\
00609       "punpcklbw %%mm7, " #in1 "\n"\
00610       "punpckhbw %%mm7, %%mm2\n"\
00611       "punpckhbw %%mm7, %%mm3\n"\
00612       "paddw " #in1 ", " #in0 "\n"\
00613       "paddw %%mm3, %%mm2\n"\
00614       "paddw %%mm2, " #in0 "\n"\
00615       "paddw " #in0 ", %%mm6\n"
00616 
00617 
00618   __asm__ volatile (
00619       "movl %3,%%ecx\n"
00620       "pxor %%mm6,%%mm6\n"
00621       "pxor %%mm7,%%mm7\n"
00622       "movq (%0),%%mm0\n"
00623       "movq 8(%0),%%mm1\n"
00624       "add %2,%0\n"
00625       "jmp 2f\n"
00626       "1:\n"
00627 
00628       SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00629       "2:\n"
00630       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00631 
00632       "subl $2, %%ecx\n"
00633       "jnz 1b\n"
00634 
00635       "movq %%mm6,%%mm0\n"
00636       "psrlq $32, %%mm6\n"
00637       "paddw %%mm6,%%mm0\n"
00638       "movq %%mm0,%%mm6\n"
00639       "psrlq $16, %%mm0\n"
00640       "paddw %%mm6,%%mm0\n"
00641       "movd %%mm0,%1\n"
00642       : "+r" (pix), "=r"(tmp)
00643       : "r" ((x86_reg)line_size) , "m" (h)
00644       : "%ecx");
00645     return tmp & 0xFFFF;
00646 }
00647 #undef SUM
00648 
00649 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
00650     int tmp;
00651 
00652     assert( (((int)pix) & 7) == 0);
00653     assert((line_size &7) ==0);
00654 
00655 #define SUM(in0, in1, out0, out1) \
00656       "movq (%0), " #out0 "\n"\
00657       "movq 8(%0), " #out1 "\n"\
00658       "add %2,%0\n"\
00659       "psadbw " #out0 ", " #in0 "\n"\
00660       "psadbw " #out1 ", " #in1 "\n"\
00661       "paddw " #in1 ", " #in0 "\n"\
00662       "paddw " #in0 ", %%mm6\n"
00663 
00664   __asm__ volatile (
00665       "movl %3,%%ecx\n"
00666       "pxor %%mm6,%%mm6\n"
00667       "pxor %%mm7,%%mm7\n"
00668       "movq (%0),%%mm0\n"
00669       "movq 8(%0),%%mm1\n"
00670       "add %2,%0\n"
00671       "jmp 2f\n"
00672       "1:\n"
00673 
00674       SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00675       "2:\n"
00676       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00677 
00678       "subl $2, %%ecx\n"
00679       "jnz 1b\n"
00680 
00681       "movd %%mm6,%1\n"
00682       : "+r" (pix), "=r"(tmp)
00683       : "r" ((x86_reg)line_size) , "m" (h)
00684       : "%ecx");
00685     return tmp;
00686 }
00687 #undef SUM
00688 
00689 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00690     int tmp;
00691 
00692     assert( (((int)pix1) & 7) == 0);
00693     assert( (((int)pix2) & 7) == 0);
00694     assert((line_size &7) ==0);
00695 
00696 #define SUM(in0, in1, out0, out1) \
00697       "movq (%0),%%mm2\n"\
00698       "movq (%1)," #out0 "\n"\
00699       "movq 8(%0),%%mm3\n"\
00700       "movq 8(%1)," #out1 "\n"\
00701       "add %3,%0\n"\
00702       "add %3,%1\n"\
00703       "psubb " #out0 ", %%mm2\n"\
00704       "psubb " #out1 ", %%mm3\n"\
00705       "pxor %%mm7, %%mm2\n"\
00706       "pxor %%mm7, %%mm3\n"\
00707       "movq %%mm2, " #out0 "\n"\
00708       "movq %%mm3, " #out1 "\n"\
00709       "psubusb " #in0 ", %%mm2\n"\
00710       "psubusb " #in1 ", %%mm3\n"\
00711       "psubusb " #out0 ", " #in0 "\n"\
00712       "psubusb " #out1 ", " #in1 "\n"\
00713       "por %%mm2, " #in0 "\n"\
00714       "por %%mm3, " #in1 "\n"\
00715       "movq " #in0 ", %%mm2\n"\
00716       "movq " #in1 ", %%mm3\n"\
00717       "punpcklbw %%mm7, " #in0 "\n"\
00718       "punpcklbw %%mm7, " #in1 "\n"\
00719       "punpckhbw %%mm7, %%mm2\n"\
00720       "punpckhbw %%mm7, %%mm3\n"\
00721       "paddw " #in1 ", " #in0 "\n"\
00722       "paddw %%mm3, %%mm2\n"\
00723       "paddw %%mm2, " #in0 "\n"\
00724       "paddw " #in0 ", %%mm6\n"
00725 
00726 
00727   __asm__ volatile (
00728       "movl %4,%%ecx\n"
00729       "pxor %%mm6,%%mm6\n"
00730       "pcmpeqw %%mm7,%%mm7\n"
00731       "psllw $15, %%mm7\n"
00732       "packsswb %%mm7, %%mm7\n"
00733       "movq (%0),%%mm0\n"
00734       "movq (%1),%%mm2\n"
00735       "movq 8(%0),%%mm1\n"
00736       "movq 8(%1),%%mm3\n"
00737       "add %3,%0\n"
00738       "add %3,%1\n"
00739       "psubb %%mm2, %%mm0\n"
00740       "psubb %%mm3, %%mm1\n"
00741       "pxor %%mm7, %%mm0\n"
00742       "pxor %%mm7, %%mm1\n"
00743       "jmp 2f\n"
00744       "1:\n"
00745 
00746       SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00747       "2:\n"
00748       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00749 
00750       "subl $2, %%ecx\n"
00751       "jnz 1b\n"
00752 
00753       "movq %%mm6,%%mm0\n"
00754       "psrlq $32, %%mm6\n"
00755       "paddw %%mm6,%%mm0\n"
00756       "movq %%mm0,%%mm6\n"
00757       "psrlq $16, %%mm0\n"
00758       "paddw %%mm6,%%mm0\n"
00759       "movd %%mm0,%2\n"
00760       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00761       : "r" ((x86_reg)line_size) , "m" (h)
00762       : "%ecx");
00763     return tmp & 0x7FFF;
00764 }
00765 #undef SUM
00766 
00767 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00768     int tmp;
00769 
00770     assert( (((int)pix1) & 7) == 0);
00771     assert( (((int)pix2) & 7) == 0);
00772     assert((line_size &7) ==0);
00773 
00774 #define SUM(in0, in1, out0, out1) \
00775       "movq (%0)," #out0 "\n"\
00776       "movq (%1),%%mm2\n"\
00777       "movq 8(%0)," #out1 "\n"\
00778       "movq 8(%1),%%mm3\n"\
00779       "add %3,%0\n"\
00780       "add %3,%1\n"\
00781       "psubb %%mm2, " #out0 "\n"\
00782       "psubb %%mm3, " #out1 "\n"\
00783       "pxor %%mm7, " #out0 "\n"\
00784       "pxor %%mm7, " #out1 "\n"\
00785       "psadbw " #out0 ", " #in0 "\n"\
00786       "psadbw " #out1 ", " #in1 "\n"\
00787       "paddw " #in1 ", " #in0 "\n"\
00788       "paddw " #in0 ", %%mm6\n"
00789 
00790   __asm__ volatile (
00791       "movl %4,%%ecx\n"
00792       "pxor %%mm6,%%mm6\n"
00793       "pcmpeqw %%mm7,%%mm7\n"
00794       "psllw $15, %%mm7\n"
00795       "packsswb %%mm7, %%mm7\n"
00796       "movq (%0),%%mm0\n"
00797       "movq (%1),%%mm2\n"
00798       "movq 8(%0),%%mm1\n"
00799       "movq 8(%1),%%mm3\n"
00800       "add %3,%0\n"
00801       "add %3,%1\n"
00802       "psubb %%mm2, %%mm0\n"
00803       "psubb %%mm3, %%mm1\n"
00804       "pxor %%mm7, %%mm0\n"
00805       "pxor %%mm7, %%mm1\n"
00806       "jmp 2f\n"
00807       "1:\n"
00808 
00809       SUM(%%mm4, %%mm5, %%mm0, %%mm1)
00810       "2:\n"
00811       SUM(%%mm0, %%mm1, %%mm4, %%mm5)
00812 
00813       "subl $2, %%ecx\n"
00814       "jnz 1b\n"
00815 
00816       "movd %%mm6,%2\n"
00817       : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00818       : "r" ((x86_reg)line_size) , "m" (h)
00819       : "%ecx");
00820     return tmp;
00821 }
00822 #undef SUM
00823 
00824 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
00825     x86_reg i=0;
00826     if(w>=16)
00827     __asm__ volatile(
00828         "1:                             \n\t"
00829         "movq  (%2, %0), %%mm0          \n\t"
00830         "movq  (%1, %0), %%mm1          \n\t"
00831         "psubb %%mm0, %%mm1             \n\t"
00832         "movq %%mm1, (%3, %0)           \n\t"
00833         "movq 8(%2, %0), %%mm0          \n\t"
00834         "movq 8(%1, %0), %%mm1          \n\t"
00835         "psubb %%mm0, %%mm1             \n\t"
00836         "movq %%mm1, 8(%3, %0)          \n\t"
00837         "add $16, %0                    \n\t"
00838         "cmp %4, %0                     \n\t"
00839         " jb 1b                         \n\t"
00840         : "+r" (i)
00841         : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
00842     );
00843     for(; i<w; i++)
00844         dst[i+0] = src1[i+0]-src2[i+0];
00845 }
00846 
00847 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
00848     x86_reg i=0;
00849     uint8_t l, lt;
00850 
00851     __asm__ volatile(
00852         "1:                             \n\t"
00853         "movq  -1(%1, %0), %%mm0        \n\t" // LT
00854         "movq  (%1, %0), %%mm1          \n\t" // T
00855         "movq  -1(%2, %0), %%mm2        \n\t" // L
00856         "movq  (%2, %0), %%mm3          \n\t" // X
00857         "movq %%mm2, %%mm4              \n\t" // L
00858         "psubb %%mm0, %%mm2             \n\t"
00859         "paddb %%mm1, %%mm2             \n\t" // L + T - LT
00860         "movq %%mm4, %%mm5              \n\t" // L
00861         "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
00862         "pminub %%mm5, %%mm1            \n\t" // min(T, L)
00863         "pminub %%mm2, %%mm4            \n\t"
00864         "pmaxub %%mm1, %%mm4            \n\t"
00865         "psubb %%mm4, %%mm3             \n\t" // dst - pred
00866         "movq %%mm3, (%3, %0)           \n\t"
00867         "add $8, %0                     \n\t"
00868         "cmp %4, %0                     \n\t"
00869         " jb 1b                         \n\t"
00870         : "+r" (i)
00871         : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
00872     );
00873 
00874     l= *left;
00875     lt= *left_top;
00876 
00877     dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
00878 
00879     *left_top= src1[w-1];
00880     *left    = src2[w-1];
00881 }
00882 
00883 #define MMABS_MMX(a,z)\
00884     "pxor " #z ", " #z "              \n\t"\
00885     "pcmpgtw " #a ", " #z "           \n\t"\
00886     "pxor " #z ", " #a "              \n\t"\
00887     "psubw " #z ", " #a "             \n\t"
00888 
00889 #define MMABS_MMX2(a,z)\
00890     "pxor " #z ", " #z "              \n\t"\
00891     "psubw " #a ", " #z "             \n\t"\
00892     "pmaxsw " #z ", " #a "            \n\t"
00893 
00894 #define MMABS_SSSE3(a,z)\
00895     "pabsw " #a ", " #a "             \n\t"
00896 
00897 #define MMABS_SUM(a,z, sum)\
00898     MMABS(a,z)\
00899     "paddusw " #a ", " #sum "         \n\t"
00900 
00901 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
00902  * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
00903  * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
00904 #define HSUM_MMX(a, t, dst)\
00905     "movq "#a", "#t"                  \n\t"\
00906     "psrlq $32, "#a"                  \n\t"\
00907     "paddusw "#t", "#a"               \n\t"\
00908     "movq "#a", "#t"                  \n\t"\
00909     "psrlq $16, "#a"                  \n\t"\
00910     "paddusw "#t", "#a"               \n\t"\
00911     "movd "#a", "#dst"                \n\t"\
00912 
00913 #define HSUM_MMX2(a, t, dst)\
00914     "pshufw $0x0E, "#a", "#t"         \n\t"\
00915     "paddusw "#t", "#a"               \n\t"\
00916     "pshufw $0x01, "#a", "#t"         \n\t"\
00917     "paddusw "#t", "#a"               \n\t"\
00918     "movd "#a", "#dst"                \n\t"\
00919 
00920 #define HSUM_SSE2(a, t, dst)\
00921     "movhlps "#a", "#t"               \n\t"\
00922     "paddusw "#t", "#a"               \n\t"\
00923     "pshuflw $0x0E, "#a", "#t"        \n\t"\
00924     "paddusw "#t", "#a"               \n\t"\
00925     "pshuflw $0x01, "#a", "#t"        \n\t"\
00926     "paddusw "#t", "#a"               \n\t"\
00927     "movd "#a", "#dst"                \n\t"\
00928 
00929 #define hadamard_func(cpu) \
00930 int ff_hadamard8_diff_##cpu  (void *s, uint8_t *src1, uint8_t *src2, \
00931                               int stride, int h); \
00932 int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
00933                               int stride, int h);
00934 
00935 hadamard_func(mmx)
00936 hadamard_func(mmx2)
00937 hadamard_func(sse2)
00938 hadamard_func(ssse3)
00939 
00940 #define DCT_SAD4(m,mm,o)\
00941     "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
00942     "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
00943     "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
00944     "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
00945     MMABS_SUM(mm##2, mm##6, mm##0)\
00946     MMABS_SUM(mm##3, mm##7, mm##1)\
00947     MMABS_SUM(mm##4, mm##6, mm##0)\
00948     MMABS_SUM(mm##5, mm##7, mm##1)\
00949 
00950 #define DCT_SAD_MMX\
00951     "pxor %%mm0, %%mm0                \n\t"\
00952     "pxor %%mm1, %%mm1                \n\t"\
00953     DCT_SAD4(q, %%mm, 0)\
00954     DCT_SAD4(q, %%mm, 8)\
00955     DCT_SAD4(q, %%mm, 64)\
00956     DCT_SAD4(q, %%mm, 72)\
00957     "paddusw %%mm1, %%mm0             \n\t"\
00958     HSUM(%%mm0, %%mm1, %0)
00959 
00960 #define DCT_SAD_SSE2\
00961     "pxor %%xmm0, %%xmm0              \n\t"\
00962     "pxor %%xmm1, %%xmm1              \n\t"\
00963     DCT_SAD4(dqa, %%xmm, 0)\
00964     DCT_SAD4(dqa, %%xmm, 64)\
00965     "paddusw %%xmm1, %%xmm0           \n\t"\
00966     HSUM(%%xmm0, %%xmm1, %0)
00967 
00968 #define DCT_SAD_FUNC(cpu) \
00969 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
00970     int sum;\
00971     __asm__ volatile(\
00972         DCT_SAD\
00973         :"=r"(sum)\
00974         :"r"(block)\
00975     );\
00976     return sum&0xFFFF;\
00977 }
00978 
00979 #define DCT_SAD       DCT_SAD_MMX
00980 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
00981 #define MMABS(a,z)    MMABS_MMX(a,z)
00982 DCT_SAD_FUNC(mmx)
00983 #undef MMABS
00984 #undef HSUM
00985 
00986 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
00987 #define MMABS(a,z)    MMABS_MMX2(a,z)
00988 DCT_SAD_FUNC(mmx2)
00989 #undef HSUM
00990 #undef DCT_SAD
00991 
00992 #define DCT_SAD       DCT_SAD_SSE2
00993 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
00994 DCT_SAD_FUNC(sse2)
00995 #undef MMABS
00996 
00997 #if HAVE_SSSE3
00998 #define MMABS(a,z)    MMABS_SSSE3(a,z)
00999 DCT_SAD_FUNC(ssse3)
01000 #undef MMABS
01001 #endif
01002 #undef HSUM
01003 #undef DCT_SAD
01004 
01005 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
01006     int sum;
01007     x86_reg i=size;
01008     __asm__ volatile(
01009         "pxor %%mm4, %%mm4 \n"
01010         "1: \n"
01011         "sub $8, %0 \n"
01012         "movq (%2,%0), %%mm2 \n"
01013         "movq (%3,%0,2), %%mm0 \n"
01014         "movq 8(%3,%0,2), %%mm1 \n"
01015         "punpckhbw %%mm2, %%mm3 \n"
01016         "punpcklbw %%mm2, %%mm2 \n"
01017         "psraw $8, %%mm3 \n"
01018         "psraw $8, %%mm2 \n"
01019         "psubw %%mm3, %%mm1 \n"
01020         "psubw %%mm2, %%mm0 \n"
01021         "pmaddwd %%mm1, %%mm1 \n"
01022         "pmaddwd %%mm0, %%mm0 \n"
01023         "paddd %%mm1, %%mm4 \n"
01024         "paddd %%mm0, %%mm4 \n"
01025         "jg 1b \n"
01026         "movq %%mm4, %%mm3 \n"
01027         "psrlq $32, %%mm3 \n"
01028         "paddd %%mm3, %%mm4 \n"
01029         "movd %%mm4, %1 \n"
01030         :"+r"(i), "=r"(sum)
01031         :"r"(pix1), "r"(pix2)
01032     );
01033     return sum;
01034 }
01035 
01036 #define PHADDD(a, t)\
01037     "movq "#a", "#t"                  \n\t"\
01038     "psrlq $32, "#a"                  \n\t"\
01039     "paddd "#t", "#a"                 \n\t"
01040 /*
01041    pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
01042    pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
01043    pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
01044  */
01045 #define PMULHRW(x, y, s, o)\
01046     "pmulhw " #s ", "#x "            \n\t"\
01047     "pmulhw " #s ", "#y "            \n\t"\
01048     "paddw " #o ", "#x "             \n\t"\
01049     "paddw " #o ", "#y "             \n\t"\
01050     "psraw $1, "#x "                 \n\t"\
01051     "psraw $1, "#y "                 \n\t"
01052 #define DEF(x) x ## _mmx
01053 #define SET_RND MOVQ_WONE
01054 #define SCALE_OFFSET 1
01055 
01056 #include "dsputil_mmx_qns_template.c"
01057 
01058 #undef DEF
01059 #undef SET_RND
01060 #undef SCALE_OFFSET
01061 #undef PMULHRW
01062 
01063 #define DEF(x) x ## _3dnow
01064 #define SET_RND(x)
01065 #define SCALE_OFFSET 0
01066 #define PMULHRW(x, y, s, o)\
01067     "pmulhrw " #s ", "#x "           \n\t"\
01068     "pmulhrw " #s ", "#y "           \n\t"
01069 
01070 #include "dsputil_mmx_qns_template.c"
01071 
01072 #undef DEF
01073 #undef SET_RND
01074 #undef SCALE_OFFSET
01075 #undef PMULHRW
01076 
01077 #if HAVE_SSSE3
01078 #undef PHADDD
01079 #define DEF(x) x ## _ssse3
01080 #define SET_RND(x)
01081 #define SCALE_OFFSET -1
01082 #define PHADDD(a, t)\
01083     "pshufw $0x0E, "#a", "#t"         \n\t"\
01084     "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
01085 #define PMULHRW(x, y, s, o)\
01086     "pmulhrsw " #s ", "#x "          \n\t"\
01087     "pmulhrsw " #s ", "#y "          \n\t"
01088 
01089 #include "dsputil_mmx_qns_template.c"
01090 
01091 #undef DEF
01092 #undef SET_RND
01093 #undef SCALE_OFFSET
01094 #undef PMULHRW
01095 #undef PHADDD
01096 #endif //HAVE_SSSE3
01097 
01098 
01099 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
01100 {
01101     int mm_flags = av_get_cpu_flags();
01102 
01103     if (mm_flags & AV_CPU_FLAG_MMX) {
01104         const int dct_algo = avctx->dct_algo;
01105         if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
01106             if(mm_flags & AV_CPU_FLAG_SSE2){
01107                 c->fdct = ff_fdct_sse2;
01108             }else if(mm_flags & AV_CPU_FLAG_MMX2){
01109                 c->fdct = ff_fdct_mmx2;
01110             }else{
01111                 c->fdct = ff_fdct_mmx;
01112             }
01113         }
01114 
01115         c->get_pixels = get_pixels_mmx;
01116         c->diff_pixels = diff_pixels_mmx;
01117         c->pix_sum = pix_sum16_mmx;
01118 
01119         c->diff_bytes= diff_bytes_mmx;
01120         c->sum_abs_dctelem= sum_abs_dctelem_mmx;
01121 
01122 #if HAVE_YASM
01123         c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx;
01124         c->hadamard8_diff[1]= ff_hadamard8_diff_mmx;
01125 #endif
01126 
01127         c->pix_norm1 = pix_norm1_mmx;
01128         c->sse[0] = (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx;
01129           c->sse[1] = sse8_mmx;
01130         c->vsad[4]= vsad_intra16_mmx;
01131 
01132         c->nsse[0] = nsse16_mmx;
01133         c->nsse[1] = nsse8_mmx;
01134         if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01135             c->vsad[0] = vsad16_mmx;
01136         }
01137 
01138         if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01139             c->try_8x8basis= try_8x8basis_mmx;
01140         }
01141         c->add_8x8basis= add_8x8basis_mmx;
01142 
01143         c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
01144 
01145 
01146         if (mm_flags & AV_CPU_FLAG_MMX2) {
01147             c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
01148 #if HAVE_YASM
01149             c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2;
01150             c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2;
01151 #endif
01152             c->vsad[4]= vsad_intra16_mmx2;
01153 
01154             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01155                 c->vsad[0] = vsad16_mmx2;
01156             }
01157 
01158             c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
01159         }
01160 
01161         if(mm_flags & AV_CPU_FLAG_SSE2){
01162             c->get_pixels = get_pixels_sse2;
01163             c->sum_abs_dctelem= sum_abs_dctelem_sse2;
01164 #if HAVE_YASM && HAVE_ALIGNED_STACK
01165             c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2;
01166             c->hadamard8_diff[1]= ff_hadamard8_diff_sse2;
01167 #endif
01168         }
01169 
01170 #if HAVE_SSSE3
01171         if(mm_flags & AV_CPU_FLAG_SSSE3){
01172             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01173                 c->try_8x8basis= try_8x8basis_ssse3;
01174             }
01175             c->add_8x8basis= add_8x8basis_ssse3;
01176             c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
01177 #if HAVE_YASM && HAVE_ALIGNED_STACK
01178             c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3;
01179             c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3;
01180 #endif
01181         }
01182 #endif
01183 
01184         if(mm_flags & AV_CPU_FLAG_3DNOW){
01185             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
01186                 c->try_8x8basis= try_8x8basis_3dnow;
01187             }
01188             c->add_8x8basis= add_8x8basis_3dnow;
01189         }
01190     }
01191 
01192     dsputil_init_pix_mmx(c, avctx);
01193 }

Generated on Fri Feb 22 2013 07:24:29 for FFmpeg by  doxygen 1.7.1