• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • Examples
  • File List
  • Globals

libavfilter/libmpcodecs/vf_fspp.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
00003  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
00004  *
00005  * This file is part of MPlayer.
00006  *
00007  * MPlayer is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * MPlayer is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License along
00018  * with MPlayer; if not, write to the Free Software Foundation, Inc.,
00019  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00020  */
00021 
00022 /*
00023  * This implementation is based on an algorithm described in
00024  * "Aria Nosratinia Embedded Post-Processing for
00025  * Enhancement of Compressed Images (1999)"
00026  * (http://citeseer.nj.nec.com/nosratinia99embedded.html)
00027  * Futher, with splitting (i)dct into hor/ver passes, one of them can be
00028  * performed once per block, not pixel. This allows for much better speed.
00029  */
00030 
00031 /*
00032   Heavily optimized version of SPP filter by Nikolaj
00033  */
00034 
00035 #include <stdio.h>
00036 #include <stdlib.h>
00037 #include <string.h>
00038 #include <inttypes.h>
00039 #include <math.h>
00040 
00041 #include "config.h"
00042 
00043 #include "mp_msg.h"
00044 #include "cpudetect.h"
00045 #include "img_format.h"
00046 #include "mp_image.h"
00047 #include "vf.h"
00048 #include "vd_ffmpeg.h"
00049 #include "libvo/fastmemcpy.h"
00050 
00051 #include "libavutil/internal.h"
00052 #include "libavutil/intreadwrite.h"
00053 #include "libavutil/mem.h"
00054 #include "libavcodec/avcodec.h"
00055 #include "libavcodec/dsputil.h"
00056 
00057 #undef free
00058 #undef malloc
00059 
00060 //===========================================================================//
00061 #define BLOCKSZ 12
00062 
00063 static const short custom_threshold[64]=
00064 // values (296) can't be too high
00065 // -it causes too big quant dependence
00066 // or maybe overflow(check), which results in some flashing
00067 { 71, 296, 295, 237,  71,  40,  38,  19,
00068   245, 193, 185, 121, 102,  73,  53,  27,
00069   158, 129, 141, 107,  97,  73,  50,  26,
00070   102, 116, 109,  98,  82,  66,  45,  23,
00071   71,  94,  95,  81,  70,  56,  38,  20,
00072   56,  77,  74,  66,  56,  44,  30,  15,
00073   38,  53,  50,  45,  38,  30,  21,  11,
00074   20,  27,  26,  23,  20,  15,  11,   5
00075 };
00076 
00077 static const uint8_t  __attribute__((aligned(32))) dither[8][8]={
00078     {  0,  48,  12,  60,   3,  51,  15,  63, },
00079     { 32,  16,  44,  28,  35,  19,  47,  31, },
00080     {  8,  56,   4,  52,  11,  59,   7,  55, },
00081     { 40,  24,  36,  20,  43,  27,  39,  23, },
00082     {  2,  50,  14,  62,   1,  49,  13,  61, },
00083     { 34,  18,  46,  30,  33,  17,  45,  29, },
00084     { 10,  58,   6,  54,   9,  57,   5,  53, },
00085     { 42,  26,  38,  22,  41,  25,  37,  21, },
00086 };
00087 
00088 struct vf_priv_s { //align 16 !
00089     uint64_t threshold_mtx_noq[8*2];
00090     uint64_t threshold_mtx[8*2];//used in both C & MMX (& later SSE2) versions
00091 
00092     int log2_count;
00093     int temp_stride;
00094     int qp;
00095     int mpeg2;
00096     int prev_q;
00097     uint8_t *src;
00098     int16_t *temp;
00099     int bframes;
00100     char *non_b_qp;
00101 };
00102 
00103 
00104 #if !HAVE_MMX
00105 
00106 //This func reads from 1 slice, 1 and clears 0 & 1
00107 static void store_slice_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
00108 {int y, x;
00109 #define STORE(pos)                                                        \
00110     temp= (src[x + pos] + (d[pos]>>log2_scale))>>(6-log2_scale);        \
00111     src[x + pos]=src[x + pos - 8*src_stride]=0;                                \
00112     if(temp & 0x100) temp= ~(temp>>31);                                        \
00113     dst[x + pos]= temp;
00114 
00115     for(y=0; y<height; y++){
00116         const uint8_t *d= dither[y];
00117         for(x=0; x<width; x+=8){
00118             int temp;
00119             STORE(0);
00120             STORE(1);
00121             STORE(2);
00122             STORE(3);
00123             STORE(4);
00124             STORE(5);
00125             STORE(6);
00126             STORE(7);
00127         }
00128         src+=src_stride;
00129         dst+=dst_stride;
00130     }
00131 }
00132 
00133 //This func reads from 2 slices, 0 & 2  and clears 2-nd
00134 static void store_slice2_c(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int height, int log2_scale)
00135 {int y, x;
00136 #define STORE2(pos)                                                        \
00137     temp= (src[x + pos] + src[x + pos + 16*src_stride] + (d[pos]>>log2_scale))>>(6-log2_scale);        \
00138     src[x + pos + 16*src_stride]=0;                                        \
00139     if(temp & 0x100) temp= ~(temp>>31);                                        \
00140     dst[x + pos]= temp;
00141 
00142     for(y=0; y<height; y++){
00143         const uint8_t *d= dither[y];
00144         for(x=0; x<width; x+=8){
00145             int temp;
00146             STORE2(0);
00147             STORE2(1);
00148             STORE2(2);
00149             STORE2(3);
00150             STORE2(4);
00151             STORE2(5);
00152             STORE2(6);
00153             STORE2(7);
00154         }
00155         src+=src_stride;
00156         dst+=dst_stride;
00157     }
00158 }
00159 
00160 static void mul_thrmat_c(struct vf_priv_s *p,int q)
00161 {
00162     int a;
00163     for(a=0;a<64;a++)
00164         ((short*)p->threshold_mtx)[a]=q * ((short*)p->threshold_mtx_noq)[a];//ints faster in C
00165 }
00166 
00167 static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt);
00168 static void row_idct_c(DCTELEM* workspace,
00169                        int16_t* output_adr, int output_stride, int cnt);
00170 static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt);
00171 
00172 //this is rather ugly, but there is no need for function pointers
00173 #define store_slice_s store_slice_c
00174 #define store_slice2_s store_slice2_c
00175 #define mul_thrmat_s mul_thrmat_c
00176 #define column_fidct_s column_fidct_c
00177 #define row_idct_s row_idct_c
00178 #define row_fdct_s row_fdct_c
00179 
00180 #else /* HAVE_MMX */
00181 
00182 //This func reads from 1 slice, 1 and clears 0 & 1
00183 static void store_slice_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
00184 {
00185     const uint8_t *od=&dither[0][0];
00186     const uint8_t *end=&dither[height][0];
00187     width = (width+7)&~7;
00188     dst_stride-=width;
00189     //src_stride=(src_stride-width)*2;
00190     __asm__ volatile(
00191         "mov %5, %%"REG_d"                \n\t"
00192         "mov %6, %%"REG_S"                \n\t"
00193         "mov %7, %%"REG_D"                \n\t"
00194         "mov %1, %%"REG_a"                \n\t"
00195         "movd %%"REG_d", %%mm5             \n\t"
00196         "xor $-1, %%"REG_d"              \n\t"
00197         "mov %%"REG_a", %%"REG_c"             \n\t"
00198         "add $7, %%"REG_d"               \n\t"
00199         "neg %%"REG_a"                   \n\t"
00200         "sub %0, %%"REG_c"            \n\t"
00201         "add %%"REG_c", %%"REG_c"             \n\t"
00202         "movd %%"REG_d", %%mm2             \n\t"
00203         "mov %%"REG_c", %1       \n\t"
00204         "mov %2, %%"REG_d"               \n\t"
00205         "shl $4, %%"REG_a"               \n\t"
00206 
00207         "2:                        \n\t"
00208         "movq (%%"REG_d"), %%mm3           \n\t"
00209         "movq %%mm3, %%mm4             \n\t"
00210         "pxor %%mm7, %%mm7             \n\t"
00211         "punpcklbw %%mm7, %%mm3        \n\t"
00212         "punpckhbw %%mm7, %%mm4        \n\t"
00213         "mov %0, %%"REG_c"            \n\t"
00214         "psraw %%mm5, %%mm3            \n\t"
00215         "psraw %%mm5, %%mm4            \n\t"
00216         "1:                        \n\t"
00217         "movq %%mm7, (%%"REG_S",%%"REG_a",)     \n\t"
00218         "movq (%%"REG_S"), %%mm0           \n\t"
00219         "movq 8(%%"REG_S"), %%mm1          \n\t"
00220 
00221         "movq %%mm7, 8(%%"REG_S",%%"REG_a",)    \n\t"
00222         "paddw %%mm3, %%mm0            \n\t"
00223         "paddw %%mm4, %%mm1            \n\t"
00224 
00225         "movq %%mm7, (%%"REG_S")           \n\t"
00226         "psraw %%mm2, %%mm0            \n\t"
00227         "psraw %%mm2, %%mm1            \n\t"
00228 
00229         "movq %%mm7, 8(%%"REG_S")          \n\t"
00230         "packuswb %%mm1, %%mm0         \n\t"
00231         "add $16, %%"REG_S"              \n\t"
00232 
00233         "movq %%mm0, (%%"REG_D")           \n\t"
00234         "add $8, %%"REG_D"               \n\t"
00235         "sub $8, %%"REG_c"               \n\t"
00236         "jg 1b                      \n\t"
00237         "add %1, %%"REG_S"       \n\t"
00238         "add $8, %%"REG_d"               \n\t"
00239         "add %3, %%"REG_D"       \n\t"
00240         "cmp %4, %%"REG_d"           \n\t"
00241         "jl 2b                      \n\t"
00242 
00243         :
00244         : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
00245           "m" (log2_scale), "m" (src), "m" (dst) //input
00246         : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
00247         );
00248 }
00249 
00250 //This func reads from 2 slices, 0 & 2  and clears 2-nd
00251 static void store_slice2_mmx(uint8_t *dst, int16_t *src, long dst_stride, long src_stride, long width, long height, long log2_scale)
00252 {
00253     const uint8_t *od=&dither[0][0];
00254     const uint8_t *end=&dither[height][0];
00255     width = (width+7)&~7;
00256     dst_stride-=width;
00257     //src_stride=(src_stride-width)*2;
00258     __asm__ volatile(
00259         "mov %5, %%"REG_d"                \n\t"
00260         "mov %6, %%"REG_S"                \n\t"
00261         "mov %7, %%"REG_D"                \n\t"
00262         "mov %1, %%"REG_a"            \n\t"
00263         "movd %%"REG_d", %%mm5             \n\t"
00264         "xor $-1, %%"REG_d"              \n\t"
00265         "mov %%"REG_a", %%"REG_c"             \n\t"
00266         "add $7, %%"REG_d"               \n\t"
00267         "sub %0, %%"REG_c"            \n\t"
00268         "add %%"REG_c", %%"REG_c"             \n\t"
00269         "movd %%"REG_d", %%mm2             \n\t"
00270         "mov %%"REG_c", %1       \n\t"
00271         "mov %2, %%"REG_d"               \n\t"
00272         "shl $5, %%"REG_a"               \n\t"
00273 
00274         "2:                        \n\t"
00275         "movq (%%"REG_d"), %%mm3           \n\t"
00276         "movq %%mm3, %%mm4             \n\t"
00277         "pxor %%mm7, %%mm7             \n\t"
00278         "punpcklbw %%mm7, %%mm3        \n\t"
00279         "punpckhbw %%mm7, %%mm4        \n\t"
00280         "mov %0, %%"REG_c"            \n\t"
00281         "psraw %%mm5, %%mm3            \n\t"
00282         "psraw %%mm5, %%mm4            \n\t"
00283         "1:                        \n\t"
00284         "movq (%%"REG_S"), %%mm0           \n\t"
00285         "movq 8(%%"REG_S"), %%mm1          \n\t"
00286         "paddw %%mm3, %%mm0            \n\t"
00287 
00288         "paddw (%%"REG_S",%%"REG_a",), %%mm0    \n\t"
00289         "paddw %%mm4, %%mm1            \n\t"
00290         "movq 8(%%"REG_S",%%"REG_a",), %%mm6    \n\t"
00291 
00292         "movq %%mm7, (%%"REG_S",%%"REG_a",)     \n\t"
00293         "psraw %%mm2, %%mm0            \n\t"
00294         "paddw %%mm6, %%mm1            \n\t"
00295 
00296         "movq %%mm7, 8(%%"REG_S",%%"REG_a",)    \n\t"
00297         "psraw %%mm2, %%mm1            \n\t"
00298         "packuswb %%mm1, %%mm0         \n\t"
00299 
00300         "movq %%mm0, (%%"REG_D")           \n\t"
00301         "add $16, %%"REG_S"              \n\t"
00302         "add $8, %%"REG_D"               \n\t"
00303         "sub $8, %%"REG_c"               \n\t"
00304         "jg 1b                      \n\t"
00305         "add %1, %%"REG_S"       \n\t"
00306         "add $8, %%"REG_d"               \n\t"
00307         "add %3, %%"REG_D"       \n\t"
00308         "cmp %4, %%"REG_d"           \n\t"
00309         "jl 2b                      \n\t"
00310 
00311         :
00312         : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
00313           "m" (log2_scale), "m" (src), "m" (dst) //input
00314         : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
00315         );
00316 }
00317 
00318 static void mul_thrmat_mmx(struct vf_priv_s *p, int q)
00319 {
00320     uint64_t *adr=&p->threshold_mtx_noq[0];
00321     __asm__ volatile(
00322         "movd %0, %%mm7                \n\t"
00323         "add $8*8*2, %%"REG_D"            \n\t"
00324         "movq 0*8(%%"REG_S"), %%mm0        \n\t"
00325         "punpcklwd %%mm7, %%mm7        \n\t"
00326         "movq 1*8(%%"REG_S"), %%mm1        \n\t"
00327         "punpckldq %%mm7, %%mm7        \n\t"
00328         "pmullw %%mm7, %%mm0           \n\t"
00329 
00330         "movq 2*8(%%"REG_S"), %%mm2        \n\t"
00331         "pmullw %%mm7, %%mm1           \n\t"
00332 
00333         "movq 3*8(%%"REG_S"), %%mm3        \n\t"
00334         "pmullw %%mm7, %%mm2           \n\t"
00335 
00336         "movq %%mm0, 0*8(%%"REG_D")        \n\t"
00337         "movq 4*8(%%"REG_S"), %%mm4        \n\t"
00338         "pmullw %%mm7, %%mm3           \n\t"
00339 
00340         "movq %%mm1, 1*8(%%"REG_D")        \n\t"
00341         "movq 5*8(%%"REG_S"), %%mm5        \n\t"
00342         "pmullw %%mm7, %%mm4           \n\t"
00343 
00344         "movq %%mm2, 2*8(%%"REG_D")        \n\t"
00345         "movq 6*8(%%"REG_S"), %%mm6        \n\t"
00346         "pmullw %%mm7, %%mm5           \n\t"
00347 
00348         "movq %%mm3, 3*8(%%"REG_D")        \n\t"
00349         "movq 7*8+0*8(%%"REG_S"), %%mm0    \n\t"
00350         "pmullw %%mm7, %%mm6           \n\t"
00351 
00352         "movq %%mm4, 4*8(%%"REG_D")        \n\t"
00353         "movq 7*8+1*8(%%"REG_S"), %%mm1    \n\t"
00354         "pmullw %%mm7, %%mm0           \n\t"
00355 
00356         "movq %%mm5, 5*8(%%"REG_D")        \n\t"
00357         "movq 7*8+2*8(%%"REG_S"), %%mm2    \n\t"
00358         "pmullw %%mm7, %%mm1           \n\t"
00359 
00360         "movq %%mm6, 6*8(%%"REG_D")        \n\t"
00361         "movq 7*8+3*8(%%"REG_S"), %%mm3    \n\t"
00362         "pmullw %%mm7, %%mm2           \n\t"
00363 
00364         "movq %%mm0, 7*8+0*8(%%"REG_D")    \n\t"
00365         "movq 7*8+4*8(%%"REG_S"), %%mm4    \n\t"
00366         "pmullw %%mm7, %%mm3           \n\t"
00367 
00368         "movq %%mm1, 7*8+1*8(%%"REG_D")    \n\t"
00369         "movq 7*8+5*8(%%"REG_S"), %%mm5    \n\t"
00370         "pmullw %%mm7, %%mm4           \n\t"
00371 
00372         "movq %%mm2, 7*8+2*8(%%"REG_D")    \n\t"
00373         "movq 7*8+6*8(%%"REG_S"), %%mm6    \n\t"
00374         "pmullw %%mm7, %%mm5           \n\t"
00375 
00376         "movq %%mm3, 7*8+3*8(%%"REG_D")    \n\t"
00377         "movq 14*8+0*8(%%"REG_S"), %%mm0   \n\t"
00378         "pmullw %%mm7, %%mm6           \n\t"
00379 
00380         "movq %%mm4, 7*8+4*8(%%"REG_D")    \n\t"
00381         "movq 14*8+1*8(%%"REG_S"), %%mm1   \n\t"
00382         "pmullw %%mm7, %%mm0           \n\t"
00383 
00384         "movq %%mm5, 7*8+5*8(%%"REG_D")    \n\t"
00385         "pmullw %%mm7, %%mm1           \n\t"
00386 
00387         "movq %%mm6, 7*8+6*8(%%"REG_D")    \n\t"
00388         "movq %%mm0, 14*8+0*8(%%"REG_D")   \n\t"
00389         "movq %%mm1, 14*8+1*8(%%"REG_D")   \n\t"
00390 
00391         : "+g" (q), "+S" (adr), "+D" (adr)
00392         :
00393         );
00394 }
00395 
00396 static void column_fidct_mmx(int16_t* thr_adr,  DCTELEM *data,  DCTELEM *output,  int cnt);
00397 static void row_idct_mmx(DCTELEM* workspace,
00398                          int16_t* output_adr,  int output_stride,  int cnt);
00399 static void row_fdct_mmx(DCTELEM *data,  const uint8_t *pixels,  int line_size,  int cnt);
00400 
00401 #define store_slice_s store_slice_mmx
00402 #define store_slice2_s store_slice2_mmx
00403 #define mul_thrmat_s mul_thrmat_mmx
00404 #define column_fidct_s column_fidct_mmx
00405 #define row_idct_s row_idct_mmx
00406 #define row_fdct_s row_fdct_mmx
00407 #endif // HAVE_MMX
00408 
00409 static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src,
00410                    int dst_stride, int src_stride,
00411                    int width, int height,
00412                    uint8_t *qp_store, int qp_stride, int is_luma)
00413 {
00414     int x, x0, y, es, qy, t;
00415     const int stride= is_luma ? p->temp_stride : (width+16);//((width+16+15)&(~15))
00416     const int step=6-p->log2_count;
00417     const int qps= 3 + is_luma;
00418     int32_t __attribute__((aligned(32))) block_align[4*8*BLOCKSZ+ 4*8*BLOCKSZ];
00419     DCTELEM *block= (DCTELEM *)block_align;
00420     DCTELEM *block3=(DCTELEM *)(block_align+4*8*BLOCKSZ);
00421 
00422     memset(block3, 0, 4*8*BLOCKSZ);
00423 
00424     //p->src=src-src_stride*8-8;//!
00425     if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
00426     for(y=0; y<height; y++){
00427         int index= 8 + 8*stride + y*stride;
00428         fast_memcpy(p->src + index, src + y*src_stride, width);//this line can be avoided by using DR & user fr.buffers
00429         for(x=0; x<8; x++){
00430             p->src[index         - x - 1]= p->src[index +         x    ];
00431             p->src[index + width + x    ]= p->src[index + width - x - 1];
00432         }
00433     }
00434     for(y=0; y<8; y++){
00435         fast_memcpy(p->src + (      7-y)*stride, p->src + (      y+8)*stride, stride);
00436         fast_memcpy(p->src + (height+8+y)*stride, p->src + (height-y+7)*stride, stride);
00437     }
00438     //FIXME (try edge emu)
00439 
00440     for(y=8; y<24; y++)
00441         memset(p->temp+ 8 +y*stride, 0,width*sizeof(int16_t));
00442 
00443     for(y=step; y<height+8; y+=step){    //step= 1,2
00444         qy=y-4;
00445         if (qy>height-1) qy=height-1;
00446         if (qy<0) qy=0;
00447         qy=(qy>>qps)*qp_stride;
00448         row_fdct_s(block, p->src + y*stride +2-(y&1), stride, 2);
00449         for(x0=0; x0<width+8-8*(BLOCKSZ-1); x0+=8*(BLOCKSZ-1)){
00450             row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, 2*(BLOCKSZ-1));
00451             if(p->qp)
00452                 column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+0*8, block3+0*8, 8*(BLOCKSZ-1)); //yes, this is a HOTSPOT
00453             else
00454                 for (x=0; x<8*(BLOCKSZ-1); x+=8) {
00455                     t=x+x0-2; //correct t=x+x0-2-(y&1), but its the same
00456                     if (t<0) t=0;//t always < width-2
00457                     t=qp_store[qy+(t>>qps)];
00458                     t=norm_qscale(t, p->mpeg2);
00459                     if (t!=p->prev_q) p->prev_q=t, mul_thrmat_s(p, t);
00460                     column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block+x*8, block3+x*8, 8); //yes, this is a HOTSPOT
00461                 }
00462             row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, 2*(BLOCKSZ-1));
00463             memmove(block, block+(BLOCKSZ-1)*64, 8*8*sizeof(DCTELEM)); //cycling
00464             memmove(block3, block3+(BLOCKSZ-1)*64, 6*8*sizeof(DCTELEM));
00465         }
00466         //
00467         es=width+8-x0; //  8, ...
00468         if (es>8)
00469             row_fdct_s(block+8*8, p->src + y*stride+8+x0 +2-(y&1), stride, (es-4)>>2);
00470         column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block, block3, es&(~1));
00471         row_idct_s(block3+0*8, p->temp + (y&15)*stride+x0+2-(y&1), stride, es>>2);
00472         {const int y1=y-8+step;//l5-7  l4-6
00473             if (!(y1&7) && y1) {
00474                 if (y1&8) store_slice_s(dst + (y1-8)*dst_stride, p->temp+ 8 +8*stride,
00475                                         dst_stride, stride, width, 8, 5-p->log2_count);
00476                 else store_slice2_s(dst + (y1-8)*dst_stride, p->temp+ 8 +0*stride,
00477                                     dst_stride, stride, width, 8, 5-p->log2_count);
00478             } }
00479     }
00480 
00481     if (y&7) {  // == height & 7
00482         if (y&8) store_slice_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +8*stride,
00483                                dst_stride, stride, width, y&7, 5-p->log2_count);
00484         else store_slice2_s(dst + ((y-8)&~7)*dst_stride, p->temp+ 8 +0*stride,
00485                             dst_stride, stride, width, y&7, 5-p->log2_count);
00486     }
00487 }
00488 
00489 static int config(struct vf_instance *vf,
00490                   int width, int height, int d_width, int d_height,
00491                   unsigned int flags, unsigned int outfmt)
00492 {
00493     int h= (height+16+15)&(~15);
00494 
00495     vf->priv->temp_stride= (width+16+15)&(~15);
00496     vf->priv->temp= (int16_t*)av_mallocz(vf->priv->temp_stride*3*8*sizeof(int16_t));
00497     //this can also be avoided, see above
00498     vf->priv->src = (uint8_t*)av_malloc(vf->priv->temp_stride*h*sizeof(uint8_t));
00499 
00500     return vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
00501 }
00502 
00503 static void get_image(struct vf_instance *vf, mp_image_t *mpi)
00504 {
00505     if(mpi->flags&MP_IMGFLAG_PRESERVE) return; // don't change
00506     // ok, we can do pp in-place (or pp disabled):
00507     vf->dmpi=vf_get_image(vf->next,mpi->imgfmt,
00508                           mpi->type, mpi->flags, mpi->width, mpi->height);
00509     mpi->planes[0]=vf->dmpi->planes[0];
00510     mpi->stride[0]=vf->dmpi->stride[0];
00511     mpi->width=vf->dmpi->width;
00512     if(mpi->flags&MP_IMGFLAG_PLANAR){
00513         mpi->planes[1]=vf->dmpi->planes[1];
00514         mpi->planes[2]=vf->dmpi->planes[2];
00515         mpi->stride[1]=vf->dmpi->stride[1];
00516         mpi->stride[2]=vf->dmpi->stride[2];
00517     }
00518     mpi->flags|=MP_IMGFLAG_DIRECT;
00519 }
00520 
00521 static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
00522 {
00523     mp_image_t *dmpi;
00524     if(!(mpi->flags&MP_IMGFLAG_DIRECT)){
00525         // no DR, so get a new image! hope we'll get DR buffer:
00526         dmpi=vf_get_image(vf->next,mpi->imgfmt,
00527                           MP_IMGTYPE_TEMP,
00528                           MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
00529                           mpi->width,mpi->height);
00530         vf_clone_mpi_attributes(dmpi, mpi);
00531     }else{
00532         dmpi=vf->dmpi;
00533     }
00534 
00535     vf->priv->mpeg2= mpi->qscale_type;
00536     if(mpi->pict_type != 3 && mpi->qscale && !vf->priv->qp){
00537         int w = mpi->qstride;
00538         int h = (mpi->h + 15) >> 4;
00539         if (!w) {
00540             w = (mpi->w + 15) >> 4;
00541             h = 1;
00542         }
00543         if(!vf->priv->non_b_qp)
00544             vf->priv->non_b_qp= malloc(w*h);
00545         fast_memcpy(vf->priv->non_b_qp, mpi->qscale, w*h);
00546     }
00547     if(vf->priv->log2_count || !(mpi->flags&MP_IMGFLAG_DIRECT)){
00548         char *qp_tab= vf->priv->non_b_qp;
00549         if(vf->priv->bframes || !qp_tab)
00550             qp_tab= mpi->qscale;
00551 
00552         if(qp_tab || vf->priv->qp){
00553             filter(vf->priv, dmpi->planes[0], mpi->planes[0], dmpi->stride[0], mpi->stride[0],
00554                    mpi->w, mpi->h, qp_tab, mpi->qstride, 1);
00555             filter(vf->priv, dmpi->planes[1], mpi->planes[1], dmpi->stride[1], mpi->stride[1],
00556                    mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
00557             filter(vf->priv, dmpi->planes[2], mpi->planes[2], dmpi->stride[2], mpi->stride[2],
00558                    mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, qp_tab, mpi->qstride, 0);
00559         }else{
00560             memcpy_pic(dmpi->planes[0], mpi->planes[0], mpi->w, mpi->h, dmpi->stride[0], mpi->stride[0]);
00561             memcpy_pic(dmpi->planes[1], mpi->planes[1], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[1], mpi->stride[1]);
00562             memcpy_pic(dmpi->planes[2], mpi->planes[2], mpi->w>>mpi->chroma_x_shift, mpi->h>>mpi->chroma_y_shift, dmpi->stride[2], mpi->stride[2]);
00563         }
00564     }
00565 
00566 #if HAVE_MMX
00567     if(gCpuCaps.hasMMX) __asm__ volatile ("emms\n\t");
00568 #endif
00569 #if HAVE_MMX2
00570     if(gCpuCaps.hasMMX2) __asm__ volatile ("sfence\n\t");
00571 #endif
00572     return vf_next_put_image(vf,dmpi, pts);
00573 }
00574 
00575 static void uninit(struct vf_instance *vf)
00576 {
00577     if(!vf->priv) return;
00578 
00579     av_free(vf->priv->temp);
00580     vf->priv->temp= NULL;
00581     av_free(vf->priv->src);
00582     vf->priv->src= NULL;
00583     //free(vf->priv->avctx);
00584     //vf->priv->avctx= NULL;
00585     free(vf->priv->non_b_qp);
00586     vf->priv->non_b_qp= NULL;
00587 
00588     av_free(vf->priv);
00589     vf->priv=NULL;
00590 }
00591 
00592 //===========================================================================//
00593 
00594 static int query_format(struct vf_instance *vf, unsigned int fmt)
00595 {
00596     switch(fmt){
00597     case IMGFMT_YVU9:
00598     case IMGFMT_IF09:
00599     case IMGFMT_YV12:
00600     case IMGFMT_I420:
00601     case IMGFMT_IYUV:
00602     case IMGFMT_CLPL:
00603     case IMGFMT_Y800:
00604     case IMGFMT_Y8:
00605     case IMGFMT_444P:
00606     case IMGFMT_422P:
00607     case IMGFMT_411P:
00608         return vf_next_query_format(vf,fmt);
00609     }
00610     return 0;
00611 }
00612 
00613 static int control(struct vf_instance *vf, int request, void* data)
00614 {
00615     switch(request){
00616     case VFCTRL_QUERY_MAX_PP_LEVEL:
00617         return 5;
00618     case VFCTRL_SET_PP_LEVEL:
00619         vf->priv->log2_count= *((unsigned int*)data);
00620         if (vf->priv->log2_count < 4) vf->priv->log2_count=4;
00621         return CONTROL_TRUE;
00622     }
00623     return vf_next_control(vf,request,data);
00624 }
00625 
00626 static int vf_open(vf_instance_t *vf, char *args)
00627 {
00628     int i=0, bias;
00629     int custom_threshold_m[64];
00630     int log2c=-1;
00631 
00632     vf->config=config;
00633     vf->put_image=put_image;
00634     vf->get_image=get_image;
00635     vf->query_format=query_format;
00636     vf->uninit=uninit;
00637     vf->control= control;
00638     vf->priv=av_mallocz(sizeof(struct vf_priv_s));//assumes align 16 !
00639 
00640     init_avcodec();
00641 
00642     //vf->priv->avctx= avcodec_alloc_context();
00643     //dsputil_init(&vf->priv->dsp, vf->priv->avctx);
00644 
00645     vf->priv->log2_count= 4;
00646     vf->priv->bframes = 0;
00647 
00648     if (args) sscanf(args, "%d:%d:%d:%d", &log2c, &vf->priv->qp, &i, &vf->priv->bframes);
00649 
00650     if( log2c >=4 && log2c <=5 )
00651         vf->priv->log2_count = log2c;
00652     else if( log2c >= 6 )
00653         vf->priv->log2_count = 5;
00654 
00655     if(vf->priv->qp < 0)
00656         vf->priv->qp = 0;
00657 
00658     if (i < -15) i = -15;
00659     if (i > 32) i = 32;
00660 
00661     bias= (1<<4)+i; //regulable
00662     vf->priv->prev_q=0;
00663     //
00664     for(i=0;i<64;i++) //FIXME: tune custom_threshold[] and remove this !
00665         custom_threshold_m[i]=(int)(custom_threshold[i]*(bias/71.)+ 0.5);
00666     for(i=0;i<8;i++){
00667         vf->priv->threshold_mtx_noq[2*i]=(uint64_t)custom_threshold_m[i*8+2]
00668             |(((uint64_t)custom_threshold_m[i*8+6])<<16)
00669             |(((uint64_t)custom_threshold_m[i*8+0])<<32)
00670             |(((uint64_t)custom_threshold_m[i*8+4])<<48);
00671         vf->priv->threshold_mtx_noq[2*i+1]=(uint64_t)custom_threshold_m[i*8+5]
00672             |(((uint64_t)custom_threshold_m[i*8+3])<<16)
00673             |(((uint64_t)custom_threshold_m[i*8+1])<<32)
00674             |(((uint64_t)custom_threshold_m[i*8+7])<<48);
00675     }
00676 
00677     if (vf->priv->qp) vf->priv->prev_q=vf->priv->qp, mul_thrmat_s(vf->priv, vf->priv->qp);
00678 
00679     return 1;
00680 }
00681 
00682 const vf_info_t vf_info_fspp = {
00683     "fast simple postprocess",
00684     "fspp",
00685     "Michael Niedermayer, Nikolaj Poroshin",
00686     "",
00687     vf_open,
00688     NULL
00689 };
00690 
00691 //====================================================================
00692 //Specific spp's dct, idct and threshold functions
00693 //I'd prefer to have them in the separate file.
00694 
00695 //#define MANGLE(a) #a
00696 
00697 //typedef int16_t DCTELEM; //! only int16_t
00698 
00699 #define DCTSIZE 8
00700 #define DCTSIZE_S "8"
00701 
00702 #define FIX(x,s)  ((int) ((x) * (1<<s) + 0.5)&0xffff)
00703 #define C64(x)    ((uint64_t)((x)|(x)<<16))<<32 | (uint64_t)(x) | (uint64_t)(x)<<16
00704 #define FIX64(x,s)  C64(FIX(x,s))
00705 
00706 #define MULTIPLY16H(x,k)   (((x)*(k))>>16)
00707 #define THRESHOLD(r,x,t) if(((unsigned)((x)+t))>t*2) r=(x);else r=0;
00708 #define DESCALE(x,n)  (((x) + (1 << ((n)-1))) >> n)
00709 
00710 #if HAVE_MMX
00711 
00712 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)=FIX64(0.382683433, 14);
00713 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_541196100)=FIX64(0.541196100, 14);
00714 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_707106781)=FIX64(0.707106781, 14);
00715 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)=FIX64(1.306562965, 14);
00716 
00717 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A)=FIX64(1.414213562, 14);
00718 
00719 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)=FIX64(1.847759065, 13);
00720 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)=FIX64(-2.613125930, 13); //-
00721 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)=FIX64(1.414213562, 13);
00722 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)=FIX64(1.082392200, 13);
00723 //for t3,t5,t7 == 0 shortcut
00724 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)=FIX64(0.847759065, 14);
00725 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)=FIX64(0.566454497, 14);
00726 DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)=FIX64(0.198912367, 14);
00727 
00728 DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)=C64(4);
00729 DECLARE_ASM_CONST(8, uint64_t, MM_2)=C64(2);
00730 
00731 #else /* !HAVE_MMX */
00732 
00733 typedef int32_t int_simd16_t;
00734 static const int16_t FIX_0_382683433=FIX(0.382683433, 14);
00735 static const int16_t FIX_0_541196100=FIX(0.541196100, 14);
00736 static const int16_t FIX_0_707106781=FIX(0.707106781, 14);
00737 static const int16_t FIX_1_306562965=FIX(1.306562965, 14);
00738 static const int16_t FIX_1_414213562_A=FIX(1.414213562, 14);
00739 static const int16_t FIX_1_847759065=FIX(1.847759065, 13);
00740 static const int16_t FIX_2_613125930=FIX(-2.613125930, 13); //-
00741 static const int16_t FIX_1_414213562=FIX(1.414213562, 13);
00742 static const int16_t FIX_1_082392200=FIX(1.082392200, 13);
00743 
00744 #endif
00745 
00746 #if !HAVE_MMX
00747 
00748 static void column_fidct_c(int16_t* thr_adr, DCTELEM *data, DCTELEM *output, int cnt)
00749 {
00750     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
00751     int_simd16_t tmp10, tmp11, tmp12, tmp13;
00752     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
00753     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
00754 
00755     DCTELEM* dataptr;
00756     DCTELEM* wsptr;
00757     int16_t *threshold;
00758     int ctr;
00759 
00760     dataptr = data;
00761     wsptr = output;
00762 
00763     for (; cnt > 0; cnt-=2) { //start positions
00764         threshold=(int16_t*)thr_adr;//threshold_mtx
00765         for (ctr = DCTSIZE; ctr > 0; ctr--) {
00766             // Process columns from input, add to output.
00767             tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
00768             tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
00769 
00770             tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
00771             tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
00772 
00773             tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
00774             tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
00775 
00776             tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
00777             tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
00778 
00779             // Even part of FDCT
00780 
00781             tmp10 = tmp0 + tmp3;
00782             tmp13 = tmp0 - tmp3;
00783             tmp11 = tmp1 + tmp2;
00784             tmp12 = tmp1 - tmp2;
00785 
00786             d0 = tmp10 + tmp11;
00787             d4 = tmp10 - tmp11;
00788 
00789             z1 = MULTIPLY16H((tmp12 + tmp13) <<2, FIX_0_707106781);
00790             d2 = tmp13 + z1;
00791             d6 = tmp13 - z1;
00792 
00793             // Even part of IDCT
00794 
00795             THRESHOLD(tmp0, d0, threshold[0*8]);
00796             THRESHOLD(tmp1, d2, threshold[2*8]);
00797             THRESHOLD(tmp2, d4, threshold[4*8]);
00798             THRESHOLD(tmp3, d6, threshold[6*8]);
00799             tmp0+=2;
00800             tmp10 = (tmp0 + tmp2)>>2;
00801             tmp11 = (tmp0 - tmp2)>>2;
00802 
00803             tmp13 = (tmp1 + tmp3)>>2; //+2 !  (psnr decides)
00804             tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
00805 
00806             tmp0 = tmp10 + tmp13; //->temps
00807             tmp3 = tmp10 - tmp13; //->temps
00808             tmp1 = tmp11 + tmp12; //->temps
00809             tmp2 = tmp11 - tmp12; //->temps
00810 
00811             // Odd part of FDCT
00812 
00813             tmp10 = tmp4 + tmp5;
00814             tmp11 = tmp5 + tmp6;
00815             tmp12 = tmp6 + tmp7;
00816 
00817             z5 = MULTIPLY16H((tmp10 - tmp12)<<2, FIX_0_382683433);
00818             z2 = MULTIPLY16H(tmp10 <<2, FIX_0_541196100) + z5;
00819             z4 = MULTIPLY16H(tmp12 <<2, FIX_1_306562965) + z5;
00820             z3 = MULTIPLY16H(tmp11 <<2, FIX_0_707106781);
00821 
00822             z11 = tmp7 + z3;
00823             z13 = tmp7 - z3;
00824 
00825             d5 = z13 + z2;
00826             d3 = z13 - z2;
00827             d1 = z11 + z4;
00828             d7 = z11 - z4;
00829 
00830             // Odd part of IDCT
00831 
00832             THRESHOLD(tmp4, d1, threshold[1*8]);
00833             THRESHOLD(tmp5, d3, threshold[3*8]);
00834             THRESHOLD(tmp6, d5, threshold[5*8]);
00835             THRESHOLD(tmp7, d7, threshold[7*8]);
00836 
00837             //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
00838             z13 = tmp6 + tmp5;
00839             z10 = (tmp6 - tmp5)<<1;
00840             z11 = tmp4 + tmp7;
00841             z12 = (tmp4 - tmp7)<<1;
00842 
00843             tmp7 = (z11 + z13)>>2; //+2 !
00844             tmp11 = MULTIPLY16H((z11 - z13)<<1, FIX_1_414213562);
00845             z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
00846             tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
00847             tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - !!
00848 
00849             tmp6 = tmp12 - tmp7;
00850             tmp5 = tmp11 - tmp6;
00851             tmp4 = tmp10 + tmp5;
00852 
00853             wsptr[DCTSIZE*0]+=  (tmp0 + tmp7);
00854             wsptr[DCTSIZE*1]+=  (tmp1 + tmp6);
00855             wsptr[DCTSIZE*2]+=  (tmp2 + tmp5);
00856             wsptr[DCTSIZE*3]+=  (tmp3 - tmp4);
00857             wsptr[DCTSIZE*4]+=  (tmp3 + tmp4);
00858             wsptr[DCTSIZE*5]+=  (tmp2 - tmp5);
00859             wsptr[DCTSIZE*6]=  (tmp1 - tmp6);
00860             wsptr[DCTSIZE*7]=  (tmp0 - tmp7);
00861             //
00862             dataptr++; //next column
00863             wsptr++;
00864             threshold++;
00865         }
00866         dataptr+=8; //skip each second start pos
00867         wsptr  +=8;
00868     }
00869 }
00870 
00871 #else /* HAVE_MMX */
00872 
00873 static void column_fidct_mmx(int16_t* thr_adr,  DCTELEM *data,  DCTELEM *output,  int cnt)
00874 {
00875     uint64_t __attribute__((aligned(8))) temps[4];
00876     __asm__ volatile(
00877         ASMALIGN(4)
00878         "1:                   \n\t"
00879         "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
00880         //
00881         "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
00882         "movq %%mm1, %%mm0             \n\t"
00883 
00884         "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
00885         "movq %%mm7, %%mm3             \n\t"
00886 
00887         "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
00888         "movq %%mm1, %%mm5             \n\t"
00889 
00890         "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
00891         "psubw %%mm7, %%mm1            \n\t" //t13
00892 
00893         "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
00894         "movq %%mm6, %%mm4             \n\t"
00895 
00896         "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
00897         "paddw %%mm7, %%mm5            \n\t" //t10
00898 
00899         "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
00900         "movq %%mm6, %%mm7             \n\t"
00901 
00902         "paddw %%mm2, %%mm6            \n\t" //t11
00903         "psubw %%mm2, %%mm7            \n\t" //t12
00904 
00905         "movq %%mm5, %%mm2             \n\t"
00906         "paddw %%mm6, %%mm5            \n\t" //d0
00907         // i0 t13 t12 i3 i1 d0 - d4
00908         "psubw %%mm6, %%mm2            \n\t" //d4
00909         "paddw %%mm1, %%mm7            \n\t"
00910 
00911         "movq  4*16(%%"REG_d"), %%mm6      \n\t"
00912         "psllw $2, %%mm7              \n\t"
00913 
00914         "psubw 0*16(%%"REG_d"), %%mm5      \n\t"
00915         "psubw %%mm6, %%mm2            \n\t"
00916 
00917         "paddusw 0*16(%%"REG_d"), %%mm5    \n\t"
00918         "paddusw %%mm6, %%mm2          \n\t"
00919 
00920         "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
00921         //
00922         "paddw 0*16(%%"REG_d"), %%mm5      \n\t"
00923         "paddw %%mm6, %%mm2            \n\t"
00924 
00925         "psubusw 0*16(%%"REG_d"), %%mm5    \n\t"
00926         "psubusw %%mm6, %%mm2          \n\t"
00927 
00928 //This func is totally compute-bound,  operates at huge speed. So,  DC shortcut
00929 // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
00930 //However,  typical numbers: nondc - 29%%,  dc - 46%%,  zero - 25%%. All <> 0 case is very rare.
00931         "paddw "MANGLE(MM_2)", %%mm5            \n\t"
00932         "movq %%mm2, %%mm6             \n\t"
00933 
00934         "paddw %%mm5, %%mm2            \n\t"
00935         "psubw %%mm6, %%mm5            \n\t"
00936 
00937         "movq %%mm1, %%mm6             \n\t"
00938         "paddw %%mm7, %%mm1            \n\t" //d2
00939 
00940         "psubw 2*16(%%"REG_d"), %%mm1      \n\t"
00941         "psubw %%mm7, %%mm6            \n\t" //d6
00942 
00943         "movq 6*16(%%"REG_d"), %%mm7       \n\t"
00944         "psraw $2, %%mm5              \n\t"
00945 
00946         "paddusw 2*16(%%"REG_d"), %%mm1    \n\t"
00947         "psubw %%mm7, %%mm6            \n\t"
00948         // t7 d2 /t11 t4 t6 - d6 /t10
00949 
00950         "paddw 2*16(%%"REG_d"), %%mm1      \n\t"
00951         "paddusw %%mm7, %%mm6          \n\t"
00952 
00953         "psubusw 2*16(%%"REG_d"), %%mm1    \n\t"
00954         "paddw %%mm7, %%mm6            \n\t"
00955 
00956         "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
00957         "psubusw %%mm7, %%mm6          \n\t"
00958 
00959         //movq [edi+"DCTSIZE_S"*2*2], mm1
00960         //movq [edi+"DCTSIZE_S"*6*2], mm6
00961         "movq %%mm1, %%mm7             \n\t"
00962         "psraw $2, %%mm2              \n\t"
00963 
00964         "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
00965         "psubw %%mm6, %%mm1            \n\t"
00966 
00967         "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
00968         "paddw %%mm7, %%mm6            \n\t" //'t13
00969 
00970         "psraw $2, %%mm6              \n\t" //paddw mm6, MM_2 !!    ---
00971         "movq %%mm2, %%mm7             \n\t"
00972 
00973         "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
00974         "paddw %%mm6, %%mm2            \n\t" //'t0
00975 
00976         "movq %%mm2, 0*8+%3            \n\t" 
00977         "psubw %%mm6, %%mm7            \n\t" //'t3
00978 
00979         "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
00980         "psubw %%mm6, %%mm1            \n\t" //'t12
00981 
00982         "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
00983         "movq %%mm5, %%mm6             \n\t"
00984 
00985         "movq %%mm7, 3*8+%3            \n\t"
00986         "paddw %%mm2, %%mm3            \n\t" //t10
00987 
00988         "paddw %%mm4, %%mm2            \n\t" //t11
00989         "paddw %%mm0, %%mm4            \n\t" //t12
00990 
00991         "movq %%mm3, %%mm7             \n\t"
00992         "psubw %%mm4, %%mm3            \n\t"
00993 
00994         "psllw $2, %%mm3              \n\t"
00995         "psllw $2, %%mm7              \n\t" //opt for P6
00996 
00997         "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
00998         "psllw $2, %%mm4              \n\t"
00999 
01000         "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t"
01001         "psllw $2, %%mm2              \n\t"
01002 
01003         "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
01004         "paddw %%mm1, %%mm5            \n\t" //'t1
01005 
01006         "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t"
01007         "psubw %%mm1, %%mm6            \n\t" //'t2
01008         // t7 't12 't11 t4 t6 - 't13 't10   ---
01009 
01010         "paddw %%mm3, %%mm7            \n\t" //z2
01011 
01012         "movq %%mm5, 1*8+%3            \n\t"
01013         "paddw %%mm3, %%mm4            \n\t" //z4
01014 
01015         "movq 3*16(%%"REG_d"), %%mm3       \n\t"
01016         "movq %%mm0, %%mm1             \n\t"
01017 
01018         "movq %%mm6, 2*8+%3            \n\t"
01019         "psubw %%mm2, %%mm1            \n\t" //z13
01020 
01021 //===
01022         "paddw %%mm2, %%mm0            \n\t" //z11
01023         "movq %%mm1, %%mm5             \n\t"
01024 
01025         "movq 5*16(%%"REG_d"), %%mm2       \n\t"
01026         "psubw %%mm7, %%mm1            \n\t" //d3
01027 
01028         "paddw %%mm7, %%mm5            \n\t" //d5
01029         "psubw %%mm3, %%mm1            \n\t"
01030 
01031         "movq 1*16(%%"REG_d"), %%mm7       \n\t"
01032         "psubw %%mm2, %%mm5            \n\t"
01033 
01034         "movq %%mm0, %%mm6             \n\t"
01035         "paddw %%mm4, %%mm0            \n\t" //d1
01036 
01037         "paddusw %%mm3, %%mm1          \n\t"
01038         "psubw %%mm4, %%mm6            \n\t" //d7
01039 
01040         // d1 d3 - - - d5 d7 -
01041         "movq 7*16(%%"REG_d"), %%mm4       \n\t"
01042         "psubw %%mm7, %%mm0            \n\t"
01043 
01044         "psubw %%mm4, %%mm6            \n\t"
01045         "paddusw %%mm2, %%mm5          \n\t"
01046 
01047         "paddusw %%mm4, %%mm6          \n\t"
01048         "paddw %%mm3, %%mm1            \n\t"
01049 
01050         "paddw %%mm2, %%mm5            \n\t"
01051         "paddw %%mm4, %%mm6            \n\t"
01052 
01053         "psubusw %%mm3, %%mm1          \n\t"
01054         "psubusw %%mm2, %%mm5          \n\t"
01055 
01056         "psubusw %%mm4, %%mm6          \n\t"
01057         "movq %%mm1, %%mm4             \n\t"
01058 
01059         "por %%mm5, %%mm4              \n\t"
01060         "paddusw %%mm7, %%mm0          \n\t"
01061 
01062         "por %%mm6, %%mm4              \n\t"
01063         "paddw %%mm7, %%mm0            \n\t"
01064 
01065         "packssdw %%mm4, %%mm4         \n\t"
01066         "psubusw %%mm7, %%mm0          \n\t"
01067 
01068         "movd %%mm4, %%"REG_a"             \n\t"
01069         "or %%"REG_a", %%"REG_a"              \n\t"
01070         "jnz 2f                 \n\t"
01071         //movq [edi+"DCTSIZE_S"*3*2], mm1
01072         //movq [edi+"DCTSIZE_S"*5*2], mm5
01073         //movq [edi+"DCTSIZE_S"*1*2], mm0
01074         //movq [edi+"DCTSIZE_S"*7*2], mm6
01075         // t4 t5 - - - t6 t7 -
01076         //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
01077 //Typical numbers: nondc - 19%%,  dc - 26%%,  zero - 55%%. zero case alone isn't worthwhile
01078         "movq 0*8+%3, %%mm4            \n\t"
01079         "movq %%mm0, %%mm1             \n\t"
01080 
01081         "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
01082         "movq %%mm1, %%mm2             \n\t"
01083 
01084         "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
01085         "movq %%mm2, %%mm3             \n\t"
01086 
01087         "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
01088         "paddw %%mm4, %%mm5            \n\t"
01089 
01090         "movq 1*8+%3, %%mm6            \n\t"
01091         //paddw mm3, MM_2
01092         "psraw $2, %%mm3              \n\t" //tmp7
01093 
01094         "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
01095         "psubw %%mm3, %%mm4            \n\t"
01096 
01097         "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
01098         "paddw %%mm3, %%mm5            \n\t"
01099 
01100         "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
01101         "paddw %%mm6, %%mm7            \n\t"
01102 
01103         "movq 2*8+%3, %%mm3            \n\t"
01104         "psubw %%mm0, %%mm6            \n\t"
01105 
01106         "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
01107         "paddw %%mm0, %%mm7            \n\t"
01108 
01109         "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
01110         "paddw %%mm3, %%mm4            \n\t"
01111 
01112         "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
01113         "psubw %%mm1, %%mm3            \n\t"
01114 
01115         "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
01116         "paddw %%mm1, %%mm4            \n\t"
01117 
01118         "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
01119         "paddw %%mm3, %%mm5            \n\t"
01120 
01121         "movq 3*8+%3, %%mm0            \n\t"
01122         "add $8, %%"REG_S"               \n\t"
01123 
01124         "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
01125         "paddw %%mm0, %%mm6            \n\t"
01126 
01127         "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
01128         "psubw %%mm2, %%mm0            \n\t"
01129 
01130         "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
01131         "paddw %%mm2, %%mm6            \n\t"
01132 
01133         "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
01134         "paddw %%mm0, %%mm7            \n\t"
01135 
01136         "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
01137 
01138         "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
01139         "add $8, %%"REG_D"               \n\t"
01140         "jmp 4f                  \n\t"
01141 
01142         "2:                    \n\t"
01143         //--- non DC2
01144         //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1  (actually thr1, thr1, thr1-1)
01145         //psraw mm5, 2
01146         //psraw mm0, 2
01147         //psraw mm6, 2
01148         "movq %%mm5, %%mm3             \n\t"
01149         "psubw %%mm1, %%mm5            \n\t"
01150 
01151         "psllw $1, %%mm5              \n\t" //'z10
01152         "paddw %%mm1, %%mm3            \n\t" //'z13
01153 
01154         "movq %%mm0, %%mm2             \n\t"
01155         "psubw %%mm6, %%mm0            \n\t"
01156 
01157         "movq %%mm5, %%mm1             \n\t"
01158         "psllw $1, %%mm0              \n\t" //'z12
01159 
01160         "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
01161         "paddw %%mm0, %%mm5            \n\t"
01162 
01163         "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
01164         "paddw %%mm6, %%mm2            \n\t" //'z11
01165 
01166         "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
01167         "movq %%mm2, %%mm7             \n\t"
01168 
01169         //---
01170         "movq 0*8+%3, %%mm4            \n\t"
01171         "psubw %%mm3, %%mm2            \n\t"
01172 
01173         "psllw $1, %%mm2              \n\t"
01174         "paddw %%mm3, %%mm7            \n\t" //'t7
01175 
01176         "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
01177         "movq %%mm4, %%mm6             \n\t"
01178         //paddw mm7, MM_2
01179         "psraw $2, %%mm7              \n\t"
01180 
01181         "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
01182         "psubw %%mm7, %%mm6            \n\t"
01183 
01184         "movq 1*8+%3, %%mm3            \n\t"
01185         "paddw %%mm7, %%mm4            \n\t"
01186 
01187         "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
01188         "paddw %%mm5, %%mm1            \n\t" //'t12
01189 
01190         "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
01191         "psubw %%mm7, %%mm1            \n\t" //'t6
01192 
01193         "movq 2*8+%3, %%mm7            \n\t"
01194         "psubw %%mm5, %%mm0            \n\t" //'t10
01195 
01196         "movq 3*8+%3, %%mm6            \n\t"
01197         "movq %%mm3, %%mm5             \n\t"
01198 
01199         "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
01200         "psubw %%mm1, %%mm5            \n\t"
01201 
01202         "psubw %%mm1, %%mm2            \n\t" //'t5
01203         "paddw %%mm1, %%mm3            \n\t"
01204 
01205         "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
01206         "movq %%mm7, %%mm4             \n\t"
01207 
01208         "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
01209         "psubw %%mm2, %%mm4            \n\t"
01210 
01211         "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
01212         "paddw %%mm2, %%mm7            \n\t"
01213 
01214         "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
01215         "paddw %%mm2, %%mm0            \n\t" //'t4
01216 
01217         // 't4 't6 't5 - - - - 't7
01218         "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
01219         "movq %%mm6, %%mm1             \n\t"
01220 
01221         "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
01222         "psubw %%mm0, %%mm1            \n\t"
01223 
01224         "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
01225         "paddw %%mm0, %%mm6            \n\t"
01226 
01227         "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
01228         "add $8, %%"REG_S"               \n\t"
01229 
01230         "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
01231 
01232         "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
01233         "add $8, %%"REG_D"               \n\t"
01234 
01235         "4:                     \n\t"
01236 //=part 2 (the same)===========================================================
01237         "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
01238         //
01239         "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
01240         "movq %%mm1, %%mm0             \n\t"
01241 
01242         "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0
01243         "movq %%mm7, %%mm3             \n\t"
01244 
01245         "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3
01246         "movq %%mm1, %%mm5             \n\t"
01247 
01248         "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
01249         "psubw %%mm7, %%mm1            \n\t" //t13
01250 
01251         "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
01252         "movq %%mm6, %%mm4             \n\t"
01253 
01254         "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1
01255         "paddw %%mm7, %%mm5            \n\t" //t10
01256 
01257         "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2
01258         "movq %%mm6, %%mm7             \n\t"
01259 
01260         "paddw %%mm2, %%mm6            \n\t" //t11
01261         "psubw %%mm2, %%mm7            \n\t" //t12
01262 
01263         "movq %%mm5, %%mm2             \n\t"
01264         "paddw %%mm6, %%mm5            \n\t" //d0
01265         // i0 t13 t12 i3 i1 d0 - d4
01266         "psubw %%mm6, %%mm2            \n\t" //d4
01267         "paddw %%mm1, %%mm7            \n\t"
01268 
01269         "movq  1*8+4*16(%%"REG_d"), %%mm6  \n\t"
01270         "psllw $2, %%mm7              \n\t"
01271 
01272         "psubw 1*8+0*16(%%"REG_d"), %%mm5  \n\t"
01273         "psubw %%mm6, %%mm2            \n\t"
01274 
01275         "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
01276         "paddusw %%mm6, %%mm2          \n\t"
01277 
01278         "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm7 \n\t"
01279         //
01280         "paddw 1*8+0*16(%%"REG_d"), %%mm5  \n\t"
01281         "paddw %%mm6, %%mm2            \n\t"
01282 
01283         "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t"
01284         "psubusw %%mm6, %%mm2          \n\t"
01285 
01286 //This func is totally compute-bound,  operates at huge speed. So,  DC shortcut
01287 // at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
01288 //However,  typical numbers: nondc - 29%%,  dc - 46%%,  zero - 25%%. All <> 0 case is very rare.
01289         "paddw "MANGLE(MM_2)", %%mm5            \n\t"
01290         "movq %%mm2, %%mm6             \n\t"
01291 
01292         "paddw %%mm5, %%mm2            \n\t"
01293         "psubw %%mm6, %%mm5            \n\t"
01294 
01295         "movq %%mm1, %%mm6             \n\t"
01296         "paddw %%mm7, %%mm1            \n\t" //d2
01297 
01298         "psubw 1*8+2*16(%%"REG_d"), %%mm1  \n\t"
01299         "psubw %%mm7, %%mm6            \n\t" //d6
01300 
01301         "movq 1*8+6*16(%%"REG_d"), %%mm7   \n\t"
01302         "psraw $2, %%mm5              \n\t"
01303 
01304         "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
01305         "psubw %%mm7, %%mm6            \n\t"
01306         // t7 d2 /t11 t4 t6 - d6 /t10
01307 
01308         "paddw 1*8+2*16(%%"REG_d"), %%mm1  \n\t"
01309         "paddusw %%mm7, %%mm6          \n\t"
01310 
01311         "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t"
01312         "paddw %%mm7, %%mm6            \n\t"
01313 
01314         "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t"
01315         "psubusw %%mm7, %%mm6          \n\t"
01316 
01317         //movq [edi+"DCTSIZE_S"*2*2], mm1
01318         //movq [edi+"DCTSIZE_S"*6*2], mm6
01319         "movq %%mm1, %%mm7             \n\t"
01320         "psraw $2, %%mm2              \n\t"
01321 
01322         "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t"
01323         "psubw %%mm6, %%mm1            \n\t"
01324 
01325         "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t"
01326         "paddw %%mm7, %%mm6            \n\t" //'t13
01327 
01328         "psraw $2, %%mm6              \n\t" //paddw mm6, MM_2 !!    ---
01329         "movq %%mm2, %%mm7             \n\t"
01330 
01331         "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
01332         "paddw %%mm6, %%mm2            \n\t" //'t0
01333 
01334         "movq %%mm2, 0*8+%3            \n\t" 
01335         "psubw %%mm6, %%mm7            \n\t" //'t3
01336 
01337         "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
01338         "psubw %%mm6, %%mm1            \n\t" //'t12
01339 
01340         "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5
01341         "movq %%mm5, %%mm6             \n\t"
01342 
01343         "movq %%mm7, 3*8+%3            \n\t"
01344         "paddw %%mm2, %%mm3            \n\t" //t10
01345 
01346         "paddw %%mm4, %%mm2            \n\t" //t11
01347         "paddw %%mm0, %%mm4            \n\t" //t12
01348 
01349         "movq %%mm3, %%mm7             \n\t"
01350         "psubw %%mm4, %%mm3            \n\t"
01351 
01352         "psllw $2, %%mm3              \n\t"
01353         "psllw $2, %%mm7              \n\t" //opt for P6
01354 
01355         "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
01356         "psllw $2, %%mm4              \n\t"
01357 
01358         "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm7 \n\t"
01359         "psllw $2, %%mm2              \n\t"
01360 
01361         "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
01362         "paddw %%mm1, %%mm5            \n\t" //'t1
01363 
01364         "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm2 \n\t"
01365         "psubw %%mm1, %%mm6            \n\t" //'t2
01366         // t7 't12 't11 t4 t6 - 't13 't10   ---
01367 
01368         "paddw %%mm3, %%mm7            \n\t" //z2
01369 
01370         "movq %%mm5, 1*8+%3            \n\t"
01371         "paddw %%mm3, %%mm4            \n\t" //z4
01372 
01373         "movq 1*8+3*16(%%"REG_d"), %%mm3   \n\t"
01374         "movq %%mm0, %%mm1             \n\t"
01375 
01376         "movq %%mm6, 2*8+%3            \n\t"
01377         "psubw %%mm2, %%mm1            \n\t" //z13
01378 
01379 //===
01380         "paddw %%mm2, %%mm0            \n\t" //z11
01381         "movq %%mm1, %%mm5             \n\t"
01382 
01383         "movq 1*8+5*16(%%"REG_d"), %%mm2   \n\t"
01384         "psubw %%mm7, %%mm1            \n\t" //d3
01385 
01386         "paddw %%mm7, %%mm5            \n\t" //d5
01387         "psubw %%mm3, %%mm1            \n\t"
01388 
01389         "movq 1*8+1*16(%%"REG_d"), %%mm7   \n\t"
01390         "psubw %%mm2, %%mm5            \n\t"
01391 
01392         "movq %%mm0, %%mm6             \n\t"
01393         "paddw %%mm4, %%mm0            \n\t" //d1
01394 
01395         "paddusw %%mm3, %%mm1          \n\t"
01396         "psubw %%mm4, %%mm6            \n\t" //d7
01397 
01398         // d1 d3 - - - d5 d7 -
01399         "movq 1*8+7*16(%%"REG_d"), %%mm4   \n\t"
01400         "psubw %%mm7, %%mm0            \n\t"
01401 
01402         "psubw %%mm4, %%mm6            \n\t"
01403         "paddusw %%mm2, %%mm5          \n\t"
01404 
01405         "paddusw %%mm4, %%mm6          \n\t"
01406         "paddw %%mm3, %%mm1            \n\t"
01407 
01408         "paddw %%mm2, %%mm5            \n\t"
01409         "paddw %%mm4, %%mm6            \n\t"
01410 
01411         "psubusw %%mm3, %%mm1          \n\t"
01412         "psubusw %%mm2, %%mm5          \n\t"
01413 
01414         "psubusw %%mm4, %%mm6          \n\t"
01415         "movq %%mm1, %%mm4             \n\t"
01416 
01417         "por %%mm5, %%mm4              \n\t"
01418         "paddusw %%mm7, %%mm0          \n\t"
01419 
01420         "por %%mm6, %%mm4              \n\t"
01421         "paddw %%mm7, %%mm0            \n\t"
01422 
01423         "packssdw %%mm4, %%mm4         \n\t"
01424         "psubusw %%mm7, %%mm0          \n\t"
01425 
01426         "movd %%mm4, %%"REG_a"             \n\t"
01427         "or %%"REG_a", %%"REG_a"              \n\t"
01428         "jnz 3f                 \n\t"
01429         //movq [edi+"DCTSIZE_S"*3*2], mm1
01430         //movq [edi+"DCTSIZE_S"*5*2], mm5
01431         //movq [edi+"DCTSIZE_S"*1*2], mm0
01432         //movq [edi+"DCTSIZE_S"*7*2], mm6
01433         // t4 t5 - - - t6 t7 -
01434         //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
01435 //Typical numbers: nondc - 19%%,  dc - 26%%,  zero - 55%%. zero case alone isn't worthwhile
01436         "movq 0*8+%3, %%mm4            \n\t"
01437         "movq %%mm0, %%mm1             \n\t"
01438 
01439         "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
01440         "movq %%mm1, %%mm2             \n\t"
01441 
01442         "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
01443         "movq %%mm2, %%mm3             \n\t"
01444 
01445         "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
01446         "paddw %%mm4, %%mm5            \n\t"
01447 
01448         "movq 1*8+%3, %%mm6            \n\t"
01449         //paddw mm3, MM_2
01450         "psraw $2, %%mm3              \n\t" //tmp7
01451 
01452         "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
01453         "psubw %%mm3, %%mm4            \n\t"
01454 
01455         "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
01456         "paddw %%mm3, %%mm5            \n\t"
01457 
01458         "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
01459         "paddw %%mm6, %%mm7            \n\t"
01460 
01461         "movq 2*8+%3, %%mm3            \n\t"
01462         "psubw %%mm0, %%mm6            \n\t"
01463 
01464         "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
01465         "paddw %%mm0, %%mm7            \n\t"
01466 
01467         "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
01468         "paddw %%mm3, %%mm4            \n\t"
01469 
01470         "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
01471         "psubw %%mm1, %%mm3            \n\t"
01472 
01473         "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
01474         "paddw %%mm1, %%mm4            \n\t"
01475 
01476         "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
01477         "paddw %%mm3, %%mm5            \n\t"
01478 
01479         "movq 3*8+%3, %%mm0            \n\t"
01480         "add $24, %%"REG_S"              \n\t"
01481 
01482         "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
01483         "paddw %%mm0, %%mm6            \n\t"
01484 
01485         "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
01486         "psubw %%mm2, %%mm0            \n\t"
01487 
01488         "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
01489         "paddw %%mm2, %%mm6            \n\t"
01490 
01491         "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
01492         "paddw %%mm0, %%mm7            \n\t"
01493 
01494         "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
01495 
01496         "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
01497         "add $24, %%"REG_D"              \n\t"
01498         "sub $2, %%"REG_c"               \n\t"
01499         "jnz 1b                \n\t"
01500         "jmp 5f                   \n\t"
01501 
01502         "3:                    \n\t"
01503         //--- non DC2
01504         //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1  (actually thr1, thr1, thr1-1)
01505         //psraw mm5, 2
01506         //psraw mm0, 2
01507         //psraw mm6, 2
01508         "movq %%mm5, %%mm3             \n\t"
01509         "psubw %%mm1, %%mm5            \n\t"
01510 
01511         "psllw $1, %%mm5              \n\t" //'z10
01512         "paddw %%mm1, %%mm3            \n\t" //'z13
01513 
01514         "movq %%mm0, %%mm2             \n\t"
01515         "psubw %%mm6, %%mm0            \n\t"
01516 
01517         "movq %%mm5, %%mm1             \n\t"
01518         "psllw $1, %%mm0              \n\t" //'z12
01519 
01520         "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
01521         "paddw %%mm0, %%mm5            \n\t"
01522 
01523         "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
01524         "paddw %%mm6, %%mm2            \n\t" //'z11
01525 
01526         "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
01527         "movq %%mm2, %%mm7             \n\t"
01528 
01529         //---
01530         "movq 0*8+%3, %%mm4            \n\t"
01531         "psubw %%mm3, %%mm2            \n\t"
01532 
01533         "psllw $1, %%mm2              \n\t"
01534         "paddw %%mm3, %%mm7            \n\t" //'t7
01535 
01536         "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
01537         "movq %%mm4, %%mm6             \n\t"
01538         //paddw mm7, MM_2
01539         "psraw $2, %%mm7              \n\t"
01540 
01541         "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
01542         "psubw %%mm7, %%mm6            \n\t"
01543 
01544         "movq 1*8+%3, %%mm3            \n\t"
01545         "paddw %%mm7, %%mm4            \n\t"
01546 
01547         "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
01548         "paddw %%mm5, %%mm1            \n\t" //'t12
01549 
01550         "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
01551         "psubw %%mm7, %%mm1            \n\t" //'t6
01552 
01553         "movq 2*8+%3, %%mm7            \n\t"
01554         "psubw %%mm5, %%mm0            \n\t" //'t10
01555 
01556         "movq 3*8+%3, %%mm6            \n\t"
01557         "movq %%mm3, %%mm5             \n\t"
01558 
01559         "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
01560         "psubw %%mm1, %%mm5            \n\t"
01561 
01562         "psubw %%mm1, %%mm2            \n\t" //'t5
01563         "paddw %%mm1, %%mm3            \n\t"
01564 
01565         "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
01566         "movq %%mm7, %%mm4             \n\t"
01567 
01568         "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
01569         "psubw %%mm2, %%mm4            \n\t"
01570 
01571         "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
01572         "paddw %%mm2, %%mm7            \n\t"
01573 
01574         "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
01575         "paddw %%mm2, %%mm0            \n\t" //'t4
01576 
01577         // 't4 't6 't5 - - - - 't7
01578         "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
01579         "movq %%mm6, %%mm1             \n\t"
01580 
01581         "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
01582         "psubw %%mm0, %%mm1            \n\t"
01583 
01584         "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
01585         "paddw %%mm0, %%mm6            \n\t"
01586 
01587         "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
01588         "add $24, %%"REG_S"              \n\t"
01589 
01590         "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
01591 
01592         "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
01593         "add $24, %%"REG_D"              \n\t"
01594         "sub $2, %%"REG_c"               \n\t"
01595         "jnz 1b                \n\t"
01596         "5:                      \n\t"
01597 
01598         : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps)
01599         : "d"(thr_adr)
01600         : "%"REG_a
01601         );
01602 }
01603 
01604 #endif // HAVE_MMX
01605 
01606 #if !HAVE_MMX
01607 
01608 static void row_idct_c(DCTELEM* workspace,
01609                        int16_t* output_adr, int output_stride, int cnt)
01610 {
01611     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
01612     int_simd16_t tmp10, tmp11, tmp12, tmp13;
01613     int_simd16_t z5, z10, z11, z12, z13;
01614     int16_t* outptr;
01615     DCTELEM* wsptr;
01616 
01617     cnt*=4;
01618     wsptr = workspace;
01619     outptr = output_adr;
01620     for (; cnt > 0; cnt--) {
01621         // Even part
01622         //Simd version reads 4x4 block and transposes it
01623         tmp10 = ( wsptr[2] +  wsptr[3]);
01624         tmp11 = ( wsptr[2] -  wsptr[3]);
01625 
01626         tmp13 = ( wsptr[0] +  wsptr[1]);
01627         tmp12 = (MULTIPLY16H( wsptr[0] - wsptr[1], FIX_1_414213562_A)<<2) - tmp13;//this shift order to avoid overflow
01628 
01629         tmp0 = tmp10 + tmp13; //->temps
01630         tmp3 = tmp10 - tmp13; //->temps
01631         tmp1 = tmp11 + tmp12;
01632         tmp2 = tmp11 - tmp12;
01633 
01634         // Odd part
01635         //Also transpose, with previous:
01636         // ---- ----      ||||
01637         // ---- ---- idct ||||
01638         // ---- ---- ---> ||||
01639         // ---- ----      ||||
01640         z13 = wsptr[4] + wsptr[5];
01641         z10 = wsptr[4] - wsptr[5];
01642         z11 = wsptr[6] + wsptr[7];
01643         z12 = wsptr[6] - wsptr[7];
01644 
01645         tmp7 = z11 + z13;
01646         tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
01647 
01648         z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
01649         tmp10 = MULTIPLY16H(z12, FIX_1_082392200) - z5;
01650         tmp12 = MULTIPLY16H(z10, FIX_2_613125930) + z5; // - FIX_
01651 
01652         tmp6 = (tmp12<<3) - tmp7;
01653         tmp5 = (tmp11<<3) - tmp6;
01654         tmp4 = (tmp10<<3) + tmp5;
01655 
01656         // Final output stage: descale and write column
01657         outptr[0*output_stride]+= DESCALE(tmp0 + tmp7, 3);
01658         outptr[1*output_stride]+= DESCALE(tmp1 + tmp6, 3);
01659         outptr[2*output_stride]+= DESCALE(tmp2 + tmp5, 3);
01660         outptr[3*output_stride]+= DESCALE(tmp3 - tmp4, 3);
01661         outptr[4*output_stride]+= DESCALE(tmp3 + tmp4, 3);
01662         outptr[5*output_stride]+= DESCALE(tmp2 - tmp5, 3);
01663         outptr[6*output_stride]+= DESCALE(tmp1 - tmp6, 3); //no += ?
01664         outptr[7*output_stride]+= DESCALE(tmp0 - tmp7, 3); //no += ?
01665         outptr++;
01666 
01667         wsptr += DCTSIZE;       // advance pointer to next row
01668     }
01669 }
01670 
01671 #else /* HAVE_MMX */
01672 
01673 static void row_idct_mmx (DCTELEM* workspace,
01674                           int16_t* output_adr,  int output_stride,  int cnt)
01675 {
01676     uint64_t __attribute__((aligned(8))) temps[4];
01677     __asm__ volatile(
01678         "lea (%%"REG_a",%%"REG_a",2), %%"REG_d"    \n\t"
01679         "1:                     \n\t"
01680         "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t"
01681         //
01682 
01683         "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t"
01684         "movq %%mm0, %%mm4             \n\t"
01685 
01686         "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
01687         "punpcklwd %%mm1, %%mm0        \n\t"
01688 
01689         "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t"
01690         "punpckhwd %%mm1, %%mm4        \n\t"
01691 
01692         //transpose 4x4
01693         "movq %%mm2, %%mm7             \n\t"
01694         "punpcklwd %%mm3, %%mm2        \n\t"
01695 
01696         "movq %%mm0, %%mm6             \n\t"
01697         "punpckldq %%mm2, %%mm0        \n\t" //0
01698 
01699         "punpckhdq %%mm2, %%mm6        \n\t" //1
01700         "movq %%mm0, %%mm5             \n\t"
01701 
01702         "punpckhwd %%mm3, %%mm7        \n\t"
01703         "psubw %%mm6, %%mm0            \n\t"
01704 
01705         "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t"
01706         "movq %%mm4, %%mm2             \n\t"
01707 
01708         "punpckldq %%mm7, %%mm4        \n\t" //2
01709         "paddw %%mm6, %%mm5            \n\t"
01710 
01711         "punpckhdq %%mm7, %%mm2        \n\t" //3
01712         "movq %%mm4, %%mm1             \n\t"
01713 
01714         "psllw $2, %%mm0              \n\t"
01715         "paddw %%mm2, %%mm4            \n\t" //t10
01716 
01717         "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
01718         "psubw %%mm2, %%mm1            \n\t" //t11
01719 
01720         "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
01721         "psubw %%mm5, %%mm0            \n\t"
01722 
01723         "movq %%mm4, %%mm6             \n\t"
01724         "paddw %%mm5, %%mm4            \n\t" //t0
01725 
01726         "psubw %%mm5, %%mm6            \n\t" //t3
01727         "movq %%mm1, %%mm7             \n\t"
01728 
01729         "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
01730         "paddw %%mm0, %%mm1            \n\t" //t1
01731 
01732         "movq %%mm4, 0*8+%3            \n\t" //t0
01733         "movq %%mm3, %%mm4             \n\t"
01734 
01735         "movq %%mm6, 1*8+%3            \n\t" //t3
01736         "punpcklwd %%mm2, %%mm3        \n\t"
01737 
01738         //transpose 4x4
01739         "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
01740         "punpckhwd %%mm2, %%mm4        \n\t"
01741 
01742         "movq %%mm5, %%mm2             \n\t"
01743         "punpcklwd %%mm6, %%mm5        \n\t"
01744 
01745         "psubw %%mm0, %%mm7            \n\t" //t2
01746         "punpckhwd %%mm6, %%mm2        \n\t"
01747 
01748         "movq %%mm3, %%mm0             \n\t"
01749         "punpckldq %%mm5, %%mm3        \n\t" //4
01750 
01751         "punpckhdq %%mm5, %%mm0        \n\t" //5
01752         "movq %%mm4, %%mm5             \n\t"
01753 
01754         //
01755         "movq %%mm3, %%mm6             \n\t"
01756         "punpckldq %%mm2, %%mm4        \n\t" //6
01757 
01758         "psubw %%mm0, %%mm3            \n\t" //z10
01759         "punpckhdq %%mm2, %%mm5        \n\t" //7
01760 
01761         "paddw %%mm0, %%mm6            \n\t" //z13
01762         "movq %%mm4, %%mm2             \n\t"
01763 
01764         "movq %%mm3, %%mm0             \n\t"
01765         "psubw %%mm5, %%mm4            \n\t" //z12
01766 
01767         "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0 \n\t" //-
01768         "paddw %%mm4, %%mm3            \n\t"
01769 
01770         "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3 \n\t" //z5
01771         "paddw %%mm5, %%mm2            \n\t" //z11  >
01772 
01773         "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4 \n\t"
01774         "movq %%mm2, %%mm5             \n\t"
01775 
01776         "psubw %%mm6, %%mm2            \n\t"
01777         "paddw %%mm6, %%mm5            \n\t" //t7
01778 
01779         "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //t11
01780         "paddw %%mm3, %%mm0            \n\t" //t12
01781 
01782         "psllw $3, %%mm0              \n\t"
01783         "psubw %%mm3, %%mm4            \n\t" //t10
01784 
01785         "movq 0*8+%3, %%mm6            \n\t"
01786         "movq %%mm1, %%mm3             \n\t"
01787 
01788         "psllw $3, %%mm4              \n\t"
01789         "psubw %%mm5, %%mm0            \n\t" //t6
01790 
01791         "psllw $3, %%mm2              \n\t"
01792         "paddw %%mm0, %%mm1            \n\t" //d1
01793 
01794         "psubw %%mm0, %%mm2            \n\t" //t5
01795         "psubw %%mm0, %%mm3            \n\t" //d6
01796 
01797         "paddw %%mm2, %%mm4            \n\t" //t4
01798         "movq %%mm7, %%mm0             \n\t"
01799 
01800         "paddw %%mm2, %%mm7            \n\t" //d2
01801         "psubw %%mm2, %%mm0            \n\t" //d5
01802 
01803         "movq "MANGLE(MM_DESCALE_RND)", %%mm2   \n\t" //4
01804         "psubw %%mm5, %%mm6            \n\t" //d7
01805 
01806         "paddw 0*8+%3, %%mm5           \n\t" //d0
01807         "paddw %%mm2, %%mm1            \n\t"
01808 
01809         "paddw %%mm2, %%mm5            \n\t"
01810         "psraw $3, %%mm1              \n\t"
01811 
01812         "paddw %%mm2, %%mm7            \n\t"
01813         "psraw $3, %%mm5              \n\t"
01814 
01815         "paddw (%%"REG_D"), %%mm5          \n\t"
01816         "psraw $3, %%mm7              \n\t"
01817 
01818         "paddw (%%"REG_D",%%"REG_a",), %%mm1    \n\t"
01819         "paddw %%mm2, %%mm0            \n\t"
01820 
01821         "paddw (%%"REG_D",%%"REG_a",2), %%mm7   \n\t"
01822         "paddw %%mm2, %%mm3            \n\t"
01823 
01824         "movq %%mm5, (%%"REG_D")           \n\t"
01825         "paddw %%mm2, %%mm6            \n\t"
01826 
01827         "movq %%mm1, (%%"REG_D",%%"REG_a",)     \n\t"
01828         "psraw $3, %%mm0              \n\t"
01829 
01830         "movq %%mm7, (%%"REG_D",%%"REG_a",2)    \n\t"
01831         "add %%"REG_d", %%"REG_D"             \n\t" //3*ls
01832 
01833         "movq 1*8+%3, %%mm5           \n\t" //t3
01834         "psraw $3, %%mm3              \n\t"
01835 
01836         "paddw (%%"REG_D",%%"REG_a",2), %%mm0   \n\t"
01837         "psubw %%mm4, %%mm5            \n\t" //d3
01838 
01839         "paddw (%%"REG_D",%%"REG_d",), %%mm3    \n\t"
01840         "psraw $3, %%mm6              \n\t"
01841 
01842         "paddw 1*8+%3, %%mm4           \n\t" //d4
01843         "paddw %%mm2, %%mm5            \n\t"
01844 
01845         "paddw (%%"REG_D",%%"REG_a",4), %%mm6   \n\t"
01846         "paddw %%mm2, %%mm4            \n\t"
01847 
01848         "movq %%mm0, (%%"REG_D",%%"REG_a",2)    \n\t"
01849         "psraw $3, %%mm5              \n\t"
01850 
01851         "paddw (%%"REG_D"), %%mm5          \n\t"
01852         "psraw $3, %%mm4              \n\t"
01853 
01854         "paddw (%%"REG_D",%%"REG_a",), %%mm4    \n\t"
01855         "add $"DCTSIZE_S"*2*4, %%"REG_S"      \n\t" //4 rows
01856 
01857         "movq %%mm3, (%%"REG_D",%%"REG_d",)     \n\t"
01858         "movq %%mm6, (%%"REG_D",%%"REG_a",4)    \n\t"
01859         "movq %%mm5, (%%"REG_D")           \n\t"
01860         "movq %%mm4, (%%"REG_D",%%"REG_a",)     \n\t"
01861 
01862         "sub %%"REG_d", %%"REG_D"             \n\t"
01863         "add $8, %%"REG_D"               \n\t"
01864         "dec %%"REG_c"                   \n\t"
01865         "jnz 1b                  \n\t"
01866 
01867         : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps)
01868         : "a"(output_stride*sizeof(short))
01869         : "%"REG_d
01870         );
01871 }
01872 
01873 #endif // HAVE_MMX
01874 
01875 #if !HAVE_MMX
01876 
01877 static void row_fdct_c(DCTELEM *data, const uint8_t *pixels, int line_size, int cnt)
01878 {
01879     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
01880     int_simd16_t tmp10, tmp11, tmp12, tmp13;
01881     int_simd16_t z1, z2, z3, z4, z5, z11, z13;
01882     DCTELEM *dataptr;
01883 
01884     cnt*=4;
01885     // Pass 1: process rows.
01886 
01887     dataptr = data;
01888     for (; cnt > 0; cnt--) {
01889         tmp0 = pixels[line_size*0] + pixels[line_size*7];
01890         tmp7 = pixels[line_size*0] - pixels[line_size*7];
01891         tmp1 = pixels[line_size*1] + pixels[line_size*6];
01892         tmp6 = pixels[line_size*1] - pixels[line_size*6];
01893         tmp2 = pixels[line_size*2] + pixels[line_size*5];
01894         tmp5 = pixels[line_size*2] - pixels[line_size*5];
01895         tmp3 = pixels[line_size*3] + pixels[line_size*4];
01896         tmp4 = pixels[line_size*3] - pixels[line_size*4];
01897 
01898         // Even part
01899 
01900         tmp10 = tmp0 + tmp3;
01901         tmp13 = tmp0 - tmp3;
01902         tmp11 = tmp1 + tmp2;
01903         tmp12 = tmp1 - tmp2;
01904         //Even columns are written first, this leads to different order of columns
01905         //in column_fidct(), but they are processed independently, so all ok.
01906         //Later in the row_idct() columns readed at the same order.
01907         dataptr[2] = tmp10 + tmp11;
01908         dataptr[3] = tmp10 - tmp11;
01909 
01910         z1 = MULTIPLY16H((tmp12 + tmp13)<<2, FIX_0_707106781);
01911         dataptr[0] = tmp13 + z1;
01912         dataptr[1] = tmp13 - z1;
01913 
01914         // Odd part
01915 
01916         tmp10 = (tmp4 + tmp5) <<2;
01917         tmp11 = (tmp5 + tmp6) <<2;
01918         tmp12 = (tmp6 + tmp7) <<2;
01919 
01920         z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
01921         z2 = MULTIPLY16H(tmp10, FIX_0_541196100) + z5;
01922         z4 = MULTIPLY16H(tmp12, FIX_1_306562965) + z5;
01923         z3 = MULTIPLY16H(tmp11, FIX_0_707106781);
01924 
01925         z11 = tmp7 + z3;
01926         z13 = tmp7 - z3;
01927 
01928         dataptr[4] = z13 + z2;
01929         dataptr[5] = z13 - z2;
01930         dataptr[6] = z11 + z4;
01931         dataptr[7] = z11 - z4;
01932 
01933         pixels++;               // advance pointer to next column
01934         dataptr += DCTSIZE;
01935     }
01936 }
01937 
01938 #else /* HAVE_MMX */
01939 
01940 static void row_fdct_mmx(DCTELEM *data,  const uint8_t *pixels,  int line_size,  int cnt)
01941 {
01942     uint64_t __attribute__((aligned(8))) temps[4];
01943     __asm__ volatile(
01944         "lea (%%"REG_a",%%"REG_a",2), %%"REG_d"    \n\t"
01945         "6:                     \n\t"
01946         "movd (%%"REG_S"), %%mm0           \n\t"
01947         "pxor %%mm7, %%mm7             \n\t"
01948 
01949         "movd (%%"REG_S",%%"REG_a",), %%mm1     \n\t"
01950         "punpcklbw %%mm7, %%mm0        \n\t"
01951 
01952         "movd (%%"REG_S",%%"REG_a",2), %%mm2    \n\t"
01953         "punpcklbw %%mm7, %%mm1        \n\t"
01954 
01955         "punpcklbw %%mm7, %%mm2        \n\t"
01956         "add %%"REG_d", %%"REG_S"             \n\t"
01957 
01958         "movq %%mm0, %%mm5             \n\t"
01959         //
01960 
01961         "movd (%%"REG_S",%%"REG_a",4), %%mm3    \n\t" //7  ;prefetch!
01962         "movq %%mm1, %%mm6             \n\t"
01963 
01964         "movd (%%"REG_S",%%"REG_d",), %%mm4     \n\t" //6
01965         "punpcklbw %%mm7, %%mm3        \n\t"
01966 
01967         "psubw %%mm3, %%mm5            \n\t"
01968         "punpcklbw %%mm7, %%mm4        \n\t"
01969 
01970         "paddw %%mm3, %%mm0            \n\t"
01971         "psubw %%mm4, %%mm6            \n\t"
01972 
01973         "movd (%%"REG_S",%%"REG_a",2), %%mm3    \n\t" //5
01974         "paddw %%mm4, %%mm1            \n\t"
01975 
01976         "movq %%mm5, 0*8+%3            \n\t" //t7
01977         "punpcklbw %%mm7, %%mm3        \n\t"
01978 
01979         "movq %%mm6, 1*8+%3            \n\t" //t6
01980         "movq %%mm2, %%mm4             \n\t"
01981 
01982         "movd (%%"REG_S"), %%mm5           \n\t" //3
01983         "paddw %%mm3, %%mm2            \n\t"
01984 
01985         "movd (%%"REG_S",%%"REG_a",), %%mm6     \n\t" //4
01986         "punpcklbw %%mm7, %%mm5        \n\t"
01987 
01988         "psubw %%mm3, %%mm4            \n\t"
01989         "punpcklbw %%mm7, %%mm6        \n\t"
01990 
01991         "movq %%mm5, %%mm3             \n\t"
01992         "paddw %%mm6, %%mm5            \n\t" //t3
01993 
01994         "psubw %%mm6, %%mm3            \n\t" //t4  ; t0 t1 t2 t4 t5 t3 - -
01995         "movq %%mm0, %%mm6             \n\t"
01996 
01997         "movq %%mm1, %%mm7             \n\t"
01998         "psubw %%mm5, %%mm0            \n\t" //t13
01999 
02000         "psubw %%mm2, %%mm1            \n\t"
02001         "paddw %%mm2, %%mm7            \n\t" //t11
02002 
02003         "paddw %%mm0, %%mm1            \n\t"
02004         "movq %%mm7, %%mm2             \n\t"
02005 
02006         "psllw $2, %%mm1              \n\t"
02007         "paddw %%mm5, %%mm6            \n\t" //t10
02008 
02009         "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm1 \n\t"
02010         "paddw %%mm6, %%mm7            \n\t" //d2
02011 
02012         "psubw %%mm2, %%mm6            \n\t" //d3
02013         "movq %%mm0, %%mm5             \n\t"
02014 
02015         //transpose 4x4
02016         "movq %%mm7, %%mm2             \n\t"
02017         "punpcklwd %%mm6, %%mm7        \n\t"
02018 
02019         "paddw %%mm1, %%mm0            \n\t" //d0
02020         "punpckhwd %%mm6, %%mm2        \n\t"
02021 
02022         "psubw %%mm1, %%mm5            \n\t" //d1
02023         "movq %%mm0, %%mm6             \n\t"
02024 
02025         "movq 1*8+%3, %%mm1            \n\t"
02026         "punpcklwd %%mm5, %%mm0        \n\t"
02027 
02028         "punpckhwd %%mm5, %%mm6        \n\t"
02029         "movq %%mm0, %%mm5             \n\t"
02030 
02031         "punpckldq %%mm7, %%mm0        \n\t" //0
02032         "paddw %%mm4, %%mm3            \n\t"
02033 
02034         "punpckhdq %%mm7, %%mm5        \n\t" //1
02035         "movq %%mm6, %%mm7             \n\t"
02036 
02037         "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
02038         "punpckldq %%mm2, %%mm6        \n\t" //2
02039 
02040         "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
02041         "punpckhdq %%mm2, %%mm7        \n\t" //3
02042 
02043         "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
02044         "paddw %%mm1, %%mm4            \n\t"
02045 
02046         "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
02047         "psllw $2, %%mm3              \n\t" //t10
02048 
02049         "movq 0*8+%3, %%mm2           \n\t"
02050         "psllw $2, %%mm4              \n\t" //t11
02051 
02052         "pmulhw "MANGLE(MM_FIX_0_707106781)", %%mm4 \n\t" //z3
02053         "paddw %%mm2, %%mm1            \n\t"
02054 
02055         "psllw $2, %%mm1              \n\t" //t12
02056         "movq %%mm3, %%mm0             \n\t"
02057 
02058         "pmulhw "MANGLE(MM_FIX_0_541196100)", %%mm0 \n\t"
02059         "psubw %%mm1, %%mm3            \n\t"
02060 
02061         "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5
02062         "movq %%mm2, %%mm5             \n\t"
02063 
02064         "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t"
02065         "psubw %%mm4, %%mm2            \n\t" //z13
02066 
02067         "paddw %%mm4, %%mm5            \n\t" //z11
02068         "movq %%mm2, %%mm6             \n\t"
02069 
02070         "paddw %%mm3, %%mm0            \n\t" //z2
02071         "movq %%mm5, %%mm7             \n\t"
02072 
02073         "paddw %%mm0, %%mm2            \n\t" //d4
02074         "psubw %%mm0, %%mm6            \n\t" //d5
02075 
02076         "movq %%mm2, %%mm4             \n\t"
02077         "paddw %%mm3, %%mm1            \n\t" //z4
02078 
02079         //transpose 4x4
02080         "punpcklwd %%mm6, %%mm2        \n\t"
02081         "paddw %%mm1, %%mm5            \n\t" //d6
02082 
02083         "punpckhwd %%mm6, %%mm4        \n\t"
02084         "psubw %%mm1, %%mm7            \n\t" //d7
02085 
02086         "movq %%mm5, %%mm6             \n\t"
02087         "punpcklwd %%mm7, %%mm5        \n\t"
02088 
02089         "punpckhwd %%mm7, %%mm6        \n\t"
02090         "movq %%mm2, %%mm7             \n\t"
02091 
02092         "punpckldq %%mm5, %%mm2        \n\t" //4
02093         "sub %%"REG_d", %%"REG_S"             \n\t"
02094 
02095         "punpckhdq %%mm5, %%mm7        \n\t" //5
02096         "movq %%mm4, %%mm5             \n\t"
02097 
02098         "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
02099         "punpckldq %%mm6, %%mm4        \n\t" //6
02100 
02101         "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
02102         "punpckhdq %%mm6, %%mm5        \n\t" //7
02103 
02104         "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
02105         "add $4, %%"REG_S"               \n\t"
02106 
02107         "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
02108         "add $"DCTSIZE_S"*2*4, %%"REG_D"      \n\t" //4 rows
02109         "dec %%"REG_c"                   \n\t"
02110         "jnz 6b                  \n\t"
02111 
02112         : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps)
02113         : "a"(line_size)
02114         : "%"REG_d);
02115 }
02116 
02117 #endif // HAVE_MMX

Generated on Fri Feb 22 2013 07:24:30 for FFmpeg by  doxygen 1.7.1