• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • Examples
  • File List
  • Globals

libavcodec/x86/fft_sse.c

Go to the documentation of this file.
00001 /*
00002  * FFT/MDCT transform with SSE optimizations
00003  * Copyright (c) 2008 Loren Merritt
00004  *
00005  * This file is part of FFmpeg.
00006  *
00007  * FFmpeg is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * FFmpeg is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with FFmpeg; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00022 #include "libavutil/x86_cpu.h"
00023 #include "libavcodec/dsputil.h"
00024 #include "fft.h"
00025 #include "config.h"
00026 
00027 DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
00028     { 1U << 31, 1U << 31, 1U << 31, 1U << 31 };
00029 
00030 void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
00031 void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
00032 void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);
00033 
00034 #if HAVE_AVX
00035 void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
00036 {
00037     ff_fft_dispatch_interleave_avx(z, s->nbits);
00038 }
00039 #endif
00040 
00041 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
00042 {
00043     int n = 1 << s->nbits;
00044 
00045     ff_fft_dispatch_interleave_sse(z, s->nbits);
00046 
00047     if(n <= 16) {
00048         x86_reg i = -8*n;
00049         __asm__ volatile(
00050             "1: \n"
00051             "movaps     (%0,%1), %%xmm0 \n"
00052             "movaps      %%xmm0, %%xmm1 \n"
00053             "unpcklps 16(%0,%1), %%xmm0 \n"
00054             "unpckhps 16(%0,%1), %%xmm1 \n"
00055             "movaps      %%xmm0,   (%0,%1) \n"
00056             "movaps      %%xmm1, 16(%0,%1) \n"
00057             "add $32, %0 \n"
00058             "jl 1b \n"
00059             :"+r"(i)
00060             :"r"(z+n)
00061             :"memory"
00062         );
00063     }
00064 }
00065 
00066 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
00067 {
00068     int n = 1 << s->nbits;
00069     int i;
00070     for(i=0; i<n; i+=2) {
00071         __asm__ volatile(
00072             "movaps %2, %%xmm0 \n"
00073             "movlps %%xmm0, %0 \n"
00074             "movhps %%xmm0, %1 \n"
00075             :"=m"(s->tmp_buf[s->revtab[i]]),
00076              "=m"(s->tmp_buf[s->revtab[i+1]])
00077             :"m"(z[i])
00078         );
00079     }
00080     memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
00081 }
00082 
00083 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
00084 {
00085     x86_reg j, k;
00086     long n = s->mdct_size;
00087     long n4 = n >> 2;
00088 
00089     s->imdct_half(s, output + n4, input);
00090 
00091     j = -n;
00092     k = n-16;
00093     __asm__ volatile(
00094         "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
00095         "1: \n"
00096         "movaps       (%2,%1), %%xmm0 \n"
00097         "movaps       (%3,%0), %%xmm1 \n"
00098         "shufps $0x1b, %%xmm0, %%xmm0 \n"
00099         "shufps $0x1b, %%xmm1, %%xmm1 \n"
00100         "xorps         %%xmm7, %%xmm0 \n"
00101         "movaps        %%xmm1, (%3,%1) \n"
00102         "movaps        %%xmm0, (%2,%0) \n"
00103         "sub $16, %1 \n"
00104         "add $16, %0 \n"
00105         "jl 1b \n"
00106         :"+r"(j), "+r"(k)
00107         :"r"(output+n4), "r"(output+n4*3)
00108         XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
00109     );
00110 }
00111 

Generated on Fri Feb 22 2013 07:24:30 for FFmpeg by  doxygen 1.7.1