diff --git a/liba52/srfftp_3dnow.h b/liba52/srfftp_3dnow.h index d62d3684a7..c5e6dd1c24 100644 --- a/liba52/srfftp_3dnow.h +++ b/liba52/srfftp_3dnow.h @@ -33,32 +33,18 @@ #ifndef SRFFTP_3DNOW_H__ #define SRFFTP_3DNOW_H__ -static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 }; +typedef struct +{ + unsigned long val[2]; +}i_cmplx_t; -#ifdef HAVE_3DNOWEX #define TRANS_FILL_MM6_MM7_3DNOW()\ __asm__ __volatile__(\ - "movl $-1, %%eax\n\t"\ - "movd %%eax, %%mm7\n\t"\ - "negl %%eax\n\t"\ - "movd %%eax, %%mm6\n\t"\ - "punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\ - "pi2fd %%mm7, %%mm7\n\t"\ - "pswapd %%mm7, %%mm6\n\t"/* 1.0 | -1.0 */\ - :::"eax","memory"); -#else -#define TRANS_FILL_MM6_MM7_3DNOW()\ - __asm__ __volatile__(\ - "movl $-1, %%eax\n\t"\ - "movd %%eax, %%mm7\n\t"\ - "negl %%eax\n\t"\ - "movd %%eax, %%mm6\n\t"\ - "punpckldq %%mm6, %%mm7\n\t" /* -1.0 | 1.0 */\ - "punpckldq %%mm7, %%mm6\n\t" /* 1.0 | -1.0 */\ - "pi2fd %%mm7, %%mm7\n\t"\ - "pi2fd %%mm6, %%mm6\n\t"\ - :::"eax","memory"); -#endif + "movq %1, %%mm7\n\t"\ + "movq %0, %%mm6\n\t"\ + ::"m"(x_plus_minus_3dnow),\ + "m"(x_minus_plus_3dnow)\ + :"memory"); #ifdef HAVE_3DNOWEX #define PSWAP_MM(mm_base,mm_hlp) "pswapd "mm_base","mm_base"\n\t" @@ -85,8 +71,8 @@ static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, "movq %5, %%mm1\n\t" /* mm1 = wTB[k*2]*/ \ "movq %%mm0, %%mm5\n\t"/*u.re = wTB[0].re + wTB[k*2].re;*/\ "pfadd %%mm1, %%mm5\n\t"/*u.im = wTB[0].im + wTB[k*2].im; mm5 = u*/\ - "pfmul %%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\ - "pfmul %%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\ + "pxor %%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\ + "pxor %%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\ "pfadd %%mm1, %%mm0\n\t"/*v.im = wTB[0].re - wTB[k*2].re;*/\ "movq %%mm0, %%mm4\n\t"/*v.re =-wTB[0].im + wTB[k*2].im;*/\ PSWAP_MM("%%mm4","%%mm2")/* mm4 = v*/\ @@ -112,18 +98,18 @@ static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, __asm__ __volatile__(\ "movq %4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\ "movq %%mm0, %%mm1\n\t"\ - "pfmul %%mm7, %%mm1\n\t"\ + "pxor %%mm7, %%mm1\n\t"\ "pfacc %%mm1, %%mm0\n\t"/*u.im = wTB[2].im - wTB[2].re; mm0 = u*/\ "movq %5, %%mm1\n\t" /*a.re = wTB[6].im - wTB[6].re; */\ "movq %%mm1, %%mm2\n\t"\ - "pfmul %%mm7, %%mm1\n\t"\ + "pxor %%mm7, %%mm1\n\t"\ "pfacc %%mm2, %%mm1\n\t"/*a.im = wTB[6].im + wTB[6].re; mm1 = a*/\ "movq %%mm1, %%mm2\n\t"\ - "pfmul %%mm7, %%mm2\n\t"/*v.im = u.re - a.re;*/\ + "pxor %%mm7, %%mm2\n\t"/*v.im = u.re - a.re;*/\ "movq %%mm0, %%mm3\n\t"/*v.re = u.im + a.im;*/\ "pfadd %%mm2, %%mm3\n\t"\ PSWAP_MM("%%mm3","%%mm2")/*mm3 = v*/\ - "pfmul %%mm6, %%mm1\n\t"/*u.re = u.re + a.re;*/\ + "pxor %%mm6, %%mm1\n\t"/*u.re = u.re + a.re;*/\ "pfadd %%mm1, %%mm0\n\t"/*u.im = u.im - a.im; mm0 = u*/\ "movq %8, %%mm2\n\t"\ "pfmul %%mm2, %%mm3\n\t" /* v *= HSQRT2_3DNOW; */\ @@ -133,9 +119,9 @@ static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, "movq %%mm1, %%mm2\n\t"\ "movq %%mm3, %%mm4\n\t"\ "pfadd %%mm0, %%mm1\n\t" /*A2 = a1 + u;*/\ - "pfmul %%mm6, %%mm4\n\t"/*A6.re = a1.re + v.re;*/\ + "pxor %%mm6, %%mm4\n\t"/*A6.re = a1.re + v.re;*/\ "pfsub %%mm0, %%mm2\n\t" /*A2 = a1 - u;*/\ - "pfmul %%mm7, %%mm3\n\t"/*A14.re = a1.re - v.re;*/\ + "pxor %%mm7, %%mm3\n\t"/*A14.re = a1.re - v.re;*/\ "movq %%mm1, %0\n\t"\ "movq %%mm2, %1\n\t"\ "movq %%mm5, %%mm2\n\t"\ @@ -159,7 +145,7 @@ static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, "pfmul %%mm0, %%mm4\n\t"/* mm4 =u.re | u.im */\ "pfmul %%mm0, %%mm5\n\t"/* mm5 = a.re | a.im */\ PSWAP_MM("%%mm5","%%mm3")\ - "pfmul %%mm7, %%mm5\n\t"\ + "pxor %%mm7, %%mm5\n\t"\ "pfadd %%mm5, %%mm4\n\t"/* mm4 = u*/\ "movq %3, %%mm1\n\t"\ "movq %2, %%mm0\n\t"\ @@ -171,9 +157,9 @@ static complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, "pfacc %%mm0, %%mm0\n\t"\ "movq %%mm4, %%mm5\n\t"\ "punpckldq %%mm0,%%mm2\n\t"/*mm2 = v.re | a.re*/\ - "pfmul %%mm6, %%mm5\n\t"\ + "pxor %%mm6, %%mm5\n\t"\ "movq %%mm2, %%mm3\n\t"\ - "pfmul %%mm7, %%mm3\n\t"\ + "pxor %%mm7, %%mm3\n\t"\ "pfadd %%mm3, %%mm5\n\t"\ PSWAP_MM("%%mm5","%%mm3")/* mm5 = v*/\ "pfadd %%mm2, %%mm4\n\t"\ diff --git a/mp3lib/dct64_k7.s b/mp3lib/dct64_k7.s index 44e2cf74ed..6a82d618c4 100644 --- a/mp3lib/dct64_k7.s +++ b/mp3lib/dct64_k7.s @@ -9,6 +9,9 @@ /// (using memory reference as operand of instructions) /// - Phase 6 is rewritten with mixing of cpu and mmx opcodes /// - change function name for support 3DNowEx! automatic detect +/// - negation of 3dnow reg was replaced with PXOR 0x800000000, MMi instead +/// of PFMUL as it was suggested by athlon manual. (Two not separated PFMUL +/// can not be paired, but PXOR can be). /// /// note: because K7 processors are an aggresive out-of-order three-way /// superscalar ones instruction order is not significand for them. @@ -21,6 +24,11 @@ /// this program. Use it at your own risk. /// +.data + .align 8 +plus_minus_3dnow: .long 0x00000000, 0x80000000 + +.text .globl dct64_3dnowex .type dct64_3dnowex,@function @@ -412,13 +420,8 @@ dct64_3dnowex: movq %mm5, 120(%esi) // 5 - movl $-1,%eax - movd %eax,%mm1 + movq plus_minus_3dnow, %mm0 /* mm0 = 1.0 | -1.0 */ movl $1,%eax - movd %eax,%mm0 - / L | H - punpckldq %mm1,%mm0 - pi2fd %mm0,%mm0 /* mm0 = 1.0 | -1.0 */ movd %eax,%mm1 pi2fd %mm1,%mm1 movl pnts+16,%eax @@ -433,7 +436,7 @@ dct64_3dnowex: movq 8(%esi),%mm4 /* mm4 = tmp2[2] | tmp2[3]*/ pfpnacc %mm4, %mm4 pswapd %mm4, %mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[2]-tmp2[3]*/ - pfmul %mm0,%mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/ + pxor %mm0,%mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/ pfmul %mm1,%mm4 /* mm4 = tmp2[2]+tmp2[3]|(tmp2[3]-tmp2[2])*cos0*/ movq %mm4,%mm5 psrlq $32,%mm5 /* mm5 = (tmp2[3]-tmp2[2])*cos0 */ @@ -449,7 +452,7 @@ dct64_3dnowex: pfpnacc %mm4, %mm4 pswapd %mm4, %mm4 - pfmul %mm0,%mm4 + pxor %mm0,%mm4 pfmul %mm1,%mm4 movq %mm4,%mm5 psrlq $32,%mm5 @@ -470,7 +473,7 @@ dct64_3dnowex: movq 40(%esi),%mm4 pfpnacc %mm4, %mm4 pswapd %mm4, %mm4 - pfmul %mm0,%mm4 + pxor %mm0,%mm4 pfmul %mm1,%mm4 movq %mm4,%mm5 psrlq $32,%mm5 @@ -484,7 +487,7 @@ dct64_3dnowex: movq 56(%esi),%mm4 pfpnacc %mm4, %mm4 pswapd %mm4, %mm4 - pfmul %mm0,%mm4 + pxor %mm0,%mm4 pfmul %mm1,%mm4 movq %mm4,%mm5 psrlq $32,%mm5 @@ -504,7 +507,7 @@ dct64_3dnowex: movq 72(%esi),%mm4 pfpnacc %mm4, %mm4 pswapd %mm4, %mm4 - pfmul %mm0,%mm4 + pxor %mm0,%mm4 pfmul %mm1,%mm4 movq %mm4,%mm5 psrlq $32,%mm5 @@ -518,7 +521,7 @@ dct64_3dnowex: movq 88(%esi),%mm4 pfpnacc %mm4, %mm4 pswapd %mm4, %mm4 - pfmul %mm0,%mm4 + pxor %mm0,%mm4 pfmul %mm1,%mm4 movq %mm4,%mm5 psrlq $32,%mm5 @@ -538,7 +541,7 @@ dct64_3dnowex: movq 104(%esi),%mm4 pfpnacc %mm4, %mm4 pswapd %mm4, %mm4 - pfmul %mm0,%mm4 + pxor %mm0,%mm4 pfmul %mm1,%mm4 movq %mm4,%mm5 psrlq $32,%mm5 @@ -552,7 +555,7 @@ dct64_3dnowex: movq 120(%esi),%mm4 pfpnacc %mm4, %mm4 pswapd %mm4, %mm4 - pfmul %mm0,%mm4 + pxor %mm0,%mm4 pfmul %mm1,%mm4 movq %mm4,%mm5 psrlq $32,%mm5