/*
 *  downmix_3dnow.S
 *
 *  Replacement of downmix_kni.S with AMD's 3DNow! SIMD operations support
 *
 *  Modified by Nick Kurshev <nickols_k@mail.ru>
 *
 *  Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - April 2000
 *
 *  downmix_3dnow.S is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  downmix_3dnow.S is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */

.section .rodata
	.align 4
sqrt2:	.float 0f0.7071068
	.p2align 5,0,
	
	.section .text
	
	.align 4
	.global downmix_3f_2r_to_2ch
	.type downmix_3f_2r_to_2ch, @function

downmix_3f_2r_to_2ch:
	pushl	%ebp
	movl  	%esp, %ebp
	
	pushl	%eax
	pushl	%ebx
	pushl	%ecx

	femms

	movl 	8(%ebp), %eax    /* samples[] */
	movl	12(%ebp), %ebx   /* &dm_par */
	movl	$128, %ecx	 /* loop counter */

	movd	(%ebx), %mm5     /* unit */
	punpckldq %mm5, %mm5     /* unit | unit */

	movd	4(%ebx), %mm6    /* clev */
	punpckldq %mm6, %mm6     /* clev | clev */

	movd	8(%ebx), %mm7    /* slev */
	punpckldq %mm7, %mm7     /* slev | slev */
.align 16
.loop:	
	movq	(%eax), %mm0 	 /* left */
	movq	2048(%eax), %mm1 /* right */
	movq 	1024(%eax), %mm2 /* center */
	pfmul	%mm5, %mm0
	pfmul	%mm5, %mm1
	
	pfmul	%mm6, %mm2
	movq	3072(%eax), %mm3 /* leftsur */
	movq	4096(%eax), %mm4 /* rithgsur */
	pfadd	%mm2, %mm0
	pfadd 	%mm2, %mm1

	pfmul	%mm7, %mm3
	pfmul	%mm7, %mm4
	pfadd	%mm3, %mm0
	pfadd	%mm4, %mm1

	movq	%mm0, (%eax)
	movq	%mm1, 1024(%eax)

	addl	$8, %eax
	decl 	%ecx
	jnz	.loop

	popl	%ecx
	popl	%ebx
	popl 	%eax

	femms

	leave
	ret
	.p2align 4,,7

	.global downmix_2f_2r_to_2ch
	.type downmix_2f_2r_to_2ch, @function

downmix_2f_2r_to_2ch:
	pushl %ebp
	movl  %esp, %ebp

	pushl %eax
	pushl %ebx
	pushl %ecx

	femms

	movl 8(%ebp), %eax 	 /* samples[] */
	movl 12(%ebp), %ebx 	 /* &dm_par */
	movl $128, %ecx		 /* loop counter */

	movd	(%ebx), %mm5     /* unit */
	punpckldq %mm5, %mm5     /* unit | unit */

	movd	8(%ebx), %mm7    /* slev */
	punpckldq %mm7, %mm7     /* slev | slev */
.align 16
.loop3:	
	movq	(%eax), %mm0 	 /* left */
	movq	1024(%eax), %mm1 /* right */
	movq 	2048(%eax), %mm3 /* leftsur */
	pfmul	%mm5, %mm0
	pfmul	%mm5, %mm1
	
	movq	3072(%eax), %mm4 /* rightsur */

	pfmul	%mm7, %mm3
	pfmul	%mm7, %mm4
	pfadd	%mm3, %mm0
	pfadd	%mm4, %mm1

	movq	%mm0, (%eax)
	movq	%mm1, 1024(%eax)

	addl	$8, %eax
	decl 	%ecx
	jnz	.loop3

	popl	%ecx
	popl	%ebx
	popl 	%eax

	femms

	leave
	ret
	.p2align 4,,7
	
	.global downmix_3f_1r_to_2ch
	.type downmix_3f_1r_to_2ch, @function

downmix_3f_1r_to_2ch:
	pushl	%ebp
	movl  	%esp, %ebp
	
	pushl	%eax
	pushl	%ebx
	pushl	%ecx

	femms

	movl 	8(%ebp), %eax    /* samples[] */
	movl	12(%ebp), %ebx   /* &dm_par */
	movl	$128, %ecx	 /* loop counter */

	movd	(%ebx), %mm5     /* unit */
	punpckldq %mm5, %mm5     /* unit | unit */

	movd	4(%ebx), %mm6    /* clev */
	punpckldq %mm6, %mm6     /* clev | clev */

	movd	8(%ebx), %mm7    /* slev */
	punpckldq %mm7, %mm7     /* slev | slev */
.align 16
.loop4:	
	movq	(%eax), %mm0 	 /* left */
	movq	2048(%eax), %mm1 /* right */
	movq 	1024(%eax), %mm2 /* center */
	pfmul	%mm5, %mm0
	pfmul	%mm5, %mm1
	
	pfmul	%mm6, %mm2
	movq	3072(%eax), %mm3 /* sur */

	pfadd	%mm2, %mm0
	pfmul	%mm7, %mm3
	
	pfadd 	%mm2, %mm1

	pfsub	%mm3, %mm0
	pfadd	%mm3, %mm1

	movq	%mm0, (%eax)
	movq	%mm1, 1024(%eax)

	addl	$8, %eax
	decl 	%ecx
	jnz	.loop4

	popl	%ecx
	popl	%ebx
	popl 	%eax

	femms

	leave
	ret
	.p2align 4,,7
		
	.global downmix_2f_1r_to_2ch
	.type downmix_2f_1r_to_2ch, @function

downmix_2f_1r_to_2ch:
	pushl	%ebp
	movl  	%esp, %ebp
	
	pushl	%eax
	pushl	%ebx
	pushl	%ecx

	femms

	movl 	8(%ebp), %eax    /* samples[] */
	movl	12(%ebp), %ebx   /* &dm_par */
	movl	$128, %ecx	 /* loop counter */

	movd	(%ebx), %mm5     /* unit */
	punpckldq %mm5, %mm5     /* unit | unit */

	movd	8(%ebx), %mm7    /* slev */
	punpckldq %mm7, %mm7     /* slev | slev */
.align 16
.loop5:	
	movq	(%eax), %mm0 	 /* left */
	movq	1024(%eax), %mm1 /* right */
	
	pfmul	%mm5, %mm0
	pfmul	%mm5, %mm1
	
	movq	2048(%eax), %mm3 /* sur */

	pfmul	%mm7, %mm3
	
	pfsub	%mm3, %mm0
	pfadd	%mm3, %mm1

	movq	%mm0, (%eax)
	movq	%mm1, 1024(%eax)

	addl	$8, %eax
	decl 	%ecx
	jnz	.loop5

	popl	%ecx
	popl	%ebx
	popl 	%eax

	femms

	leave
	ret
	.p2align 4,,7
	
	.global downmix_3f_0r_to_2ch
	.type downmix_3f_0r_to_2ch, @function

downmix_3f_0r_to_2ch:
	pushl	%ebp
	movl  	%esp, %ebp
	
	pushl	%eax
	pushl	%ebx
	pushl	%ecx

	femms

	movl 	8(%ebp), %eax    /* samples[] */
	movl	12(%ebp), %ebx   /* &dm_par */
	movl	$128, %ecx	 /* loop counter */

	movd	(%ebx), %mm5     /* unit */
	punpckldq %mm5, %mm5     /* unit | unit */

	movd	4(%ebx), %mm6    /* clev */
	punpckldq %mm6, %mm6     /* clev | clev */
.align 16
.loop6:	
	movq	(%eax), %mm0 	 /* left */
	movq	2048(%eax), %mm1 /* right */
	movq 	1024(%eax), %mm2 /* center */
	pfmul	%mm5, %mm0
	pfmul	%mm5, %mm1
	
	pfmul	%mm6, %mm2

	pfadd	%mm2, %mm0
	
	pfadd 	%mm2, %mm1

	movq	%mm0, (%eax)
	movq	%mm1, 1024(%eax)

	addl	$8, %eax
	decl 	%ecx
	jnz	.loop6

	popl	%ecx
	popl	%ebx
	popl 	%eax

	femms

	leave
	ret
	.p2align 4,,7
	
	.global stream_sample_2ch_to_s16
	.type stream_sample_2ch_to_s16, @function

stream_sample_2ch_to_s16:
	pushl %ebp
	movl  %esp, %ebp

	pushl %eax
	pushl %ebx
	pushl %edx
	pushl %ecx

	femms

	movl 8(%ebp), %eax	/* s16_samples */
	movl 12(%ebp), %ebx	/* left */
	movl 16(%ebp), %edx	/* right */
	movl $64, %ecx
.align 16
.loop1:
	movq (%ebx), %mm0	/* l1 | l0 */
	movq (%edx), %mm1	/* r1 | r0 */
	movq 8(%ebx), %mm3
	movq 8(%edx), %mm4
	movq %mm0, %mm2		/* l1 | l0 */
	movq %mm3, %mm5
	punpckldq %mm1, %mm0	/* r0 | l0 */
	punpckldq %mm4, %mm3
	punpckhdq %mm1, %mm2	/* r1 | l1 */
	punpckhdq %mm4, %mm5

	pf2id %mm0, %mm0	/* r0 | l0 --> int_32 */
	pf2id %mm3, %mm3
	pf2id %mm2, %mm2	/* r1 | l1 --> int_32 */
	pf2id %mm5, %mm5
	packssdw %mm2, %mm0	/* r1 l1 r0 l0 --> int_16 */
	movq %mm0, (%eax)
	packssdw %mm5, %mm3
	movq %mm3, 8(%eax)

	addl $16, %eax
	addl $16, %ebx
	addl $16, %edx

	decl %ecx
	jnz .loop1

	popl %ecx
	popl %edx
	popl %ebx
	popl %eax

	femms

	leave
	ret
	.p2align 4,,7
	
	.global stream_sample_1ch_to_s16
	.type stream_sample_1ch_to_s16, @function

stream_sample_1ch_to_s16:
	pushl %ebp
	movl  %esp, %ebp

	pushl %eax
	pushl %ebx
	pushl %ecx

	femms

	movl $sqrt2, %eax
	movd (%eax), %mm7
	movl 8(%ebp), %eax	/* s16_samples */
	movl 12(%ebp), %ebx	/* left */
	punpckldq %mm7, %mm7    /* sqrt2 | sqrt2 */
	movl $64, %ecx
.align 16
.loop2:
	movq  (%ebx), %mm0	/* c1 | c0 */
	movq 8(%ebx), %mm1
	pfmul %mm7, %mm0
	pfmul %mm7, %mm1

	pf2id %mm0, %mm0	/* c1 c0 --> int_32 */
	pf2id %mm1, %mm1

	packssdw %mm0, %mm0	/* c1 c1 c0 c0 --> int_16 */
	packssdw %mm1, %mm1

	movq %mm0, (%eax)
	movq %mm1, 8(%eax)
	addl $16, %eax
	addl $16, %ebx

	decl %ecx
	jnz .loop2

	popl %ecx
	popl %ebx
	popl %eax
	
	femms
	leave
	ret 
