;	for new GOGO-no-coda (1999/09)
;	Copyright (C) 1999 shigeo

;	99/09/15
;	̃\[X3D Now!(39kclk->18kclk)
;	œK 15kclk->14kclk by URURI


%include "nasm.h"

	globaldef	window_filter_subband_3DN
	globaldef	window_filter_subband_E3DN
	externdef	enwindow
	externdef	idct_coefficient
%if 0
	externdef	sbd_off
%endif
	externdef	sbd_xxx


HAN_SIZE	equ	512		;defined in common.h
SBLIMIT		equ	32
EXTRADELAY	equ	56		;defined in musenc.c

F_SIZE	equ	4
%define F_PTR	dword

	segment_data

		align	16
D_SCALER	dd	38000000h,38000000h			;1.0/32768 define in common.h
D_PLUS_MINUS	dd	-1.0, 1.0

;		align	16
;s_ptr		dd 0
;saveOff		dd 0

	segment_code

;
;		2000/04/02	؂Ă邼EEE  by kei
;					sbd_xxx[] ̒̏Ԃ sbt_shiftin_C Ɠ
;		2000/04/04	sbd_xxx[] ̒̏Ԃ sbt_shiftin_SSE ƓɕύX
;		2000/04/08	memcpy ̃oOCƂ
;		2000/04/17	SMPœK̂߂̕z by K.SAKAI
;
;void sbd_shiftin_3DN(int (*mfbuf)[1152+576+EXTRADELAY], int stereo, int mode_gr);

proc	sbd_shiftin_3DN

%$mfbuf		arg		4
%$stereo	arg		4
%$mode_gr	arg		4
	femms

	pushd	ebx, edi, ebp

	mov		edx, [sp(%$mode_gr)]
	imul	edx, 18*32*F_SIZE
	mov		eax, [sp(%$mfbuf)]
	mov		ebp, sbd_xxx+1152*4

.for.ch:

.for.memcpy.init:
	mov		ebx, ebp
	add		ebx, (512-32+16)*4
	mov		edi, ebx
	sub		ebx, edx
	mov		ecx, (512-32+16)/16
	jmp		short .for.memcpy

	align	16
.for.memcpy:
	pmov	mm0,[ebx + 0*F_SIZE]
	pmov	mm1,[ebx + 2*F_SIZE]
	pmov	mm2,[ebx + 4*F_SIZE]
	pmov	mm3,[ebx + 6*F_SIZE]
	pmov	[edi +   0*F_SIZE],mm0
	pmov	[edi +   2*F_SIZE],mm1
	pmov	[edi +   4*F_SIZE],mm2
	pmov	[edi +   6*F_SIZE],mm3

	pmov	mm4,[ebx +   8*F_SIZE]
	pmov	mm5,[ebx +  10*F_SIZE]
	pmov	mm6,[ebx +  12*F_SIZE]
	pmov	mm7,[ebx +  14*F_SIZE]
	sub		ebx, 16*4
	pmov	[edi +   8*F_SIZE],mm4
	pmov	[edi +  10*F_SIZE],mm5
	pmov	[edi +  12*F_SIZE],mm6
	pmov	[edi +  14*F_SIZE],mm7
	sub		edi, 16*4

.for.memcpy.next:
	dec		ecx
	jnz		.for.memcpy
.for.memcpy.end:

.for.win_buf.init:
	mov		edi, ebp
	mov		ebx, eax			;win_buf
	mov		ecx, edx
	jmp		short .for.win_buf

	align	16
.for.win_buf:

%assign i 32
%assign	j 0
	pmovd	mm0, [ebx +fsizen(j+15)] 	;= [ *:15]
	pi2fd	mm1, [ebx +fsizen(j+ 1)]	;= [ 2: 1]
	punpckldq	mm0, [ebx +fsizen(j+0)]	;= [ 0:15]
	pi2fd	mm2, [ebx +fsizen(j+ 3)]	;= [ 4: 3]
	pi2fd	mm3, [ebx +fsizen(j+ 5)]	;= [ 6: 5]
	pi2fd	mm0, mm0

	pmov	[edi+fsizen(i-14)], mm1
	pmov	[edi+fsizen(i-12)], mm2
	pmov	[edi+fsizen(i-10)], mm3
	pmov	[edi+fsizen(i-16)], mm0

	pi2fd	mm4, [ebx +fsizen(j+ 7)]	;= [ 8: 7]
	pi2fd	mm5, [ebx +fsizen(j+ 9)]	;= [10: 9]
	pi2fd	mm6, [ebx +fsizen(j+11)]	;= [12:11]
	pi2fd	mm7, [ebx +fsizen(j+13)]	;= [14:13]

	pmov	[edi+fsizen(i- 8)], mm4
	pmov	[edi+fsizen(i- 6)], mm5
	pmov	[edi+fsizen(i- 4)], mm6
	pmov	[edi+fsizen(i- 2)], mm7

%assign i 16
%assign	j 16
	pi2fd	mm0, [ebx +fsizen(j+0)]
	pi2fd	mm1, [ebx +fsizen(j+2)]
	pi2fd	mm2, [ebx +fsizen(j+4)]
	pi2fd	mm3, [ebx +fsizen(j+6)]

	punpckldq	mm4, mm0
	punpckldq	mm5, mm1
	punpckldq	mm6, mm2
	punpckldq	mm7, mm3

	punpckhdq	mm0, mm4
	punpckhdq	mm1, mm5
	punpckhdq	mm2, mm6
	punpckhdq	mm3, mm7

	pmov	[edi+fsizen(i-2)], mm0
	pmov	[edi+fsizen(i-4)], mm1
	pmov	[edi+fsizen(i-6)], mm2
	pmov	[edi+fsizen(i-8)], mm3

	pi2fd	mm4, [ebx +fsizen(j+ 8)]
	pi2fd	mm5, [ebx +fsizen(j+10)]
	pi2fd	mm6, [ebx +fsizen(j+12)]
	pi2fd	mm7, [ebx +fsizen(j+14)]

	punpckldq	mm0, mm4
	punpckldq	mm1, mm5
	punpckldq	mm2, mm6
	punpckldq	mm3, mm7

	add		ebx, fsizen(32)

	punpckhdq	mm4, mm0
	punpckhdq	mm5, mm1
	punpckhdq	mm6, mm2
	punpckhdq	mm7, mm3

	pmov	[edi+fsizen(i-10)], mm4
	pmov	[edi+fsizen(i-12)], mm5
	pmov	[edi+fsizen(i-14)], mm6
	pmov	[edi+fsizen(i-16)], mm7

	sub		edi, fsizen(32)
.for.win_buf.next:
	sub		ecx, fsizen(32)
	jnz		near .for.win_buf
.for.win_buf.end:

.for.ch.next:
	cmp		byte [sp(%$stereo)], 2
	jne		.exit

	add		eax,(1152+576+EXTRADELAY)*F_SIZE
	add		ebp,(1152+HAN_SIZE)*F_SIZE
	cmp		ebp,sbd_xxx+(1152+HAN_SIZE)*F_SIZE*2
	jb		near .for.ch
.for.ch.end:

.exit:
	femms

	popd	ebx, edi, ebp
endproc
;***************************************************************************

%ifdef USE_E3DN
;
;		2000/04/03	sbd_xxx[] ̒̏Ԃ sbt_shiftin_C Ɠ by kei
;		2000/04/04	sbd_xxx[] ̒̏Ԃ sbt_shiftin_SSE ƓɕύX
;		2000/04/08	memcpy ̃oOCƂ
;
;void sbd_shiftin_E3DN(int (*mfbuf)[1152+576+EXTRADELAY], int stereo, int mode_gr);

proc	sbd_shiftin_E3DN

%$mfbuf		arg		4
%$stereo	arg		4
%$mode_gr	arg		4
;	extern _clkbegin
;	extern _clkend
;	call 		_clkbegin

	femms

	pushd	ebx, edi, ebp

	mov		edx, [sp(%$mode_gr)]
	imul	edx, 18*32*F_SIZE
	mov		eax, [sp(%$mfbuf)]
	mov		ebp, sbd_xxx+1152*4

.for.ch:

.for.memcpy.init:
	mov		ebx, ebp
	add		ebx, (512-32+16)*4
	mov		edi, ebx
	sub		ebx, edx
	mov		ecx, (512-32+16)/16
	jmp		short .for.memcpy

	align	16
.for.memcpy:
	prefetch	[ebx-64]
	prefetchw	[edi-64]

	pmov	mm0,[ebx + 0*F_SIZE]
	pmov	mm1,[ebx + 2*F_SIZE]
	pmov	mm2,[ebx + 4*F_SIZE]
	pmov	mm3,[ebx + 6*F_SIZE]
	pmov	[edi +   0*F_SIZE],mm0
	pmov	[edi +   2*F_SIZE],mm1
	pmov	[edi +   4*F_SIZE],mm2
	pmov	[edi +   6*F_SIZE],mm3

	pmov	mm4,[ebx +   8*F_SIZE]
	pmov	mm5,[ebx +  10*F_SIZE]
	pmov	mm6,[ebx +  12*F_SIZE]
	pmov	mm7,[ebx +  14*F_SIZE]
	sub		ebx, 16*4
	pmov	[edi +   8*F_SIZE],mm4
	pmov	[edi +  10*F_SIZE],mm5
	pmov	[edi +  12*F_SIZE],mm6
	pmov	[edi +  14*F_SIZE],mm7
	sub		edi, 16*4

.for.memcpy.next:
	dec		ecx
	jnz		.for.memcpy
.for.memcpy.end:

.for.win_buf.init:
	mov		edi, ebp
	mov		ebx, eax			;win_buf
	mov		ecx, edx
	jmp		short .for.win_buf

	align	16
.for.win_buf:
	prefetch	[ebx+128]
	prefetch	[ebx+196]
	prefetchw	[edi-64]
	prefetchw	[edi-128]

%assign i 32
%assign	j 0
	pmovd	mm0, [ebx +fsizen(j+15)] 	;= [ *:15]
	pi2fd	mm1, [ebx +fsizen(j+ 1)]	;= [ 2: 1]
	punpckldq	mm0, [ebx +fsizen(j+0)]	;= [ 0:15]
	pi2fd	mm2, [ebx +fsizen(j+ 3)]	;= [ 4: 3]
	pi2fd	mm3, [ebx +fsizen(j+ 5)]	;= [ 6: 5]
	pi2fd	mm0, mm0

	pmov	[edi+fsizen(i-14)], mm1
	pmov	[edi+fsizen(i-12)], mm2
	pmov	[edi+fsizen(i-10)], mm3
	pmov	[edi+fsizen(i-16)], mm0

	pi2fd	mm4, [ebx +fsizen(j+ 7)]	;= [ 8: 7]
	pi2fd	mm5, [ebx +fsizen(j+ 9)]	;= [10: 9]
	pi2fd	mm6, [ebx +fsizen(j+11)]	;= [12:11]
	pi2fd	mm7, [ebx +fsizen(j+13)]	;= [14:13]

	pmov	[edi+fsizen(i- 8)], mm4
	pmov	[edi+fsizen(i- 6)], mm5
	pmov	[edi+fsizen(i- 4)], mm6
	pmov	[edi+fsizen(i- 2)], mm7

%assign i 16
%assign	j 16
	pi2fd	mm0, [ebx +fsizen(j+0)]
	pi2fd	mm1, [ebx +fsizen(j+2)]
	pi2fd	mm2, [ebx +fsizen(j+4)]
	pi2fd	mm3, [ebx +fsizen(j+6)]

	pswapd	mm0, mm0
	pswapd	mm1, mm1
	pswapd	mm2, mm2
	pswapd	mm3, mm3

	pmov	[edi+fsizen(i-2)], mm0
	pmov	[edi+fsizen(i-4)], mm1
	pmov	[edi+fsizen(i-6)], mm2
	pmov	[edi+fsizen(i-8)], mm3

	pi2fd	mm4, [ebx +fsizen(j+ 8)]
	pi2fd	mm5, [ebx +fsizen(j+10)]
	pi2fd	mm6, [ebx +fsizen(j+12)]
	pi2fd	mm7, [ebx +fsizen(j+14)]

	add		ebx, fsizen(32)

	pswapd	mm4, mm4
	pswapd	mm5, mm5
	pswapd	mm6, mm6
	pswapd	mm7, mm7

	pmov	[edi+fsizen(i-10)], mm4
	pmov	[edi+fsizen(i-12)], mm5
	pmov	[edi+fsizen(i-14)], mm6
	pmov	[edi+fsizen(i-16)], mm7

	sub		edi, fsizen(32)
.for.win_buf.next:
	sub		ecx, fsizen(32)
	jnz		near .for.win_buf
.for.win_buf.end:

.for.ch.next:
	cmp		byte [sp(%$stereo)], 2
	jne		.exit

	add		eax,(1152+576+EXTRADELAY)*F_SIZE
	add		ebp,(1152+HAN_SIZE)*F_SIZE
	cmp		ebp,sbd_xxx+(1152+HAN_SIZE)*F_SIZE*2
	jb		near .for.ch
.for.ch.end:

.exit:
	femms

	popd	ebx, edi, ebp

;	call 		_clkend
endproc
%endif
;***************************************************************************

;
;		2000/04/06	؂܂EEE  by kei
;		2000/04/07	unroll C
;		2000/04/08	yprime[16] [v̊O
;					

; void window_filter_subband_3DN(float *win_buf, float *s, int mode_gr)
%if 0
3DN, E3DN łł̎
static void
window_filter_subband_3DN(float *win_buf, float *xout, int mode_gr)
{
	float	*c, yprime[32];
	int	i,j,k;

	c = &win_buf[1152];
	for(j = 0; j < 18*mode_gr; j++){
		yprime[ 0] = c[64*0+16]*enwindow[4][0][0]
		      + c[64*1+16]*enwindow[4][1][0]
		      + c[64*2+16]*enwindow[4][2][0]
		      + c[64*3+16]*enwindow[4][3][0]
		      + c[64*4+16]*enwindow[4][4][0]
		      + c[64*5+16]*enwindow[4][5][0]
		      + c[64*6+16]*enwindow[4][6][0]
		      + c[64*7+16]*enwindow[4][7][0];

		for(i = 0; i < 16; i++){
			register float	a, b;

			a  =c[64*0 + i   ]*enwindow[i/4  ][0][i%4];
			a -=c[64*7 + i+16]*enwindow[i/4+8][0][i%4]; // t]
			b  =c[64*0 + i+32]*enwindow[i/4+8][0][i%4];
			b +=c[64*7 + i+48]*enwindow[i/4  ][0][i%4]; // t]

			a +=c[64*1 + i   ]*enwindow[i/4  ][1][i%4];
			a -=c[64*6 + i+16]*enwindow[i/4+8][1][i%4]; // t]
			b +=c[64*1 + i+32]*enwindow[i/4+8][1][i%4];
			b +=c[64*6 + i+48]*enwindow[i/4  ][1][i%4]; // t]

			a +=c[64*2 + i   ]*enwindow[i/4  ][2][i%4];
			a -=c[64*5 + i+16]*enwindow[i/4+8][2][i%4]; // t]
			b +=c[64*2 + i+32]*enwindow[i/4+8][2][i%4];
			b +=c[64*5 + i+48]*enwindow[i/4  ][2][i%4]; // t]

			a +=c[64*3 + i   ]*enwindow[i/4  ][3][i%4];
			a -=c[64*4 + i+16]*enwindow[i/4+8][3][i%4]; // t]
			b +=c[64*3 + i+32]*enwindow[i/4+8][3][i%4];
			b +=c[64*4 + i+48]*enwindow[i/4  ][3][i%4]; // t]

			a +=c[64*4 + i   ]*enwindow[i/4  ][4][i%4];
			a -=c[64*3 + i+16]*enwindow[i/4+8][4][i%4]; // t]
			b +=c[64*4 + i+32]*enwindow[i/4+8][4][i%4];
			b +=c[64*3 + i+48]*enwindow[i/4  ][4][i%4]; // t]

			a +=c[64*5 + i   ]*enwindow[i/4  ][5][i%4];
			a -=c[64*2 + i+16]*enwindow[i/4+8][5][i%4]; // t]
			b +=c[64*5 + i+32]*enwindow[i/4+8][5][i%4];
			b +=c[64*2 + i+48]*enwindow[i/4  ][5][i%4]; // t]

			a +=c[64*6 + i   ]*enwindow[i/4  ][6][i%4];
			a -=c[64*1 + i+16]*enwindow[i/4+8][6][i%4]; // t]
			b +=c[64*6 + i+32]*enwindow[i/4+8][6][i%4];
			b +=c[64*1 + i+48]*enwindow[i/4  ][6][i%4]; // t]

			a +=c[64*7 + i   ]*enwindow[i/4  ][7][i%4];
			a -=c[64*0 + i+16]*enwindow[i/4+8][7][i%4]; // t]
			b +=c[64*7 + i+32]*enwindow[i/4+8][7][i%4];
			b +=c[64*0 + i+48]*enwindow[i/4  ][7][i%4]; // t]

			yprime[16-i] = a;
			yprime[i+16] = b;
		}
		c -= 32;

		for( i=0; i<16; i++ ){
			register float	s0, s1;

			s0 = s1 = 0.0;
			for( k=0; k<32; k+=4 ){
				s0 += idct_coefficient[i][k  ]*yprime[k+0];
				s1 += idct_coefficient[i][k+1]*yprime[k+1];
				s0 += idct_coefficient[i][k+2]*yprime[k+2];
				s1 += idct_coefficient[i][k+3]*yprime[k+3];
			}
			xout[i+ 0] = s0+s1;
			xout[31-i] = s0-s1;
		}
		xout += 32;
	}
}
%endif

%idefine	enw(x,y,z)	(eax+(x)*8*4*4+(y)*4*4+(z)*4-8*8*4*4)
%idefine	c(x,y)		(edi+(x)*64*4+(y)*4-4*64*4)
%idefine	yprime(x)	(esp+(x)*4)

%idefine	enwoff(x,y,z)	((x)*8*4*4+(y)*4*4+(z)*4)
%idefine	coff(x,y)		((x)*64*4+(y)*4)

proc window_filter_subband_3DN

%$win_buf	arg		4
%$s			arg		4
%$mode_gr	arg		4

;	extern _clkbegin
;	extern _clkend
;	call 		_clkbegin

	femms
	pushd	ebx, esi, edi, ebp

	mov		edi,[sp(%$win_buf)]
	mov		ebp,[sp(%$s)]
	mov		esi,[sp(%$mode_gr)]			; 1 or 2
	add		edi,1152*F_SIZE				; = c = &win_buf[1152]
	add		esi,esi
	lea		esi,[esi+esi*8]				; = j = 18*mode_gr

	add		edi, coff(4,0)

; allocate yprime[32]
	mov		edx,esp
	sub		esp,32*4+4
	and		esp,~31					; align to 32 byte boundary
	mov		[esp+32*4],edx			; save the original ESP
	jmp		short .f1

	align	16
.lp1:
.f1:
	mov		eax, enwindow+enwoff(8,0,0)
;		yprime[0]
	pmovd	mm0, [c(0,16)]
	pmovd	mm1, [c(1,16)]
	pmovd	mm2, [c(2,16)]
	pmovd	mm3, [c(3,16)]
	pmovd	mm4, [enw(4,0,0)]
	pmovd	mm5, [enw(4,1,0)]
	pmovd	mm6, [enw(4,2,0)]
	pmovd	mm7, [enw(4,3,0)]

	punpckldq	mm0, [c(4,16)]
	punpckldq	mm1, [c(5,16)]
	punpckldq	mm2, [c(6,16)]
	punpckldq	mm3, [c(7,16)]
	punpckldq	mm4, [enw(4,4,0)]
	punpckldq	mm5, [enw(4,5,0)]
	punpckldq	mm6, [enw(4,6,0)]
	punpckldq	mm7, [enw(4,7,0)]

	pfmul	mm0, mm4
	pfmul	mm1, mm5
	pfmul	mm2, mm6
	pfmul	mm3, mm7

	pfadd	mm0, mm1
	pfadd	mm2, mm3
	pfadd	mm0, mm2
	pfacc	mm0, mm0
	
	pmovd	[yprime(0)], mm0

;		yprime[16]
	pmovd	mm1, [c(3,0)]
	pmovd	mm0, [c(1,0)]
	pmovd	mm2, [c(7,0)]
	pmovd	mm3, [c(5,0)]
	punpckldq	mm1, [c(4,0)]
	punpckldq	mm0, [c(2,0)]
	punpckldq	mm2, [c(6,0)]
	pfadd	mm1, mm3
	pfadd	mm0, mm2

	pmovd	mm4, [c(0,32)]
	pmovd	mm5, [c(2,32)]
	pmovd	mm6, [c(7,32)]
	pmovd	mm7, [c(5,32)]
	punpckldq	mm4, [c(1,32)]
	punpckldq	mm5, [c(3,32)]
	punpckldq	mm6, [c(6,32)]
	punpckldq	mm7, [c(4,32)]
	pfsub	mm4, mm6
	pfsub	mm5, mm7

	pmovd	mm2, [enw(0,1,0)]
	pmovd	mm3, [enw(0,3,0)]
	pmovd	mm6, [enw(8,0,0)]
	pmovd	mm7, [enw(8,2,0)]
	punpckldq	mm2, [enw(0,2,0)]
	punpckldq	mm3, [enw(0,4,0)]
	punpckldq	mm6, [enw(8,1,0)]
	punpckldq	mm7, [enw(8,3,0)]

	pfmul	mm0, mm2
	pfmul	mm1, mm3
	pfmul	mm4, mm6
	pfmul	mm5, mm7
	pfadd	mm0, mm1
	pfadd	mm4, mm5
	pfadd	mm0, mm4
	pfacc	mm0, mm0
	
	pmovd	[yprime(16)], mm0

%macro	window_filter_3dn_yprime_first_type1 0
;			a  =c[64*0 + i   ]*enwindow[i/4  ][0][i%4];
;			a -=c[64*7 + i+16]*enwindow[i/4+8][0][i%4]; // t]
;			b  =c[64*0 + i+32]*enwindow[i/4+8][0][i%4];
;			b +=c[64*7 + i+48]*enwindow[i/4  ][0][i%4]; // t]
;	output
;		mm0		b:a[i+1:i+1]
;		mm1		a[i+3:i+2]
;		mm3		b[i+3:i+2]
	pmovd	mm0, [c(0,1)]
	pmov	mm1, [c(0,2)]
	pmovd	mm2, [enw(0,0,1)]
	punpckldq	mm0, [c(7,49)]
	pmov	mm3, [enw(0,0,2)]
	pfacc	mm2, mm2
	pmovd	mm4, [c(7,17)]
	pfmul 	mm1, mm3
	pmov	mm5, [c(7,18)]
	pfmul	mm0, mm2
	pfmul	mm3, [c(7,50)]
	pmov	mm2, [D_PLUS_MINUS]
	punpckldq	mm4, [c(0,33)]
	pmovd	mm6, [enw(8,0,1)]
	pmov	mm7, [enw(8,0,2)]
	pfacc	mm6, mm6
	pfmul	mm5, mm7
	pfmul	mm6, mm2
	pfsub	mm1, mm5
	pfmul	mm4, mm6
	pfmul	mm7, [c(0,34)]
	pfadd	mm0, mm4
	pfadd	mm3, mm7
%endmacro

%macro	window_filter_3dn_yprime_other_type1 2
;			a +=c[64*%1 + i   ]*enwindow[i/4  ][%1][i%4];
;			a -=c[64*%2 + i+16]*enwindow[i/4+8][%1][i%4]; // t]
;			b +=c[64*%1 + i+32]*enwindow[i/4+8][%1][i%4];
;			b +=c[64*%2 + i+48]*enwindow[i/4  ][%1][i%4]; // t]
;	output
;		mm0		b:a[i+1:i+1]
;		mm1		a[i+3:i+2]
;		mm3		b[i+3:i+2]
	pmovd	mm4, [c(%1, 1)]
	pmov	mm5, [c(%1, 2)]
	pmovd	mm6, [enw(0,%1,1)]
	punpckldq	mm4, [c(%2,49)]
	pmov	mm7, [enw(0,%1,2)]
	pfacc	mm6, mm6
	pfmul	mm5, mm7
	pfmul	mm4, mm6
	pfadd	mm1, mm5
	pfmul	mm7, [c(%2,50)]
	pfadd	mm0, mm4
	pmovd	mm4, [c(%2,17)]
	pfadd	mm3, mm7
	pmov	mm5, [c(%2,18)]
	punpckldq	mm4, [c(%1,33)]
	pmovd	mm6, [enw(8,%1,1)]
	pmov	mm7, [enw(8,%1,2)]
	pfacc	mm6, mm6
	pfmul	mm5, mm7
	pfmul	mm6, mm2
	pfsub	mm1, mm5
	pfmul	mm4, mm6
	pfmul	mm7, [c(%1,34)]
	pfadd	mm0, mm4
	pfadd	mm3, mm7
%endmacro

%macro	window_filter_3dn_yprime_first_type2 0
;			a  =c[64*0 + i   ]*enwindow[i/4  ][0][i%4];
;			a -=c[64*7 + i+16]*enwindow[i/4+8][0][i%4]; // t]
;			b  =c[64*0 + i+32]*enwindow[i/4+8][0][i%4];
;			b +=c[64*7 + i+48]*enwindow[i/4  ][0][i%4]; // t]
;	output
;		mm0		a[i+1:i+0]
;		mm1		a[i+3:i+2]
;		mm2		b[i+1:i+0]
;		mm3		b[i+3:i+2]
	pmov	mm0, [c(0,0)]
	pmov	mm1, [c(0,2)]

	pmov	mm2, [enw(0,0,0)]
	pmov	mm3, [enw(0,0,2)]

	pfmul	mm0, mm2
	pfmul 	mm1, mm3
	pfmul	mm2, [c(7,48)]
	pfmul	mm3, [c(7,50)]

	pmov	mm4, [c(7,16)]
	pmov	mm5, [c(7,18)]
	pmov	mm6, [enw(8,0,0)]
	pmov	mm7, [enw(8,0,2)]
	
	pfmul	mm4, mm6
	pfmul	mm5, mm7
	pfsub	mm0, mm4
	pfsub	mm1, mm5
	pfmul	mm6, [c(0,32)]
	pfmul	mm7, [c(0,34)]
	pfadd	mm2, mm6
	pfadd	mm3, mm7
%endmacro

%macro	window_filter_3dn_yprime_other_type2 2
;			a +=c[64*%1 + i   ]*enwindow[i/4  ][%1][i%4];
;			a -=c[64*%2 + i+16]*enwindow[i/4+8][%1][i%4]; // t]
;			b +=c[64*%1 + i+32]*enwindow[i/4+8][%1][i%4];
;			b +=c[64*%2 + i+48]*enwindow[i/4  ][%1][i%4]; // t]
;	output
;		mm0		a[i+1:i+0]
;		mm1		a[i+3:i+2]
;		mm2		b[i+1:i+0]
;		mm3		b[i+3:i+2]
	pmov	mm4, [c(%1,0)]
	pmov	mm5, [c(%1,2)]
	pmov	mm6, [enw(0,%1,0)]	
	pmov	mm7, [enw(0,%1,2)]	

	pfmul	mm4, mm6
	pfmul	mm5, mm7
	pfadd	mm0, mm4
	pfadd	mm1, mm5
	pfmul	mm6, [c(%2,48)]
	pfmul	mm7, [c(%2,50)]
	pfadd	mm2, mm6
	pfadd	mm3, mm7

	pmov	mm4, [c(%2,16)]
	pmov	mm5, [c(%2,18)]
	pmov	mm6, [enw(8,%1,0)]
	pmov	mm7, [enw(8,%1,2)]

	pfmul	mm4, mm6
	pfmul	mm5, mm7
	pfsub	mm0, mm4
	pfsub	mm1, mm5
	pfmul	mm6, [c(%1,32)]
	pfmul	mm7, [c(%1,34)]
	pfadd	mm2, mm6
	pfadd	mm3, mm7
%endmacro

;   i =  1, 2, 3
	window_filter_3dn_yprime_first_type1 
	window_filter_3dn_yprime_other_type1 1,6
	window_filter_3dn_yprime_other_type1 2,5
	window_filter_3dn_yprime_other_type1 3,4
	window_filter_3dn_yprime_other_type1 4,3
	window_filter_3dn_yprime_other_type1 5,2
	window_filter_3dn_yprime_other_type1 6,1
	window_filter_3dn_yprime_other_type1 7,0
	add	edi, 4*4
	add	eax, 8*4*4
	pmovd	[yprime(15)], mm0
	punpckldq	mm5, mm1
	psrlq	mm0, 32
	pmov	[yprime(18)], mm3
	punpckhdq	mm1, mm5
	pmovd	[yprime(17)], mm0
	pmov	[yprime(13)], mm1



;	i = 4..15
	lea		ebx, [yprime(20)]
	lea		edx, [yprime(11)]
	mov		ecx, 3
	jmp	short .lp4
.lp4:
	window_filter_3dn_yprime_first_type2
	window_filter_3dn_yprime_other_type2 1,6
	window_filter_3dn_yprime_other_type2 2,5
	window_filter_3dn_yprime_other_type2 3,4
	window_filter_3dn_yprime_other_type2 4,3
	window_filter_3dn_yprime_other_type2 5,2
	window_filter_3dn_yprime_other_type2 6,1
	window_filter_3dn_yprime_other_type2 7,0
	add	edi, 4*4
	add	eax, 8*4*4
	punpckldq	mm4, mm0
	punpckldq	mm5, mm1
	punpckhdq	mm0, mm4
	punpckhdq	mm1, mm5

	pmov	[ebx+ 0], mm2
	pmov	[ebx+ 8], mm3
	add		ebx, 4*4

	pmov	[edx- 0], mm0
	pmov	[edx- 8], mm1
	sub		edx, 4*4

	dec		ecx
	jnz	near .lp4

	sub		edi,48*4			; c -= 32

	mov		ecx, 32
	mov		edx, idct_coefficient+184
	jmp		short .lp2

	align	16
.lp2:
;	mm0 = s1:s0[i+0]
;	mm1 = s1:s0[i+1]
;	mm2 = s1:s0[i+2]
;	mm3 = s1:s0[i+3]
	pmov	mm3, [yprime(0)]
	pmov	mm0, [edx+32*4*0-184]
	pmov	mm1, [edx+32*4*1-184]
	pmov	mm2, [edx+32*4*2-184]
	pfmul	mm0, mm3
	pfmul	mm1, mm3
	pfmul	mm2, mm3
	pfmul	mm3, [edx+32*4*3-184]

	pmov	mm7, [yprime(2)]
	pmov	mm4, [edx+32*4*0+8-184]
	pmov	mm5, [edx+32*4*1+8-184]
	pmov	mm6, [edx+32*4*2+8-184]
	pfmul	mm4, mm7
	pfmul	mm5, mm7
	pfmul	mm6, mm7
	pfmul	mm7, [edx+32*4*3+8-184]
	pfadd	mm0, mm4
	pfadd	mm1, mm5
	pfadd	mm2, mm6
	pfadd	mm3, mm7

	mov		eax, 4
	mov		ebx, 7
	jmp	short .lp3

	align	16
.lp3:
	pmov	mm7, [yprime(eax)]
	pmov	mm4, [edx+32*4*0+eax*4-184]
	pmov	mm5, [edx+32*4*1+eax*4-184]
	pmov	mm6, [edx+32*4*2+eax*4-184]
	pfmul	mm4, mm7
	pfmul	mm5, mm7
	pfmul	mm6, mm7
	pfmul	mm7, [edx+32*4*3+eax*4-184]
	pfadd	mm0, mm4
	pfadd	mm1, mm5
	pfadd	mm2, mm6
	pfadd	mm3, mm7

	pmov	mm7, [yprime(eax)+8]
	pmov	mm4, [edx+32*4*0+eax*4+8-184]
	pmov	mm5, [edx+32*4*1+eax*4+8-184]
	pmov	mm6, [edx+32*4*2+eax*4+8-184]
	pfmul	mm4, mm7
	pfmul	mm5, mm7
	pfmul	mm6, mm7
	pfmul	mm7, [edx+32*4*3+eax*4+8-184]
	add		eax, 4
	pfadd	mm0, mm4
	pfadd	mm1, mm5
	dec		ebx
	pfadd	mm2, mm6
	pfadd	mm3, mm7
	jnz		.lp3

	pfacc	mm4, mm0
	pfacc	mm5, mm1
	pfacc	mm6, mm2
	pfacc	mm7, mm3

	punpckhdq	mm4, mm5
	punpckhdq	mm6, mm7
	punpckldq	mm0, mm1
	punpckldq	mm2, mm3
	pmov	[ebp+4*0], mm4
	pmov	[ebp+4*2], mm6

	add		edx, 32*4*4
	add		ebp, 16
	sub		ecx, 8

	pfsub	mm4, mm0
	pfsub	mm6, mm2
	pfsub	mm0, mm4
	pfsub	mm2, mm6

	punpckldq	mm1, mm0
	punpckldq	mm3, mm2
	punpckhdq	mm0, mm1
	punpckhdq	mm2, mm3

	pmov	[ebp+ecx*4+ 8], mm0
	pmov	[ebp+ecx*4+ 0], mm2

	jnz		near .lp2

	add		ebp,16*4
	dec		esi
	jnz		near .lp1

; free area for yprime[32]
	mov		esp,[esp+32*4]

.exit:
	femms
	popd	ebx, esi, edi, ebp

;	call 		_clkend

endproc
;***************************************************************************

%ifdef USE_E3DN

;		2000/04/06	X^uI window_filter_subband_3DN  jmp 邾 by kei
;		2000/04/07	܂Ƃ E3DN Ή
;		2000/04/08	yprime[16] [v̊O

proc window_filter_subband_E3DN
%$win_buf	arg		4
%$s			arg		4
%$mode_gr	arg		4

;	extern _clkbegin
;	extern _clkend
;	call 		_clkbegin

	femms
	pushd	ebx, esi, edi, ebp

	mov		edi,[sp(%$win_buf)]
	mov		ebp,[sp(%$s)]
	mov		esi,[sp(%$mode_gr)]			; 1 or 2
	add		edi,1152*F_SIZE				; = c = &win_buf[1152]
	add		esi,esi
	lea		esi,[esi+esi*8]				; = j = 18*mode_gr

	add		edi, coff(4,0)

; allocate yprime[32]
	mov		edx,esp
	sub		esp,32*4+4
	and		esp,~63					; align to 64 byte boundary
	mov		[esp+32*4],edx			; save the original ESP
	prefetchw  [yprime(0)]
	prefetchw  [yprime(16)]
	jmp		short .f1

	align	16
.lp1:
.f1:
	mov		eax, enwindow+enwoff(8,0,0)

;		yprime[0]
	pmovd	mm0, [c(0,16)]
	pmovd	mm1, [c(1,16)]
	pmovd	mm2, [c(2,16)]
	pmovd	mm3, [c(3,16)]
	pmovd	mm4, [enw(4,0,0)]
	pmovd	mm5, [enw(4,1,0)]
	pmovd	mm6, [enw(4,2,0)]
	pmovd	mm7, [enw(4,3,0)]

	punpckldq	mm0, [c(4,16)]
	punpckldq	mm1, [c(5,16)]
	punpckldq	mm2, [c(6,16)]
	punpckldq	mm3, [c(7,16)]
	punpckldq	mm4, [enw(4,4,0)]
	punpckldq	mm5, [enw(4,5,0)]
	punpckldq	mm6, [enw(4,6,0)]
	punpckldq	mm7, [enw(4,7,0)]

	pfmul	mm0, mm4
	pfmul	mm1, mm5
	pfmul	mm2, mm6
	pfmul	mm3, mm7

	pfadd	mm0, mm1
	pfadd	mm2, mm3
	pfadd	mm0, mm2
	pfacc	mm0, mm0
	
	pmovd	[yprime(0)], mm0

;		yprime[16]
	pmovd	mm1, [c(3,0)]
	pmovd	mm0, [c(1,0)]
	pmovd	mm2, [c(7,0)]
	pmovd	mm3, [c(5,0)]
	punpckldq	mm1, [c(4,0)]
	punpckldq	mm0, [c(2,0)]
	punpckldq	mm2, [c(6,0)]
	pfadd	mm1, mm3
	pfadd	mm0, mm2

	pmovd	mm4, [c(0,32)]
	pmovd	mm5, [c(2,32)]
	pmovd	mm6, [c(7,32)]
	pmovd	mm7, [c(5,32)]
	punpckldq	mm4, [c(1,32)]
	punpckldq	mm5, [c(3,32)]
	punpckldq	mm6, [c(6,32)]
	punpckldq	mm7, [c(4,32)]
	pfsub	mm4, mm6
	pfsub	mm5, mm7

	pmovd	mm2, [enw(0,1,0)]
	pmovd	mm3, [enw(0,3,0)]
	pmovd	mm6, [enw(8,0,0)]
	pmovd	mm7, [enw(8,2,0)]
	punpckldq	mm2, [enw(0,2,0)]
	punpckldq	mm3, [enw(0,4,0)]
	punpckldq	mm6, [enw(8,1,0)]
	punpckldq	mm7, [enw(8,3,0)]

	pfmul	mm0, mm2
	pfmul	mm1, mm3
	pfmul	mm4, mm6
	pfmul	mm5, mm7
	pfadd	mm0, mm1
	pfadd	mm4, mm5
	pfadd	mm0, mm4
	pfacc	mm0, mm0
	
	pmovd	[yprime(16)], mm0

;   i =  1, 2, 3
	window_filter_3dn_yprime_first_type1 
	window_filter_3dn_yprime_other_type1 1,6
	window_filter_3dn_yprime_other_type1 2,5
	window_filter_3dn_yprime_other_type1 3,4
	window_filter_3dn_yprime_other_type1 4,3
	window_filter_3dn_yprime_other_type1 5,2
	window_filter_3dn_yprime_other_type1 6,1
	window_filter_3dn_yprime_other_type1 7,0
	add	edi, 4*4
	add	eax, 8*4*4
	pmovd	[yprime(15)], mm0
	psrlq	mm0, 32
	pswapd		mm1, mm1
	pmov	[yprime(18)], mm3
	pmovd	[yprime(17)], mm0
	pmov	[yprime(13)], mm1

;	i = 4..15
	lea		ebx, [yprime(20)]
	lea		edx, [yprime(11)]
	mov		ecx, 3
	jmp	short .lp4
.lp4:
	window_filter_3dn_yprime_first_type2 
	window_filter_3dn_yprime_other_type2 1,6
	window_filter_3dn_yprime_other_type2 2,5
	window_filter_3dn_yprime_other_type2 3,4
	window_filter_3dn_yprime_other_type2 4,3
	window_filter_3dn_yprime_other_type2 5,2
	window_filter_3dn_yprime_other_type2 6,1
	window_filter_3dn_yprime_other_type2 7,0
	add	edi, 4*4
	add	eax, 8*4*4
	pswapd		mm0, mm0
	pswapd		mm1, mm1

	pmov	[ebx+ 0], mm2
	pmov	[ebx+ 8], mm3
	add		ebx, 4*4

	pmov	[edx- 0], mm0
	pmov	[edx- 8], mm1
	sub		edx, 4*4

	dec		ecx
	jnz	near .lp4

	sub		edi,48*4			; c -= 32

	prefetchw	[ebp]
	prefetchw	[ebp+64]
	mov		ecx, 32
	mov		edx, idct_coefficient+184
	jmp		short .lp2
	
	align	16
.lp2:
;	mm0 = s1:s0[i+0]
;	mm1 = s1:s0[i+1]
;	mm2 = s1:s0[i+2]
;	mm3 = s1:s0[i+3]
	pmov	mm3, [yprime(0)]
	pmov	mm0, [edx+32*4*0-184]
	pmov	mm1, [edx+32*4*1-184]
	pmov	mm2, [edx+32*4*2-184]
	pfmul	mm0, mm3
	pfmul	mm1, mm3
	pfmul	mm2, mm3
	pfmul	mm3, [edx+32*4*3-184]

	pmov	mm7, [yprime(2)]
	pmov	mm4, [edx+32*4*0+8-184]
	pmov	mm5, [edx+32*4*1+8-184]
	pmov	mm6, [edx+32*4*2+8-184]
	pfmul	mm4, mm7
	pfmul	mm5, mm7
	pfmul	mm6, mm7
	pfmul	mm7, [edx+32*4*3+8-184]
	pfadd	mm0, mm4
	pfadd	mm1, mm5
	pfadd	mm2, mm6
	pfadd	mm3, mm7

	mov		eax, 4
	mov		ebx, 7
	prefetch	[c(0,0)]
	prefetch	[c(1,0)]
	jmp	short .lp3

	align	16
.lp3:
	pmov	mm7, [yprime(eax)]
	pmov	mm4, [edx+32*4*0+eax*4-184]
	pmov	mm5, [edx+32*4*1+eax*4-184]
	pmov	mm6, [edx+32*4*2+eax*4-184]
	pfmul	mm4, mm7
	pfmul	mm5, mm7
	pfmul	mm6, mm7
	pfmul	mm7, [edx+32*4*3+eax*4-184]
	pfadd	mm0, mm4
	pfadd	mm1, mm5
	pfadd	mm2, mm6
	pfadd	mm3, mm7

	pmov	mm7, [yprime(eax)+8]
	pmov	mm4, [edx+32*4*0+eax*4+8-184]
	pmov	mm5, [edx+32*4*1+eax*4+8-184]
	pmov	mm6, [edx+32*4*2+eax*4+8-184]
	pfmul	mm4, mm7
	pfmul	mm5, mm7
	pfmul	mm6, mm7
	pfmul	mm7, [edx+32*4*3+eax*4+8-184]
	add		eax, 4
	pfadd	mm0, mm4
	pfadd	mm1, mm5
	dec		ebx
	pfadd	mm2, mm6
	pfadd	mm3, mm7
	jnz		.lp3

	pfacc	mm4, mm0
	pfacc	mm5, mm1
	pfacc	mm6, mm2
	pfacc	mm7, mm3

	punpckhdq	mm4, mm5
	punpckhdq	mm6, mm7
	add		edx, 32*4*4
	pmov	[ebp+4*0], mm4
	pmov	[ebp+4*2], mm6

	add		ebp, 16
	sub		ecx, 8

	pfnacc	mm0, mm0
	pfnacc	mm1, mm1
	pfnacc	mm2, mm2
	pfnacc	mm3, mm3

	punpckldq	mm1, mm0
	punpckldq	mm3, mm2

	pmov	[ebp+ecx*4+ 8], mm1
	pmov	[ebp+ecx*4+ 0], mm3

	jnz		near .lp2

	add		ebp,16*4
	dec		esi
	jnz		near .lp1

; free area for yprime[32]
	mov		esp,[esp+32*4]

.exit:
	femms
	popd	ebx, esi, edi, ebp

;	call 		_clkend

endproc

;***************************************************************************
%endif

%if 0
; ȑO̎
; void window_filter_subband_3DN(int *win_buf,int ch ,float *s,int mode_gr)

;***************************************************************************
window_filter_subband_3DN:
		push	ebx
		push	esi
		push	edi
		push	ebp
		femms
%assign _P 4*4
		mov		esi,[esp+_P+8]		;esi=ch
		mov		edi,sbd_xxx			;edi=c=&sbd_xxx[0][0]
		mov		ebx,[esp+_P+4]		;ebx=win_buf
		mov		ebp,[sbd_off+esi*4]		;ebp=offset
		and		esi,esi
		jz		short .F00
		add		edi,HAN_SIZE*F_SIZE	;c=&sbd_xxx[1][0]

.F00:
		;	ebx=win_buf, esi=j=18*2, edi=c(fix), ebp=offset=sbd_off[ch]
		mov		[saveOff],ebx
		mov		eax,[esp+_P+12]
		mov		esi,[esp+_P+16]		;esi=mode_gr(=1 or 2)
		add		esi,esi
		lea		esi,[esi+esi*8]		;=esi *= 18
		mov		[s_ptr],eax		;ۑ

; allocate yyy[64]
%assign LOCAL_STACK 64*4

		mov		edx,esp
		sub		esp,LOCAL_STACK+4
		and		esp,~15					; align to 16 byte boundary
		mov		[esp+LOCAL_STACK],edx	; save the original ESP
		jmp		short .LOOP
%define	yyy esp

		align	16
.LOOP:
;	for(i=31;i>=0;i--) c[i+offset] = (double)(*(*win_buf)++) * SCALER;
		mov		ebx,[saveOff]

		lea		edx,[edi+ebp*4]

;		movq	mm7,[D_SCALER]			;mm7=[SCALER:SCALER]
		pi2fd	mm0,[ebx + 0*4]			;mm0=[wb1:wb0]
		pi2fd	mm1,[ebx + 0*4+8]		;mm1=[wb3:wb2]
		pi2fd	mm2,[ebx + 4*4]			;mm2=[wb5:wb4]
		pi2fd	mm3,[ebx + 4*4+8]		;mm3=[wb7:wb6]
;		pfmul	mm0,mm7
;		pfmul	mm1,mm7
;		pfmul	mm2,mm7
;		pfmul	mm3,mm7
		punpckldq	mm4,mm0
		punpckldq	mm5,mm1
		punpckldq	mm6,mm2
		punpckldq	mm7,mm3
		punpckhdq	mm0,mm4				;mm0=[wb0*R:wb1*R]
		punpckhdq	mm1,mm5				;mm1=[wb2*R:wb3*R]
		punpckhdq	mm2,mm6				;mm2=[wb4*R:wb5*R]
		punpckhdq	mm3,mm7				;mm3=[wb6*R:wb7*R]
		movq	[edx+28*4-0*4+8],mm0
		movq	[edx+28*4-0*4],mm1
		movq	[edx+28*4-4*4+8],mm2
		movq	[edx+28*4-4*4],mm3

;		movq	mm7,[D_SCALER]			;mm7=[SCALER:SCALER]
		pi2fd	mm0,[ebx + 8*4]			;mm0=[wb9:wb8]
		pi2fd	mm1,[ebx + 8*4+8]		;mm1=[wb11:wb10]
		pi2fd	mm2,[ebx + 12*4]		;mm2=[wb13:wb12]
		pi2fd	mm3,[ebx + 12*4+8]		;mm3=[wb15:wb14]
;		pfmul	mm0,mm7
;		pfmul	mm1,mm7
;		pfmul	mm2,mm7
;		pfmul	mm3,mm7
		punpckldq	mm4,mm0
		punpckldq	mm5,mm1
		punpckldq	mm6,mm2
		punpckldq	mm7,mm3
		punpckhdq	mm0,mm4				;mm0=[wb8*R:wb9*R]
		punpckhdq	mm1,mm5				;mm1=[wb10*R:wb11*R]
		punpckhdq	mm2,mm6				;mm2=[wb12*R:wb13*R]
		punpckhdq	mm3,mm7				;mm3=[wb14*R:wb15*R]
		movq	[edx+28*4-8*4+8],mm0
		movq	[edx+28*4-8*4],mm1
		movq	[edx+28*4-12*4+8],mm2
		movq	[edx+28*4-12*4],mm3

;		movq	mm7,[D_SCALER]			;mm7=[SCALER:SCALER]
		pi2fd	mm0,[ebx + 16*4]		;mm0=[wb17:wb16]
		pi2fd	mm1,[ebx + 16*4+8]		;mm1=[wb19:wb18]
		pi2fd	mm2,[ebx + 20*4]		;mm2=[wb21:wb20]
		pi2fd	mm3,[ebx + 20*4+8]		;mm3=[wb23:wb22]
;		pfmul	mm0,mm7
;		pfmul	mm1,mm7
;		pfmul	mm2,mm7
;		pfmul	mm3,mm7
		punpckldq	mm4,mm0
		punpckldq	mm5,mm1
		punpckldq	mm6,mm2
		punpckldq	mm7,mm3
		punpckhdq	mm0,mm4				;mm0=[wb16*R:wb17*R]
		punpckhdq	mm1,mm5				;mm1=[wb18*R:wb19*R]
		punpckhdq	mm2,mm6				;mm2=[wb20*R:wb21*R]
		punpckhdq	mm3,mm7				;mm3=[wb22*R:wb23*R]
		movq	[edx+28*4-16*4+8],mm0
		movq	[edx+28*4-16*4],mm1
		movq	[edx+28*4-20*4+8],mm2
		movq	[edx+28*4-20*4],mm3

;		movq	mm7,[D_SCALER]			;mm7=[SCALER:SCALER]
		pi2fd	mm0,[ebx + 24*4]		;mm0=[wb25:wb24]
		pi2fd	mm1,[ebx + 24*4+8]		;mm1=[wb27:wb26]
		pi2fd	mm2,[ebx + 28*4]		;mm2=[wb29:wb28]
		pi2fd	mm3,[ebx + 28*4+8]		;mm3=[wb31:wb30]
;		pfmul	mm0,mm7
;		pfmul	mm1,mm7
;		pfmul	mm2,mm7
;		pfmul	mm3,mm7
		punpckldq	mm4,mm0
		punpckldq	mm5,mm1
		punpckldq	mm6,mm2
		punpckldq	mm7,mm3
		punpckhdq	mm0,mm4				;mm0=[wb24*R:wb25*R]
		punpckhdq	mm1,mm5				;mm1=[wb26*R:wb27*R]
		punpckhdq	mm2,mm6				;mm2=[wb28*R:wb29R]
		punpckhdq	mm3,mm7				;mm3=[wb30*R:wb31*R]
		movq	[edx+28*4-24*4+8],mm0
		movq	[edx+28*4-24*4],mm1
		movq	[edx+28*4-28*4+8],mm2
		movq	[edx+28*4-28*4],mm3

		add		ebx,32*F_SIZE
		mov		[saveOff],ebx
;;
		xor		ecx,ecx
		mov		ebx,HAN_SIZE-1		;code size k̂
		mov		edx,enwindow
		jmp		short .lp1

		align	16
.lp1:							;{
		lea		eax,[ecx+ebp]			;eax=i+offset
		and		eax,ebx

		movq	mm0,[edx+ 0*4]
		movq	mm1,[edx+ 0*4+8]
		movq	mm2,[edx+ 4*4]
		movq	mm3,[edx+ 4*4+8]
		pfmul	mm0,[edi+eax*4]
		pfmul	mm1,[edi+eax*4+8]
		add		eax,64
		and		eax,ebx
		pfmul	mm2,[edi+eax*4]
		pfmul	mm3,[edi+eax*4+8]
		add		eax,64
		and		eax,ebx
		movq	mm4,[edx+ 8*4]
		movq	mm5,[edx+ 8*4+8]
		pfadd	mm0,mm2
		pfadd	mm1,mm3

		movq	mm6,[edx+ 12*4]
		movq	mm7,[edx+ 12*4+8]
		pfmul	mm4,[edi+eax*4]
		pfmul	mm5,[edi+eax*4+8]
		add		eax,64
		and		eax,ebx
		pfadd	mm0,mm4
		pfadd	mm1,mm5
		pfmul	mm6,[edi+eax*4]
		pfmul	mm7,[edi+eax*4+8]
		add		eax,64
		and		eax,ebx
		movq	mm2,[edx+ 16*4]
		movq	mm3,[edx+ 16*4+8]
		pfadd	mm0,mm6
		pfadd	mm1,mm7

		movq	mm4,[edx+ 20*4]
		movq	mm5,[edx+ 20*4+8]
		pfmul	mm2,[edi+eax*4]
		pfmul	mm3,[edi+eax*4+8]
		add		eax,64
		and		eax,ebx
		pfadd	mm0,mm2
		pfadd	mm1,mm3
		pfmul	mm4,[edi+eax*4]
		pfmul	mm5,[edi+eax*4+8]
		add		eax,64
		and		eax,ebx
		movq	mm6,[edx+ 24*4]
		movq	mm7,[edx+ 24*4+8]
		pfadd	mm0,mm4
		pfadd	mm1,mm5

		movq	mm2,[edx+ 28*4]
		movq	mm3,[edx+ 28*4+8]
		pfmul	mm6,[edi+eax*4]
		pfmul	mm7,[edi+eax*4+8]
		add		eax,64
		and		eax,ebx
		pfadd	mm0,mm6
		pfadd	mm1,mm7
		pfmul	mm2,[edi+eax*4]
		pfmul	mm3,[edi+eax*4+8]
		add		edx,32*F_SIZE
		pfadd	mm0,mm2
		pfadd	mm1,mm3

		add		ecx,4
		movq	[yyy+ecx*4-4*4],mm0
		movq	[yyy+ecx*4-2*4],mm1
		cmp		ecx,HAN_SIZE/8
		jb		near .lp1		;}

		add		ebp,480
		and		ebp,ebx

;	yprime[0] = yprime[16]
;       for( i=1; i<=16; i++ ) yprime[i] = y[i+16]+y[16-i];
;
;	AC~XAǎdȂȂ

		lea		eax,[yyy+0x80]	; R[hTCYk
;%define Y eax-0x80	;for my priprocessor
		
		movq	mm0,[eax-128+ 0*4]
		movq	mm1,[eax-128+ 2*4]
		movq	mm2,[eax-128+ 4*4]
		movq	mm3,[eax-128+ 6*4]
		punpckldq	mm4,mm0
		punpckldq	mm5,mm1
		punpckldq	mm6,mm2
		punpckldq	mm7,mm3
		punpckhdq	mm0,mm4			;mm0=[ 0: 1]
		punpckhdq	mm1,mm5			;mm1=[ 2: 3]
		punpckhdq	mm2,mm6			;mm2=[ 4: 5]
		punpckhdq	mm3,mm7			;mm3=[ 6: 7]
		movq	mm4,[eax-128+ 8*4]
		movq	mm5,[eax-128+10*4]
		movq	mm6,[eax-128+12*4]
		movq	mm7,[eax-128+14*4]
		mov		ebx,[eax-128+16*4]		;ebx=[16]
		pfadd	mm0,[eax-128+31*4]		;mm0+=[32:31]
		pfadd	mm1,[eax-128+29*4]		;mm1+=[30:29]
		pfadd	mm2,[eax-128+27*4]		;mm2+=[28:27]
		pfadd	mm3,[eax-128+25*4]		;mm3+=[26:25]
		movq	[eax-128+15*4],mm0
		movq	[eax-128+13*4],mm1
		movq	[eax-128+11*4],mm2
		movq	[eax-128+ 9*4],mm3

		punpckldq	mm0,mm4
		punpckldq	mm1,mm5
		punpckldq	mm2,mm6
		punpckldq	mm3,mm7
		punpckhdq	mm4,mm0			;mm4=[ 8: 9]
		punpckhdq	mm5,mm1			;mm5=[10:11]
		punpckhdq	mm6,mm2			;mm6=[12:13]
		punpckhdq	mm7,mm3			;mm7=[14:15]
		pfadd	mm4,[eax-128+23*4]		;mm4+=[24:23]
		pfadd	mm5,[eax-128+21*4]		;mm5+=[22:21]
		pfadd	mm6,[eax-128+19*4]		;mm6+=[20:19]
		pfadd	mm7,[eax-128+17*4]		;mm7+=[18:17]
		movq	[eax-128+ 7*4],mm4
		movq	[eax-128+ 5*4],mm5
		movq	[eax-128+ 3*4],mm6
		movq	[eax-128+ 1*4],mm7
		mov		[eax-128+ 0*4],ebx


		movq	mm0,[eax-128+49*4]
		movq	mm1,[eax-128+51*4]
		movq	mm2,[eax-128+53*4]
		movq	mm3,[eax-128+55*4]
		punpckldq	mm4,mm0
		punpckldq	mm5,mm1
		punpckldq	mm6,mm2
		punpckldq	mm7,mm3
		punpckhdq	mm0,mm4			;mm0=[49:50]
		punpckhdq	mm1,mm5			;mm1=[51:52]
		punpckhdq	mm2,mm6			;mm2=[53:54]
		punpckhdq	mm3,mm7			;mm3=[55:56]
		pfsubr	mm0,[eax-128+46*4]		;mm0=[47:46]-mm0
		pfsubr	mm1,[eax-128+44*4]		;mm1=[45:44]-mm1
		pfsubr	mm2,[eax-128+42*4]		;mm2=[43:42]-mm2
		pfsubr	mm3,[eax-128+40*4]		;mm3=[41:40]-mm3
		movq	mm4,[eax-128+57*4]
		movq	mm5,[eax-128+59*4]
		movq	mm6,[eax-128+61*4]
		movq	mm7,[eax-128+63*4]
		movq	[eax-128+30*4],mm0
		movq	[eax-128+28*4],mm1
		movq	[eax-128+26*4],mm2
		movq	[eax-128+24*4],mm3

		punpckldq	mm0,mm4
		punpckldq	mm1,mm5
		punpckldq	mm2,mm6
		punpckhdq	mm4,mm0			;mm4=[57:58]
		punpckhdq	mm5,mm1			;mm5=[59:60]
		punpckhdq	mm6,mm2			;mm6=[61:62]
									;mm7=[63]
		pfsubr	mm4,[eax-128+38*4]		;mm4=[39:38]-mm4
		pfsubr	mm5,[eax-128+36*4]		;mm5=[37:36]-mm5
		pfsubr	mm6,[eax-128+34*4]		;mm6=[35:34]-mm6
		pfsubr	mm7,[eax-128+33*4]		;mm7=[33]-mm7
		movq	[eax-128+22*4],mm4
		movq	[eax-128+20*4],mm5
		movq	[eax-128+18*4],mm6
		movd	[eax-128+17*4],mm7

	;-----------------------------------------------
;%undef Y

;       for( i=0; i<16; i++ ){
;               s0 = s1 = 0.0;
;               for( j=0; j<32; j+=2 ){
;                       s0 += (*m)[i+0][j  ]*yprime[j+0];
;                       s1 += (*m)[i+0][j+1]*yprime[j+1];
;               }
;               xout[i+ 0] = s0+s1;
;               xout[31-i] = s0-s1;
;       }

		mov		ebx,[s_ptr]
		mov		ecx,16
		lea		edx,[idct_coefficient+128]	;+128̓R[hTCYk̂
		lea		eax,[yyy+128]
		jmp		.lp2
		align	16
.lp2:							;{
		movq	mm0,[edx-128+ 0*4]
		movq	mm2,[eax-128+ 0*4]
		movq	mm1,[edx-128+ 2*4]
		movq	mm3,[eax-128+ 2*4]
		pfmul	mm0,mm2
		pfmul	mm1,mm3
		pfmul	mm2,[edx+ 0*4]
		pfmul	mm3,[edx+ 2*4]

		movq	mm4,[edx-128+ 4*4]
		movq	mm6,[eax-128+ 4*4]
		movq	mm5,[edx-128+ 6*4]
		movq	mm7,[eax-128+ 6*4]
		pfmul	mm4,mm6
		pfmul	mm5,mm7
		pfmul	mm6,[edx+ 4*4]
		pfmul	mm7,[edx+ 6*4]
		pfadd	mm0,mm4
		pfadd	mm1,mm5
		pfadd	mm2,mm6
		pfadd	mm3,mm7

		movq	mm4,[edx-128+ 8*4]
		movq	mm6,[eax-128+ 8*4]
		movq	mm5,[edx-128+10*4]
		movq	mm7,[eax-128+10*4]
		pfmul	mm4,mm6
		pfmul	mm5,mm7
		pfmul	mm6,[edx+ 8*4]
		pfmul	mm7,[edx+10*4]
		pfadd	mm0,mm4
		pfadd	mm1,mm5
		pfadd	mm2,mm6
		pfadd	mm3,mm7

		movq	mm4,[edx-128+12*4]
		movq	mm6,[eax-128+12*4]
		movq	mm5,[edx-128+14*4]
		movq	mm7,[eax-128+14*4]
		pfmul	mm4,mm6
		pfmul	mm5,mm7
		pfmul	mm6,[edx+12*4]
		pfmul	mm7,[edx+14*4]
		pfadd	mm0,mm4
		pfadd	mm1,mm5
		pfadd	mm2,mm6
		pfadd	mm3,mm7

		movq	mm4,[edx-128+16*4]
		movq	mm6,[eax-128+16*4]
		movq	mm5,[edx-128+18*4]
		movq	mm7,[eax-128+18*4]
		pfmul	mm4,mm6
		pfmul	mm5,mm7
		pfmul	mm6,[edx+16*4]
		pfmul	mm7,[edx+18*4]
		pfadd	mm0,mm4
		pfadd	mm1,mm5
		pfadd	mm2,mm6
		pfadd	mm3,mm7

		movq	mm4,[edx-128+20*4]
		movq	mm6,[eax-128+20*4]
		movq	mm5,[edx-128+22*4]
		movq	mm7,[eax-128+22*4]
		pfmul	mm4,mm6
		pfmul	mm5,mm7
		pfmul	mm6,[edx+20*4]
		pfmul	mm7,[edx+22*4]
		pfadd	mm0,mm4
		pfadd	mm1,mm5
		pfadd	mm2,mm6
		pfadd	mm3,mm7

		movq	mm4,[edx-128+24*4]
		movq	mm6,[eax-128+24*4]
		movq	mm5,[edx-128+26*4]
		movq	mm7,[eax-128+26*4]
		pfmul	mm4,mm6
		pfmul	mm5,mm7
		pfmul	mm6,[edx+24*4]
		pfmul	mm7,[edx+26*4]
		pfadd	mm0,mm4
		pfadd	mm1,mm5
		pfadd	mm2,mm6
		pfadd	mm3,mm7

		movq	mm4,[edx-128+28*4]
		movq	mm6,[eax-128+28*4]
		movq	mm5,[edx-128+30*4]
		movq	mm7,[eax-128+30*4]
		pfmul	mm4,mm6
		pfmul	mm5,mm7
		pfmul	mm6,[edx+28*4]
		pfmul	mm7,[edx+30*4]
		pfadd	mm0,mm4
		pfadd	mm1,mm5
		pfadd	mm2,mm6
		pfadd	mm3,mm7

		pfadd	mm0, mm1
		pfadd	mm2, mm3
		add		edx,256

;               xout[i+ 0] = s0+s1;
;               xout[31-i] = s0-s1;
		movq	mm4,mm0			;[s1:s0]
		movq	mm5,mm0			;[s1:s0]
		pfacc	mm0,mm2			;[s0+s1]
		movq	mm6,mm2
		movq	mm7,mm2
		add		ebx,8
		sub		ecx,2
		punpckhdq	mm4,mm4		;[s1]
		punpckhdq	mm6,mm6
		movq	[ebx-8],mm0		;s0+s1
		pfsub	mm5,mm4			;[s0-s1]
		pfsub	mm7,mm6
		movd	[ebx+ecx*8+4],mm5
		movd	[ebx+ecx*8],mm7
		jnz		near .lp2		;}

		add		ebx,16*4
		dec		esi
		mov		[s_ptr],ebx
		jnz		near .LOOP

; free area for yyy[64]
		mov		esp,[esp+LOCAL_STACK]
%undef yyy

;
		test	dword [esp+_P+8],1
		mov		edx,sbd_off
		jz		short .F2
		add		edx,4
.F2:
		mov		[edx],ebp

.exit:
		femms
		pop		ebp
		pop		edi
		pop		esi
		pop		ebx
		ret
%endif
