/****************************************************************************
 *                                                                          *
 * NDSAUDIO                                                                 *
 *                                                                          *
 * Copyright (c) 2009, Mukunda Johnson (mukunda@mukunda.com)                *
 *                                                                          *
 * Permission to use, copy, modify, and/or distribute this software for any *
 * purpose with or without fee is hereby granted, provided that the above   *
 * copyright notice and this permission notice appear in all copies.        *
 *                                                                          *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES *
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF         *
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR  *
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES   *
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN    *
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF  *
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.           *
 *--------------------------------------------------------------------------*
 ****************************************************************************/

	.global naCopyStreamData
	.global naZeroFillStream
	
.equ	NA_SFB_2CHN, 1
.equ	NA_SFB_2SRC, 2
.equ	NA_SFB_16BIT, 4

.struct 0
	SI_CALLBACK:	.SPACE 4 //
	SI_CHLEFT:	.SPACE 1 //
	SI_CHRIGHT:	.SPACE 1
	SI_FORMAT:	.SPACE 1
	SI_AUTOFILL:	.SPACE 1 
	SI_CLOSING:	.SPACE 1 //
	SI_BUSY:	.SPACE 1
	SI_CLS:		.SPACE 1
	SI_RS2:		.SPACE 1
	SI_RS3:		.SPACE 1 //
	SI_RS4:		.SPACE 1
	SI_PREVTIMER:	.SPACE 2
	SI_BUFFERLEN:	.SPACE 4 //
	SI_BUFFERBYTES:	.SPACE 4
	SI_SAMPLESFREE:	.SPACE 4
	SI_CYCLEMOD:	.SPACE 4
	SI_PERIOD:	.SPACE 4
	SI_OUTPOS:	.SPACE 4
	SI_CLOSINGSAMP:	.SPACE 4
	SI_LIFETIME:	.SPACE 4
	SI_CPUSTAT1:	.SPACE 4
	SI_CPUSTAT2:	.SPACE 4
	SI_OUTBUFFER:	.SPACE 4
	SI_WORKBUFFER:	.SPACE 4
	SI_USERDATA:	.SPACE 4
	SI_LINK:	.SPACE 4

.text
.arm
.align

/****************************************************************************
 * naCopyStreamData( audioStreamInstance *si, int length )
 *
 * Copy data from workbuffer into stream output.
 *
 * length is a multiple of 4.
 ****************************************************************************/
naCopyStreamData:
	
	cmp	r1, #0				// catch length=0
	bxeq	lr				//
	push	{r0,r1, r4-r11,lr}		// preserve regs
//---------------------------------------------------------------------------
	ldr	r4, [r0, #SI_WORKBUFFER]	// r4 = workbuffer
	ldr	r5, [r0, #SI_BUFFERLEN]		// r5 = buffer length (multiple of 16)
	ldr	r6, [r0, #SI_OUTPOS]		// r6 = output position
	mov	r7, r1				// r7 = length
	ldrb	r8, [r0, #SI_FORMAT]		// r8 = format
	ldr	r9, [r0, #SI_OUTBUFFER]		// r9 = outbuffer
//---------------------------------------------------------------------------
copy_main:
	mov	r1, r6				// r1 = output position
	sub	r2, r5, r6			// r2 = samples until buffer end
	mov	r0, r7				// r0 = length (sublength)
	cmp	r0, r2				// if sublength >= remaining:
	movge	r0, r2				//   sublength = remaining
	movge	r6, #0				//   position = 0
	addlt	r6, r0				// else:
						//   position += sublength
	sub	r7, r0				// total length -= sublength
	
	push	{r5-r9}
	tst	r8, #NA_SFB_2SRC
	bne	.stereo
	bl	unpack_mono
	b	.mono
.stereo:
	bl	unpack_stereo
.mono:
	pop	{r5-r9}
	cmp	r7, #0
	bne	copy_main
	
	b	increment_position_thing	// increment position and exit
	
// unpack routines:
//
// r0 = length (multiple of 4)
// r1 = output position
// r5 = buffer length (offset to second buffer for stereo)
// r9 = outbuffer
// r4 = workbuffer (must add length during routine)
// r0-r3,r5-r12 are preserved and may be clobbered
	
//------------------------------------------------------------------------------
unpack_mono:
//------------------------------------------------------------------------------
	tst	r8, #NA_SFB_16BIT
	bne	unpack_16bit_mono
//------------------------------------------------------------------------------
unpack_8bit_mono:
//------------------------------------------------------------------------------
	add	r1, r9, r1		// apply output offset
	subs	r0, #16
	bmi	.u8m_copy4
.u8m_copy16:
1:	ldmia	r4!, {r2,r3,r5,r6}	// copy 16 samples/iteration
	stmia	r1!, {r2,r3,r5,r6}	//
	subs	r0, #16			//
	bpl	1b			//
.u8m_copy4:				// copy remaining chunks
	adds	r0, #16			//
	bxeq	lr			//
1:	ldmia	r4!, {r2}		//
	stmia	r1!, {r2}		//
	subs	r0, #4			//
	bne	1b			//
	bx	lr
//------------------------------------------------------------------------------
unpack_16bit_mono:
//------------------------------------------------------------------------------
	add	r1, r9, r1, lsl#1		// apply output offset
	subs	r0, #16
	bmi	.u16m_copy4
1:	ldmia	r4!, {r2,r3,r5,r6,r7,r8,r9,r10}	// copy 32 bytes per 16 iterations
	stmia	r1!, {r2,r3,r5,r6,r7,r8,r9,r10}	//
	subs	r0, #16				//
	bpl	1b				//
.u16m_copy4:
	adds	r0, #16
	bxeq	lr
1:	ldmia	r4!, {r2,r3}
	stmia	r1!, {r2,r3}
	subs	r0, #4
	bne	1b
	bx	lr
//------------------------------------------------------------------------------
unpack_stereo:
//------------------------------------------------------------------------------
	tst	r8, #NA_SFB_16BIT
	bne	unpack_16bit_stereo
//------------------------------------------------------------------------------
unpack_8bit_stereo:
//------------------------------------------------------------------------------
	
	add	r1, r9, r1			// r1 = output + position
	ldr	r8,=0xFF00FF00
1:	ldmia	r4!, {r2,r3}			// r2,r3 = 
						// R1L1R0L0
						// R3L3R2L2
	and	r9, r8, r2, lsl#8		// r9 = L1--L0--
	orr	r9, r9, lsl#8			// r9 = L1L0L0--
	and	r10, r8, r3, lsl#8		// r10 = L3--L2--
	orr	r10, r10, lsl#8			// r10 = L3L2L2--
	bic	r10, #0xFF00			// r10 = L3L2----
	orr	r12, r10, r9, lsr#16		// r12 = L3L2L1L0
	
	and	r9, r8, r2			// r9 = R1--R0--
	orr	r9, r9, lsl#8			// r9 = R1R0R0--
	and	r10, r8, r3			// r10 = R3--R2--
	orr	r10, r10, lsl#8			// r10 = R3R2R2--
	bic	r10, #0xFF00			// r10 = R3R2----
	orr	r11, r10, r9, lsr#16		// r11 = R3R2R1R0
	
	str	r11, [r1, r5]			// write right buffer
	str	r12, [r1], #4			// write left buffer and inc
	
	subs	r0, #4				// 4 samples processed
	bne	1b				//
	bx	lr				//
	
//------------------------------------------------------------------------------
unpack_16bit_stereo:
//------------------------------------------------------------------------------
	
	mov	r5, r5, lsl#1
	add	r1, r9, r1, lsl#1		// r1 = output + position
	ldr	r12,=0xFFFF0000
	
1:	ldmia	r4!, {r2,r3,r6,r7}		// r2,r3,r6,r7 = R0L0 R1L1 R2L2 R3L3 
	and	r8, r3, r12			// r8 = R1--
	orr	r8, r2, lsr#16			// r8 = R1R0
	and	r9, r7, r12			// r9 = R3--
	orr	r9, r6, lsr#16			// r9 = R3R2
	and	r10, r2, r12, lsr#16		// r10 = --L0
	orr	r10, r3, lsl#16			// r10 = L1L0
	and	r11, r6, r12, lsr#16		// r11 = --L2
	orr	r11, r7, lsl#16			// r11 = L3L2
	strd	r8, [r1, r5]			// write 4 samples (right)
	strd	r10, [r1], #8			// write 4 samples (left)
	
	subs	r0, #4				// 4 samples processed
	bne	1b				//
	bx	lr				//

/*******************************************************************************
 * naZeroFillStream( audioStreamInstance *si, int length )
 *
 * Fill output buffer with a number of zero samples
 *
 * length is multiple of 4
 *******************************************************************************/
naZeroFillStream:
	
	cmp	r1, #0				// catch 0 length
	bxeq	lr				//
	
	push	{r0,r1,r4-r11,lr}		// preserve regs
	
	ldr	r3, [r0, #SI_OUTPOS]		// r3 = position
	ldr	r4, [r0, #SI_OUTBUFFER]		// r4 = outbuffer
	ldr	r5, [r0, #SI_BUFFERLEN]		// r5 = buffer length
	
	ldrb	r2, [r0, #SI_FORMAT]		// convert sample units to byte units
	tst	r2, #NA_SFB_16BIT		// (test 16bit mode and shift)
	movne	r1, r1, lsl#1			// 
	movne	r3, r3, lsl#1			//
	movne	r5, r5, lsl#1			//
	
	add	r5, r4				// r5 = end of left buffer
	add	r4, r3				// r4 = left output ptr
	add	r6, r5, r3			// r6 = right output ptr
	
	mov	r7, #0
	//mov	r8, #0
	//mov	r9, #0
	//mov	r10, #0
	
	tst	r2, #NA_SFB_2SRC
	bne	zfill_stereo
zfill_mono:
	stmia	r4!, {r7}
	cmp	r4, r5
	ldrge	r4, [r0, #SI_OUTBUFFER]
	subs	r1, #4
	bne	zfill_mono
	b	zfill_exit
zfill_stereo:
	stmia	r4!, {r7}
	stmia	r6!, {r7}
	cmp	r4, r5
	ldrge	r4, [r0, #SI_OUTBUFFER]
	movge	r6, r5
	subs	r1, #4
	bne	zfill_stereo
	
zfill_exit:

// copy function exit:
//
// modifies position
// (position += length)
// and samples free
// (sf -= length)

increment_position_thing:
	
	pop	{r0, r1}		// position += length
	ldr	r2, [r0, #SI_OUTPOS]	//
	add	r2, r1			//
	ldr	r3, [r0, #SI_BUFFERLEN]	//
1:	cmp	r2, r3			//
	subge	r2, r3			//
	bge	1b			//
	str	r2, [r0, #SI_OUTPOS]	//
	
	ldr	r2, [r0, #SI_SAMPLESFREE]	// sf -= length
	sub	r2, r1				//
	str	r2, [r0, #SI_SAMPLESFREE]	//
	
	pop	{r4-r11,pc}