	SECTION .text
	align 16
global	libecc_shiftwindow_xorassign:function (libecc_shiftwindow_xorassign_end - libecc_shiftwindow_xorassign)

; libecc_shiftwindow_xorassign(uint32_t* bitset1, uint32_t const* bitset2, int32_t lsb, int32_t msb, int32_t shift)
;
; Example (with digits of only 8 bits):
; bitset1 : [00001111][11111111][10000000]
; bitset2 : [00000011][11111111][00000000]
; shifted : [00000001][11111111][10000000]	(line up least significant bit with bitset1)
; result  : [00001110][00000000][00000000]	(XOR with bitset1, written to bitset1)

; Inputs
%define Mbitset1 [esp+n4+36]	; Pointer to the first digit of bitset1.
%define Mbitset2 [esp+n4+40]	; Pointer to the first digit of bitset2.
%define Mlsb	 [esp+n4+44]	; The index of the least significant bit of the window.
%define Mmsb	 [esp+n4+48]	; The index of the most significant bit of the window.
%define Mshift	 [esp+n4+52]	; The distance that the window needs to be shifted to the right.
;
; Register assignment:
%define src esi		; Current source address.
%define dst edi		; Current destination address.
%define count edx	; Input digits counter.
%define bitshift ecx	; Bit shift.
%define digit ebx	; Current input digit.
%define store eax	; Temporary storage for (previous) input digit.
%define mask ebp	; Mask for first and last input digit.
;
; The right most digit that needs to be read is		(lsb / 32)
; The left most digit that needs to be read is		(msb / 32)
; The total number of digits that need to be read is	((msb / 32) - (lsb / 32)) + 1
; The right bit shift is				((lsb % 32) - ((lsb - shift) % 32))
; if that is negative, then we need a left shift of	(((lsb - shift) % 32) - (lsb % 32))
;
; Local variables.
%define Mxdb [esp]		;  4 * ((lsb - shift) / 32), the offset in bytes to the first byte
				;  of the least significant digit of bitset1.
%assign n 1			; Number of local variables.
%assign n4 4*n

libecc_shiftwindow_xorassign:

	pushad
	sub esp, n4
                       	        ;  abcdD
	mov eax, Mlsb		;  1
        mov ebx, Mlsb		;   1
	mov edi, Mlsb		;      1
	sub eax, Mshift		;  2
	sub ebx, Mshift		;   2
	and edi, 31		;      2
	and eax, 0xffffffe0	;  3
	and ebx, 31		;   3
	mov ecx, edi		;    1 3	; ecx = (lsb % 32)
	sar eax, 3		;  4
	mov edx, Mlsb		;     1		; edx = lsb
	sub edi, ebx		;   4  4	; edi = ((lsb % 32) - ((lsb - shift) % 32))
	mov Mxdb, eax		;  5
	js shiftwindow_left
	jmp shiftwindow_right

; Shift to the left
;
; The first digit to read is				(bitset2) + 4 * (lsb / 32)
; The first digit to write is				(bitset1) + 4 * ((lsb - shift) / 32)
; The mask for the first digit is			(0xffffffff << (lsb % 32))
; The mask for the last digit is			~(0xfffffffe << (msb % 32))

shiftwindow_left:

	; Calculate bit shift, but keep it in edi.
	neg edi			; edi = (((lsb - shift) % 32) - (lsb % 32))

	; Calculate first mask (put into eax).
	mov eax, 0xffffffff
	shl eax, cl		; eax = (0xffffffff << (lsb % 32))

	; Calculate *src*, address of first digit.
	and edx, 0xffffffe0	; edx invalidated
	mov src, Mbitset2
	sar edx, 3
	add src, edx		; esi = src = (bitset2) + 4 * (lsb / 32)

	; Calculate *digit*, first (masked) digit.
	mov digit, [src]	; ebx = digit
	and digit, eax

	; Calculate *mask*.
	mov ecx, Mmsb		; ecx = msb
	mov eax, 0xfffffffe	; eax invalidated
	mov mask, 0xffffffff
	shl eax, cl		; shl automagically does the % 32
	xor mask, eax		; ebp = mask = ~(0xfffffffe << (msb % 32))

	; Calculate *count*, the total number of input digits minus one.
	sar ecx, 5		; ecx = msb / 32
	mov eax, Mlsb
	mov count, ecx
	sar eax, 5		; eax = (lsb / 32)
	sub count, eax		; edx = count = ((msb / 32) - (lsb / 32))

	; Put the bit shift in *bitshift*.
	mov bitshift, edi	; ecx = bitshift

	; Calculate *dst*, address of first digit to write to.
	mov dst, Mbitset1
	add dst, Mxdb		; edi = dst = bitset1 + 4 * ((lsb - shift) / 32)

	; Is the first digit also the last digit?
	test count, count
	mov store, count	; Fastest way to assure that store contains zero in case we jump to .lastdigit.
	jz .lastdigit

	; Process the first digit.
	mov store, digit	; eax = store = digit
	shl digit, cl
	xor [dst], digit

.nextdigit:
	; Retrieve the next digit.
	add src, 4
	add dst, 4
	mov digit, [src]

	; Is this the last digit?
	dec count
	jz .lastdigit

	; Process the next digit.
	shld digit, store, cl
	mov store, [src]
	xor [dst], digit
	jmp .nextdigit

.lastdigit:
	; Process the last digit.
	and digit, mask
	shld digit, store, cl
	xor [dst], digit

	; There is possibly one more output digit.
	mov eax, [src]		; store invalidated
	test ecx, ecx		; Set zero flag if bitshift is zero, shld does not affect the flags then.
	shld edx, eax, cl	; Use the fact that edx = count = 0
	jz .exit		; It is possible that [dst+4] is not accessible in this case.
	xor [dst+4], edx

.exit:
	add esp, n4
	popad
	ret

; Shift to the right
;
; The first digit to read is		(bitset2) + 4 * (lsb / 32) + 4 * ((msb / 32) - (lsb / 32))
; The first digit to write is		(bitset1) + 4 * ((lsb - shift) / 32) + 4 * ((msb / 32) - (lsb / 32))
; The mask for the first digit is	~(0xfffffffe << (msb % 32))
; The mask for the last digit is	(0xffffffff << (lsb % 32))

	ALIGN 32

shiftwindow_right:
	; ecx = (lsb % 32)
	; edx = lsb
	; edi = ((lsb % 32) - ((lsb - shift) % 32))

	; Calculate *mask*.
	mov mask, 0xffffffff
	xor eax, eax		; eax = 0
	shl mask, cl		; ebp = mask = (0xffffffff << (lsb % 32))

	; Calculate first mask (put into eax).
	mov ebx, 0xfffffffe
	mov ecx, Mmsb		; ecx = msb
	dec eax			; eax = 0xffffffff
	shl ebx, cl		; shl automagically does the % 32
	xor eax, ebx		; eax = ~(0xfffffffe << (msb % 32))

	; Calculate *src*, address of first digit.
	and ecx, 0xffffffe0	; ecx = 32 * (msb / 32)
	and edx, 0xffffffe0	; edx = 32 * (lsb / 32)
	mov src, Mbitset2
	sub ecx, edx		; ecx = 32 * ((msb / 32) - (lsb / 32))
	mov ebx, edx		; ebx = 32 * (lsb / 32)
	sar ecx, 3		; ecx = 4 * ((msb / 32) - (lsb / 32))
	sar ebx, 3		; ebx = 4 * (lsb / 32)
	add src, ecx
	add src, ebx		; esi = src = (bitset2) + 4 * (lsb / 32) + 4 * ((msb / 32) - (lsb / 32))

	; Calculate *digit*, first (masked) digit.
	mov digit, [src]	; ebx = digit
	mov edx, ecx		; edx = 4 * ((msb / 32) - (lsb / 32))
	and digit, eax

	; Put the bit shift in *bitshift*.
	mov bitshift, edi	; ecx = bitshift

	; Calculate *dst*, address of first digit to write to.
	mov dst, Mbitset1
	add dst, Mxdb
	add dst, edx		; edi = dst = (bitset1) + 4 * ((lsb - shift) / 32) + 4 * ((msb / 32) - (lsb / 32))

	; Calculate *count*, the total number of input digits minus one.
	shr count, 2		; edx = count = ((msb / 32) - (lsb / 32))

	; Is the first digit also the last digit?
	mov store, count	; Fastest way to assure that store contains zero in case we jump to .lastdigit.
	jz .lastdigit

	; Process the first digit.
	mov store, digit	; eax = store = digit
	sub src, 4		; Decrementing src already here gives better performance.
	shr digit, cl
	jz .seconddigit		; If the result is zero then we might not be able to access [dst]
				; If cl is zero then the shr did not affect the flags and ZF is still unset,
				; so we will not jump.  In that case however, [dst] will always be accessible.
	xor [dst], digit

	; Retrieve the next digit.
	sub dst, 4
	mov digit, [src]

	; Is this the last digit?
	dec count
	jz .lastdigit

.nextdigit:
	; Process the next digit.
	shrd digit, store, cl
	mov store, [src]
	xor [dst], digit

	; Retrieve the next digit.
	sub src, 4
.seconddigit:
	sub dst, 4
	dec count		; Already decrement count here.
	mov digit, [src]
	jnz .nextdigit

.lastdigit:
	; Process the last digit.
	and digit, mask
	shrd digit, store, cl
	xor [dst], digit

	; There is not another possibly output digit because
	; the right most non-zero digits have been aligned.
	add esp, n4
	popad
	ret

libecc_shiftwindow_xorassign_end:


	SECTION .text
	align 16
global	libecc_shift_xorassign:function (libecc_shift_xorassign_end - libecc_shift_xorassign)

; libecc_shift_xorassign(uint32_t* bitset1, uint32_t const* bitset2, int32_t lsb, int32_t msb, int32_t shift)
;
%define Mbitset1 [esp+36]	; Pointer to the first digit of bitset1.
%define Mbitset2 [esp+40]	; Pointer to the first digit of bitset2.
%define Mlsb	 [esp+44]	; The index of the least significant bit of bitset2.
%define Mmsb	 [esp+48]	; The index of the most significant bit of bitset2.
%define Mshift	 [esp+52]	; The distance that bitset2 needs to be shifted to the right.

libecc_shift_xorassign:

	pushad
				; abcdDS
	mov eax, Mlsb		; 1
	mov ecx, Mshift		;   1
	mov edx, Mmsb		;    1
	add eax, Mshift		; 2
	mov ebx, Mlsb		;  1
	and ecx, 31		;   2
	add edx, Mshift		;    2
	sar eax, 5		; 3
	add ebx, ecx		;  23
	sar edx, 5		;    3
	mov edi, Mbitset1	;     1
	and ebx, 0xffffffe0	;  3
	sub edx, eax		; 4  4
	mov esi, Mbitset2	;      1
	sar ebx, 3		;  4
	sal eax, 2		; 5
	add esi, ebx		;  5   2
	add edi, eax		; 6   2

	mov ebx, [esi + 4 * edx]
.loop
	mov eax, [esi + 4 * edx - 4]
	shld ebx, eax, cl
	xor [edi + 4 * edx], ebx
	mov ebx, eax
	dec edx
	jns .loop

	popad
	ret

libecc_shift_xorassign_end:
