From f18cfec99ce8caf8878a2d87acc69986e08bf788 Mon Sep 17 00:00:00 2001
From: bg <bg@b1d182e4-1ff8-0310-901f-bddb46175740>
Date: Sat, 12 Dec 2009 21:05:01 +0000
Subject: [PATCH] some improvments for BMW

---
 avr-asm-macros.S      |  26 +++++------
 bmw/bmw_small-asm.S   | 100 ++++++++++++++++++++++++++++++++++++------
 bmw/bmw_small-cstub.c |  80 +++++++--------------------------
 3 files changed, 116 insertions(+), 90 deletions(-)

diff --git a/avr-asm-macros.S b/avr-asm-macros.S
index 829562b..63f9303 100644
--- a/avr-asm-macros.S
+++ b/avr-asm-macros.S
@@ -17,15 +17,15 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-/* 
+/*
  * File:        avr-asm-macros.S
  * Author:      Daniel Otte
  * Date:        2008-08-13
  * License:     GPLv3 or later
  * Description: some macros which are quite usefull
- * 
+ *
  */
- 
+
 #include <avr/io.h>
 
 /*******************************************************************************
@@ -34,14 +34,14 @@
 
 .macro push_ p1:req, p2:vararg
 	push \p1
-.ifnb \p2	
+.ifnb \p2
 	push_ \p2
 .endif
 .endm
 
 .macro pop_ p1:req, p2:vararg
 	pop \p1
-.ifnb \p2	
+.ifnb \p2
 	pop_ \p2
 .endif
 .endm
@@ -50,13 +50,13 @@
 	push \from
 .if     \to-\from
 	push_range "(\from+1)",\to
-.endif		
+.endif
 .endm
 
 .macro pop_range from:req, to:req
 	pop \to
 .if     \to-\from
-	pop_range \from,"(\to-1)"	
+	pop_range \from,"(\to-1)"
 .endif
 .endm
 
@@ -64,7 +64,7 @@
 	in r0, _SFR_IO_ADDR(SREG)
 	in \reg1, _SFR_IO_ADDR(SPL)
 	in \reg2, _SFR_IO_ADDR(SPH)
-	sbiw \reg1, \size 
+	sbiw \reg1, \size
 	cli
 	out _SFR_IO_ADDR(SPH), \reg2
 	out _SFR_IO_ADDR(SREG), r0
@@ -75,7 +75,7 @@
 	in r0, _SFR_IO_ADDR(SREG)
 	in \reg1, _SFR_IO_ADDR(SPL)
 	in \reg2, _SFR_IO_ADDR(SPH)
-	adiw \reg1, \size 
+	adiw \reg1, \size
 	cli
 	out _SFR_IO_ADDR(SPH), \reg2
 	out _SFR_IO_ADDR(SREG), r0
@@ -88,7 +88,7 @@
 	in \reg1, _SFR_IO_ADDR(SPL)
 	in \reg2, _SFR_IO_ADDR(SPH)
 	subi \reg1, lo8(\size)
-	sbci \reg2, hi8(\size)	 
+	sbci \reg2, hi8(\size)
 	cli
 	out _SFR_IO_ADDR(SPH), \reg2
 	out _SFR_IO_ADDR(SREG), r0
@@ -100,7 +100,7 @@
 	in \reg1, _SFR_IO_ADDR(SPL)
 	in \reg2, _SFR_IO_ADDR(SPH)
 	adiw \reg1, 63
-	adiw \reg1, (\size-63) 
+	adiw \reg1, (\size-63)
 	cli
 	out _SFR_IO_ADDR(SPH), \reg2
 	out _SFR_IO_ADDR(SREG), r0
@@ -113,7 +113,7 @@
 	in \reg2, _SFR_IO_ADDR(SPH)
 	adiw \reg1, 63
 	adiw \reg1, 63
-	adiw \reg1, (\size-63*2) 
+	adiw \reg1, (\size-63*2)
 	cli
 	out _SFR_IO_ADDR(SPH), \reg2
 	out _SFR_IO_ADDR(SREG), r0
@@ -131,7 +131,7 @@
 	add \reg1, r16
 	adc \reg2, r17
 	pop r17
-	pop r16 
+	pop r16
 	cli
 	out _SFR_IO_ADDR(SPH), \reg2
 	out _SFR_IO_ADDR(SREG), r0
diff --git a/bmw/bmw_small-asm.S b/bmw/bmw_small-asm.S
index 62bd166..ef05e96 100644
--- a/bmw/bmw_small-asm.S
+++ b/bmw/bmw_small-asm.S
@@ -800,8 +800,8 @@ q0 =  6
 q1 =  7
 .global bmw_small_f1
 bmw_small_f1:
-	push_range 2, 7
-	push_range 28, 29
+;	push_range 2, 7
+;	push_range 28, 29
 	push r16
 	movw q0, r24
 	movw m0, r22
@@ -843,8 +843,8 @@ bmw_small_f1:
 	cpi r16, 16
 	brne 1b
 	pop r16
-	pop_range 28, 29
-	pop_range 2, 7
+;	pop_range 28, 29
+;	pop_range 2, 7
 	ret
 
 /*******************************************************************************
@@ -960,9 +960,9 @@ f0_jumptable:
 
 .global bmw_small_f0
 bmw_small_f0:
-	push_range 28, 29
-    push_range 4, 11
-    push_range 16, 17
+;	push_range 28, 29
+;    push_range 4, 11
+;    push_range 16, 17
     /* h[i] ^= m[i]; q[i]= 0 */
 	movw r26, h0 ; h
 	movw r30, m0 ; m
@@ -1105,9 +1105,9 @@ bmw_small_f0:
 	adc acc0, acc1
 	st Z+, acc0
 
-    pop_range 16, 17
-    pop_range 4, 11
-	pop_range 28, 29
+;   pop_range 16, 17
+;   pop_range 4, 11
+;	pop_range 28, 29
     ret
 
 /*******************************************************************************
@@ -1220,8 +1220,8 @@ bmw_small_f2:
 	st X+, r0
 	dec r18
 	brne 1b
-	push_range 28, 29
-	push_range  2, 17
+;	push_range 28, 29
+;	push_range  2, 17
 	movw q0, r22
 	movw h0, r24
 	/* calc xl */
@@ -1655,8 +1655,8 @@ bmw_small_f2:
 	rcall tshiftr
 	modify_h_2 5
 bmw_small_f2_exit:
-	pop_range  2, 17
-	pop_range 28, 29
+;	pop_range  2, 17
+;	pop_range 28, 29
 	ret
 
 cli_putb:
@@ -1695,3 +1695,75 @@ cli_putchar:
 	call cli_putc
 	pop_range 18, 31
 	ret
+
+/*******************************************************************************
+* void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block){
+*	uint32_t q[32];
+*	dump_x(block, 16, 'M');
+*	bmw_small_f0(ctx->h, block, q);
+*	dump_x(q, 16, 'Q');
+*	bmw_small_f1(q, block, ctx->h);
+*	dump_x(q, 32, 'Q');
+*	bmw_small_f2(ctx->h, q, block);
+*	ctx->counter += 1;
+*	ctx_dump(ctx);
+* }
+*
+* param ctx:   r24:r25
+* param block: r22:r23
+*/
+h0 =  2
+h1 =  3
+b0 =  4
+b1 =  5
+q0 =  6
+q1 =  7
+.global bmw_small_nextBlock
+bmw_small_nextBlock:
+	push_range 28, 29
+	push_range  2, 17
+	stack_alloc_large 32*4, 30, 31
+	adiw r30, 1
+	movw q0, r30
+	movw h0, r24
+	movw b0, r22
+	/* increment counter */
+	movw r30, r24
+	adiw r30, 60
+	ldd r22, Z+4
+	ldd r23, Z+5
+	ldd r24, Z+6
+	ldd r25, Z+7
+	ldi r21, 1
+	add r22, r21
+	adc r23, r1
+	adc r24, r1
+	adc r25, r1
+	std Z+4, r22
+	std Z+5, r23
+	std Z+6, r24
+	std Z+7, r25
+	/* call bmw_small_f0(ctx->h, block, q) */
+	movw r24, h0
+	movw r22, b0
+	movw r20, q0
+	push_ q1, q0, b1, b0, h1, h0
+	rcall bmw_small_f0
+	/*	call bmw_small_f1(q, block, ctx->h) */
+	pop_ 20, 21, 22, 23, 24, 25,
+	push_ 21, 20, 25, 24, 23, 22
+	rcall bmw_small_f1
+	/*	call bmw_small_f2(ctx->h, q, block) */
+	pop_ 20, 21, 22, 23, 24, 25,
+	rcall bmw_small_f2
+	stack_free_large3 32*4
+	pop_range  2, 17
+	pop_range 28, 29
+	ret
+
+
+
+
+
+
+
diff --git a/bmw/bmw_small-cstub.c b/bmw/bmw_small-cstub.c
index af26144..5b34145 100644
--- a/bmw/bmw_small-cstub.c
+++ b/bmw/bmw_small-cstub.c
@@ -77,51 +77,9 @@
 void bmw_small_f1(uint32_t* q, const void* m, const void* h);
 void bmw_small_f0(uint32_t* h, const void* m, uint32_t* q);
 void bmw_small_f2(uint32_t* h, uint32_t* q, const void* m);
+void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block);
 
 /*
-static
-void bmw_small_f2(uint32_t* h, const uint32_t* q, const void* m){
-	uint32_t xl=0, xh;
-	uint8_t i;
-	for(i=16;i<24;++i){
-		xl ^= q[i];
-	}
-	xh = xl;
-	for(i=24;i<32;++i){
-		xh ^= q[i];
-	}
-#if DEBUG
-	cli_putstr_P(PSTR("\r\n XL = "));
-	cli_hexdump_rev(&xl, 4);
-	cli_putstr_P(PSTR("\r\n XH = "));
-	cli_hexdump_rev(&xh, 4);
-#endif
-	memcpy(h, m, 16*4);
-	h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5);
-	h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6);
-	h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5);
-	h[4] ^= SHR32(xh, 3) ^ q[20];
-	h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6);
-	h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5);
-	h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8);
-	h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2);
-	for(i=0; i<8; ++i){
-		h[i] += xl ^ q[24+i] ^ q[i];
-	}
-	for(i=0; i<8; ++i){
-		h[8+i] ^= xh ^ q[24+i];
-		h[8+i] += ROTL32(h[(4+i)%8],i+9);
-	}
-	h[11] += SHL32(xl, 4) ^ q[18] ^ q[11];
-	h[10] += SHL32(xl, 6) ^ q[17] ^ q[10];
-	h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8];
-	h[15] += SHR32(xl, 2) ^ q[22] ^ q[15];
-	h[12] += SHR32(xl, 3) ^ q[19] ^ q[12];
-	h[13] += SHR32(xl, 4) ^ q[20] ^ q[13];
-	h[ 9] += SHR32(xl, 6) ^ q[16] ^ q[ 9];
-	h[14] += SHR32(xl, 7) ^ q[21] ^ q[14];
-}
-*/
 void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block){
 	uint32_t q[32];
 	dump_x(block, 16, 'M');
@@ -133,39 +91,35 @@ void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block){
 	ctx->counter += 1;
 	ctx_dump(ctx);
 }
+*/
 
 void bmw_small_lastBlock(bmw_small_ctx_t* ctx, const void* block, uint16_t length_b){
-	uint8_t buffer[64];
+	struct {
+		uint8_t  buffer[64];
+		uint32_t ctr;
+	} pctx;
 	while(length_b >= BMW_SMALL_BLOCKSIZE){
 		bmw_small_nextBlock(ctx, block);
 		length_b -= BMW_SMALL_BLOCKSIZE;
 		block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
 	}
-	memset(buffer, 0, 64);
-	memcpy(buffer, block, (length_b+7)/8);
-	buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
+	memset(pctx.buffer, 0, 64);
+	memcpy(pctx.buffer, block, (length_b+7)/8);
+	pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
 	if(length_b+1>64*8-64){
-		bmw_small_nextBlock(ctx, buffer);
-		memset(buffer, 0, 64-8);
+		bmw_small_nextBlock(ctx, pctx.buffer);
+		memset(pctx.buffer, 0, 64-8);
 		ctx->counter -= 1;
 	}
-	*((uint64_t*)&(buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
-	bmw_small_nextBlock(ctx, buffer);
+	*((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
+	bmw_small_nextBlock(ctx, pctx.buffer);
 	uint8_t i;
-	uint32_t q[32];
-	memset(buffer, 0xaa, 64);
+	memset(pctx.buffer, 0xaa, 64);
 	for(i=0; i<16;++i){
-		buffer[i*4] = i+0xa0;
+		pctx.buffer[i*4] = i+0xa0;
 	}
-//	dump_x(buffer, 16, 'A');
-	dump_x(ctx->h, 16, 'M');
-	bmw_small_f0((uint32_t*)buffer, ctx->h, q);
-	dump_x(buffer, 16, 'a');
-	dump_x(q, 16, 'Q');
-	bmw_small_f1(q, ctx->h, (uint32_t*)buffer);
-	dump_x(q, 32, 'Q');
-	bmw_small_f2((uint32_t*)buffer, q, ctx->h);
-	memcpy(ctx->h, buffer, 64);
+	bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
+	memcpy(ctx->h, pctx.buffer, 64);
 }
 
 void bmw224_init(bmw224_ctx_t* ctx){
-- 
2.39.5