From f0c9ba379b687a52a78fee9d3c0e8078238f51fe Mon Sep 17 00:00:00 2001
From: bg <bg@b1d182e4-1ff8-0310-901f-bddb46175740>
Date: Tue, 15 Dec 2009 17:40:26 +0000
Subject: [PATCH] even more asm fun for BMW

---
 bmw/bmw_small-asm.S   | 324 +++++++++++++++++++++++++++++++++++++++++-
 bmw/bmw_small-cstub.c |  91 ------------
 2 files changed, 317 insertions(+), 98 deletions(-)

diff --git a/bmw/bmw_small-asm.S b/bmw/bmw_small-asm.S
index ef05e96..1aab11a 100644
--- a/bmw/bmw_small-asm.S
+++ b/bmw/bmw_small-asm.S
@@ -29,7 +29,8 @@
 #include "avr-asm-macros.S"
 
 shiftcodetable:
-	.byte 0x00 ;  0
+;	.byte 0x00 ;  0
+shiftcodetable_1:
 	.byte 0x01 ;  1
 	.byte 0x02 ;  2
 	.byte 0x03 ;  3
@@ -47,7 +48,7 @@ shiftcodetable_9:
 	.byte 0x2A ; 14
 	.byte 0x29 ; 15
 	.byte 0x20 ; 16
-	.byte 0x21 ; 17 unused but necesseray for padding
+;	.byte 0x21 ; 17 unused but necesseray for padding
 
 
 
@@ -163,9 +164,8 @@ rotl32p9:
 .global rotl_addel
 rotl_addel:
 	andi r20, 0x0f
-	inc r20
-	ldi r30, lo8(shiftcodetable)
-	ldi r31, hi8(shiftcodetable)
+	ldi r30, lo8(shiftcodetable_1)
+	ldi r31, hi8(shiftcodetable_1)
 	add r30, r20
 	adc r31, r1
 	lpm r20, Z
@@ -183,8 +183,7 @@ rotl_addel:
 	movw r22, r30
 2:  bst  r20, 3
 	andi r20, 0x07
-	brne 3f
-	ret
+	breq some_ret
 3:
 	brts rotr32; 4f
 	rjmp rotl32
@@ -1659,6 +1658,8 @@ bmw_small_f2_exit:
 ;	pop_range 28, 29
 	ret
 
+#if DEBUG_FUNCTIONS
+
 cli_putb:
 	push r2
 	push_range 18, 26
@@ -1696,6 +1697,8 @@ cli_putchar:
 	pop_range 18, 31
 	ret
 
+#endif
+
 /*******************************************************************************
 * void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block){
 *	uint32_t q[32];
@@ -1719,7 +1722,11 @@ b1 =  5
 q0 =  6
 q1 =  7
 .global bmw_small_nextBlock
+.global bmw224_nextBlock
+.global bmw256_nextBlock
 bmw_small_nextBlock:
+bmw224_nextBlock:
+bmw256_nextBlock:
 	push_range 28, 29
 	push_range  2, 17
 	stack_alloc_large 32*4, 30, 31
@@ -1762,8 +1769,311 @@ bmw_small_nextBlock:
 	ret
 
 
+/*******************************************************************************
+* void bmw224_init(bmw224_ctx_t* ctx){
+*	uint8_t i;
+*	ctx->h[0] = 0x00010203;
+*	for(i=1; i<16; ++i){
+*		ctx->h[i] = ctx->h[i-1]+ 0x04040404;
+*	}
+*	ctx->counter=0;
+* }
+*
+* param ctx:  r24:r25
+*/
+.global bmw224_init
+bmw224_init:
+	movw r26, r24
+	ldi r22, 0x03
+	ldi r23, 0x02
+	ldi r24, 0x01
+	ldi r25, 0x00
+bmw_small_init:
+	st X+, r22
+	st X+, r23
+	st X+, r24
+	st X+, r25
+	ldi r18, 16-1
+	ldi r20, 0x04
+1:
+	add r22, r20
+	adc r23, r20
+	adc r24, r20
+	adc r25, r20
+	st X+, r22
+	st X+, r23
+	st X+, r24
+	st X+, r25
+	dec r18
+	brne 1b
+	st X+, r1
+	st X+, r1
+	st X+, r1
+	st X+, r1
+	ret
 
+.global bmw256_init
+bmw256_init:
+	movw r26, r24
+	ldi r22, 0x43
+	ldi r23, 0x42
+	ldi r24, 0x41
+	ldi r25, 0x40
+	rjmp bmw_small_init
 
+/*******************************************************************************
+* void bmw_small_lastBlock(bmw_small_ctx_t* ctx, const void* block, uint16_t length_b){
+*	struct {
+*		uint8_t  buffer[64];
+*		uint32_t ctr;
+*	} pctx;
+*	while(length_b >= BMW_SMALL_BLOCKSIZE){
+*		bmw_small_nextBlock(ctx, block);
+*		length_b -= BMW_SMALL_BLOCKSIZE;
+*		block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
+*	}
+*	memset(pctx.buffer, 0, 64);
+*	memcpy(pctx.buffer, block, (length_b+7)/8);
+*	pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
+*	if(length_b+1>64*8-64){
+*		bmw_small_nextBlock(ctx, pctx.buffer);
+*		memset(pctx.buffer, 0, 64-8);
+*		ctx->counter -= 1;
+*	}
+*	*((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
+*	bmw_small_nextBlock(ctx, pctx.buffer);
+*	uint8_t i;
+*	memset(pctx.buffer, 0xaa, 64);
+*	for(i=0; i<16;++i){
+*		pctx.buffer[i*4] = i+0xa0;
+*	}
+*	bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
+*	memcpy(ctx->h, pctx.buffer, 64);
+* }
+*
+* param ctx:      r24:r25
+* param block:    r22:r23
+* param length_b: r20:r21
+*/
+ctx0 =  2
+ctx1 =  3
+blc0 =  4
+blc1 =  5
+len0 = 28
+len1 = 29
+buf0 =  6
+buf1 =  7
+
+.global bmw_small_lastBlock
+.global bmw224_lastBlock
+.global bmw256_lastBlock
+bmw_small_lastBlock:
+bmw224_lastBlock:
+bmw256_lastBlock:
+/*	while(length_b >= BMW_SMALL_BLOCKSIZE){
+		bmw_small_nextBlock(ctx, block);
+		length_b -= BMW_SMALL_BLOCKSIZE;
+		block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
+	}
+*/
+	push_range 2, 7
+	push_range 28, 29
+	movw ctx0, r24
+	movw blc0, r22
+	movw len0, r20
+1:
+	cpi len1, hi8(512)
+	brlo 2f
+	movw r24, ctx0
+	movw r22, blc0
+	rcall bmw_small_nextBlock
+	ldi r24, 64
+	add blc0, r24
+	adc blc1, r1
+	subi len1, hi8(512)
+	rjmp 1b
+2:
+/*	struct {
+		uint8_t  buffer[64];
+		uint32_t ctr;
+	} pctx;
+*/
+	stack_alloc_large 68
+	adiw r30, 1
+	movw buf0, r30
+/*	memset(pctx.buffer, 0, 64);
+	memcpy(pctx.buffer, block, (length_b+7)/8);
+	pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
+*/	movw r24, len0
+	lsr r25
+	ror r24
+	lsr r24
+	lsr r24
+;	inc r24
+	ldi r23, 63
+	sub r23, r24
+	movw r26, blc0
+	tst r24
+	breq 301f
+30: ld r20, X+
+	st Z+, r20
+	dec r24
+	brne 30b
+301:
+	clr r20
+	mov r21, len0
+	ldi r24, 0x80
+	andi r21, 0x07
+	breq 305f
+	ld r20, X+
+303:
+	lsr r24
+	dec r21
+	brne 303b
+305:
+	or r20, r24
+	st Z+, r20
+	tst r23
+	breq 32f
+31: st Z+, r1
+	dec r23
+	brne 31b
+32:
+/*	if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
+		bmw_small_nextBlock(ctx, pctx.buffer);
+		memset(pctx.buffer, 0, 64-8);
+		ctx->counter -= 1;
+	}
+*/
+	tst len1
+	breq 400f
+	cpi len0, 192
+	brlo 400f
+	movw r24, ctx0
+	movw r22, buf0
+	rcall bmw_small_nextBlock
+	movw r26, buf0
+	ldi r20, 64-8
+350:
+	st X+, r1
+	dec r20
+	brne 350b
+	movw r30, ctx0
+	adiw r30, 60
+	ldd r21, Z+4
+	ldd r22, Z+5
+	ldd r23, Z+6
+	ldd r24, Z+7
+	subi r21, 1
+	sbc r22, r1
+	sbc r23, r1
+	sbc r24, r1
+	rjmp 410f
+/*	*((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
+	bmw_small_nextBlock(ctx, pctx.buffer);
+*/
+400:
+	movw r30, ctx0
+	adiw r30, 60
+	ldd r21, Z+4
+	ldd r22, Z+5
+	ldd r23, Z+6
+	ldd r24, Z+7
+410:
+	clr r25
+	lsl r21
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	mov r20, len0
+	add r21, len1
+	adc r22, r1
+	adc r23, r1
+	adc r24, r1
+	adc r25, r1
+	movw r30, buf0
+	adiw r30, 64-8
+	st Z+, r20
+	st Z+, r21
+	st Z+, r22
+	st Z+, r23
+	st Z+, r24
+	st Z+, r25
+	st Z+, r1
+	st Z+, r1
+	movw r24, ctx0
+	movw r22, buf0
+	rcall bmw_small_nextBlock
+/*	memset(pctx.buffer, 0xaa, 64);
+	for(i=0; i<16;++i){
+		pctx.buffer[i*4] = i+0xa0;
+	}
+*/
+	ldi r18, 0xa0
+	ldi r19, 0xaa
+	movw r26, buf0
+500:
+	st X+, r18
+	st X+, r19
+	st X+, r19
+	st X+, r19
+	inc r18
+	sbrs r18, 4
+	rjmp 500b
+/*	bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
+	memcpy(ctx->h, pctx.buffer, 64);
+*/
+	movw r24, buf0
+	movw r22, ctx0
+	rcall bmw_small_nextBlock
+	ldi r18, 64
+	movw r26, ctx0
+	movw r30, buf0
+600:
+	ld r20, Z+
+	st X+, r20
+	dec r18
+	brne 600b
 
+	stack_free_large 68
+	pop_range 28, 29
+	pop_range 2, 7
+	ret
 
+/*******************************************************************************
+* void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){
+*	memcpy(dest, &(ctx->h[9]), 224/8);
+* }
+*
+* param dest:  r24:r25
+* param ctx:   r22:r23
+*/
+.global bmw224_ctx2hash
+bmw224_ctx2hash:
+	movw r26, r24
+	movw r30, r22
+	adiw r30, 9*4
+	ldi r22, 28
+	rjmp 1f
 
+/*******************************************************************************
+* void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
+*	memcpy(dest, &(ctx->h[8]), 256/8);
+* }
+*
+* param dest:  r24:r25
+* param ctx:   r22:r23
+*/
+.global bmw256_ctx2hash
+bmw256_ctx2hash:
+	movw r26, r24
+	movw r30, r22
+	adiw r30, 8*4
+	ldi r22, 32
+1:
+	ld r23, Z+
+	st X+, r23
+	dec r22
+	brne 1b
+	ret
diff --git a/bmw/bmw_small-cstub.c b/bmw/bmw_small-cstub.c
index 5b34145..3ace5e2 100644
--- a/bmw/bmw_small-cstub.c
+++ b/bmw/bmw_small-cstub.c
@@ -74,97 +74,6 @@
  #define dump_x(a,b,c)
 #endif
 
-void bmw_small_f1(uint32_t* q, const void* m, const void* h);
-void bmw_small_f0(uint32_t* h, const void* m, uint32_t* q);
-void bmw_small_f2(uint32_t* h, uint32_t* q, const void* m);
-void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block);
-
-/*
-void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block){
-	uint32_t q[32];
-	dump_x(block, 16, 'M');
-	bmw_small_f0(ctx->h, block, q);
-	dump_x(q, 16, 'Q');
-	bmw_small_f1(q, block, ctx->h);
-	dump_x(q, 32, 'Q');
-	bmw_small_f2(ctx->h, q, block);
-	ctx->counter += 1;
-	ctx_dump(ctx);
-}
-*/
-
-void bmw_small_lastBlock(bmw_small_ctx_t* ctx, const void* block, uint16_t length_b){
-	struct {
-		uint8_t  buffer[64];
-		uint32_t ctr;
-	} pctx;
-	while(length_b >= BMW_SMALL_BLOCKSIZE){
-		bmw_small_nextBlock(ctx, block);
-		length_b -= BMW_SMALL_BLOCKSIZE;
-		block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
-	}
-	memset(pctx.buffer, 0, 64);
-	memcpy(pctx.buffer, block, (length_b+7)/8);
-	pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
-	if(length_b+1>64*8-64){
-		bmw_small_nextBlock(ctx, pctx.buffer);
-		memset(pctx.buffer, 0, 64-8);
-		ctx->counter -= 1;
-	}
-	*((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
-	bmw_small_nextBlock(ctx, pctx.buffer);
-	uint8_t i;
-	memset(pctx.buffer, 0xaa, 64);
-	for(i=0; i<16;++i){
-		pctx.buffer[i*4] = i+0xa0;
-	}
-	bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
-	memcpy(ctx->h, pctx.buffer, 64);
-}
-
-void bmw224_init(bmw224_ctx_t* ctx){
-	uint8_t i;
-	ctx->h[0] = 0x00010203;
-	for(i=1; i<16; ++i){
-		ctx->h[i] = ctx->h[i-1]+ 0x04040404;
-	}
-	ctx->counter=0;
-//	ctx_dump(ctx);
-}
-
-void bmw256_init(bmw256_ctx_t* ctx){
-	uint8_t i;
-	ctx->h[0] = 0x40414243;
-	for(i=1; i<16; ++i){
-		ctx->h[i] = ctx->h[i-1]+ 0x04040404;
-	}
-	ctx->counter=0;
-//	ctx_dump(ctx);
-}
-
-void bmw224_nextBlock(bmw224_ctx_t* ctx, const void* block){
-	bmw_small_nextBlock(ctx, block);
-}
-
-void bmw256_nextBlock(bmw256_ctx_t* ctx, const void* block){
-	bmw_small_nextBlock(ctx, block);
-}
-
-void bmw224_lastBlock(bmw224_ctx_t* ctx, const void* block, uint16_t length_b){
-	bmw_small_lastBlock(ctx, block, length_b);
-}
-
-void bmw256_lastBlock(bmw256_ctx_t* ctx, const void* block, uint16_t length_b){
-	bmw_small_lastBlock(ctx, block, length_b);
-}
-
-void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){
-	memcpy(dest, &(ctx->h[9]), 224/8);
-}
-
-void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
-	memcpy(dest, &(ctx->h[8]), 256/8);
-}
 
 void bmw224(void* dest, const void* msg, uint32_t length_b){
 	bmw_small_ctx_t ctx;
-- 
2.39.5