X-Git-Url: https://git.cryptolib.org/?p=avr-crypto-lib.git;a=blobdiff_plain;f=bmw%2Fbmw_small-asm.S;h=e8902e158f8e588b52118d1b415ae73c1c67ba33;hp=62bd166b3c6405f3417023448ad1ec280513a227;hb=eb0cafe05ab4cdf60878dbd81e4ff3712d5150f2;hpb=3d99e4ba447ef04801609c5459b7c0c332ae332f diff --git a/bmw/bmw_small-asm.S b/bmw/bmw_small-asm.S index 62bd166..e8902e1 100644 --- a/bmw/bmw_small-asm.S +++ b/bmw/bmw_small-asm.S @@ -29,7 +29,8 @@ #include "avr-asm-macros.S" shiftcodetable: - .byte 0x00 ; 0 +; .byte 0x00 ; 0 +shiftcodetable_1: .byte 0x01 ; 1 .byte 0x02 ; 2 .byte 0x03 ; 3 @@ -47,7 +48,7 @@ shiftcodetable_9: .byte 0x2A ; 14 .byte 0x29 ; 15 .byte 0x20 ; 16 - .byte 0x21 ; 17 unused but necesseray for padding +; .byte 0x21 ; 17 unused but necesseray for padding @@ -163,9 +164,8 @@ rotl32p9: .global rotl_addel rotl_addel: andi r20, 0x0f - inc r20 - ldi r30, lo8(shiftcodetable) - ldi r31, hi8(shiftcodetable) + ldi r30, lo8(shiftcodetable_1) + ldi r31, hi8(shiftcodetable_1) add r30, r20 adc r31, r1 lpm r20, Z @@ -183,8 +183,7 @@ rotl_addel: movw r22, r30 2: bst r20, 3 andi r20, 0x07 - brne 3f - ret + breq some_ret 3: brts rotr32; 4f rjmp rotl32 @@ -511,7 +510,7 @@ const_lut: .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b /******************************************************************************* -* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){ +* uint32_t addelment(uint8_t j, const uint32_t *m, const uint32_t *h){ * uint32_t r; * r = pgm_read_dword(k_lut+j); * r += rotl_addel(((uint32_t*)m)[j&0xf], j+0); @@ -539,12 +538,11 @@ addelement: mov j, r24 movw h0, r20 movw m0, r22 - mov r25, r24 - lsl r25 - lsl r25 + lsl r24 + lsl r24 ldi r30, lo8(const_lut) ldi r31, hi8(const_lut) - add r30, r25 + add r30, r24 adc r31, r1 lpm acc0, Z+ lpm acc1, Z+ @@ -627,7 +625,7 @@ addelement: ret /******************************************************************************* -* uint32_t bmw_small_expand1(uint8_t j, const void* m, const void* h, const uint32_t* q){ +* uint32_t bmw_small_expand1(uint8_t j, const void *m, const void *h, const uint32_t *q){ * uint32_t(*s[])(uint32_t) = {bmw_small_s1, bmw_small_s2, bmw_small_s3, bmw_small_s0}; * uint32_t r; * uint8_t i; @@ -710,7 +708,7 @@ expand1_exit: ret /******************************************************************************* -* uint32_t bmw_small_expand2(uint8_t j, const void* m, const void* h, const uint32_t* q){ +* uint32_t bmw_small_expand2(uint8_t j, const void *m, const void *h, const uint32_t *q){ * uint32_t(*rf[])(uint32_t) = {bmw_small_r1, bmw_small_r2, bmw_small_r3, * bmw_small_r4, bmw_small_r5, bmw_small_r6, * bmw_small_r7}; @@ -783,7 +781,7 @@ bmw_small_expand2: rjmp expand1_exit /******************************************************************************* -* void bmw_small_f1(uint32_t* q, const void* m, const void* h){ +* void bmw_small_f1(uint32_t *q, const void *m, const void *h){ * uint8_t i; * q[16] = bmw_small_expand1(0, m, h, q); * q[17] = bmw_small_expand1(1, m, h, q); @@ -800,8 +798,8 @@ q0 = 6 q1 = 7 .global bmw_small_f1 bmw_small_f1: - push_range 2, 7 - push_range 28, 29 +; push_range 2, 7 +; push_range 28, 29 push r16 movw q0, r24 movw m0, r22 @@ -843,15 +841,15 @@ bmw_small_f1: cpi r16, 16 brne 1b pop r16 - pop_range 28, 29 - pop_range 2, 7 +; pop_range 28, 29 +; pop_range 2, 7 ret /******************************************************************************* * uint16_t hack_table[5] PROGMEM = { 0x0311, 0xDDB3, 0x2A79, 0x07AA, 0x51C2 }; * uint8_t offset_table[5] PROGMEM = { 4+16, 6+16, 9+16, 12+16, 13+16 }; * -* void bmw_small_f0(uint32_t* h, const void* m, uint32_t* q){ +* void bmw_small_f0(uint32_t *h, const void *m, uint32_t *q){ * uint16_t hack_reg; * uint8_t c,i,j; * uint32_t(*s[])(uint32_t)={ bmw_small_s0, bmw_small_s1, bmw_small_s2, @@ -960,9 +958,9 @@ f0_jumptable: .global bmw_small_f0 bmw_small_f0: - push_range 28, 29 - push_range 4, 11 - push_range 16, 17 +; push_range 28, 29 +; push_range 4, 11 +; push_range 16, 17 /* h[i] ^= m[i]; q[i]= 0 */ movw r26, h0 ; h movw r30, m0 ; m @@ -1105,13 +1103,13 @@ bmw_small_f0: adc acc0, acc1 st Z+, acc0 - pop_range 16, 17 - pop_range 4, 11 - pop_range 28, 29 +; pop_range 16, 17 +; pop_range 4, 11 +; pop_range 28, 29 ret /******************************************************************************* -* void bmw_small_f2(uint32_t* h, const uint32_t* q, const void* m){ +* void bmw_small_f2(uint32_t *h, const uint32_t *q, const void *m){ * uint32_t xl=0, xh; * uint8_t i; * for(i=16;i<24;++i){ @@ -1220,8 +1218,8 @@ bmw_small_f2: st X+, r0 dec r18 brne 1b - push_range 28, 29 - push_range 2, 17 +; push_range 28, 29 +; push_range 2, 17 movw q0, r22 movw h0, r24 /* calc xl */ @@ -1655,10 +1653,12 @@ bmw_small_f2: rcall tshiftr modify_h_2 5 bmw_small_f2_exit: - pop_range 2, 17 - pop_range 28, 29 +; pop_range 2, 17 +; pop_range 28, 29 ret +#if DEBUG_FUNCTIONS + cli_putb: push r2 push_range 18, 26 @@ -1695,3 +1695,503 @@ cli_putchar: call cli_putc pop_range 18, 31 ret + +#endif + +/******************************************************************************* +* void bmw_small_nextBlock(bmw_small_ctx_t *ctx, const void *block){ +* uint32_t q[32]; +* dump_x(block, 16, 'M'); +* bmw_small_f0(ctx->h, block, q); +* dump_x(q, 16, 'Q'); +* bmw_small_f1(q, block, ctx->h); +* dump_x(q, 32, 'Q'); +* bmw_small_f2(ctx->h, q, block); +* ctx->counter += 1; +* ctx_dump(ctx); +* } +* +* param ctx: r24:r25 +* param block: r22:r23 +*/ +h0 = 2 +h1 = 3 +b0 = 4 +b1 = 5 +q0 = 6 +q1 = 7 +.global bmw_small_nextBlock +.global bmw224_nextBlock +.global bmw256_nextBlock +bmw_small_nextBlock: +bmw224_nextBlock: +bmw256_nextBlock: + push_range 28, 29 + push_range 2, 17 + stack_alloc_large 32*4, 30, 31 + adiw r30, 1 + movw q0, r30 + movw h0, r24 + movw b0, r22 + /* increment counter */ + movw r30, r24 + adiw r30, 60 + ldd r22, Z+4 + ldd r23, Z+5 + ldd r24, Z+6 + ldd r25, Z+7 + ldi r21, 1 + add r22, r21 + adc r23, r1 + adc r24, r1 + adc r25, r1 + std Z+4, r22 + std Z+5, r23 + std Z+6, r24 + std Z+7, r25 + /* call bmw_small_f0(ctx->h, block, q) */ + movw r24, h0 + movw r22, b0 + movw r20, q0 + push_ q1, q0, b1, b0, h1, h0 + rcall bmw_small_f0 + /* call bmw_small_f1(q, block, ctx->h) */ + pop_ 20, 21, 22, 23, 24, 25, + push_ 21, 20, 25, 24, 23, 22 + rcall bmw_small_f1 + /* call bmw_small_f2(ctx->h, q, block) */ + pop_ 20, 21, 22, 23, 24, 25, + rcall bmw_small_f2 + stack_free_large3 32*4 + pop_range 2, 17 + pop_range 28, 29 + ret + + +/******************************************************************************* +* void bmw224_init(bmw224_ctx_t *ctx){ +* uint8_t i; +* ctx->h[0] = 0x00010203; +* for(i=1; i<16; ++i){ +* ctx->h[i] = ctx->h[i-1]+ 0x04040404; +* } +* ctx->counter=0; +* } +* +* param ctx: r24:r25 +*/ +.global bmw224_init +bmw224_init: + movw r26, r24 + ldi r22, 0x03 + ldi r23, 0x02 + ldi r24, 0x01 + ldi r25, 0x00 +bmw_small_init: + st X+, r22 + st X+, r23 + st X+, r24 + st X+, r25 + ldi r18, 16-1 + ldi r20, 0x04 +1: + add r22, r20 + adc r23, r20 + adc r24, r20 + adc r25, r20 + st X+, r22 + st X+, r23 + st X+, r24 + st X+, r25 + dec r18 + brne 1b + st X+, r1 + st X+, r1 + st X+, r1 + st X+, r1 + ret + +.global bmw256_init +bmw256_init: + movw r26, r24 + ldi r22, 0x43 + ldi r23, 0x42 + ldi r24, 0x41 + ldi r25, 0x40 + rjmp bmw_small_init + +/******************************************************************************* +* void bmw_small_lastBlock(bmw_small_ctx_t *ctx, const void *block, uint16_t length_b){ +* struct { +* uint8_t buffer[64]; +* uint32_t ctr; +* } pctx; +* while(length_b >= BMW_SMALL_BLOCKSIZE){ +* bmw_small_nextBlock(ctx, block); +* length_b -= BMW_SMALL_BLOCKSIZE; +* block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B; +* } +* memset(pctx.buffer, 0, 64); +* memcpy(pctx.buffer, block, (length_b+7)/8); +* pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07); +* if(length_b+1>64*8-64){ +* bmw_small_nextBlock(ctx, pctx.buffer); +* memset(pctx.buffer, 0, 64-8); +* ctx->counter -= 1; +* } +* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b; +* bmw_small_nextBlock(ctx, pctx.buffer); +* uint8_t i; +* memset(pctx.buffer, 0xaa, 64); +* for(i=0; i<16;++i){ +* pctx.buffer[i*4] = i+0xa0; +* } +* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h); +* memcpy(ctx->h, pctx.buffer, 64); +* } +* +* param ctx: r24:r25 +* param block: r22:r23 +* param length_b: r20:r21 +*/ +ctx0 = 2 +ctx1 = 3 +blc0 = 4 +blc1 = 5 +len0 = 28 +len1 = 29 +buf0 = 6 +buf1 = 7 + +.global bmw_small_lastBlock +.global bmw224_lastBlock +.global bmw256_lastBlock +bmw_small_lastBlock: +bmw224_lastBlock: +bmw256_lastBlock: +/* while(length_b >= BMW_SMALL_BLOCKSIZE){ + bmw_small_nextBlock(ctx, block); + length_b -= BMW_SMALL_BLOCKSIZE; + block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B; + } +*/ + push_range 2, 7 + push_range 28, 29 + movw ctx0, r24 + movw blc0, r22 + movw len0, r20 +1: + cpi len1, hi8(512) + brlo 2f + movw r24, ctx0 + movw r22, blc0 + rcall bmw_small_nextBlock + ldi r24, 64 + add blc0, r24 + adc blc1, r1 + subi len1, hi8(512) + rjmp 1b +2: +/* struct { + uint8_t buffer[64]; + uint32_t ctr; + } pctx; +*/ + stack_alloc_large 68 + adiw r30, 1 + movw buf0, r30 +/* memset(pctx.buffer, 0, 64); + memcpy(pctx.buffer, block, (length_b+7)/8); + pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07); +*/ movw r24, len0 + lsr r25 + ror r24 + lsr r24 + lsr r24 +; inc r24 + ldi r23, 63 + sub r23, r24 + movw r26, blc0 + tst r24 + breq 301f +30: ld r20, X+ + st Z+, r20 + dec r24 + brne 30b +301: + clr r20 + mov r21, len0 + ldi r24, 0x80 + andi r21, 0x07 + breq 305f + ld r20, X+ +303: + lsr r24 + dec r21 + brne 303b +305: + or r20, r24 + st Z+, r20 + tst r23 + breq 32f +31: st Z+, r1 + dec r23 + brne 31b +32: +/* if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511 + bmw_small_nextBlock(ctx, pctx.buffer); + memset(pctx.buffer, 0, 64-8); + ctx->counter -= 1; + } +*/ + tst len1 + breq 400f + cpi len0, 192 + brlo 400f + movw r24, ctx0 + movw r22, buf0 + rcall bmw_small_nextBlock + movw r26, buf0 + ldi r20, 64-8 +350: + st X+, r1 + dec r20 + brne 350b + movw r30, ctx0 + adiw r30, 60 + ldd r21, Z+4 + ldd r22, Z+5 + ldd r23, Z+6 + ldd r24, Z+7 + subi r21, 1 + sbc r22, r1 + sbc r23, r1 + sbc r24, r1 + rjmp 410f +/* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b; + bmw_small_nextBlock(ctx, pctx.buffer); +*/ +400: + movw r30, ctx0 + adiw r30, 60 + ldd r21, Z+4 + ldd r22, Z+5 + ldd r23, Z+6 + ldd r24, Z+7 +410: + clr r25 + lsl r21 + rol r22 + rol r23 + rol r24 + rol r25 + mov r20, len0 + add r21, len1 + adc r22, r1 + adc r23, r1 + adc r24, r1 + adc r25, r1 + movw r30, buf0 + adiw r30, 64-8 + st Z+, r20 + st Z+, r21 + st Z+, r22 + st Z+, r23 + st Z+, r24 + st Z+, r25 + st Z+, r1 + st Z+, r1 + movw r24, ctx0 + movw r22, buf0 + rcall bmw_small_nextBlock +/* memset(pctx.buffer, 0xaa, 64); + for(i=0; i<16;++i){ + pctx.buffer[i*4] = i+0xa0; + } +*/ + ldi r18, 0xa0 + ldi r19, 0xaa + movw r26, buf0 +500: + st X+, r18 + st X+, r19 + st X+, r19 + st X+, r19 + inc r18 + sbrs r18, 4 + rjmp 500b +/* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h); + memcpy(ctx->h, pctx.buffer, 64); +*/ + movw r24, buf0 + movw r22, ctx0 + rcall bmw_small_nextBlock + ldi r18, 64 + movw r26, ctx0 + movw r30, buf0 +600: + ld r20, Z+ + st X+, r20 + dec r18 + brne 600b + + stack_free_large 68 + pop_range 28, 29 + pop_range 2, 7 + ret + +/******************************************************************************* +* void bmw224_ctx2hash(void *dest, const bmw224_ctx_t *ctx){ +* memcpy(dest, &(ctx->h[9]), 224/8); +* } +* +* param dest: r24:r25 +* param ctx: r22:r23 +*/ +.global bmw224_ctx2hash +bmw224_ctx2hash: + movw r26, r24 + movw r30, r22 + adiw r30, 9*4 + ldi r22, 28 + rjmp 1f + +/******************************************************************************* +* void bmw256_ctx2hash(void *dest, const bmw256_ctx_t *ctx){ +* memcpy(dest, &(ctx->h[8]), 256/8); +* } +* +* param dest: r24:r25 +* param ctx: r22:r23 +*/ +.global bmw256_ctx2hash +bmw256_ctx2hash: + movw r26, r24 + movw r30, r22 + adiw r30, 8*4 + ldi r22, 32 +1: + ld r23, Z+ + st X+, r23 + dec r22 + brne 1b + ret + +/******************************************************************************* +* void bmw256(void *dest, const void *msg, uint32_t length_b){ +* bmw_small_ctx_t ctx; +* bmw256_init(&ctx); +* while(length_b>=BMW_SMALL_BLOCKSIZE){ +* bmw_small_nextBlock(&ctx, msg); +* length_b -= BMW_SMALL_BLOCKSIZE; +* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B; +* } +* bmw_small_lastBlock(&ctx, msg, length_b); +* bmw256_ctx2hash(dest, &ctx); +* } +* +* param dest: r24:r25 +* param msg: r22:r23 +* param length_b: r18:r21 +*/ +ctx0 = 2 +ctx1 = 3 +msg0 = 4 +msg1 = 5 +len0 = 6 +len1 = 7 +len2 = 8 +len3 = 9 +dst0 = 10 +dst1 = 11 +.global bmw256 +bmw256: + push r16 + ldi r16, 1 + rjmp bmw_small_all + +/******************************************************************************* +* void bmw224(void *dest, const void *msg, uint32_t length_b){ +* bmw_small_ctx_t ctx; +* bmw224_init(&ctx); +* while(length_b>=BMW_SMALL_BLOCKSIZE){ +* bmw_small_nextBlock(&ctx, msg); +* length_b -= BMW_SMALL_BLOCKSIZE; +* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B; +* } +* bmw_small_lastBlock(&ctx, msg, length_b); +* bmw224_ctx2hash(dest, &ctx); +* } +* +* param dest: r24:r25 +* param msg: r22:r23 +* param length_b: r18:r21 +*/ +ctx0 = 2 +ctx1 = 3 +msg0 = 4 +msg1 = 5 +len0 = 6 +len1 = 7 +len2 = 8 +len3 = 9 +dst0 = 10 +dst1 = 11 +.global bmw224 +bmw224: + push r16 + clr r16 + +bmw_small_all: + push_range 2, 11 + stack_alloc_large 64+4 + adiw r30, 1 + movw ctx0, r30 + movw dst0, r24 + movw msg0, r22 + movw len0, r18 + movw len2, r20 + movw r24, ctx0 + ldi r30, pm_lo8(init_lut) + ldi r31, pm_hi8(init_lut) + add r30, r16 + adc r31, r1 + icall +20: + mov r18, len2 + or r18, len3 + breq 50f + movw r24, ctx0 + movw r22, msg0 + rcall bmw_small_nextBlock + ldi r20, 2 + sub len1, r20 + sbc len2, r1 + sbc len3, r1 + ldi r20, 64 + add msg0, r20 + adc msg1, r1 + rjmp 20b +50: + movw r24, ctx0 + movw r22, msg0 + movw r20, len0 + rcall bmw_small_lastBlock + movw r24, dst0 + movw r22, ctx0 + ldi r30, pm_lo8(c2h_lut) + ldi r31, pm_hi8(c2h_lut) + add r30, r16 + adc r31, r1 + icall + stack_free_large 64+4 + pop_range 2, 11 + pop r16 + ret + +init_lut: + rjmp bmw224_init + rjmp bmw256_init +c2h_lut: + rjmp bmw224_ctx2hash + rjmp bmw256_ctx2hash