]> git.cryptolib.org Git - avr-crypto-lib.git/commitdiff
even more asm fun for BMW
authorbg <bg@b1d182e4-1ff8-0310-901f-bddb46175740>
Tue, 15 Dec 2009 17:40:26 +0000 (17:40 +0000)
committerbg <bg@b1d182e4-1ff8-0310-901f-bddb46175740>
Tue, 15 Dec 2009 17:40:26 +0000 (17:40 +0000)
bmw/bmw_small-asm.S
bmw/bmw_small-cstub.c

index ef05e96bcc4eb0094ac57334dc47a29626427963..1aab11a1cc87aa4f025b27a1fd2540823d801ab2 100644 (file)
@@ -29,7 +29,8 @@
 #include "avr-asm-macros.S"
 
 shiftcodetable:
-       .byte 0x00 ;  0
+;      .byte 0x00 ;  0
+shiftcodetable_1:
        .byte 0x01 ;  1
        .byte 0x02 ;  2
        .byte 0x03 ;  3
@@ -47,7 +48,7 @@ shiftcodetable_9:
        .byte 0x2A ; 14
        .byte 0x29 ; 15
        .byte 0x20 ; 16
-       .byte 0x21 ; 17 unused but necesseray for padding
+;      .byte 0x21 ; 17 unused but necesseray for padding
 
 
 
@@ -163,9 +164,8 @@ rotl32p9:
 .global rotl_addel
 rotl_addel:
        andi r20, 0x0f
-       inc r20
-       ldi r30, lo8(shiftcodetable)
-       ldi r31, hi8(shiftcodetable)
+       ldi r30, lo8(shiftcodetable_1)
+       ldi r31, hi8(shiftcodetable_1)
        add r30, r20
        adc r31, r1
        lpm r20, Z
@@ -183,8 +183,7 @@ rotl_addel:
        movw r22, r30
 2:  bst  r20, 3
        andi r20, 0x07
-       brne 3f
-       ret
+       breq some_ret
 3:
        brts rotr32; 4f
        rjmp rotl32
@@ -1659,6 +1658,8 @@ bmw_small_f2_exit:
 ;      pop_range 28, 29
        ret
 
+#if DEBUG_FUNCTIONS
+
 cli_putb:
        push r2
        push_range 18, 26
@@ -1696,6 +1697,8 @@ cli_putchar:
        pop_range 18, 31
        ret
 
+#endif
+
 /*******************************************************************************
 * void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block){
 *      uint32_t q[32];
@@ -1719,7 +1722,11 @@ b1 =  5
 q0 =  6
 q1 =  7
 .global bmw_small_nextBlock
+.global bmw224_nextBlock
+.global bmw256_nextBlock
 bmw_small_nextBlock:
+bmw224_nextBlock:
+bmw256_nextBlock:
        push_range 28, 29
        push_range  2, 17
        stack_alloc_large 32*4, 30, 31
@@ -1762,8 +1769,311 @@ bmw_small_nextBlock:
        ret
 
 
+/*******************************************************************************
+* void bmw224_init(bmw224_ctx_t* ctx){
+*      uint8_t i;
+*      ctx->h[0] = 0x00010203;
+*      for(i=1; i<16; ++i){
+*              ctx->h[i] = ctx->h[i-1]+ 0x04040404;
+*      }
+*      ctx->counter=0;
+* }
+*
+* param ctx:  r24:r25
+*/
+.global bmw224_init
+bmw224_init:
+       movw r26, r24
+       ldi r22, 0x03
+       ldi r23, 0x02
+       ldi r24, 0x01
+       ldi r25, 0x00
+bmw_small_init:
+       st X+, r22
+       st X+, r23
+       st X+, r24
+       st X+, r25
+       ldi r18, 16-1
+       ldi r20, 0x04
+1:
+       add r22, r20
+       adc r23, r20
+       adc r24, r20
+       adc r25, r20
+       st X+, r22
+       st X+, r23
+       st X+, r24
+       st X+, r25
+       dec r18
+       brne 1b
+       st X+, r1
+       st X+, r1
+       st X+, r1
+       st X+, r1
+       ret
 
+.global bmw256_init
+bmw256_init:
+       movw r26, r24
+       ldi r22, 0x43
+       ldi r23, 0x42
+       ldi r24, 0x41
+       ldi r25, 0x40
+       rjmp bmw_small_init
 
+/*******************************************************************************
+* void bmw_small_lastBlock(bmw_small_ctx_t* ctx, const void* block, uint16_t length_b){
+*      struct {
+*              uint8_t  buffer[64];
+*              uint32_t ctr;
+*      } pctx;
+*      while(length_b >= BMW_SMALL_BLOCKSIZE){
+*              bmw_small_nextBlock(ctx, block);
+*              length_b -= BMW_SMALL_BLOCKSIZE;
+*              block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
+*      }
+*      memset(pctx.buffer, 0, 64);
+*      memcpy(pctx.buffer, block, (length_b+7)/8);
+*      pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
+*      if(length_b+1>64*8-64){
+*              bmw_small_nextBlock(ctx, pctx.buffer);
+*              memset(pctx.buffer, 0, 64-8);
+*              ctx->counter -= 1;
+*      }
+*      *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
+*      bmw_small_nextBlock(ctx, pctx.buffer);
+*      uint8_t i;
+*      memset(pctx.buffer, 0xaa, 64);
+*      for(i=0; i<16;++i){
+*              pctx.buffer[i*4] = i+0xa0;
+*      }
+*      bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
+*      memcpy(ctx->h, pctx.buffer, 64);
+* }
+*
+* param ctx:      r24:r25
+* param block:    r22:r23
+* param length_b: r20:r21
+*/
+ctx0 =  2
+ctx1 =  3
+blc0 =  4
+blc1 =  5
+len0 = 28
+len1 = 29
+buf0 =  6
+buf1 =  7
+
+.global bmw_small_lastBlock
+.global bmw224_lastBlock
+.global bmw256_lastBlock
+bmw_small_lastBlock:
+bmw224_lastBlock:
+bmw256_lastBlock:
+/*     while(length_b >= BMW_SMALL_BLOCKSIZE){
+               bmw_small_nextBlock(ctx, block);
+               length_b -= BMW_SMALL_BLOCKSIZE;
+               block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
+       }
+*/
+       push_range 2, 7
+       push_range 28, 29
+       movw ctx0, r24
+       movw blc0, r22
+       movw len0, r20
+1:
+       cpi len1, hi8(512)
+       brlo 2f
+       movw r24, ctx0
+       movw r22, blc0
+       rcall bmw_small_nextBlock
+       ldi r24, 64
+       add blc0, r24
+       adc blc1, r1
+       subi len1, hi8(512)
+       rjmp 1b
+2:
+/*     struct {
+               uint8_t  buffer[64];
+               uint32_t ctr;
+       } pctx;
+*/
+       stack_alloc_large 68
+       adiw r30, 1
+       movw buf0, r30
+/*     memset(pctx.buffer, 0, 64);
+       memcpy(pctx.buffer, block, (length_b+7)/8);
+       pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
+*/     movw r24, len0
+       lsr r25
+       ror r24
+       lsr r24
+       lsr r24
+;      inc r24
+       ldi r23, 63
+       sub r23, r24
+       movw r26, blc0
+       tst r24
+       breq 301f
+30: ld r20, X+
+       st Z+, r20
+       dec r24
+       brne 30b
+301:
+       clr r20
+       mov r21, len0
+       ldi r24, 0x80
+       andi r21, 0x07
+       breq 305f
+       ld r20, X+
+303:
+       lsr r24
+       dec r21
+       brne 303b
+305:
+       or r20, r24
+       st Z+, r20
+       tst r23
+       breq 32f
+31: st Z+, r1
+       dec r23
+       brne 31b
+32:
+/*     if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
+               bmw_small_nextBlock(ctx, pctx.buffer);
+               memset(pctx.buffer, 0, 64-8);
+               ctx->counter -= 1;
+       }
+*/
+       tst len1
+       breq 400f
+       cpi len0, 192
+       brlo 400f
+       movw r24, ctx0
+       movw r22, buf0
+       rcall bmw_small_nextBlock
+       movw r26, buf0
+       ldi r20, 64-8
+350:
+       st X+, r1
+       dec r20
+       brne 350b
+       movw r30, ctx0
+       adiw r30, 60
+       ldd r21, Z+4
+       ldd r22, Z+5
+       ldd r23, Z+6
+       ldd r24, Z+7
+       subi r21, 1
+       sbc r22, r1
+       sbc r23, r1
+       sbc r24, r1
+       rjmp 410f
+/*     *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
+       bmw_small_nextBlock(ctx, pctx.buffer);
+*/
+400:
+       movw r30, ctx0
+       adiw r30, 60
+       ldd r21, Z+4
+       ldd r22, Z+5
+       ldd r23, Z+6
+       ldd r24, Z+7
+410:
+       clr r25
+       lsl r21
+       rol r22
+       rol r23
+       rol r24
+       rol r25
+       mov r20, len0
+       add r21, len1
+       adc r22, r1
+       adc r23, r1
+       adc r24, r1
+       adc r25, r1
+       movw r30, buf0
+       adiw r30, 64-8
+       st Z+, r20
+       st Z+, r21
+       st Z+, r22
+       st Z+, r23
+       st Z+, r24
+       st Z+, r25
+       st Z+, r1
+       st Z+, r1
+       movw r24, ctx0
+       movw r22, buf0
+       rcall bmw_small_nextBlock
+/*     memset(pctx.buffer, 0xaa, 64);
+       for(i=0; i<16;++i){
+               pctx.buffer[i*4] = i+0xa0;
+       }
+*/
+       ldi r18, 0xa0
+       ldi r19, 0xaa
+       movw r26, buf0
+500:
+       st X+, r18
+       st X+, r19
+       st X+, r19
+       st X+, r19
+       inc r18
+       sbrs r18, 4
+       rjmp 500b
+/*     bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
+       memcpy(ctx->h, pctx.buffer, 64);
+*/
+       movw r24, buf0
+       movw r22, ctx0
+       rcall bmw_small_nextBlock
+       ldi r18, 64
+       movw r26, ctx0
+       movw r30, buf0
+600:
+       ld r20, Z+
+       st X+, r20
+       dec r18
+       brne 600b
 
+       stack_free_large 68
+       pop_range 28, 29
+       pop_range 2, 7
+       ret
 
+/*******************************************************************************
+* void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){
+*      memcpy(dest, &(ctx->h[9]), 224/8);
+* }
+*
+* param dest:  r24:r25
+* param ctx:   r22:r23
+*/
+.global bmw224_ctx2hash
+bmw224_ctx2hash:
+       movw r26, r24
+       movw r30, r22
+       adiw r30, 9*4
+       ldi r22, 28
+       rjmp 1f
 
+/*******************************************************************************
+* void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
+*      memcpy(dest, &(ctx->h[8]), 256/8);
+* }
+*
+* param dest:  r24:r25
+* param ctx:   r22:r23
+*/
+.global bmw256_ctx2hash
+bmw256_ctx2hash:
+       movw r26, r24
+       movw r30, r22
+       adiw r30, 8*4
+       ldi r22, 32
+1:
+       ld r23, Z+
+       st X+, r23
+       dec r22
+       brne 1b
+       ret
index 5b3414587ecd75b22b254a8df44b9ab184043323..3ace5e2a894e5aebd27adc44947c9366e4958ec8 100644 (file)
  #define dump_x(a,b,c)
 #endif
 
-void bmw_small_f1(uint32_t* q, const void* m, const void* h);
-void bmw_small_f0(uint32_t* h, const void* m, uint32_t* q);
-void bmw_small_f2(uint32_t* h, uint32_t* q, const void* m);
-void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block);
-
-/*
-void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block){
-       uint32_t q[32];
-       dump_x(block, 16, 'M');
-       bmw_small_f0(ctx->h, block, q);
-       dump_x(q, 16, 'Q');
-       bmw_small_f1(q, block, ctx->h);
-       dump_x(q, 32, 'Q');
-       bmw_small_f2(ctx->h, q, block);
-       ctx->counter += 1;
-       ctx_dump(ctx);
-}
-*/
-
-void bmw_small_lastBlock(bmw_small_ctx_t* ctx, const void* block, uint16_t length_b){
-       struct {
-               uint8_t  buffer[64];
-               uint32_t ctr;
-       } pctx;
-       while(length_b >= BMW_SMALL_BLOCKSIZE){
-               bmw_small_nextBlock(ctx, block);
-               length_b -= BMW_SMALL_BLOCKSIZE;
-               block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
-       }
-       memset(pctx.buffer, 0, 64);
-       memcpy(pctx.buffer, block, (length_b+7)/8);
-       pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
-       if(length_b+1>64*8-64){
-               bmw_small_nextBlock(ctx, pctx.buffer);
-               memset(pctx.buffer, 0, 64-8);
-               ctx->counter -= 1;
-       }
-       *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
-       bmw_small_nextBlock(ctx, pctx.buffer);
-       uint8_t i;
-       memset(pctx.buffer, 0xaa, 64);
-       for(i=0; i<16;++i){
-               pctx.buffer[i*4] = i+0xa0;
-       }
-       bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
-       memcpy(ctx->h, pctx.buffer, 64);
-}
-
-void bmw224_init(bmw224_ctx_t* ctx){
-       uint8_t i;
-       ctx->h[0] = 0x00010203;
-       for(i=1; i<16; ++i){
-               ctx->h[i] = ctx->h[i-1]+ 0x04040404;
-       }
-       ctx->counter=0;
-//     ctx_dump(ctx);
-}
-
-void bmw256_init(bmw256_ctx_t* ctx){
-       uint8_t i;
-       ctx->h[0] = 0x40414243;
-       for(i=1; i<16; ++i){
-               ctx->h[i] = ctx->h[i-1]+ 0x04040404;
-       }
-       ctx->counter=0;
-//     ctx_dump(ctx);
-}
-
-void bmw224_nextBlock(bmw224_ctx_t* ctx, const void* block){
-       bmw_small_nextBlock(ctx, block);
-}
-
-void bmw256_nextBlock(bmw256_ctx_t* ctx, const void* block){
-       bmw_small_nextBlock(ctx, block);
-}
-
-void bmw224_lastBlock(bmw224_ctx_t* ctx, const void* block, uint16_t length_b){
-       bmw_small_lastBlock(ctx, block, length_b);
-}
-
-void bmw256_lastBlock(bmw256_ctx_t* ctx, const void* block, uint16_t length_b){
-       bmw_small_lastBlock(ctx, block, length_b);
-}
-
-void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){
-       memcpy(dest, &(ctx->h[9]), 224/8);
-}
-
-void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
-       memcpy(dest, &(ctx->h[8]), 256/8);
-}
 
 void bmw224(void* dest, const void* msg, uint32_t length_b){
        bmw_small_ctx_t ctx;