X-Git-Url: https://git.cryptolib.org/?a=blobdiff_plain;f=bmw%2Fbmw_small-tinyasm.S;h=50e110d82d9635c14a735ee176cc30d6e919b92f;hb=0747bb9f3d1759c0b71a0cff3387835db9833d8e;hp=7cdbadb59c4ebe93093a5bd09e594aee1c0405a2;hpb=e1455a81c4a27673cbe64d4615c63e2a591deade;p=avr-crypto-lib.git diff --git a/bmw/bmw_small-tinyasm.S b/bmw/bmw_small-tinyasm.S index 7cdbadb..50e110d 100644 --- a/bmw/bmw_small-tinyasm.S +++ b/bmw/bmw_small-tinyasm.S @@ -28,6 +28,11 @@ #include "avr-asm-macros.S" +acc2 = 8 +acc3 = 9 +acc0 = 14 +acc1 = 15 + /******************************************************************************/ /* param a: r22:r23:r24:r25 @@ -91,6 +96,7 @@ bitrotateleft_1: breq 20f 10: lsl r0 +rol32: rol r22 rol r23 rol r24 @@ -102,6 +108,18 @@ bitrotateleft_1: /******************************************************************************/ +sn_stub: + movw r22, r2 + movw r24, r4 + lpm r20, Z+ + rcall rotateleft32 +eor32_to_acc: + eor acc0, r22 + eor acc1, r23 + eor acc2, r24 + eor acc3, r25 + ret + s_table: s0: .byte 1, 3, 4,19 s1: .byte 1, 2, 8,23 @@ -109,22 +127,11 @@ s2: .byte 2, 1,12,25 s3: .byte 2, 2,15,29 s4: .byte 1, 0, 0, 0 s5: .byte 2, 0, 0, 0 -/* -s0: .byte 0x34, 19 -s1: .byte 0x28, 23 -s2: .byte 0x9C, 25 -s3: .byte 0xAF, 29 -s4: .byte 0x00, 0 -s5: .byte 0x80, 0 -*/ -acc2 = 8 -acc3 = 9 + h0 = 10 h1 = 11 m0 = 12 m1 = 13 -acc0 = 14 -acc1 = 15 /* param x: r22:r23:r24:25 @@ -146,8 +153,7 @@ sn: movw r4, r24 lpm r20, Z+ rcall shiftright32 - movw acc0, r22 - movw acc2, r24 + rcall mov32_to_acc ;--- movw r22, r2 movw r24, r4 @@ -155,25 +161,16 @@ sn: rcall shiftleft32 rcall eor32_to_acc ;--- - movw r22, r2 - movw r24, r4 - lpm r20, Z+ - rcall rotateleft32 - rcall eor32_to_acc -;--- - movw r22, r2 - movw r24, r4 - lpm r20, Z+ - rcall rotateleft32 - rcall eor32_to_acc + rcall sn_stub + rcall sn_stub + movw r22, acc0 movw r24, acc2 pop acc3 pop acc2 pop acc1 pop acc0 - pop_range 2, 5 - ret + rjmp pop5 /******************************************************************************/ /* @@ -236,13 +233,6 @@ add_X_to_32: adc r25, r0 ret -store_acc_to_dec_X: - st -X, acc3 - st -X, acc2 - st -X, acc1 - st -X, acc0 - ret - store32_to_X: st X+, r22 st X+, r23 @@ -250,6 +240,16 @@ store32_to_X: st X+, r25 ret +mov32_to_acc: + movw acc0, r22 + movw acc2, r24 + ret + +eor_acc_from_Y_add_to_Z: + rcall load32_from_Y + rcall eor32_to_acc + rjmp add_acc_to_Z + /******************************************************************************/ /* param q: r28:r29 (Y) @@ -290,20 +290,6 @@ m1 = 13 acc0 = 14 acc1 = 15 -add32_to_acc: - add acc0, r22 - adc acc1, r23 - adc acc2, r24 - adc acc3, r25 - ret - -eor32_to_acc: - eor acc0, r22 - eor acc1, r23 - eor acc2, r24 - eor acc3, r25 - ret - load_acc_from_X: ld acc0, X+ ld acc1, X+ @@ -327,6 +313,7 @@ add_acc_to_Z: ret load_rotate_add_M: + mov r20, j andi r20, 0x0f mov r0, r20 lsl r0 @@ -346,6 +333,30 @@ load_rotate_add_M: sbc acc3, r25 ret + +;--- + +/******************************************************************************/ +load_sn_add: + rcall load32_from_X + rcall sn +add32_to_acc: + add acc0, r22 + adc acc1, r23 + adc acc2, r24 + adc acc3, r25 + ret + +/* + param q: r26:r27 + param m: r22:r23 + param h: r20:r21 + param j: r24 +*/ + +expand_intro: + push_range 26, 27 + push r24 addelement: mov j, r24 movw h0, r20 @@ -361,46 +372,22 @@ addelement: rcall store_acc_to_dec_X adiw r26, 4 clt - mov r20, j rcall load_rotate_add_M - mov r20, j - subi r20, -3 + subi j, -3 rcall load_rotate_add_M - mov r20, j set - subi r20, -10 + subi j, -7 rcall load_rotate_add_M lsl j lsl j - subi j, -7*4 + subi j, -7*4+10*4 andi j, 0x3f movw r26, h0 add r26, j adc r27, r1 rcall load32_from_X rcall eor32_to_acc -;--- - ret - -/******************************************************************************/ -load_sn_add: - rcall load32_from_X - rcall sn - rjmp add32_to_acc -; ret - -/* - param q: r26:r27 - param m: r22:r23 - param h: r20:r21 - param j: r24 -*/ - -expand_intro: - - push_range 26, 27 - push r24 - rcall addelement +;-- pop r24 pop_range 26, 27 lsl r24 @@ -458,8 +445,12 @@ expand2: rcall load_sn_add expand2_exit: adiw r26, 4 - rjmp store_acc_to_dec_X -; ret +store_acc_to_dec_X: + st -X, acc3 + st -X, acc2 + st -X, acc1 + st -X, acc0 + ret /******************************************************************************/ /* @@ -507,16 +498,28 @@ h0 = 4 h1 = 5 m0 = 6 m1 = 7 +ctx0 = 2 +ctx1 = 3 +msg0 = 4 +msg1 = 5 - +restore_f1: + movw r26, r2 + movw r22, r4 + movw r20, r6 + ret +bmw_small_nextBlock_early: + movw r24, ctx0 + movw r22, msg0 .global bmw_small_nextBlock .global bmw224_nextBlock .global bmw256_nextBlock bmw_small_nextBlock: bmw224_nextBlock: bmw256_nextBlock: + push_range 2, 7 push_range 28, 29 - push_range 2, 17 + push_range 8, 17 stack_alloc_large 32*4, r28, r29 ldi r16, 0x4f push r16 @@ -654,23 +657,18 @@ f1: movw r26, r2 clr r24 rcall expand1 - movw r26, r2 - movw r22, r4 - movw r20, r6 + rcall restore_f1 ldi r24, 1 rcall expand1 ldi r17, 2 -10: movw r26, r2 - movw r22, r4 - movw r20, r6 +10: rcall restore_f1 mov r24, r17 rcall expand2 inc r17 sbrs r17, 4 rjmp 10b + rcall restore_f1 movw r24, r2 - movw r22, r4 - movw r20, r6 /* call f2 */ @@ -704,8 +702,7 @@ f2: movw h0, r20 movw r28, r22 rcall load32_from_X - movw acc0, r22 - movw acc2, r24 + rcall mov32_to_acc ldi r17, 15 10: rcall load32_from_X rcall eor32_to_acc @@ -733,10 +730,8 @@ f2: movw r26, q16_0 ldi r17, 16 10: - ld acc0, Y+ - ld acc1, Y+ - ld acc2, Y+ - ld acc3, Y+ + rcall load32_from_Y + rcall mov32_to_acc ;--- movw r22, xh0 movw r24, xh2 @@ -795,9 +790,7 @@ f2: movw acc2, xl2 rcall load32_from_X rcall eor32_to_acc - rcall load32_from_Y - rcall eor32_to_acc - rcall add_acc_to_Z + rcall eor_acc_from_Y_add_to_Z dec r17 brne 10b sbiw r26, 9*4 /* X points to q[23] */ @@ -805,9 +798,7 @@ f2: eor acc1, xl0 eor acc2, xl1 eor acc3, xl2 - rcall load32_from_Y - rcall eor32_to_acc - rcall add_acc_to_Z + rcall eor_acc_from_Y_add_to_Z ;--- sbiw r26, 8*4 /* X points to q[16] */ mov h0, r30 @@ -827,11 +818,9 @@ f2: rjmp 21f 20: rcall shiftright32 21: - rcall eor32_to_acc - rcall load32_from_Y - rcall eor32_to_acc movw r30, h0 - rcall add_acc_to_Z + rcall eor32_to_acc + rcall eor_acc_from_Y_add_to_Z movw h0, r30 dec r17 brne 10b @@ -845,8 +834,7 @@ f2: rcall load32_from_X mov r20, r18 rcall rotateleft32 - movw acc0, r22 - movw acc2, r24 + rcall mov32_to_acc rcall add_acc_to_Z inc r18 cpi r17, 5 @@ -862,8 +850,15 @@ f2: ; rcall printX ;--- END DBG stack_free_large3 32*4+4 - pop_range 2, 17 + pop_range 10, 17 +pop9: + pop_range 8, 9 +pop28: pop_range 28, 29 +pop7: + pop_range 6, 7 +pop5: + pop_range 2, 5 ret /******************************************************************************/ @@ -912,9 +907,7 @@ bmw256_lastBlock: 1: cpi len1, hi8(512) brlo 2f - movw r24, ctx0 - movw r22, blc0 - rcall bmw_small_nextBlock + rcall bmw_small_nextBlock_early ldi r24, 64 add blc0, r24 adc blc1, r1 @@ -976,9 +969,8 @@ bmw256_lastBlock: breq 400f cpi len0, 192 brlo 400f - movw r24, ctx0 - movw r22, buf0 - rcall bmw_small_nextBlock + movw blc0, buf0 + rcall bmw_small_nextBlock_early movw r26, buf0 ldi r20, 64-8 350: @@ -998,30 +990,24 @@ bmw256_lastBlock: rcall load32_from_Z_stub 410: clr r25 + ldi r20, 1 lsl r21 - rol r22 - rol r23 - rol r24 - rol r25 + rcall rol32 mov r20, len0 add r21, len1 adc r22, r1 adc r23, r1 adc r24, r1 adc r25, r1 - movw r30, buf0 - adiw r30, 64-8 - st Z+, r20 - st Z+, r21 - st Z+, r22 - st Z+, r23 - st Z+, r24 - st Z+, r25 - st Z+, r1 - st Z+, r1 - movw r24, ctx0 - movw r22, buf0 - rcall bmw_small_nextBlock + movw r26, buf0 + adiw r26, 64-8 + st X+, r20 + st X+, r21 + rcall store32_to_X + st X+, r1 + st X+, r1 + movw blc0, buf0 + rcall bmw_small_nextBlock_early /* memset(pctx.buffer, 0xaa, 64); for(i=0; i<16;++i){ pctx.buffer[i*4] = i+0xa0; @@ -1040,9 +1026,9 @@ bmw256_lastBlock: /* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h); memcpy(ctx->h, pctx.buffer, 64); */ - movw r24, buf0 - movw r22, ctx0 - rcall bmw_small_nextBlock + movw r24, buf0 + movw r22, ctx0 + rcall bmw_small_nextBlock ldi r18, 64 movw r26, ctx0 movw r30, buf0 @@ -1053,9 +1039,7 @@ bmw256_lastBlock: brne 600b stack_free_large 68 - pop_range 28, 29 - pop_range 2, 7 - ret + rjmp pop28 /******************************************************************************* @@ -1068,10 +1052,9 @@ bmw256_lastBlock: */ .global bmw224_ctx2hash bmw224_ctx2hash: - movw r26, r24 movw r30, r22 adiw r30, 9*4 - ldi r22, 28 + ldi r18, 28 rjmp 1f /******************************************************************************* @@ -1084,14 +1067,13 @@ bmw224_ctx2hash: */ .global bmw256_ctx2hash bmw256_ctx2hash: - movw r26, r24 movw r30, r22 adiw r30, 8*4 - ldi r22, 32 -1: - ld r23, Z+ + ldi r18, 32 +1: movw r26, r24 +1: ld r23, Z+ st X+, r23 - dec r22 + dec r18 brne 1b ret @@ -1124,10 +1106,10 @@ dst0 = 10 dst1 = 11 .global bmw256 bmw256: - push r16 - ldi r16, 1 + set rjmp bmw_small_all + /******************************************************************************* * void bmw224(void* dest, const void* msg, uint32_t length_b){ * bmw_small_ctx_t ctx; @@ -1157,15 +1139,21 @@ dst0 = 6 dst1 = 7 .global bmw224 bmw224: - push r16 - clr r16 + clt + rjmp bmw_small_all + bmw_small_all: - push_range 2, 9 + push_range 2, 7 push_range 28, 29 + push_range 8, 9 + push r16 stack_alloc_large 64+4 adiw r30, 1 - movw ctx0, r30 + clr r16 + brtc 10f + inc r16 +10: movw ctx0, r30 movw dst0, r24 movw msg0, r22 movw len0, r18 @@ -1180,9 +1168,7 @@ bmw_small_all: mov r18, len2 or r18, len3 breq 50f - movw r24, ctx0 - movw r22, msg0 - rcall bmw_small_nextBlock + rcall bmw_small_nextBlock_early subi len1, 2 sbc len2, r1 sbc len3, r1 @@ -1203,10 +1189,8 @@ bmw_small_all: adc r31, r1 icall stack_free_large 64+4 - pop_range 28, 29 - pop_range 2, 9 pop r16 - ret + rjmp pop9 init_lut: rjmp bmw224_init @@ -1229,36 +1213,30 @@ c2h_lut: */ .global bmw224_init bmw224_init: - movw r26, r24 - ldi r22, 0x03 - ldi r23, 0x02 - ldi r24, 0x01 - ldi r25, 0x00 + ldi r22, 0x00 + ldi r23, 0x40 bmw_small_init: - rcall store32_to_X - ldi r18, 16-1 - ldi r20, 0x04 -1: - add r22, r20 - adc r23, r20 - adc r24, r20 - adc r25, r20 - rcall store32_to_X - dec r18 - brne 1b - st X+, r1 - st X+, r1 - st X+, r1 - st X+, r1 + movw r26, r24 + adiw r26, 4 +10: + st -X, r22 + inc r22 + mov r20, r22 + andi r20, 0x3 + brne 10b + adiw r26, 8 +20: cp r22, r23 + brne 10b + st -X, r1 + st -X, r1 + st -X, r1 + st -X, r1 ret .global bmw256_init bmw256_init: - movw r26, r24 - ldi r22, 0x43 - ldi r23, 0x42 - ldi r24, 0x41 - ldi r25, 0x40 + ldi r22, 0x40 + ldi r23, 0x80 rjmp bmw_small_init