X-Git-Url: https://git.cryptolib.org/?a=blobdiff_plain;f=bmw%2Fbmw_small-tinyasm.S;h=de7ff5a2f62360952f9700b0d7fdc3020cf5f7d0;hb=7bc75db2cff11a8a8b347e27dec3f4e019418d52;hp=764f2815edbfb2782c1843f7482ab1ef5d6771d7;hpb=c6a15ac3ba0c10bfb904be257a2fe5bda7b3dea3;p=avr-crypto-lib.git diff --git a/bmw/bmw_small-tinyasm.S b/bmw/bmw_small-tinyasm.S index 764f281..de7ff5a 100644 --- a/bmw/bmw_small-tinyasm.S +++ b/bmw/bmw_small-tinyasm.S @@ -122,41 +122,44 @@ eor_r22_in_r16: param s: r20 */ sn: - push_range 12, 20 + push_range 2, 5 + push r17 + push r19 ldi r30, lo8(s_table) ldi r31, hi8(s_table) lsl r20 lsl r20 add r30, r20 adc r31, r1 - movw r12, r22 - movw r14, r24 + movw r2, r22 + movw r4, r24 lpm r20, Z+ rcall shiftright32 movw r16, r22 movw r18, r24 ;--- - movw r22, r12 - movw r24, r14 + movw r22, r2 + movw r24, r4 lpm r20, Z+ rcall shiftleft32 rcall eor_r22_in_r16 ;--- - movw r22, r12 - movw r24, r14 + movw r22, r2 + movw r24, r4 lpm r20, Z+ rcall rotateleft32 rcall eor_r22_in_r16 ;--- - movw r22, r12 - movw r24, r14 + movw r22, r2 + movw r24, r4 lpm r20, Z+ rcall rotateleft32 - eor r22, r16 - eor r23, r17 - eor r24, r18 - eor r25, r19 - pop_range 12, 20 + rcall eor_r22_in_r16 + movw r22, r16 + movw r24, r18 + pop r19 + pop r17 + pop_range 2, 5 ret /******************************************************************************/ @@ -168,6 +171,7 @@ sn: memxor_short: ; tst r20 ; breq memxor_exit + ldi r20, 64 10: ld r21, X ld r22, Z+ eor r21, r22 @@ -185,42 +189,6 @@ h1 = 5 m0 = 6 m1 = 7 -add_hx_to_w: - movw r26, h0 - add r26, r16 - adc r27, r1 - ld r22, Y - ldd r23, Y+1 - ldd r24, Y+2 - ldd r25, Y+3 - lsl r20 - rol r21 - brcs 30f - /* addition */ - ld r0, X+ - add r22, r0 - ld r0, X+ - adc r23, r0 - ld r0, X+ - adc r24, r0 - ld r0, X+ - adc r25, r0 - rjmp 50f -30: /* substract */ - ld r0, X+ - sub r22, r0 - ld r0, X+ - sbc r23, r0 - ld r0, X+ - sbc r24, r0 - ld r0, X+ - sbc r25, r0 -50: - st Y+, r22 - st Y+, r23 - st Y+, r24 - st Y+, r25 - ret /******************************************************************************/ load32_from_X: @@ -236,6 +204,24 @@ load32_from_Y: ld r24, Y+ ld r25, Y+ ret + +store32_to_Y: + st Y+, r22 + st Y+, r23 + st Y+, r24 + st Y+, r25 + ret + +add_X_to_32: + ld r0, X+ + add r22, r0 + ld r0, X+ + adc r23, r0 + ld r0, X+ + adc r24, r0 + ld r0, X+ + adc r25, r0 + ret /******************************************************************************/ /* param q: r28:r29 (Y) @@ -258,131 +244,6 @@ f0_s_table: .byte 0,1,2,3,4 ; .byte 0 -f0: - movw h0, r26 - movw q0, r28 - movw m0, r30 -;--- DBG -; push_range 22, 25 -; movw r24, r26 -; ldi r22, 'H' -; rcall printX -; pop_range 22, 25 -;--- END DBG -;--- DBG -; push_range 22, 25 -; movw r24, r30 -; ldi r22, 'M' -; rcall printX -; pop_range 22, 25 -;--- END DBG - /* xor m into h */ - ldi r20, 64 - rcall memxor_short - movw r30, m0 - movw r26, h0 - - /* set q to zero */ - ldi r22, 64 -10: st Y+, r1 - dec r22 - brne 10b - movw r28, q0 - /* calculate W and store it in Q */ - ldi r19, 5 -30: - ldi r18, 16 - /* load initial index */ - ldi r30, lo8(f0_indextable-1) - ldi r31, hi8(f0_indextable-1) - add r30, r19 - adc r31, r1 - lpm r16, Z - /* load values from hacktable */ - ldi r30, lo8(f0_hacktable-2) - ldi r31, hi8(f0_hacktable-2) - lsl r19 - add r30, r19 - adc r31, r1 - lsr r19 - lpm r21, Z+ - lpm r20, Z -40: - call add_hx_to_w - subi r16, -4 - andi r16, 0x0f<<2 - dec r18 - brne 40b - movw r28, q0 - dec r19 - brne 30b - movw r26, h0 -;--- DBG -; push_range 22, 25 -; movw r24, r28 -; ldi r22, 'W' -; rcall printX -; pop_range 22, 25 -;--- END DBG - /* xor m into h */ - ldi r20, 64 - movw r26, h0 - movw r30, m0 - rcall memxor_short - sbiw r26, 60 -;--- - ldi r30, lo8(f0_s_table) - ldi r31, hi8(f0_s_table) - ldi r21, 15 - mov r8, r21 -50: - ldd r22, Y+0 - ldd r23, Y+1 - ldd r24, Y+2 - ldd r25, Y+3 - lpm r20, Z+ - movw r2, r30 - rcall sn - movw r30, r2 - - ld r0, X+ - add r22, r0 - ld r0, X+ - adc r23, r0 - ld r0, X+ - adc r24, r0 - ld r0, X+ - adc r25, r0 - - st Y+, r22 - st Y+, r23 - st Y+, r24 - st Y+, r25 - dec r8 - brne 50b -;--- - ldd r22, Y+0 - ldd r23, Y+1 - ldd r24, Y+2 - ldd r25, Y+3 - clr r20 - rcall sn - movw r30, r2 - movw r26, h0 - ld r0, X+ - add r22, r0 - ld r0, X+ - adc r23, r0 - ld r0, X+ - adc r24, r0 - ld r0, X+ - adc r25, r0 - sbiw r26, 4 - st Y+, r22 - st Y+, r23 - st Y+, r24 - st Y+, r25 - ret /******************************************************************************/ @@ -460,10 +321,7 @@ load_rotate_add_M: movw r26, m0 add r26, r0 adc r27, r1 - ld r22, X+ - ld r23, X+ - ld r24, X+ - ld r25, X+ + rcall load32_from_X inc r20 rcall rotateleft32 brts 10f @@ -549,13 +407,8 @@ expand1: inc r19 cpi r19, 17 brne 10b -expand1_exit: -; adiw r26, 63 - st X+, acc0 - st X+, acc1 - st X+, acc2 - st X+, acc3 - ret + rjmp expand2_exit + /******************************************************************************/ /* @@ -581,16 +434,20 @@ expand2: rcall add32_to_acc dec r19 brne 10b - rcall load32_from_X ldi r20, 4 + rcall load32_from_X rcall sn rcall add32_to_acc - rcall load32_from_X ldi r20, 5 + rcall load32_from_X rcall sn rcall add32_to_acc - - rjmp expand1_exit +expand2_exit: + st X+, acc0 + st X+, acc1 + st X+, acc2 + st X+, acc3 + ret /******************************************************************************/ /* @@ -604,13 +461,193 @@ expand2: param h: r20:r21 param j: r24 */ + +/******************************************************************************/ +/* + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +f2_1_shift_table: + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55 +f2_2_shift_table: + .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1) + .byte 0 ; just for alignment + +/******************************************************************************/ +/* + param ctx: r24:r25 + param msg: r22:r23 +*/ +/* f0 + param q: r28:r29 (Y) + param h: r26:r27 (X) + param m: r30:r31 (Z) +*/ +/* f1 + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +/* f2 + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +q0 = 2 +q1 = 3 +h0 = 4 +h1 = 5 +m0 = 6 +m1 = 7 + + +.global bmw_small_nextBlock +.global bmw224_nextBlock +.global bmw256_nextBlock +bmw_small_nextBlock: +bmw224_nextBlock: +bmw256_nextBlock: + push_range 28, 29 + push_range 2, 17 + stack_alloc_large 32*4, r28, r29 + adiw r28, 1 +; push_range 28, 29 /* push Q */ +; push_range 22, 25 /* push M & H */ + /* increment counter */ + movw r26, r24 + movw r2, r26 + adiw r26, 63 + adiw r26, 1 + rcall load_acc_from_X + ldi r19, 1 + add acc0, r19 + adc acc1, r1 + adc acc2, r1 + adc acc3, r1 + st -X, acc3 + st -X, acc2 + st -X, acc1 + st -X, acc0 + /* call f0 */ + movw r30, r22 + movw r26, r24 +f0: + movw h0, r26 + movw q0, r28 + movw m0, r30 + /* xor m into h */ +; ldi r20, 64 + rcall memxor_short + movw r30, m0 + movw r26, h0 + + /* set q to zero */ + ldi r22, 64 +10: st Y+, r1 + dec r22 + brne 10b + movw r28, q0 + /* calculate W and store it in Q */ + ldi r19, 5 +30: + ldi r18, 16 + /* load initial index */ + ldi r30, lo8(f0_indextable-1) + ldi r31, hi8(f0_indextable-1) + add r30, r19 + adc r31, r1 + lpm r16, Z + /* load values from hacktable */ + ldi r30, lo8(f0_hacktable-2) + ldi r31, hi8(f0_hacktable-2) + lsl r19 + add r30, r19 + adc r31, r1 + lsr r19 + lpm r21, Z+ + lpm r20, Z +40: + ;call add_hx_to_w +add_hx_to_w: + movw r26, h0 + add r26, r16 + adc r27, r1 + rcall load32_from_Y + sbiw r28, 4 + lsl r20 + rol r21 + brcs 300f + /* addition */ + rcall add_X_to_32 + rjmp 500f +300: /* substract */ + ld r0, X+ + sub r22, r0 + ld r0, X+ + sbc r23, r0 + ld r0, X+ + sbc r24, r0 + ld r0, X+ + sbc r25, r0 +500: + rcall store32_to_Y + subi r16, -4 + andi r16, 0x0f<<2 + dec r18 + brne 40b + movw r28, q0 + dec r19 + brne 30b + movw r26, h0 + /* xor m into h */ +; ldi r20, 64 + movw r26, h0 + movw r30, m0 + rcall memxor_short + sbiw r26, 60 +;--- + ldi r30, lo8(f0_s_table) + ldi r31, hi8(f0_s_table) + ldi r21, 15 + mov r8, r21 +50: + rcall load32_from_Y + sbiw r28, 4 + lpm r20, Z+ + movw r2, r30 + rcall sn + movw r30, r2 + + rcall add_X_to_32 + rcall store32_to_Y + + dec r8 + brne 50b +;--- + rcall load32_from_Y + clr r20 + rcall sn + movw r30, r2 + movw r26, h0 + rcall add_X_to_32 + sbiw r26, 4 + st -Y, r25 + st -Y, r24 + st -Y, r23 + st -Y, r22 + sbiw r28, 15*4 + movw r20, h0 + movw r22, m0 + + /* call f1*/ + movw r24, r28 f1: movw r2, r24 movw r4, r22 movw r6, r20 movw r26, r2 -; movw r22, r4 -; movw r20, r6 clr r24 rcall expand1 movw r26, r2 @@ -627,20 +664,17 @@ f1: inc r17 sbrs r17, 4 rjmp 10b - ret + movw r24, r2 + movw r22, r4 + movw r20, r6 -/******************************************************************************/ -/* - param q: r24:r25 - param m: r22:r23 - param h: r20:r21 -*/ -f2_1_shift_table: - .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55 -f2_2_shift_table: - .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1) - .byte 0 ; just for alignment + + /* call f2 */ +; pop_range 20, 25 +; push_range 20, 25 +; rcall printQ +; push r20 +; push r21 acc2 = 8 acc3 = 9 acc0 = 14 @@ -663,35 +697,23 @@ f2: adiw r26, 63 adiw r26, 1 movw q16_0, r26 - clr xl0 - clr xl1 - clr xl2 - clr xl3 - ldi r17, 8 -10: ld r0, X+ - eor xl0, r0 - ld r0, X+ - eor xl1, r0 - ld r0, X+ - eor xl2, r0 - ld r0, X+ - eor xl3, r0 - dec r17 - brne 10b -;--- /* calc XH */ - movw xh0, xl0 - movw xh2, xl2 - ldi r17, 8 -10: ld r0, X+ - eor xh0, r0 - ld r0, X+ - eor xh1, r0 - ld r0, X+ - eor xh2, r0 - ld r0, X+ - eor xh3, r0 + movw h0, r20 + movw r28, r22 + rcall load32_from_X + movw acc0, r22 + movw acc2, r24 + ldi r17, 15 +10: rcall load32_from_X + rcall eor32_to_acc + cpi r17, 9 + brne 15f + movw xl0, acc0 + movw xl2, acc2 +15: dec r17 brne 10b + movw xh0, acc0 + movw xh2, acc2 ;--- DBG ; push_range 22, 25 ; movw r22, xl0 @@ -704,8 +726,6 @@ f2: ;--- END DBG ;--- /* calc first half of h0..h15 */ - movw h0, r20 - movw r28, r22 movw r26, q16_0 ldi r17, 16 10: @@ -822,83 +842,11 @@ f2: rcall add_acc_to_Z inc r18 cpi r17, 5 - breq 20f - dec r17 + brne 20f + sbiw r26, 8*4 +20: dec r17 brne 10b - ret -20: sbiw r26, 8*4 - dec r17 - rjmp 10b -/******************************************************************************/ -/* - param ctx: r24:r25 - param msg: r22:r23 -*/ -/* f0 - param q: r28:r29 (Y) - param h: r26:r27 (X) - param m: r30:r31 (Z) -*/ -/* f1 - param q: r24:r25 - param m: r22:r23 - param h: r20:r21 -*/ -/* f2 - param q: r24:r25 - param m: r22:r23 - param h: r20:r21 -*/ -.global bmw_small_nextBlock -.global bmw224_nextBlock -.global bmw256_nextBlock -bmw_small_nextBlock: -bmw224_nextBlock: -bmw256_nextBlock: - push_range 28, 29 - push_range 2, 17 - stack_alloc_large 32*4, r28, r29 - adiw r28, 1 - push_range 28, 29 /* push Q */ - push_range 22, 25 /* push M & H */ - /* increment counter */ - movw r26, r24 - movw r2, r26 - adiw r26, 63 - adiw r26, 1 - rcall load_acc_from_X - ldi r19, 1 - add acc0, r19 - adc acc1, r1 - adc acc2, r1 - adc acc3, r1 - st -X, acc0 - st -X, acc1 - st -X, acc2 - st -X, acc3 - /* call f0 */ - movw r30, r22 - movw r26, r24 - rcall f0 - /* call f1*/ - pop r21 - pop r20 - pop r23 - pop r22 - pop r25 - pop r24 -; rcall printQ - push_range 20, 25 - rcall f1 - /* call f2 */ -; pop_range 20, 25 -; push_range 20, 25 -; rcall printQ - pop_range 20, 25 -; push r20 -; push r21 - call f2 ;--- DBG ; pop r25 ; pop r24 @@ -911,11 +859,6 @@ bmw256_nextBlock: ret /******************************************************************************/ -/* - param ctx: r24:r25 - param msg: r22:r23 - param len: r20:r21 -*/ ctx0 = 2 ctx1 = 3 blc0 = 4 @@ -925,6 +868,22 @@ len1 = 29 buf0 = 6 buf1 = 7 +load32_from_Z_stub: + movw r30, ctx0 + adiw r30, 60 + ldd r21, Z+4 + ldd r22, Z+5 + ldd r23, Z+6 + ldd r24, Z+7 + ret + +/******************************************************************************/ +/* + param ctx: r24:r25 + param msg: r22:r23 + param len: r20:r21 +*/ + .global bmw_small_lastBlock .global bmw224_lastBlock .global bmw256_lastBlock @@ -1019,12 +978,7 @@ bmw256_lastBlock: st X+, r1 dec r20 brne 350b - movw r30, ctx0 - adiw r30, 60 - ldd r21, Z+4 - ldd r22, Z+5 - ldd r23, Z+6 - ldd r24, Z+7 + rcall load32_from_Z_stub subi r21, 1 sbc r22, r1 sbc r23, r1 @@ -1034,12 +988,7 @@ bmw256_lastBlock: bmw_small_nextBlock(ctx, pctx.buffer); */ 400: - movw r30, ctx0 - adiw r30, 60 - ldd r21, Z+4 - ldd r22, Z+5 - ldd r23, Z+6 - ldd r24, Z+7 + rcall load32_from_Z_stub 410: clr r25 lsl r21 @@ -1194,19 +1143,20 @@ ctx0 = 2 ctx1 = 3 msg0 = 4 msg1 = 5 -len0 = 6 -len1 = 7 +len0 = 28 +len1 = 29 len2 = 8 len3 = 9 -dst0 = 10 -dst1 = 11 +dst0 = 6 +dst1 = 7 .global bmw224 bmw224: push r16 clr r16 bmw_small_all: - push_range 2, 11 + push_range 2, 9 + push_range 28, 29 stack_alloc_large 64+4 adiw r30, 1 movw ctx0, r30 @@ -1227,8 +1177,7 @@ bmw_small_all: movw r24, ctx0 movw r22, msg0 rcall bmw_small_nextBlock - ldi r20, 2 - sub len1, r20 + subi len1, 2 sbc len2, r1 sbc len3, r1 ldi r20, 64 @@ -1248,7 +1197,8 @@ bmw_small_all: adc r31, r1 icall stack_free_large 64+4 - pop_range 2, 11 + pop_range 28, 29 + pop_range 2, 9 pop r16 ret