X-Git-Url: https://git.cryptolib.org/?p=avr-crypto-lib.git;a=blobdiff_plain;f=bmw%2Fbmw_small-tinyasm.S;h=ad4ee95ad1621378b484c646ed26f94df30af481;hp=d0f8530ffd160099578572974021e8aa8a591975;hb=eb0cafe05ab4cdf60878dbd81e4ff3712d5150f2;hpb=267cb0a0d12700ecfdc3f9ab005b61e48691f6a2 diff --git a/bmw/bmw_small-tinyasm.S b/bmw/bmw_small-tinyasm.S index d0f8530..ad4ee95 100644 --- a/bmw/bmw_small-tinyasm.S +++ b/bmw/bmw_small-tinyasm.S @@ -28,12 +28,24 @@ #include "avr-asm-macros.S" +acc2 = 8 +acc3 = 9 +acc0 = 14 +acc1 = 15 + +#define DEBUG 0 + /******************************************************************************/ /* param a: r22:r23:r24:r25 param s: r20 */ shiftleft32: + tst r20 + brpl 10f + neg r20 + rjmp shiftright32 +10: clr r0 cpi r20, 8 brlo bitrotateleft_1 @@ -42,7 +54,7 @@ shiftleft32: mov r23, r22 clr r22 subi r20, 8 - rjmp shiftleft32 + rjmp 10b /******************************************************************************/ /* @@ -91,6 +103,7 @@ bitrotateleft_1: breq 20f 10: lsl r0 +rol32: rol r22 rol r23 rol r24 @@ -102,6 +115,18 @@ bitrotateleft_1: /******************************************************************************/ +sn_stub: + movw r22, r2 + movw r24, r4 + lpm r20, Z+ + rcall rotateleft32 +eor32_to_acc: + eor acc0, r22 + eor acc1, r23 + eor acc2, r24 + eor acc3, r25 + ret + s_table: s0: .byte 1, 3, 4,19 s1: .byte 1, 2, 8,23 @@ -110,54 +135,49 @@ s3: .byte 2, 2,15,29 s4: .byte 1, 0, 0, 0 s5: .byte 2, 0, 0, 0 -eor_r22_in_r16: - eor r16, r22 - eor r17, r23 - eor r18, r24 - eor r19, r25 - ret +h0 = 10 +h1 = 11 +m0 = 12 +m1 = 13 /* param x: r22:r23:r24:25 param s: r20 */ sn: - push_range 12, 20 + push_range 2, 5 + push acc0 + push acc1 + push acc2 + push acc3 ldi r30, lo8(s_table) ldi r31, hi8(s_table) lsl r20 lsl r20 add r30, r20 adc r31, r1 - movw r12, r22 - movw r14, r24 + movw r2, r22 + movw r4, r24 lpm r20, Z+ rcall shiftright32 - movw r16, r22 - movw r18, r24 + rcall mov32_to_acc ;--- - movw r22, r12 - movw r24, r14 + movw r22, r2 + movw r24, r4 lpm r20, Z+ rcall shiftleft32 - rcall eor_r22_in_r16 -;--- - movw r22, r12 - movw r24, r14 - lpm r20, Z+ - rcall rotateleft32 - rcall eor_r22_in_r16 + rcall eor32_to_acc ;--- - movw r22, r12 - movw r24, r14 - lpm r20, Z+ - rcall rotateleft32 - eor r22, r16 - eor r23, r17 - eor r24, r18 - eor r25, r19 - pop_range 12, 20 - ret + rcall sn_stub + rcall sn_stub + + movw r22, acc0 + movw r24, acc2 + pop acc3 + pop acc2 + pop acc1 + pop acc0 + rjmp pop5 /******************************************************************************/ /* @@ -165,9 +185,11 @@ sn: param src: r30:r31 (Z) param len: r20 */ -memxor_short: +memxor_64: ; tst r20 ; breq memxor_exit + ldi r20, 64 +memxor: 10: ld r21, X ld r22, Z+ eor r21, r22 @@ -185,42 +207,6 @@ h1 = 5 m0 = 6 m1 = 7 -add_hx_to_w: - movw r26, h0 - add r26, r16 - adc r27, r1 - ld r22, Y - ldd r23, Y+1 - ldd r24, Y+2 - ldd r25, Y+3 - lsl r20 - rol r21 - brcs 30f - /* addition */ - ld r0, X+ - add r22, r0 - ld r0, X+ - adc r23, r0 - ld r0, X+ - adc r24, r0 - ld r0, X+ - adc r25, r0 - rjmp 50f -30: /* substract */ - ld r0, X+ - sub r22, r0 - ld r0, X+ - sbc r23, r0 - ld r0, X+ - sbc r24, r0 - ld r0, X+ - sbc r25, r0 -50: - st Y+, r22 - st Y+, r23 - st Y+, r24 - st Y+, r25 - ret /******************************************************************************/ load32_from_X: @@ -236,139 +222,15 @@ load32_from_Y: ld r24, Y+ ld r25, Y+ ret -/******************************************************************************/ -/* - param q: r28:r29 (Y) - param h: r26:r27 (X) - param m: r30:r31 (Z) -*/ - -f0_hacktable: - .byte 0x03, 0x11 - .byte 0xDD, 0xB3 - .byte 0x2A, 0x79 - .byte 0x07, 0xAA - .byte 0x51, 0xC2 -f0_indextable: - .byte 5*4,7*4,10*4,13*4,14*4 -; .byte 0 ; just for alignment -f0_s_table: - .byte 0,1,2,3,4 - .byte 0,1,2,3,4 - .byte 0,1,2,3,4 -; .byte 0 - -f0: - movw h0, r26 - movw q0, r28 - movw m0, r30 -;--- DBG -; push_range 22, 25 -; movw r24, r26 -; ldi r22, 'H' -; rcall printX -; pop_range 22, 25 -;--- END DBG -;--- DBG -; push_range 22, 25 -; movw r24, r30 -; ldi r22, 'M' -; rcall printX -; pop_range 22, 25 -;--- END DBG - /* xor m into h */ - ldi r20, 64 - rcall memxor_short - movw r30, m0 - movw r26, h0 - - /* set q to zero */ - ldi r22, 64 -10: st Y+, r1 - dec r22 - brne 10b - movw r28, q0 - /* calculate W and store it in Q */ - ldi r19, 5 -30: - ldi r18, 16 - /* load initial index */ - ldi r30, lo8(f0_indextable-1) - ldi r31, hi8(f0_indextable-1) - add r30, r19 - adc r31, r1 - lpm r16, Z - /* load values from hacktable */ - ldi r30, lo8(f0_hacktable-2) - ldi r31, hi8(f0_hacktable-2) - lsl r19 - add r30, r19 - adc r31, r1 - lsr r19 - lpm r21, Z+ - lpm r20, Z -40: - call add_hx_to_w - subi r16, -4 - andi r16, 0x0f<<2 - dec r18 - brne 40b - movw r28, q0 - dec r19 - brne 30b - movw r26, h0 -;--- DBG -; push_range 22, 25 -; movw r24, r28 -; ldi r22, 'W' -; rcall printX -; pop_range 22, 25 -;--- END DBG - /* xor m into h */ - ldi r20, 64 - movw r26, h0 - movw r30, m0 - rcall memxor_short - sbiw r26, 60 -;--- - ldi r30, lo8(f0_s_table) - ldi r31, hi8(f0_s_table) - ldi r21, 15 - mov r8, r21 -50: - ldd r22, Y+0 - ldd r23, Y+1 - ldd r24, Y+2 - ldd r25, Y+3 - lpm r20, Z+ - movw r2, r30 - rcall sn - movw r30, r2 - - ld r0, X+ - add r22, r0 - ld r0, X+ - adc r23, r0 - ld r0, X+ - adc r24, r0 - ld r0, X+ - adc r25, r0 +store32_to_Y: st Y+, r22 st Y+, r23 st Y+, r24 st Y+, r25 - dec r8 - brne 50b -;--- - ldd r22, Y+0 - ldd r23, Y+1 - ldd r24, Y+2 - ldd r25, Y+3 - clr r20 - rcall sn - movw r30, r2 - movw r26, h0 + ret + +add_X_to_32: ld r0, X+ add r22, r0 ld r0, X+ @@ -377,23 +239,47 @@ f0: adc r24, r0 ld r0, X+ adc r25, r0 - sbiw r26, 4 - st Y+, r22 - st Y+, r23 - st Y+, r24 - st Y+, r25 + ret + +store32_to_X: + st X+, r22 + st X+, r23 + st X+, r24 + st X+, r25 + ret + +mov32_to_acc: + movw acc0, r22 + movw acc2, r24 ret /******************************************************************************/ +/* + param q: r28:r29 (Y) + param h: r26:r27 (X) + param m: r30:r31 (Z) +*/ + +f2_1_shift_table: +; .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55 +; .byte 0x55, 0x87, 0x55, 0x51, 0x03, 0x66, 0x64, 0x2B + .byte 5, -5, -7, 8, -5, 5, -1, 5, -3, 0, 6, -6, -4, 6, -11, 2 +f2_2_shift_table: +; .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1) + .byte 8, -6, 6, 4, -3, -4, -7, -2 +expand2_rot_table: + .byte 3,7,13,16,19,23,27 + +f0_hacktable: + .byte 0x03, 0x11, 5*4 + .byte 0xDD, 0xB3, 7*4 + .byte 0x2A, 0x79, 10*4 + .byte 0x07, 0xAA, 13*4 + .byte 0x51, 0xC2, 14*4 -const_lut: - .long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f - .long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3 - .long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7 - .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b /******************************************************************************* -* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){ +* uint32_t addelment(uint8_t j, const uint32_t *m, const uint32_t *h){ * uint32_t r; * r = pgm_read_dword(k_lut+j); * r += rotl_addel(((uint32_t*)m)[j&0xf], j+0); @@ -416,20 +302,6 @@ m1 = 13 acc0 = 14 acc1 = 15 -add32_to_acc: - add acc0, r22 - adc acc1, r23 - adc acc2, r24 - adc acc3, r25 - ret - -eor32_to_acc: - eor acc0, r22 - eor acc1, r23 - eor acc2, r24 - eor acc3, r25 - ret - load_acc_from_X: ld acc0, X+ ld acc1, X+ @@ -437,22 +309,23 @@ load_acc_from_X: ld acc3, X+ ret -add_acc_to_Z: - ld r0, Z +add_acc_to_X: + ld r0, X add r0, acc0 - st Z+, r0 - ld r0, Z + st X+, r0 + ld r0, X adc r0, acc1 - st Z+, r0 - ld r0, Z + st X+, r0 + ld r0, X adc r0, acc2 - st Z+, r0 - ld r0, Z + st X+, r0 + ld r0, X adc r0, acc3 - st Z+, r0 + st X+, r0 ret load_rotate_add_M: + mov r20, j andi r20, 0x0f mov r0, r20 lsl r0 @@ -460,78 +333,75 @@ load_rotate_add_M: movw r26, m0 add r26, r0 adc r27, r1 - ld r22, X+ - ld r23, X+ - ld r24, X+ - ld r25, X+ + rcall load32_from_X inc r20 rcall rotateleft32 brts 10f - rcall add32_to_acc - ret + rjmp add32_to_acc +; ret 10: sub acc0, r22 sbc acc1, r23 sbc acc2, r24 sbc acc3, r25 ret + +;--- + +/******************************************************************************/ +load_sn_add: + rcall load32_from_X + rcall sn +add32_to_acc: + add acc0, r22 + adc acc1, r23 + adc acc2, r24 + adc acc3, r25 + ret + +/* + param q: r26:r27 + param m: r22:r23 + param h: r20:r21 + param j: r24 +*/ + +expand_intro: + push_range 26, 27 + push r24 addelement: mov j, r24 movw h0, r20 movw m0, r22 - lsl r24 - lsl r24 - mov r28, r24 - ldi r30, lo8(const_lut) - ldi r31, hi8(const_lut) - add r30, r24 - adc r31, r1 - lpm acc0, Z+ - lpm acc1, Z+ - lpm acc2, Z+ - lpm acc3, Z+ + sbiw r26, 4 + rcall load_acc_from_X + ldi r24, 0x55 + add acc0, r24 + adc acc1, r24 + adc acc2, r24 + ldi r24, 5 + adc acc3, r24 + rcall store_acc_to_dec_X + adiw r26, 4 clt - mov r20, j rcall load_rotate_add_M - mov r20, j - subi r20, -3 + subi j, -3 rcall load_rotate_add_M - mov r20, j set - subi r20, -10 + subi j, -7 rcall load_rotate_add_M lsl j lsl j - subi j, -7*4 + subi j, -7*4+10*4 andi j, 0x3f movw r26, h0 add r26, j adc r27, r1 - ld r0, X+ - eor acc0, r0 - ld r0, X+ - eor acc1, r0 - ld r0, X+ - eor acc2, r0 - ld r0, X+ - eor acc3, r0 -;--- - ret - -/******************************************************************************/ -/* - param q: r26:r27 - param m: r22:r23 - param h: r20:r21 - param j: r24 -*/ - -expand_intro: - push_range 20, 27 -; push r24 - rcall addelement -; pop r24 - pop_range 20, 27 + rcall load32_from_X + rcall eor32_to_acc +;-- + pop r24 + pop_range 26, 27 lsl r24 lsl r24 add r26, r24 @@ -541,21 +411,14 @@ expand1: rcall expand_intro ldi r19, 1 10: - rcall load32_from_X mov r20, r19 andi r20, 3 - rcall sn - rcall add32_to_acc + rcall load_sn_add inc r19 cpi r19, 17 brne 10b -expand1_exit: -; adiw r26, 63 - st X+, acc0 - st X+, acc1 - st X+, acc2 - st X+, acc3 - ret + rjmp expand2_exit + /******************************************************************************/ /* @@ -565,8 +428,6 @@ expand1_exit: param j: r24 */ -expand2_rot_table: - .byte 0,3,0,7,0,13,0,16,0,19,0,23,0,27 expand2: rcall expand_intro @@ -575,22 +436,25 @@ expand2: ldi r31, hi8(expand2_rot_table) 10: rcall load32_from_X - mov r20, r19 + sbrs r19, 0 + rjmp 12f lpm r20, Z+ rcall rotateleft32 - rcall add32_to_acc +12: rcall add32_to_acc dec r19 brne 10b - rcall load32_from_X ldi r20, 4 - rcall sn - rcall add32_to_acc - rcall load32_from_X + rcall load_sn_add ldi r20, 5 - rcall sn - rcall add32_to_acc - - rjmp expand1_exit + rcall load_sn_add +expand2_exit: + adiw r26, 4 +store_acc_to_dec_X: + st -X, acc3 + st -X, acc2 + st -X, acc1 + st -X, acc0 + ret /******************************************************************************/ /* @@ -604,43 +468,219 @@ expand2: param h: r20:r21 param j: r24 */ + +/******************************************************************************/ +/* + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ + +/******************************************************************************/ +/* + param ctx: r24:r25 + param msg: r22:r23 +*/ +/* f0 + param q: r28:r29 (Y) + param h: r26:r27 (X) + param m: r30:r31 (Z) +*/ +/* f1 + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +/* f2 + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +q0 = 2 +q1 = 3 +h0 = 4 +h1 = 5 +m0 = 6 +m1 = 7 +ctx0 = 2 +ctx1 = 3 +msg0 = 4 +msg1 = 5 + +restore_f1: + movw r26, r2 + movw r22, r4 + movw r20, r6 + ret +bmw_small_nextBlock_early: + movw r24, ctx0 + movw r22, msg0 +.global bmw_small_nextBlock +.global bmw224_nextBlock +.global bmw256_nextBlock +bmw_small_nextBlock: +bmw224_nextBlock: +bmw256_nextBlock: + push_range 2, 7 + push_range 28, 29 + push_range 8, 17 + stack_alloc_large 32*4, r28, r29 + ldi r16, 0x4f + push r16 + ldi r16, 0xff + push r16 + push r16 + ldi r16, 0xfb + push r16 + adiw r28, 1 +; push_range 28, 29 /* push Q */ +; push_range 22, 25 /* push M & H */ + /* increment counter */ + movw r26, r24 + movw r2, r26 + adiw r26, 63 + adiw r26, 1 + rcall load_acc_from_X + ldi r19, 1 + add acc0, r19 + adc acc1, r1 + adc acc2, r1 + adc acc3, r1 + rcall store_acc_to_dec_X + /* call f0 */ + movw r30, r22 + movw r26, r24 +f0: + movw h0, r26 + movw q0, r28 + movw m0, r30 + /* xor m into h */ +; ldi r20, 64 + rcall memxor_64 + movw r30, m0 + movw r26, h0 + + /* set q to zero */ + ldi r22, 64 +10: st Y+, r1 + dec r22 + brne 10b + movw r28, q0 + /* calculate W and store it in Q */ + ldi r19, 5 +30: + ldi r18, 16 + /* load initial index */ + + /* load values from hacktable */ + ldi r30, lo8(f0_hacktable-3) + ldi r31, hi8(f0_hacktable-3) + mov r16, r19 + lsl r16 + add r16, r19 + add r30, r16 + adc r31, r1 + lpm r21, Z+ + lpm r20, Z+ + lpm r16, Z+ +40: + ;call add_hx_to_w +add_hx_to_w: + movw r26, h0 + add r26, r16 + adc r27, r1 + rcall load32_from_Y + sbiw r28, 4 + lsl r20 + rol r21 + brcs 300f + /* addition */ + rcall add_X_to_32 + rjmp 500f +300: /* substract */ + rcall load_acc_from_X + sub r22, acc0 + sbc r23, acc1 + sbc r24, acc2 + sbc r25, acc3 + +500: + rcall store32_to_Y + subi r16, -4 + andi r16, 0x0f<<2 + dec r18 + brne 40b + movw r28, q0 + dec r19 + brne 30b + movw r26, h0 + /* xor m into h */ +; ldi r20, 64 + movw r26, h0 + movw r30, m0 + rcall memxor_64 + sbiw r26, 60 +;--- + clr r17 + ldi r21, 15 + mov r8, r21 +50: + rcall load32_from_Y + sbiw r28, 4 + mov r20, r17 + rcall sn + inc r17 + cpi r17, 5 + brne 52f + clr r17 +52: + rcall add_X_to_32 + rcall store32_to_Y + + dec r8 + brne 50b +;--- + rcall load32_from_Y + clr r20 + rcall sn + movw r26, h0 + rcall add_X_to_32 + sbiw r26, 4 + sbiw r28, 4 + rcall store32_to_Y + sbiw r28, 4 + sbiw r28, 15*4 + movw r20, h0 + movw r22, m0 + + /* call f1*/ + movw r2, r28 f1: - movw r2, r24 movw r4, r22 movw r6, r20 movw r26, r2 -; movw r22, r4 -; movw r20, r6 clr r24 rcall expand1 - movw r26, r2 - movw r22, r4 - movw r20, r6 + rcall restore_f1 ldi r24, 1 rcall expand1 ldi r17, 2 -10: movw r26, r2 - movw r22, r4 - movw r20, r6 +10: rcall restore_f1 mov r24, r17 rcall expand2 inc r17 sbrs r17, 4 rjmp 10b - ret + rcall restore_f1 + movw r24, r2 -/******************************************************************************/ -/* - param q: r24:r25 - param m: r22:r23 - param h: r20:r21 -*/ -f2_1_shift_table: - .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55 -f2_2_shift_table: - .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1) - .byte 0 ; just for alignment + + /* call f2 */ +; pop_range 20, 25 +; push_range 20, 25 +; rcall printQ +; push r20 +; push r21 acc2 = 8 acc3 = 9 acc0 = 14 @@ -659,39 +699,29 @@ h0 = 18 h1 = 19 f2: movw r26, r24 - /* calc XL */ + /* calc XL & XH */ adiw r26, 63 adiw r26, 1 movw q16_0, r26 - clr xl0 - clr xl1 - clr xl2 - clr xl3 - ldi r17, 8 -10: ld r0, X+ - eor xl0, r0 - ld r0, X+ - eor xl1, r0 - ld r0, X+ - eor xl2, r0 - ld r0, X+ - eor xl3, r0 - dec r17 - brne 10b -;--- /* calc XH */ - movw xh0, xl0 - movw xh2, xl2 - ldi r17, 8 -10: ld r0, X+ - eor xh0, r0 - ld r0, X+ - eor xh1, r0 - ld r0, X+ - eor xh2, r0 - ld r0, X+ - eor xh3, r0 + movw h0, r20 +;--- +; push h0 +; push h1 +;--- + movw r28, r22 + rcall load_acc_from_X + ldi r17, 15 +10: rcall load32_from_X + rcall eor32_to_acc + cpi r17, 9 + brne 15f + movw xl0, acc0 + movw xl2, acc2 +15: dec r17 brne 10b + movw xh0, acc0 + movw xh2, acc2 ;--- DBG ; push_range 22, 25 ; movw r22, xl0 @@ -702,220 +732,114 @@ f2: ; rcall print32 ; pop_range 22, 25 ;--- END DBG - + /* copy m(Y) into h */ + movw r26, h0 + ldi r22, 64 +10: + ld r23, Y+ + st X+, r23 + dec r22 + brne 10b ;--- /* calc first half of h0..h15 */ - movw h0, r20 - movw r28, r22 - movw r26, q16_0 - ldi r17, 16 + movw r28, q16_0 + movw r26, h0 + ldi r30, lo8(f2_1_shift_table) + ldi r31, hi8(f2_1_shift_table) + ldi r17, 15 10: - ld acc0, Y+ - ld acc1, Y+ - ld acc2, Y+ - ld acc3, Y+ ;--- - ldi r30, lo8(f2_1_shift_table-1) - ldi r31, hi8(f2_1_shift_table-1) movw r22, xh0 movw r24, xh2 - add r30, r17 - adc r31, r1 - lpm r20, Z - mov r1, r20 - andi r20, 0x0f - clt - cpi r17, 16 - breq 20f - cpi r17, 11 - brne 21f -20: set -21: brts 25f - rcall shiftright32 - rjmp 26f -25: rcall shiftleft32 -26: rcall eor32_to_acc + lpm r20, Z+ + sbrc r17, 3 + rcall shiftleft32 + rcall mov32_to_acc ;--- - rcall load32_from_X - mov r20, r1 - clr r1 - swap r20 - andi r20, 0x0f - brts 27f + rcall load32_from_Y + lpm r20, Z+ + sbrc r17, 3 rcall shiftleft32 - rjmp 28f -27: rcall shiftright32 -28: rcall eor32_to_acc + rcall eor32_to_acc ;--- - movw r30, h0 - st Z+, acc0 - st Z+, acc1 - st Z+, acc2 - st Z+, acc3 - movw h0, r30 + rcall load32_from_X + rcall eor32_to_acc + rcall store_acc_to_dec_X + adiw r26, 4 ;--- dec r17 - brne 10b + brpl 10b ;----- - sbiw r26, 4*8 /* X points to q[24] */ - movw r28, r26 + sbiw r28, 4*8 /* Y points to q[24] */ + movw r30, r28 sbiw r28, 63 sbiw r28, 33 /* Y points to q[0] */ - sbiw r30, 63 - sbiw r30, 1 /* Z points to h0 */ - ldi r17, 8 -10: movw acc0, xl0 - movw acc2, xl2 - rcall load32_from_X - rcall eor32_to_acc - rcall load32_from_Y - rcall eor32_to_acc - rcall add_acc_to_Z - dec r17 - brne 10b - sbiw r26, 9*4 /* X points to q[23] */ - rcall load_acc_from_X - eor acc1, xl0 - eor acc2, xl1 - eor acc3, xl2 - rcall load32_from_Y - rcall eor32_to_acc - rcall add_acc_to_Z -;--- - sbiw r26, 8*4 /* X points to q[16] */ - mov h0, r30 - ldi r17, 7 -10: - ldi r30, lo8(f2_2_shift_table-1) - ldi r31, hi8(f2_2_shift_table-1) - add r30, r17 - adc r31, r1 - lpm r20, Z - rcall load_acc_from_X - movw r22, xl0 + movw r26, r28 + ldi r20, 8*4 + /* xor q[24..31] into q[0..7] */ + rcall memxor + /* xor q[23] into q[8] */ + sbiw r30, 9*4 + ldi r20, 4 + rcall memxor + /* xor q[16..22] into q[9..15] */ + sbiw r30, 8*4 + ldi r20, 7*4 + rcall memxor + + movw r26, h0 + ldi r17, 15 + ldi r30, lo8(f2_2_shift_table-8) + ldi r31, hi8(f2_2_shift_table-8) +10: movw r22, xl0 movw r24, xl2 - lsr r20 - brcc 20f + lpm r20, Z+ + sbrs r17, 3 rcall shiftleft32 - rjmp 21f -20: rcall shiftright32 -21: - rcall eor32_to_acc + rcall mov32_to_acc rcall load32_from_Y rcall eor32_to_acc - movw r30, h0 - rcall add_acc_to_Z - movw h0, r30 + rcall add_acc_to_X dec r17 - brne 10b + brpl 10b ;----- - sbiw r30, 8*4 /* Z points to h8 */ - movw r26, r30 - sbiw r26, 4*4 /* X points to h4 */ + sbiw r26, 8*4 /* X points to h8 */ + movw r28, r26 + sbiw r28, 4*4 /* Y points to h4 */ ldi r17, 8 ldi r18, 9 10: - rcall load32_from_X + rcall load32_from_Y mov r20, r18 rcall rotateleft32 - movw acc0, r22 - movw acc2, r24 - rcall add_acc_to_Z + rcall mov32_to_acc + rcall add_acc_to_X inc r18 cpi r17, 5 - breq 20f - dec r17 + brne 20f + sbiw r28, 8*4 +20: dec r17 brne 10b - ret -20: sbiw r26, 8*4 - dec r17 - rjmp 10b -/******************************************************************************/ -/* - param ctx: r24:r25 - param msg: r22:r23 -*/ -/* f0 - param q: r28:r29 (Y) - param h: r26:r27 (X) - param m: r30:r31 (Z) -*/ -/* f1 - param q: r24:r25 - param m: r22:r23 - param h: r20:r21 -*/ -/* f2 - param q: r24:r25 - param m: r22:r23 - param h: r20:r21 -*/ -.global bmw_small_nextBlock -.global bmw224_nextBlock -.global bmw256_nextBlock -bmw_small_nextBlock: -bmw224_nextBlock: -bmw256_nextBlock: - push_range 28, 29 - push_range 2, 17 - stack_alloc_large 32*4, r28, r29 - adiw r28, 1 - push_range 28, 29 /* push Q */ - push_range 22, 25 /* push M & H */ - /* increment counter */ - movw r26, r24 - movw r2, r26 - adiw r26, 63 - adiw r26, 1 - rcall load_acc_from_X - ldi r19, 1 - add acc0, r19 - adc acc1, r1 - adc acc2, r1 - adc acc3, r1 - st -X, acc3 - st -X, acc2 - st -X, acc1 - st -X, acc0 - /* call f0 */ - movw r30, r22 - movw r26, r24 - rcall f0 - /* call f1*/ - pop r21 - pop r20 - pop r23 - pop r22 - pop r25 - pop r24 -; rcall printQ - push_range 20, 25 - rcall f1 - /* call f2 */ -; pop_range 20, 25 -; push_range 20, 25 -; rcall printQ - pop_range 20, 25 -; push r20 -; push r21 - call f2 +exit: ;--- DBG ; pop r25 ; pop r24 ; ldi r22, 'H' ; rcall printX ;--- END DBG - stack_free_large3 32*4 - pop_range 2, 17 + stack_free_large3 32*4+4 + pop_range 10, 17 +pop9: + pop_range 8, 9 +pop28: pop_range 28, 29 +pop7: + pop_range 6, 7 +pop5: + pop_range 2, 5 ret /******************************************************************************/ -/* - param ctx: r24:r25 - param msg: r22:r23 - param len: r20:r21 -*/ ctx0 = 2 ctx1 = 3 blc0 = 4 @@ -925,6 +849,22 @@ len1 = 29 buf0 = 6 buf1 = 7 +load32_from_Z_stub: + movw r30, ctx0 + adiw r30, 60 + ldd r21, Z+4 + ldd r22, Z+5 + ldd r23, Z+6 + ldd r24, Z+7 + ret + +/******************************************************************************/ +/* + param ctx: r24:r25 + param msg: r22:r23 + param len: r20:r21 +*/ + .global bmw_small_lastBlock .global bmw224_lastBlock .global bmw256_lastBlock @@ -945,9 +885,7 @@ bmw256_lastBlock: 1: cpi len1, hi8(512) brlo 2f - movw r24, ctx0 - movw r22, blc0 - rcall bmw_small_nextBlock + rcall bmw_small_nextBlock_early ldi r24, 64 add blc0, r24 adc blc1, r1 @@ -966,15 +904,14 @@ bmw256_lastBlock: memcpy(pctx.buffer, block, (length_b+7)/8); pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07); */ movw r24, len0 + ldi r23, 63 + movw r26, blc0 lsr r25 ror r24 lsr r24 lsr r24 - ldi r23, 63 - sub r23, r24 - movw r26, blc0 - tst r24 breq 301f + sub r23, r24 /* copy (#r24) bytes to stack buffer */ 30: ld r20, X+ st Z+, r20 @@ -1010,21 +947,15 @@ bmw256_lastBlock: breq 400f cpi len0, 192 brlo 400f - movw r24, ctx0 - movw r22, buf0 - rcall bmw_small_nextBlock + movw blc0, buf0 + rcall bmw_small_nextBlock_early movw r26, buf0 ldi r20, 64-8 350: st X+, r1 dec r20 brne 350b - movw r30, ctx0 - adiw r30, 60 - ldd r21, Z+4 - ldd r22, Z+5 - ldd r23, Z+6 - ldd r24, Z+7 + rcall load32_from_Z_stub subi r21, 1 sbc r22, r1 sbc r23, r1 @@ -1034,60 +965,48 @@ bmw256_lastBlock: bmw_small_nextBlock(ctx, pctx.buffer); */ 400: - movw r30, ctx0 - adiw r30, 60 - ldd r21, Z+4 - ldd r22, Z+5 - ldd r23, Z+6 - ldd r24, Z+7 + rcall load32_from_Z_stub 410: clr r25 + ldi r20, 1 lsl r21 - rol r22 - rol r23 - rol r24 - rol r25 + rcall rol32 mov r20, len0 add r21, len1 adc r22, r1 adc r23, r1 adc r24, r1 adc r25, r1 - movw r30, buf0 - adiw r30, 64-8 - st Z+, r20 - st Z+, r21 - st Z+, r22 - st Z+, r23 - st Z+, r24 - st Z+, r25 - st Z+, r1 - st Z+, r1 - movw r24, ctx0 - movw r22, buf0 - rcall bmw_small_nextBlock + movw r26, buf0 + adiw r26, 64-8 + st X+, r20 + st X+, r21 + rcall store32_to_X + st X+, r1 + st X+, r1 + movw blc0, buf0 + rcall bmw_small_nextBlock_early /* memset(pctx.buffer, 0xaa, 64); for(i=0; i<16;++i){ pctx.buffer[i*4] = i+0xa0; } */ - ldi r18, 0xa0 - ldi r19, 0xaa + ldi r22, 0xa0 + ldi r23, 0xaa + ldi r24, 0xaa + ldi r25, 0xaa movw r26, buf0 500: - st X+, r18 - st X+, r19 - st X+, r19 - st X+, r19 - inc r18 - sbrs r18, 4 + rcall store32_to_X + inc r22 + sbrs r22, 4 rjmp 500b /* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h); memcpy(ctx->h, pctx.buffer, 64); */ - movw r24, buf0 - movw r22, ctx0 - rcall bmw_small_nextBlock + movw r24, buf0 + movw r22, ctx0 + rcall bmw_small_nextBlock ldi r18, 64 movw r26, ctx0 movw r30, buf0 @@ -1098,13 +1017,11 @@ bmw256_lastBlock: brne 600b stack_free_large 68 - pop_range 28, 29 - pop_range 2, 7 - ret + rjmp pop28 /******************************************************************************* -* void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){ +* void bmw224_ctx2hash(void *dest, const bmw224_ctx_t *ctx){ * memcpy(dest, &(ctx->h[9]), 224/8); * } * @@ -1113,14 +1030,13 @@ bmw256_lastBlock: */ .global bmw224_ctx2hash bmw224_ctx2hash: - movw r26, r24 movw r30, r22 adiw r30, 9*4 - ldi r22, 28 + ldi r18, 28 rjmp 1f /******************************************************************************* -* void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){ +* void bmw256_ctx2hash(void *dest, const bmw256_ctx_t *ctx){ * memcpy(dest, &(ctx->h[8]), 256/8); * } * @@ -1129,19 +1045,18 @@ bmw224_ctx2hash: */ .global bmw256_ctx2hash bmw256_ctx2hash: - movw r26, r24 movw r30, r22 adiw r30, 8*4 - ldi r22, 32 -1: - ld r23, Z+ + ldi r18, 32 +1: movw r26, r24 +1: ld r23, Z+ st X+, r23 - dec r22 + dec r18 brne 1b ret /******************************************************************************* -* void bmw256(void* dest, const void* msg, uint32_t length_b){ +* void bmw256(void *dest, const void *msg, uint32_t length_b){ * bmw_small_ctx_t ctx; * bmw256_init(&ctx); * while(length_b>=BMW_SMALL_BLOCKSIZE){ @@ -1169,12 +1084,12 @@ dst0 = 10 dst1 = 11 .global bmw256 bmw256: - push r16 - ldi r16, 1 + set rjmp bmw_small_all + /******************************************************************************* -* void bmw224(void* dest, const void* msg, uint32_t length_b){ +* void bmw224(void *dest, const void *msg, uint32_t length_b){ * bmw_small_ctx_t ctx; * bmw224_init(&ctx); * while(length_b>=BMW_SMALL_BLOCKSIZE){ @@ -1194,22 +1109,28 @@ ctx0 = 2 ctx1 = 3 msg0 = 4 msg1 = 5 -len0 = 6 -len1 = 7 +len0 = 28 +len1 = 29 len2 = 8 len3 = 9 -dst0 = 10 -dst1 = 11 +dst0 = 6 +dst1 = 7 .global bmw224 bmw224: - push r16 - clr r16 + clt + bmw_small_all: - push_range 2, 11 + push_range 2, 7 + push_range 28, 29 + push_range 8, 9 + push r16 stack_alloc_large 64+4 adiw r30, 1 - movw ctx0, r30 + clr r16 + brtc 10f + inc r16 +10: movw ctx0, r30 movw dst0, r24 movw msg0, r22 movw len0, r18 @@ -1224,11 +1145,8 @@ bmw_small_all: mov r18, len2 or r18, len3 breq 50f - movw r24, ctx0 - movw r22, msg0 - rcall bmw_small_nextBlock - ldi r20, 2 - sub len1, r20 + rcall bmw_small_nextBlock_early + subi len1, 2 sbc len2, r1 sbc len3, r1 ldi r20, 64 @@ -1248,9 +1166,8 @@ bmw_small_all: adc r31, r1 icall stack_free_large 64+4 - pop_range 2, 11 pop r16 - ret + rjmp pop9 init_lut: rjmp bmw224_init @@ -1260,7 +1177,7 @@ c2h_lut: rjmp bmw256_ctx2hash /******************************************************************************* -* void bmw224_init(bmw224_ctx_t* ctx){ +* void bmw224_init(bmw224_ctx_t *ctx){ * uint8_t i; * ctx->h[0] = 0x00010203; * for(i=1; i<16; ++i){ @@ -1273,42 +1190,30 @@ c2h_lut: */ .global bmw224_init bmw224_init: - movw r26, r24 - ldi r22, 0x03 - ldi r23, 0x02 - ldi r24, 0x01 - ldi r25, 0x00 + ldi r22, 0x00 + ldi r23, 0x40 bmw_small_init: - st X+, r22 - st X+, r23 - st X+, r24 - st X+, r25 - ldi r18, 16-1 - ldi r20, 0x04 -1: - add r22, r20 - adc r23, r20 - adc r24, r20 - adc r25, r20 - st X+, r22 - st X+, r23 - st X+, r24 - st X+, r25 - dec r18 - brne 1b - st X+, r1 - st X+, r1 - st X+, r1 - st X+, r1 + movw r26, r24 + adiw r26, 4 +10: + st -X, r22 + inc r22 + mov r20, r22 + andi r20, 0x3 + brne 10b + adiw r26, 8 +20: cp r22, r23 + brne 10b + st -X, r1 + st -X, r1 + st -X, r1 + st -X, r1 ret .global bmw256_init bmw256_init: - movw r26, r24 - ldi r22, 0x43 - ldi r23, 0x42 - ldi r24, 0x41 - ldi r25, 0x40 + ldi r22, 0x40 + ldi r23, 0x80 rjmp bmw_small_init