X-Git-Url: https://git.cryptolib.org/?a=blobdiff_plain;f=bmw%2Fbmw_small-tinyasm.S;h=04dea66eaf9de3b725bc28ecb686b30f9cc0ec5f;hb=e9e07569721b9e005d6b602e26a03e930e796577;hp=50e110d82d9635c14a735ee176cc30d6e919b92f;hpb=0747bb9f3d1759c0b71a0cff3387835db9833d8e;p=avr-crypto-lib.git diff --git a/bmw/bmw_small-tinyasm.S b/bmw/bmw_small-tinyasm.S index 50e110d..04dea66 100644 --- a/bmw/bmw_small-tinyasm.S +++ b/bmw/bmw_small-tinyasm.S @@ -1,7 +1,7 @@ /* bmw_small-tinyasm.S */ /* This file is part of the AVR-Crypto-Lib. - Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) + Copyright (C) 2006-2015 Daniel Otte (bg@nerilex.org) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -33,12 +33,19 @@ acc3 = 9 acc0 = 14 acc1 = 15 +#define DEBUG 0 + /******************************************************************************/ /* param a: r22:r23:r24:r25 param s: r20 */ shiftleft32: + tst r20 + brpl 10f + neg r20 + rjmp shiftright32 +10: clr r0 cpi r20, 8 brlo bitrotateleft_1 @@ -47,7 +54,7 @@ shiftleft32: mov r23, r22 clr r22 subi r20, 8 - rjmp shiftleft32 + rjmp 10b /******************************************************************************/ /* @@ -178,10 +185,11 @@ sn: param src: r30:r31 (Z) param len: r20 */ -memxor_short: +memxor_64: ; tst r20 ; breq memxor_exit ldi r20, 64 +memxor: 10: ld r21, X ld r22, Z+ eor r21, r22 @@ -245,11 +253,6 @@ mov32_to_acc: movw acc2, r24 ret -eor_acc_from_Y_add_to_Z: - rcall load32_from_Y - rcall eor32_to_acc - rjmp add_acc_to_Z - /******************************************************************************/ /* param q: r28:r29 (Y) @@ -257,17 +260,26 @@ eor_acc_from_Y_add_to_Z: param m: r30:r31 (Z) */ +f2_1_shift_table: +; .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55 +; .byte 0x55, 0x87, 0x55, 0x51, 0x03, 0x66, 0x64, 0x2B + .byte 5, -5, -7, 8, -5, 5, -1, 5, -3, 0, 6, -6, -4, 6, -11, 2 +f2_2_shift_table: +; .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1) + .byte 8, -6, 6, 4, -3, -4, -7, -2 +expand2_rot_table: + .byte 3,7,13,16,19,23,27 + f0_hacktable: .byte 0x03, 0x11, 5*4 .byte 0xDD, 0xB3, 7*4 .byte 0x2A, 0x79, 10*4 .byte 0x07, 0xAA, 13*4 .byte 0x51, 0xC2, 14*4 - .byte 0 ; just for alignment /******************************************************************************* -* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){ +* uint32_t addelment(uint8_t j, const uint32_t *m, const uint32_t *h){ * uint32_t r; * r = pgm_read_dword(k_lut+j); * r += rotl_addel(((uint32_t*)m)[j&0xf], j+0); @@ -297,19 +309,19 @@ load_acc_from_X: ld acc3, X+ ret -add_acc_to_Z: - ld r0, Z +add_acc_to_X: + ld r0, X add r0, acc0 - st Z+, r0 - ld r0, Z + st X+, r0 + ld r0, X adc r0, acc1 - st Z+, r0 - ld r0, Z + st X+, r0 + ld r0, X adc r0, acc2 - st Z+, r0 - ld r0, Z + st X+, r0 + ld r0, X adc r0, acc3 - st Z+, r0 + st X+, r0 ret load_rotate_add_M: @@ -416,14 +428,6 @@ expand1: param j: r24 */ -f2_1_shift_table: - .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55 -f2_2_shift_table: - .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1) - -expand2_rot_table: - .byte 3,7,13,16,19,23,27 -; .byte 0 ; just for alignment expand2: rcall expand_intro @@ -552,7 +556,7 @@ f0: movw m0, r30 /* xor m into h */ ; ldi r20, 64 - rcall memxor_short + rcall memxor_64 movw r30, m0 movw r26, h0 @@ -614,7 +618,7 @@ add_hx_to_w: ; ldi r20, 64 movw r26, h0 movw r30, m0 - rcall memxor_short + rcall memxor_64 sbiw r26, 60 ;--- clr r17 @@ -695,14 +699,17 @@ h0 = 18 h1 = 19 f2: movw r26, r24 - /* calc XL */ + /* calc XL & XH */ adiw r26, 63 adiw r26, 1 movw q16_0, r26 movw h0, r20 +;--- +; push h0 +; push h1 +;--- movw r28, r22 - rcall load32_from_X - rcall mov32_to_acc + rcall load_acc_from_X ldi r17, 15 10: rcall load32_from_X rcall eor32_to_acc @@ -725,124 +732,95 @@ f2: ; rcall print32 ; pop_range 22, 25 ;--- END DBG - + /* copy m(Y) into h */ + movw r26, h0 + ldi r22, 64 +10: + ld r23, Y+ + st X+, r23 + dec r22 + brne 10b ;--- /* calc first half of h0..h15 */ - movw r26, q16_0 - ldi r17, 16 + movw r28, q16_0 + movw r26, h0 + ldi r30, lo8(f2_1_shift_table) + ldi r31, hi8(f2_1_shift_table) + ldi r17, 15 10: - rcall load32_from_Y - rcall mov32_to_acc ;--- movw r22, xh0 movw r24, xh2 - cpi r17, 9 - brge 15f - clr r1 - rjmp 26f -15: ldi r30, lo8(f2_1_shift_table-9) - ldi r31, hi8(f2_1_shift_table-9) - add r30, r17 - adc r31, r1 - lpm r20, Z - mov r1, r20 - andi r20, 0x0f - clt - cpi r17, 16 - breq 20f - cpi r17, 11 - brne 21f -20: set -21: brts 25f - rcall shiftright32 - rjmp 26f -25: rcall shiftleft32 -26: rcall eor32_to_acc + lpm r20, Z+ + sbrc r17, 3 + rcall shiftleft32 + rcall mov32_to_acc ;--- - rcall load32_from_X - mov r20, r1 - clr r1 - swap r20 - andi r20, 0x0f - brts 27f + rcall load32_from_Y + lpm r20, Z+ + sbrc r17, 3 rcall shiftleft32 - rjmp 28f -27: rcall shiftright32 -28: rcall eor32_to_acc + rcall eor32_to_acc ;--- - movw r30, h0 - st Z+, acc0 - st Z+, acc1 - st Z+, acc2 - st Z+, acc3 - movw h0, r30 + rcall load32_from_X + rcall eor32_to_acc + rcall store_acc_to_dec_X + adiw r26, 4 ;--- dec r17 - brne 10b + brpl 10b ;----- - sbiw r26, 4*8 /* X points to q[24] */ - movw r28, r26 + sbiw r28, 4*8 /* Y points to q[24] */ + movw r30, r28 sbiw r28, 63 sbiw r28, 33 /* Y points to q[0] */ - sbiw r30, 63 - sbiw r30, 1 /* Z points to h0 */ - ldi r17, 8 -10: movw acc0, xl0 - movw acc2, xl2 - rcall load32_from_X - rcall eor32_to_acc - rcall eor_acc_from_Y_add_to_Z - dec r17 - brne 10b - sbiw r26, 9*4 /* X points to q[23] */ - rcall load_acc_from_X - eor acc1, xl0 - eor acc2, xl1 - eor acc3, xl2 - rcall eor_acc_from_Y_add_to_Z -;--- - sbiw r26, 8*4 /* X points to q[16] */ - mov h0, r30 - ldi r17, 7 -10: - ldi r30, lo8(f2_2_shift_table-1) - ldi r31, hi8(f2_2_shift_table-1) - add r30, r17 - adc r31, r1 - lpm r20, Z - rcall load_acc_from_X - movw r22, xl0 + movw r26, r28 + ldi r20, 8*4 + /* xor q[24..31] into q[0..7] */ + rcall memxor + /* xor q[23] into q[8] */ + sbiw r30, 9*4 + ldi r20, 4 + rcall memxor + /* xor q[16..22] into q[9..15] */ + sbiw r30, 8*4 + ldi r20, 7*4 + rcall memxor + + movw r26, h0 + ldi r17, 15 + ldi r30, lo8(f2_2_shift_table-8) + ldi r31, hi8(f2_2_shift_table-8) +10: movw r22, xl0 movw r24, xl2 - lsr r20 - brcc 20f + lpm r20, Z+ + sbrs r17, 3 rcall shiftleft32 - rjmp 21f -20: rcall shiftright32 -21: - movw r30, h0 + rcall mov32_to_acc + rcall load32_from_Y rcall eor32_to_acc - rcall eor_acc_from_Y_add_to_Z - movw h0, r30 + rcall add_acc_to_X dec r17 - brne 10b + brpl 10b ;----- - sbiw r30, 8*4 /* Z points to h8 */ - movw r26, r30 - sbiw r26, 4*4 /* X points to h4 */ + sbiw r26, 8*4 /* X points to h8 */ + movw r28, r26 + sbiw r28, 4*4 /* Y points to h4 */ ldi r17, 8 ldi r18, 9 10: - rcall load32_from_X + rcall load32_from_Y mov r20, r18 rcall rotateleft32 rcall mov32_to_acc - rcall add_acc_to_Z + rcall add_acc_to_X inc r18 cpi r17, 5 brne 20f - sbiw r26, 8*4 + sbiw r28, 8*4 20: dec r17 brne 10b +exit: ;--- DBG ; pop r25 ; pop r24 @@ -1043,7 +1021,7 @@ bmw256_lastBlock: /******************************************************************************* -* void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){ +* void bmw224_ctx2hash(void *dest, const bmw224_ctx_t *ctx){ * memcpy(dest, &(ctx->h[9]), 224/8); * } * @@ -1058,7 +1036,7 @@ bmw224_ctx2hash: rjmp 1f /******************************************************************************* -* void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){ +* void bmw256_ctx2hash(void *dest, const bmw256_ctx_t *ctx){ * memcpy(dest, &(ctx->h[8]), 256/8); * } * @@ -1078,7 +1056,7 @@ bmw256_ctx2hash: ret /******************************************************************************* -* void bmw256(void* dest, const void* msg, uint32_t length_b){ +* void bmw256(void *dest, const void *msg, uint32_t length_b){ * bmw_small_ctx_t ctx; * bmw256_init(&ctx); * while(length_b>=BMW_SMALL_BLOCKSIZE){ @@ -1111,7 +1089,7 @@ bmw256: /******************************************************************************* -* void bmw224(void* dest, const void* msg, uint32_t length_b){ +* void bmw224(void *dest, const void *msg, uint32_t length_b){ * bmw_small_ctx_t ctx; * bmw224_init(&ctx); * while(length_b>=BMW_SMALL_BLOCKSIZE){ @@ -1140,7 +1118,6 @@ dst1 = 7 .global bmw224 bmw224: clt - rjmp bmw_small_all bmw_small_all: @@ -1200,7 +1177,7 @@ c2h_lut: rjmp bmw256_ctx2hash /******************************************************************************* -* void bmw224_init(bmw224_ctx_t* ctx){ +* void bmw224_init(bmw224_ctx_t *ctx){ * uint8_t i; * ctx->h[0] = 0x00010203; * for(i=1; i<16; ++i){