From 709c6948180356089680fd243927b22133ba64d0 Mon Sep 17 00:00:00 2001 From: bg Date: Wed, 14 Apr 2010 10:38:26 +0000 Subject: [PATCH] further shrinked bmw tiny to 1778 bytes --- bmw/bmw_large.c | 28 ++++- bmw/bmw_small-tinyasm.S | 265 ++++++++++++++++++++-------------------- 2 files changed, 156 insertions(+), 137 deletions(-) diff --git a/bmw/bmw_large.c b/bmw/bmw_large.c index 5b0039a..719b585 100644 --- a/bmw/bmw_large.c +++ b/bmw/bmw_large.c @@ -30,10 +30,10 @@ #include #include "bmw_large.h" -#define SHL64(a,n) ((a)<<(n)) -#define SHR64(a,n) ((a)>>(n)) -#define ROTL64(a,n) (((a)<<(n))|((a)>>(64-(n)))) -#define ROTR64(a,n) (((a)>>(n))|((a)<<(64-(n)))) +#define SHL64(a,n) shiftl64(a,n) +#define SHR64(a,n) shiftr64(a,n) +#define ROTL64(a,n) rotl64(a,n) +#define ROTR64(a,n) rotr64(a,n) #define TWEAK 1 #define BUG24 0 @@ -75,6 +75,26 @@ #define dump_x(a,b,c) #endif +static +uint64_t rotl64(uint64_t a, uint8_t r){ + return (a<>(64-r)); +} + +static +uint64_t rotr64(uint64_t a, uint8_t r){ + return (a>>r)|(a<<(64-r)); +} + +static +uint64_t shiftl64(uint64_t a, uint8_t r){ + return (a<>r); +} + static uint64_t bmw_large_s0(uint64_t x){ uint64_t r; diff --git a/bmw/bmw_small-tinyasm.S b/bmw/bmw_small-tinyasm.S index 801331f..fca301a 100644 --- a/bmw/bmw_small-tinyasm.S +++ b/bmw/bmw_small-tinyasm.S @@ -109,13 +109,22 @@ s2: .byte 2, 1,12,25 s3: .byte 2, 2,15,29 s4: .byte 1, 0, 0, 0 s5: .byte 2, 0, 0, 0 - -eor_r22_in_r16: - eor r16, r22 - eor r17, r23 - eor r18, r24 - eor r19, r25 - ret +/* +s0: .byte 0x34, 19 +s1: .byte 0x28, 23 +s2: .byte 0x9C, 25 +s3: .byte 0xAF, 29 +s4: .byte 0x00, 0 +s5: .byte 0x80, 0 +*/ +acc2 = 8 +acc3 = 9 +h0 = 10 +h1 = 11 +m0 = 12 +m1 = 13 +acc0 = 14 +acc1 = 15 /* param x: r22:r23:r24:25 @@ -123,8 +132,10 @@ eor_r22_in_r16: */ sn: push_range 2, 5 - push r17 - push r19 + push acc0 + push acc1 + push acc2 + push acc3 ldi r30, lo8(s_table) ldi r31, hi8(s_table) lsl r20 @@ -135,30 +146,32 @@ sn: movw r4, r24 lpm r20, Z+ rcall shiftright32 - movw r16, r22 - movw r18, r24 + movw acc0, r22 + movw acc2, r24 ;--- movw r22, r2 movw r24, r4 lpm r20, Z+ rcall shiftleft32 - rcall eor_r22_in_r16 + rcall eor32_to_acc ;--- movw r22, r2 movw r24, r4 lpm r20, Z+ rcall rotateleft32 - rcall eor_r22_in_r16 + rcall eor32_to_acc ;--- movw r22, r2 movw r24, r4 lpm r20, Z+ rcall rotateleft32 - rcall eor_r22_in_r16 - movw r22, r16 - movw r24, r18 - pop r19 - pop r17 + rcall eor32_to_acc + movw r22, acc0 + movw r24, acc2 + pop acc3 + pop acc2 + pop acc1 + pop acc0 pop_range 2, 5 ret @@ -222,6 +235,21 @@ add_X_to_32: ld r0, X+ adc r25, r0 ret + +store_acc_to_dec_X: + st -X, acc3 + st -X, acc2 + st -X, acc1 + st -X, acc0 + ret + +store32_to_X: + st X+, r22 + st X+, r23 + st X+, r24 + st X+, r25 + ret + /******************************************************************************/ /* param q: r28:r29 (Y) @@ -230,28 +258,13 @@ add_X_to_32: */ f0_hacktable: - .byte 0x03, 0x11 - .byte 0xDD, 0xB3 - .byte 0x2A, 0x79 - .byte 0x07, 0xAA - .byte 0x51, 0xC2 -f0_indextable: - .byte 5*4,7*4,10*4,13*4,14*4 -; .byte 0 ; just for alignment -f0_s_table: - .byte 0,1,2,3,4 - .byte 0,1,2,3,4 - .byte 0,1,2,3,4 -; .byte 0 - - -/******************************************************************************/ + .byte 0x03, 0x11, 5*4 + .byte 0xDD, 0xB3, 7*4 + .byte 0x2A, 0x79, 10*4 + .byte 0x07, 0xAA, 13*4 + .byte 0x51, 0xC2, 14*4 + .byte 0 ; just for alignment -const_lut: - .long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f - .long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3 - .long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7 - .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b /******************************************************************************* * uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){ @@ -337,17 +350,16 @@ addelement: mov j, r24 movw h0, r20 movw m0, r22 - lsl r24 - lsl r24 - mov r28, r24 - ldi r30, lo8(const_lut) - ldi r31, hi8(const_lut) - add r30, r24 - adc r31, r1 - lpm acc0, Z+ - lpm acc1, Z+ - lpm acc2, Z+ - lpm acc3, Z+ + sbiw r26, 4 + rcall load_acc_from_X + ldi r24, 0x55 + add acc0, r24 + adc acc1, r24 + adc acc2, r24 + ldi r24, 5 + adc acc3, r24 + rcall store_acc_to_dec_X + adiw r26, 4 clt mov r20, j rcall load_rotate_add_M @@ -365,18 +377,18 @@ addelement: movw r26, h0 add r26, j adc r27, r1 - ld r0, X+ - eor acc0, r0 - ld r0, X+ - eor acc1, r0 - ld r0, X+ - eor acc2, r0 - ld r0, X+ - eor acc3, r0 + rcall load32_from_X + rcall eor32_to_acc ;--- ret /******************************************************************************/ +load_sn_add: + rcall load32_from_X + rcall sn + rcall add32_to_acc + ret + /* param q: r26:r27 param m: r22:r23 @@ -399,11 +411,9 @@ expand1: rcall expand_intro ldi r19, 1 10: - rcall load32_from_X mov r20, r19 andi r20, 3 - rcall sn - rcall add32_to_acc + rcall load_sn_add inc r19 cpi r19, 17 brne 10b @@ -418,8 +428,14 @@ expand1: param j: r24 */ +f2_1_shift_table: + .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55 +f2_2_shift_table: + .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1) + expand2_rot_table: - .byte 3,7,13,16,19,23,27,0 + .byte 3,7,13,16,19,23,27 +; .byte 0 ; just for alignment expand2: rcall expand_intro @@ -436,18 +452,12 @@ expand2: dec r19 brne 10b ldi r20, 4 - rcall load32_from_X - rcall sn - rcall add32_to_acc + rcall load_sn_add ldi r20, 5 - rcall load32_from_X - rcall sn - rcall add32_to_acc + rcall load_sn_add expand2_exit: - st X+, acc0 - st X+, acc1 - st X+, acc2 - st X+, acc3 + adiw r26, 4 + rcall store_acc_to_dec_X ret /******************************************************************************/ @@ -469,12 +479,6 @@ expand2_exit: param m: r22:r23 param h: r20:r21 */ -f2_1_shift_table: - .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55 -f2_2_shift_table: - .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1) - .byte 0 ; just for alignment /******************************************************************************/ /* @@ -513,6 +517,13 @@ bmw256_nextBlock: push_range 28, 29 push_range 2, 17 stack_alloc_large 32*4, r28, r29 + ldi r16, 0x4f + push r16 + ldi r16, 0xff + push r16 + push r16 + ldi r16, 0xfb + push r16 adiw r28, 1 ; push_range 28, 29 /* push Q */ ; push_range 22, 25 /* push M & H */ @@ -527,10 +538,7 @@ bmw256_nextBlock: adc acc1, r1 adc acc2, r1 adc acc3, r1 - st -X, acc3 - st -X, acc2 - st -X, acc1 - st -X, acc0 + rcall store_acc_to_dec_X /* call f0 */ movw r30, r22 movw r26, r24 @@ -555,20 +563,18 @@ f0: 30: ldi r18, 16 /* load initial index */ - ldi r30, lo8(f0_indextable-1) - ldi r31, hi8(f0_indextable-1) - add r30, r19 - adc r31, r1 - lpm r16, Z + /* load values from hacktable */ - ldi r30, lo8(f0_hacktable-2) - ldi r31, hi8(f0_hacktable-2) - lsl r19 - add r30, r19 + ldi r30, lo8(f0_hacktable-3) + ldi r31, hi8(f0_hacktable-3) + mov r16, r19 + lsl r16 + add r16, r19 + add r30, r16 adc r31, r1 - lsr r19 lpm r21, Z+ - lpm r20, Z + lpm r20, Z+ + lpm r16, Z+ 40: ;call add_hx_to_w add_hx_to_w: @@ -584,14 +590,12 @@ add_hx_to_w: rcall add_X_to_32 rjmp 500f 300: /* substract */ - ld r0, X+ - sub r22, r0 - ld r0, X+ - sbc r23, r0 - ld r0, X+ - sbc r24, r0 - ld r0, X+ - sbc r25, r0 + rcall load_acc_from_X + sub r22, acc0 + sbc r23, acc1 + sbc r24, acc2 + sbc r25, acc3 + 500: rcall store32_to_Y subi r16, -4 @@ -609,18 +613,19 @@ add_hx_to_w: rcall memxor_short sbiw r26, 60 ;--- - ldi r30, lo8(f0_s_table) - ldi r31, hi8(f0_s_table) + clr r17 ldi r21, 15 mov r8, r21 50: rcall load32_from_Y sbiw r28, 4 - lpm r20, Z+ - movw r2, r30 + mov r20, r17 rcall sn - movw r30, r2 - + inc r17 + cpi r17, 5 + brne 52f + clr r17 +52: rcall add_X_to_32 rcall store32_to_Y @@ -630,22 +635,19 @@ add_hx_to_w: rcall load32_from_Y clr r20 rcall sn - movw r30, r2 movw r26, h0 rcall add_X_to_32 sbiw r26, 4 - st -Y, r25 - st -Y, r24 - st -Y, r23 - st -Y, r22 + sbiw r28, 4 + rcall store32_to_Y + sbiw r28, 4 sbiw r28, 15*4 movw r20, h0 movw r22, m0 /* call f1*/ - movw r24, r28 + movw r2, r28 f1: - movw r2, r24 movw r4, r22 movw r6, r20 movw r26, r2 @@ -735,10 +737,14 @@ f2: ld acc2, Y+ ld acc3, Y+ ;--- - ldi r30, lo8(f2_1_shift_table-1) - ldi r31, hi8(f2_1_shift_table-1) movw r22, xh0 movw r24, xh2 + cpi r17, 9 + brge 15f + clr r1 + rjmp 26f +15: ldi r30, lo8(f2_1_shift_table-9) + ldi r31, hi8(f2_1_shift_table-9) add r30, r17 adc r31, r1 lpm r20, Z @@ -854,7 +860,7 @@ f2: ; ldi r22, 'H' ; rcall printX ;--- END DBG - stack_free_large3 32*4 + stack_free_large3 32*4+4 pop_range 2, 17 pop_range 28, 29 ret @@ -1021,16 +1027,15 @@ bmw256_lastBlock: pctx.buffer[i*4] = i+0xa0; } */ - ldi r18, 0xa0 - ldi r19, 0xaa + ldi r22, 0xa0 + ldi r23, 0xaa + ldi r24, 0xaa + ldi r25, 0xaa movw r26, buf0 500: - st X+, r18 - st X+, r19 - st X+, r19 - st X+, r19 - inc r18 - sbrs r18, 4 + rcall store32_to_X + inc r22 + sbrs r22, 4 rjmp 500b /* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h); memcpy(ctx->h, pctx.buffer, 64); @@ -1230,10 +1235,7 @@ bmw224_init: ldi r24, 0x01 ldi r25, 0x00 bmw_small_init: - st X+, r22 - st X+, r23 - st X+, r24 - st X+, r25 + rcall store32_to_X ldi r18, 16-1 ldi r20, 0x04 1: @@ -1241,10 +1243,7 @@ bmw_small_init: adc r23, r20 adc r24, r20 adc r25, r20 - st X+, r22 - st X+, r23 - st X+, r24 - st X+, r25 + rcall store32_to_X dec r18 brne 1b st X+, r1 -- 2.39.5