From c6a15ac3ba0c10bfb904be257a2fe5bda7b3dea3 Mon Sep 17 00:00:00 2001 From: bg Date: Sun, 4 Apr 2010 22:52:01 +0000 Subject: [PATCH] BMW224/256 now below 2KiB\! --- bmw/bmw_small-tinyasm.S | 1439 +++++++++++++++++++++++++++++++++++++++ hfal-performance.c | 11 +- mkfiles/bmw_tiny.mk | 12 + 3 files changed, 1456 insertions(+), 6 deletions(-) create mode 100644 bmw/bmw_small-tinyasm.S create mode 100644 mkfiles/bmw_tiny.mk diff --git a/bmw/bmw_small-tinyasm.S b/bmw/bmw_small-tinyasm.S new file mode 100644 index 0000000..764f281 --- /dev/null +++ b/bmw/bmw_small-tinyasm.S @@ -0,0 +1,1439 @@ +/* bmw_small-tinyasm.S */ +/* + This file is part of the AVR-Crypto-Lib. + Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/* + * File: bmw_small-tinyasm.S + * Author: Daniel Otte + * Date: 2010-03-28 + * License: GPLv3 or later + * Description: implementation of BlueMidnightWish + * + */ + +#include "avr-asm-macros.S" + +/******************************************************************************/ +/* + param a: r22:r23:r24:r25 + param s: r20 +*/ +shiftleft32: + clr r0 + cpi r20, 8 + brlo bitrotateleft_1 + mov r25, r24 + mov r24, r23 + mov r23, r22 + clr r22 + subi r20, 8 + rjmp shiftleft32 + +/******************************************************************************/ +/* + param a: r22:r23:r24:r25 + param s: r20 +*/ +shiftright32: + cpi r20, 8 + brlo bitshiftright + mov r22, r23 + mov r23, r24 + mov r24, r25 + clr r25 + subi r20, 8 + rjmp shiftright32 +bitshiftright: + tst r20 + breq 20f +10: lsr r25 + ror r24 + ror r23 + ror r22 + dec r20 + brne 10b +20: ret + +/******************************************************************************/ +/* + param a: r22:r23:r24:r25 + param s: r20 +*/ +rotateleft32: + cpi r20, 8 + brlo bitrotateleft + mov r0, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r0 + subi r20, 8 + rjmp rotateleft32 +bitrotateleft: + mov r0, r25 +bitrotateleft_1: + tst r20 + breq 20f +10: + lsl r0 + rol r22 + rol r23 + rol r24 + rol r25 + dec r20 + brne 10b +20: ret + + +/******************************************************************************/ + +s_table: +s0: .byte 1, 3, 4,19 +s1: .byte 1, 2, 8,23 +s2: .byte 2, 1,12,25 +s3: .byte 2, 2,15,29 +s4: .byte 1, 0, 0, 0 +s5: .byte 2, 0, 0, 0 + +eor_r22_in_r16: + eor r16, r22 + eor r17, r23 + eor r18, r24 + eor r19, r25 + ret + +/* + param x: r22:r23:r24:25 + param s: r20 +*/ +sn: + push_range 12, 20 + ldi r30, lo8(s_table) + ldi r31, hi8(s_table) + lsl r20 + lsl r20 + add r30, r20 + adc r31, r1 + movw r12, r22 + movw r14, r24 + lpm r20, Z+ + rcall shiftright32 + movw r16, r22 + movw r18, r24 +;--- + movw r22, r12 + movw r24, r14 + lpm r20, Z+ + rcall shiftleft32 + rcall eor_r22_in_r16 +;--- + movw r22, r12 + movw r24, r14 + lpm r20, Z+ + rcall rotateleft32 + rcall eor_r22_in_r16 +;--- + movw r22, r12 + movw r24, r14 + lpm r20, Z+ + rcall rotateleft32 + eor r22, r16 + eor r23, r17 + eor r24, r18 + eor r25, r19 + pop_range 12, 20 + ret + +/******************************************************************************/ +/* + param dest: r26:r27 (X) + param src: r30:r31 (Z) + param len: r20 +*/ +memxor_short: +; tst r20 +; breq memxor_exit +10: ld r21, X + ld r22, Z+ + eor r21, r22 + st X+, r21 + dec r20 + brne 10b +memxor_exit: + ret + +/******************************************************************************/ +q0 = 2 +q1 = 3 +h0 = 4 +h1 = 5 +m0 = 6 +m1 = 7 + +add_hx_to_w: + movw r26, h0 + add r26, r16 + adc r27, r1 + ld r22, Y + ldd r23, Y+1 + ldd r24, Y+2 + ldd r25, Y+3 + lsl r20 + rol r21 + brcs 30f + /* addition */ + ld r0, X+ + add r22, r0 + ld r0, X+ + adc r23, r0 + ld r0, X+ + adc r24, r0 + ld r0, X+ + adc r25, r0 + rjmp 50f +30: /* substract */ + ld r0, X+ + sub r22, r0 + ld r0, X+ + sbc r23, r0 + ld r0, X+ + sbc r24, r0 + ld r0, X+ + sbc r25, r0 +50: + st Y+, r22 + st Y+, r23 + st Y+, r24 + st Y+, r25 + ret + +/******************************************************************************/ +load32_from_X: + ld r22, X+ + ld r23, X+ + ld r24, X+ + ld r25, X+ + ret + +load32_from_Y: + ld r22, Y+ + ld r23, Y+ + ld r24, Y+ + ld r25, Y+ + ret +/******************************************************************************/ +/* + param q: r28:r29 (Y) + param h: r26:r27 (X) + param m: r30:r31 (Z) +*/ + +f0_hacktable: + .byte 0x03, 0x11 + .byte 0xDD, 0xB3 + .byte 0x2A, 0x79 + .byte 0x07, 0xAA + .byte 0x51, 0xC2 +f0_indextable: + .byte 5*4,7*4,10*4,13*4,14*4 +; .byte 0 ; just for alignment +f0_s_table: + .byte 0,1,2,3,4 + .byte 0,1,2,3,4 + .byte 0,1,2,3,4 +; .byte 0 + +f0: + movw h0, r26 + movw q0, r28 + movw m0, r30 +;--- DBG +; push_range 22, 25 +; movw r24, r26 +; ldi r22, 'H' +; rcall printX +; pop_range 22, 25 +;--- END DBG +;--- DBG +; push_range 22, 25 +; movw r24, r30 +; ldi r22, 'M' +; rcall printX +; pop_range 22, 25 +;--- END DBG + /* xor m into h */ + ldi r20, 64 + rcall memxor_short + movw r30, m0 + movw r26, h0 + + /* set q to zero */ + ldi r22, 64 +10: st Y+, r1 + dec r22 + brne 10b + movw r28, q0 + /* calculate W and store it in Q */ + ldi r19, 5 +30: + ldi r18, 16 + /* load initial index */ + ldi r30, lo8(f0_indextable-1) + ldi r31, hi8(f0_indextable-1) + add r30, r19 + adc r31, r1 + lpm r16, Z + /* load values from hacktable */ + ldi r30, lo8(f0_hacktable-2) + ldi r31, hi8(f0_hacktable-2) + lsl r19 + add r30, r19 + adc r31, r1 + lsr r19 + lpm r21, Z+ + lpm r20, Z +40: + call add_hx_to_w + subi r16, -4 + andi r16, 0x0f<<2 + dec r18 + brne 40b + movw r28, q0 + dec r19 + brne 30b + movw r26, h0 +;--- DBG +; push_range 22, 25 +; movw r24, r28 +; ldi r22, 'W' +; rcall printX +; pop_range 22, 25 +;--- END DBG + /* xor m into h */ + ldi r20, 64 + movw r26, h0 + movw r30, m0 + rcall memxor_short + sbiw r26, 60 +;--- + ldi r30, lo8(f0_s_table) + ldi r31, hi8(f0_s_table) + ldi r21, 15 + mov r8, r21 +50: + ldd r22, Y+0 + ldd r23, Y+1 + ldd r24, Y+2 + ldd r25, Y+3 + lpm r20, Z+ + movw r2, r30 + rcall sn + movw r30, r2 + + ld r0, X+ + add r22, r0 + ld r0, X+ + adc r23, r0 + ld r0, X+ + adc r24, r0 + ld r0, X+ + adc r25, r0 + + st Y+, r22 + st Y+, r23 + st Y+, r24 + st Y+, r25 + dec r8 + brne 50b +;--- + ldd r22, Y+0 + ldd r23, Y+1 + ldd r24, Y+2 + ldd r25, Y+3 + clr r20 + rcall sn + movw r30, r2 + movw r26, h0 + ld r0, X+ + add r22, r0 + ld r0, X+ + adc r23, r0 + ld r0, X+ + adc r24, r0 + ld r0, X+ + adc r25, r0 + sbiw r26, 4 + st Y+, r22 + st Y+, r23 + st Y+, r24 + st Y+, r25 + ret + +/******************************************************************************/ + +const_lut: + .long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f + .long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3 + .long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7 + .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b + +/******************************************************************************* +* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){ +* uint32_t r; +* r = pgm_read_dword(k_lut+j); +* r += rotl_addel(((uint32_t*)m)[j&0xf], j+0); +* r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3); +* r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10); +* r ^= ((uint32_t*)h)[(j+7)&0xf]; +* return r; +* } +* param j: r24 +* param m: r22:r23 +* param h: r20:r21 +*/ +j = 16 +acc2 = 8 +acc3 = 9 +h0 = 10 +h1 = 11 +m0 = 12 +m1 = 13 +acc0 = 14 +acc1 = 15 + +add32_to_acc: + add acc0, r22 + adc acc1, r23 + adc acc2, r24 + adc acc3, r25 + ret + +eor32_to_acc: + eor acc0, r22 + eor acc1, r23 + eor acc2, r24 + eor acc3, r25 + ret + +load_acc_from_X: + ld acc0, X+ + ld acc1, X+ + ld acc2, X+ + ld acc3, X+ + ret + +add_acc_to_Z: + ld r0, Z + add r0, acc0 + st Z+, r0 + ld r0, Z + adc r0, acc1 + st Z+, r0 + ld r0, Z + adc r0, acc2 + st Z+, r0 + ld r0, Z + adc r0, acc3 + st Z+, r0 + ret + +load_rotate_add_M: + andi r20, 0x0f + mov r0, r20 + lsl r0 + lsl r0 + movw r26, m0 + add r26, r0 + adc r27, r1 + ld r22, X+ + ld r23, X+ + ld r24, X+ + ld r25, X+ + inc r20 + rcall rotateleft32 + brts 10f + rcall add32_to_acc + ret +10: sub acc0, r22 + sbc acc1, r23 + sbc acc2, r24 + sbc acc3, r25 + ret + +addelement: + mov j, r24 + movw h0, r20 + movw m0, r22 + lsl r24 + lsl r24 + mov r28, r24 + ldi r30, lo8(const_lut) + ldi r31, hi8(const_lut) + add r30, r24 + adc r31, r1 + lpm acc0, Z+ + lpm acc1, Z+ + lpm acc2, Z+ + lpm acc3, Z+ + clt + mov r20, j + rcall load_rotate_add_M + mov r20, j + subi r20, -3 + rcall load_rotate_add_M + mov r20, j + set + subi r20, -10 + rcall load_rotate_add_M + lsl j + lsl j + subi j, -7*4 + andi j, 0x3f + movw r26, h0 + add r26, j + adc r27, r1 + ld r0, X+ + eor acc0, r0 + ld r0, X+ + eor acc1, r0 + ld r0, X+ + eor acc2, r0 + ld r0, X+ + eor acc3, r0 +;--- + ret + +/******************************************************************************/ +/* + param q: r26:r27 + param m: r22:r23 + param h: r20:r21 + param j: r24 +*/ + +expand_intro: + push_range 20, 27 +; push r24 + rcall addelement +; pop r24 + pop_range 20, 27 + lsl r24 + lsl r24 + add r26, r24 + adc r27, r1 + ret +expand1: + rcall expand_intro + ldi r19, 1 +10: + rcall load32_from_X + mov r20, r19 + andi r20, 3 + rcall sn + rcall add32_to_acc + inc r19 + cpi r19, 17 + brne 10b +expand1_exit: +; adiw r26, 63 + st X+, acc0 + st X+, acc1 + st X+, acc2 + st X+, acc3 + ret + +/******************************************************************************/ +/* + param q: r26:r27 + param m: r22:r23 + param h: r20:r21 + param j: r24 +*/ + +expand2_rot_table: + .byte 0,3,0,7,0,13,0,16,0,19,0,23,0,27 + +expand2: + rcall expand_intro + ldi r19, 14 + ldi r30, lo8(expand2_rot_table) + ldi r31, hi8(expand2_rot_table) +10: + rcall load32_from_X + mov r20, r19 + lpm r20, Z+ + rcall rotateleft32 + rcall add32_to_acc + dec r19 + brne 10b + rcall load32_from_X + ldi r20, 4 + rcall sn + rcall add32_to_acc + rcall load32_from_X + ldi r20, 5 + rcall sn + rcall add32_to_acc + + rjmp expand1_exit + +/******************************************************************************/ +/* + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +/* for calling expand1/2 + param q: r26:r27 + param m: r22:r23 + param h: r20:r21 + param j: r24 +*/ +f1: + movw r2, r24 + movw r4, r22 + movw r6, r20 + movw r26, r2 +; movw r22, r4 +; movw r20, r6 + clr r24 + rcall expand1 + movw r26, r2 + movw r22, r4 + movw r20, r6 + ldi r24, 1 + rcall expand1 + ldi r17, 2 +10: movw r26, r2 + movw r22, r4 + movw r20, r6 + mov r24, r17 + rcall expand2 + inc r17 + sbrs r17, 4 + rjmp 10b + ret + +/******************************************************************************/ +/* + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +f2_1_shift_table: + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55 +f2_2_shift_table: + .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1) + .byte 0 ; just for alignment +acc2 = 8 +acc3 = 9 +acc0 = 14 +acc1 = 15 +xl0 = 2 +xl1 = 3 +xl2 = 4 +xl3 = 5 +xh0 = 6 +xh1 = 7 +xh2 = 10 +xh3 = 11 +q16_0 = 12 +q16_1 = 13 +h0 = 18 +h1 = 19 +f2: + movw r26, r24 + /* calc XL */ + adiw r26, 63 + adiw r26, 1 + movw q16_0, r26 + clr xl0 + clr xl1 + clr xl2 + clr xl3 + ldi r17, 8 +10: ld r0, X+ + eor xl0, r0 + ld r0, X+ + eor xl1, r0 + ld r0, X+ + eor xl2, r0 + ld r0, X+ + eor xl3, r0 + dec r17 + brne 10b +;--- /* calc XH */ + movw xh0, xl0 + movw xh2, xl2 + ldi r17, 8 +10: ld r0, X+ + eor xh0, r0 + ld r0, X+ + eor xh1, r0 + ld r0, X+ + eor xh2, r0 + ld r0, X+ + eor xh3, r0 + dec r17 + brne 10b +;--- DBG +; push_range 22, 25 +; movw r22, xl0 +; movw r24, xl2 +; rcall print32 +; movw r22, xh0 +; movw r24, xh2 +; rcall print32 +; pop_range 22, 25 +;--- END DBG + +;--- /* calc first half of h0..h15 */ + movw h0, r20 + movw r28, r22 + movw r26, q16_0 + ldi r17, 16 +10: + ld acc0, Y+ + ld acc1, Y+ + ld acc2, Y+ + ld acc3, Y+ +;--- + ldi r30, lo8(f2_1_shift_table-1) + ldi r31, hi8(f2_1_shift_table-1) + movw r22, xh0 + movw r24, xh2 + add r30, r17 + adc r31, r1 + lpm r20, Z + mov r1, r20 + andi r20, 0x0f + clt + cpi r17, 16 + breq 20f + cpi r17, 11 + brne 21f +20: set +21: brts 25f + rcall shiftright32 + rjmp 26f +25: rcall shiftleft32 +26: rcall eor32_to_acc +;--- + rcall load32_from_X + mov r20, r1 + clr r1 + swap r20 + andi r20, 0x0f + brts 27f + rcall shiftleft32 + rjmp 28f +27: rcall shiftright32 +28: rcall eor32_to_acc +;--- + movw r30, h0 + st Z+, acc0 + st Z+, acc1 + st Z+, acc2 + st Z+, acc3 + movw h0, r30 +;--- + dec r17 + brne 10b +;----- + sbiw r26, 4*8 /* X points to q[24] */ + movw r28, r26 + sbiw r28, 63 + sbiw r28, 33 /* Y points to q[0] */ + sbiw r30, 63 + sbiw r30, 1 /* Z points to h0 */ + ldi r17, 8 +10: movw acc0, xl0 + movw acc2, xl2 + rcall load32_from_X + rcall eor32_to_acc + rcall load32_from_Y + rcall eor32_to_acc + rcall add_acc_to_Z + dec r17 + brne 10b + sbiw r26, 9*4 /* X points to q[23] */ + rcall load_acc_from_X + eor acc1, xl0 + eor acc2, xl1 + eor acc3, xl2 + rcall load32_from_Y + rcall eor32_to_acc + rcall add_acc_to_Z +;--- + sbiw r26, 8*4 /* X points to q[16] */ + mov h0, r30 + ldi r17, 7 +10: + ldi r30, lo8(f2_2_shift_table-1) + ldi r31, hi8(f2_2_shift_table-1) + add r30, r17 + adc r31, r1 + lpm r20, Z + rcall load_acc_from_X + movw r22, xl0 + movw r24, xl2 + lsr r20 + brcc 20f + rcall shiftleft32 + rjmp 21f +20: rcall shiftright32 +21: + rcall eor32_to_acc + rcall load32_from_Y + rcall eor32_to_acc + movw r30, h0 + rcall add_acc_to_Z + movw h0, r30 + dec r17 + brne 10b +;----- + sbiw r30, 8*4 /* Z points to h8 */ + movw r26, r30 + sbiw r26, 4*4 /* X points to h4 */ + ldi r17, 8 + ldi r18, 9 +10: + rcall load32_from_X + mov r20, r18 + rcall rotateleft32 + movw acc0, r22 + movw acc2, r24 + rcall add_acc_to_Z + inc r18 + cpi r17, 5 + breq 20f + dec r17 + brne 10b + ret +20: sbiw r26, 8*4 + dec r17 + rjmp 10b + +/******************************************************************************/ +/* + param ctx: r24:r25 + param msg: r22:r23 +*/ +/* f0 + param q: r28:r29 (Y) + param h: r26:r27 (X) + param m: r30:r31 (Z) +*/ +/* f1 + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +/* f2 + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +.global bmw_small_nextBlock +.global bmw224_nextBlock +.global bmw256_nextBlock +bmw_small_nextBlock: +bmw224_nextBlock: +bmw256_nextBlock: + push_range 28, 29 + push_range 2, 17 + stack_alloc_large 32*4, r28, r29 + adiw r28, 1 + push_range 28, 29 /* push Q */ + push_range 22, 25 /* push M & H */ + /* increment counter */ + movw r26, r24 + movw r2, r26 + adiw r26, 63 + adiw r26, 1 + rcall load_acc_from_X + ldi r19, 1 + add acc0, r19 + adc acc1, r1 + adc acc2, r1 + adc acc3, r1 + st -X, acc0 + st -X, acc1 + st -X, acc2 + st -X, acc3 + /* call f0 */ + movw r30, r22 + movw r26, r24 + rcall f0 + /* call f1*/ + pop r21 + pop r20 + pop r23 + pop r22 + pop r25 + pop r24 +; rcall printQ + push_range 20, 25 + rcall f1 + /* call f2 */ +; pop_range 20, 25 +; push_range 20, 25 +; rcall printQ + pop_range 20, 25 +; push r20 +; push r21 + call f2 +;--- DBG +; pop r25 +; pop r24 +; ldi r22, 'H' +; rcall printX +;--- END DBG + stack_free_large3 32*4 + pop_range 2, 17 + pop_range 28, 29 + ret + +/******************************************************************************/ +/* + param ctx: r24:r25 + param msg: r22:r23 + param len: r20:r21 +*/ +ctx0 = 2 +ctx1 = 3 +blc0 = 4 +blc1 = 5 +len0 = 28 +len1 = 29 +buf0 = 6 +buf1 = 7 + +.global bmw_small_lastBlock +.global bmw224_lastBlock +.global bmw256_lastBlock +bmw_small_lastBlock: +bmw224_lastBlock: +bmw256_lastBlock: +/* while(length_b >= BMW_SMALL_BLOCKSIZE){ + bmw_small_nextBlock(ctx, block); + length_b -= BMW_SMALL_BLOCKSIZE; + block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B; + } +*/ + push_range 2, 7 + push_range 28, 29 + movw ctx0, r24 + movw blc0, r22 + movw len0, r20 +1: + cpi len1, hi8(512) + brlo 2f + movw r24, ctx0 + movw r22, blc0 + rcall bmw_small_nextBlock + ldi r24, 64 + add blc0, r24 + adc blc1, r1 + subi len1, hi8(512) + rjmp 1b +2: +/* struct { + uint8_t buffer[64]; + uint32_t ctr; + } pctx; +*/ + stack_alloc_large 68 + adiw r30, 1 + movw buf0, r30 +/* memset(pctx.buffer, 0, 64); + memcpy(pctx.buffer, block, (length_b+7)/8); + pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07); +*/ movw r24, len0 + lsr r25 + ror r24 + lsr r24 + lsr r24 + ldi r23, 63 + sub r23, r24 + movw r26, blc0 + tst r24 + breq 301f + /* copy (#r24) bytes to stack buffer */ +30: ld r20, X+ + st Z+, r20 + dec r24 + brne 30b +301: /* calculate the appended byte */ + clr r20 + mov r21, len0 + ldi r24, 0x80 + andi r21, 0x07 + breq 305f + ld r20, X+ +303: + lsr r24 + dec r21 + brne 303b +305: + or r20, r24 + st Z+, r20 + tst r23 + breq 32f +31: st Z+, r1 + dec r23 + brne 31b +32: +/* if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511 + bmw_small_nextBlock(ctx, pctx.buffer); + memset(pctx.buffer, 0, 64-8); + ctx->counter -= 1; + } +*/ + tst len1 + breq 400f + cpi len0, 192 + brlo 400f + movw r24, ctx0 + movw r22, buf0 + rcall bmw_small_nextBlock + movw r26, buf0 + ldi r20, 64-8 +350: + st X+, r1 + dec r20 + brne 350b + movw r30, ctx0 + adiw r30, 60 + ldd r21, Z+4 + ldd r22, Z+5 + ldd r23, Z+6 + ldd r24, Z+7 + subi r21, 1 + sbc r22, r1 + sbc r23, r1 + sbc r24, r1 + rjmp 410f +/* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b; + bmw_small_nextBlock(ctx, pctx.buffer); +*/ +400: + movw r30, ctx0 + adiw r30, 60 + ldd r21, Z+4 + ldd r22, Z+5 + ldd r23, Z+6 + ldd r24, Z+7 +410: + clr r25 + lsl r21 + rol r22 + rol r23 + rol r24 + rol r25 + mov r20, len0 + add r21, len1 + adc r22, r1 + adc r23, r1 + adc r24, r1 + adc r25, r1 + movw r30, buf0 + adiw r30, 64-8 + st Z+, r20 + st Z+, r21 + st Z+, r22 + st Z+, r23 + st Z+, r24 + st Z+, r25 + st Z+, r1 + st Z+, r1 + movw r24, ctx0 + movw r22, buf0 + rcall bmw_small_nextBlock +/* memset(pctx.buffer, 0xaa, 64); + for(i=0; i<16;++i){ + pctx.buffer[i*4] = i+0xa0; + } +*/ + ldi r18, 0xa0 + ldi r19, 0xaa + movw r26, buf0 +500: + st X+, r18 + st X+, r19 + st X+, r19 + st X+, r19 + inc r18 + sbrs r18, 4 + rjmp 500b +/* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h); + memcpy(ctx->h, pctx.buffer, 64); +*/ + movw r24, buf0 + movw r22, ctx0 + rcall bmw_small_nextBlock + ldi r18, 64 + movw r26, ctx0 + movw r30, buf0 +600: + ld r20, Z+ + st X+, r20 + dec r18 + brne 600b + + stack_free_large 68 + pop_range 28, 29 + pop_range 2, 7 + ret + + +/******************************************************************************* +* void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){ +* memcpy(dest, &(ctx->h[9]), 224/8); +* } +* +* param dest: r24:r25 +* param ctx: r22:r23 +*/ +.global bmw224_ctx2hash +bmw224_ctx2hash: + movw r26, r24 + movw r30, r22 + adiw r30, 9*4 + ldi r22, 28 + rjmp 1f + +/******************************************************************************* +* void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){ +* memcpy(dest, &(ctx->h[8]), 256/8); +* } +* +* param dest: r24:r25 +* param ctx: r22:r23 +*/ +.global bmw256_ctx2hash +bmw256_ctx2hash: + movw r26, r24 + movw r30, r22 + adiw r30, 8*4 + ldi r22, 32 +1: + ld r23, Z+ + st X+, r23 + dec r22 + brne 1b + ret + +/******************************************************************************* +* void bmw256(void* dest, const void* msg, uint32_t length_b){ +* bmw_small_ctx_t ctx; +* bmw256_init(&ctx); +* while(length_b>=BMW_SMALL_BLOCKSIZE){ +* bmw_small_nextBlock(&ctx, msg); +* length_b -= BMW_SMALL_BLOCKSIZE; +* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B; +* } +* bmw_small_lastBlock(&ctx, msg, length_b); +* bmw256_ctx2hash(dest, &ctx); +* } +* +* param dest: r24:r25 +* param msg: r22:r23 +* param length_b: r18:r21 +*/ +ctx0 = 2 +ctx1 = 3 +msg0 = 4 +msg1 = 5 +len0 = 6 +len1 = 7 +len2 = 8 +len3 = 9 +dst0 = 10 +dst1 = 11 +.global bmw256 +bmw256: + push r16 + ldi r16, 1 + rjmp bmw_small_all + +/******************************************************************************* +* void bmw224(void* dest, const void* msg, uint32_t length_b){ +* bmw_small_ctx_t ctx; +* bmw224_init(&ctx); +* while(length_b>=BMW_SMALL_BLOCKSIZE){ +* bmw_small_nextBlock(&ctx, msg); +* length_b -= BMW_SMALL_BLOCKSIZE; +* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B; +* } +* bmw_small_lastBlock(&ctx, msg, length_b); +* bmw224_ctx2hash(dest, &ctx); +* } +* +* param dest: r24:r25 +* param msg: r22:r23 +* param length_b: r18:r21 +*/ +ctx0 = 2 +ctx1 = 3 +msg0 = 4 +msg1 = 5 +len0 = 6 +len1 = 7 +len2 = 8 +len3 = 9 +dst0 = 10 +dst1 = 11 +.global bmw224 +bmw224: + push r16 + clr r16 + +bmw_small_all: + push_range 2, 11 + stack_alloc_large 64+4 + adiw r30, 1 + movw ctx0, r30 + movw dst0, r24 + movw msg0, r22 + movw len0, r18 + movw len2, r20 + movw r24, ctx0 + ldi r30, pm_lo8(init_lut) + ldi r31, pm_hi8(init_lut) + add r30, r16 + adc r31, r1 + icall +20: + mov r18, len2 + or r18, len3 + breq 50f + movw r24, ctx0 + movw r22, msg0 + rcall bmw_small_nextBlock + ldi r20, 2 + sub len1, r20 + sbc len2, r1 + sbc len3, r1 + ldi r20, 64 + add msg0, r20 + adc msg1, r1 + rjmp 20b +50: + movw r24, ctx0 + movw r22, msg0 + movw r20, len0 + rcall bmw_small_lastBlock + movw r24, dst0 + movw r22, ctx0 + ldi r30, pm_lo8(c2h_lut) + ldi r31, pm_hi8(c2h_lut) + add r30, r16 + adc r31, r1 + icall + stack_free_large 64+4 + pop_range 2, 11 + pop r16 + ret + +init_lut: + rjmp bmw224_init + rjmp bmw256_init +c2h_lut: + rjmp bmw224_ctx2hash + rjmp bmw256_ctx2hash + +/******************************************************************************* +* void bmw224_init(bmw224_ctx_t* ctx){ +* uint8_t i; +* ctx->h[0] = 0x00010203; +* for(i=1; i<16; ++i){ +* ctx->h[i] = ctx->h[i-1]+ 0x04040404; +* } +* ctx->counter=0; +* } +* +* param ctx: r24:r25 +*/ +.global bmw224_init +bmw224_init: + movw r26, r24 + ldi r22, 0x03 + ldi r23, 0x02 + ldi r24, 0x01 + ldi r25, 0x00 +bmw_small_init: + st X+, r22 + st X+, r23 + st X+, r24 + st X+, r25 + ldi r18, 16-1 + ldi r20, 0x04 +1: + add r22, r20 + adc r23, r20 + adc r24, r20 + adc r25, r20 + st X+, r22 + st X+, r23 + st X+, r24 + st X+, r25 + dec r18 + brne 1b + st X+, r1 + st X+, r1 + st X+, r1 + st X+, r1 + ret + +.global bmw256_init +bmw256_init: + movw r26, r24 + ldi r22, 0x43 + ldi r23, 0x42 + ldi r24, 0x41 + ldi r25, 0x40 + rjmp bmw_small_init + + +/******************************************************************************/ + +#if DEBUG + +printQ: + push_range 20, 25 + ldi r16, 4 + mov r9, r16 + movw r16, r24 + ldi r24, lo8(qdbg_str) + ldi r25, hi8(qdbg_str) + call cli_putstr_P + clr r8 +10: ldi r24, lo8(qdbg_str1) + ldi r25, hi8(qdbg_str1) + call cli_putstr_P + mov r24, r8 + call cli_hexdump_byte + ldi r24, lo8(qdbg_str2) + ldi r25, hi8(qdbg_str2) + call cli_putstr_P + movw r24, r16 + clr r23 + ldi r22, 4 + call cli_hexdump_rev + add r16, r9 + adc r17, r1 + inc r8 + sbrs r8, 5 + rjmp 10b + pop_range 20, 25 + ret +qdbg_str: .asciz "\r\nDBG Q: " +qdbg_str1: .asciz "\r\n Q[" +qdbg_str2: .asciz "] = " + + +printX: + push_range 6, 9 + push_range 16, 27 + push_range 30, 31 + ldi r16, 4 + mov r6, r22 + mov r9, r16 + movw r16, r24 + ldi r24, lo8(Xdbg_str) + ldi r25, hi8(Xdbg_str) + call cli_putstr_P + mov r24, r6 + call cli_putc + ldi r24, ':' + call cli_putc + clr r8 +10: ldi r24, lo8(Xdbg_str1) + ldi r25, hi8(Xdbg_str1) + call cli_putstr_P + mov r24, r6 + call cli_putc + ldi r24, '[' + call cli_putc + mov r24, r8 + call cli_hexdump_byte + ldi r24, lo8(Xdbg_str2) + ldi r25, hi8(Xdbg_str2) + call cli_putstr_P + movw r24, r16 + clr r23 + ldi r22, 4 + call cli_hexdump_rev + add r16, r9 + adc r17, r1 + inc r8 + sbrs r8, 4 + rjmp 10b + pop_range 30, 31 + pop_range 16, 27 + pop_range 6, 9 + ret +Xdbg_str: .asciz "\r\nDBG " +Xdbg_str1: .asciz "\r\n " +Xdbg_str2: .asciz "] = " + +print32: + push_range 6, 9 + push_range 16, 27 + push_range 30, 31 + movw r6, r22 + movw r8, r24 + ldi r24, lo8(Xdbg_str) + ldi r25, hi8(Xdbg_str) + call cli_putstr_P + mov r24, r9 + call cli_hexdump_byte + mov r24, r8 + call cli_hexdump_byte + mov r24, r7 + call cli_hexdump_byte + mov r24, r6 + call cli_hexdump_byte + pop_range 30, 31 + pop_range 16, 27 + pop_range 6, 9 + ret + + +print_acc: + push_range 16, 27 + push_range 30, 31 + ldi r24, lo8(Xdbg_str) + ldi r25, hi8(Xdbg_str) + call cli_putstr_P + mov r24, r9 + call cli_hexdump_byte + mov r24, r8 + call cli_hexdump_byte + mov r24, r15 + call cli_hexdump_byte + mov r24, r14 + call cli_hexdump_byte + pop_range 30, 31 + pop_range 16, 27 + ret + +#endif + diff --git a/hfal-performance.c b/hfal-performance.c index 6f51dc6..5371f80 100644 --- a/hfal-performance.c +++ b/hfal-performance.c @@ -139,13 +139,12 @@ void hfal_stacksize(const hfdesc_t* hd){ uint8_t data[(hf.blocksize_b+7)/8]; uint8_t digest[(hf.hashsize_b+7)/8]; uint16_t t1, t2; - uint8_t i; if(hf.type!=HFDESC_TYPE_HASHFUNCTION) return; cli_putstr_P(PSTR("\r\n\r\n === ")); cli_putstr_P(hf.name); - cli_putstr_P(PSTR(" stack-usage === " + cli_putstr_P(PSTR(" stack-usage === ")); cli(); stack_measure_init(&smctx, PATTERN_A); @@ -153,7 +152,7 @@ void hfal_stacksize(const hfdesc_t* hd){ t1 = stack_measure_final(&smctx); stack_measure_init(&smctx, PATTERN_B); hf.init(&ctx); - t1 = stack_measure_final(&smctx); + t2 = stack_measure_final(&smctx); sei(); t1 = (t1>t2)?t1:t2; @@ -166,7 +165,7 @@ void hfal_stacksize(const hfdesc_t* hd){ t1 = stack_measure_final(&smctx); stack_measure_init(&smctx, PATTERN_B); hf.nextBlock(&ctx, data); - t1 = stack_measure_final(&smctx); + t2 = stack_measure_final(&smctx); sei(); t1 = (t1>t2)?t1:t2; @@ -179,7 +178,7 @@ void hfal_stacksize(const hfdesc_t* hd){ t1 = stack_measure_final(&smctx); stack_measure_init(&smctx, PATTERN_B); hf.lastBlock(&ctx, data, 0); - t1 = stack_measure_final(&smctx); + t2 = stack_measure_final(&smctx); sei(); t1 = (t1>t2)?t1:t2; @@ -192,7 +191,7 @@ void hfal_stacksize(const hfdesc_t* hd){ t1 = stack_measure_final(&smctx); stack_measure_init(&smctx, PATTERN_B); hf.ctx2hash(digest, &ctx); - t1 = stack_measure_final(&smctx); + t2 = stack_measure_final(&smctx); sei(); t1 = (t1>t2)?t1:t2; diff --git a/mkfiles/bmw_tiny.mk b/mkfiles/bmw_tiny.mk new file mode 100644 index 0000000..f38cee3 --- /dev/null +++ b/mkfiles/bmw_tiny.mk @@ -0,0 +1,12 @@ +# Makefile for BlueMidnightWish +ALGO_NAME := BMW_TINY + +# comment out the following line for removement of BlueMidnightWish from the build process +HASHES += $(ALGO_NAME) + +$(ALGO_NAME)_DIR := bmw/ +$(ALGO_NAME)_OBJ := bmw_small-tinyasm.o bmw_large.o +$(ALGO_NAME)_TEST_BIN := main-bmw-test.o hfal_bmw_small.o hfal_bmw_large.o $(CLI_STD) $(HFAL_STD) +$(ALGO_NAME)_NESSIE_TEST := test nessie +$(ALGO_NAME)_PERFORMANCE_TEST := performance + -- 2.39.5