From: bg Date: Tue, 11 May 2010 15:02:32 +0000 (+0000) Subject: bmw tiny split up X-Git-Url: https://git.cryptolib.org/?p=avr-crypto-lib.git;a=commitdiff_plain;h=52ef0158a4d47da53cf4595a7f9ee867947b4390 bmw tiny split up --- diff --git a/bmw/bmw_224-tinyasm.S b/bmw/bmw_224-tinyasm.S new file mode 100644 index 0000000..e01752c --- /dev/null +++ b/bmw/bmw_224-tinyasm.S @@ -0,0 +1,1281 @@ +/* bmw_small-tinyasm.S */ +/* + This file is part of the AVR-Crypto-Lib. + Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/* + * File: bmw_small-tinyasm.S + * Author: Daniel Otte + * Date: 2010-03-28 + * License: GPLv3 or later + * Description: implementation of BlueMidnightWish + * + */ + +#include "avr-asm-macros.S" + +acc2 = 8 +acc3 = 9 +acc0 = 14 +acc1 = 15 + +#define DEBUG 0 + +/******************************************************************************/ +/* + param a: r22:r23:r24:r25 + param s: r20 +*/ +shiftleft32: + clr r0 + cpi r20, 8 + brlo bitrotateleft_1 + mov r25, r24 + mov r24, r23 + mov r23, r22 + clr r22 + subi r20, 8 + rjmp shiftleft32 + +/******************************************************************************/ +/* + param a: r22:r23:r24:r25 + param s: r20 +*/ +shiftright32: + cpi r20, 8 + brlo bitshiftright + mov r22, r23 + mov r23, r24 + mov r24, r25 + clr r25 + subi r20, 8 + rjmp shiftright32 +bitshiftright: + tst r20 + breq 20f +10: lsr r25 + ror r24 + ror r23 + ror r22 + dec r20 + brne 10b +20: ret + +/******************************************************************************/ +/* + param a: r22:r23:r24:r25 + param s: r20 +*/ +rotateleft32: + cpi r20, 8 + brlo bitrotateleft + mov r0, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r0 + subi r20, 8 + rjmp rotateleft32 +bitrotateleft: + mov r0, r25 +bitrotateleft_1: + tst r20 + breq 20f +10: + lsl r0 +rol32: + rol r22 + rol r23 + rol r24 + rol r25 + dec r20 + brne 10b +20: ret + + +/******************************************************************************/ + +sn_stub: + movw r22, r2 + movw r24, r4 + lpm r20, Z+ + rcall rotateleft32 +eor32_to_acc: + eor acc0, r22 + eor acc1, r23 + eor acc2, r24 + eor acc3, r25 + ret + +s_table: +s0: .byte 1, 3, 4,19 +s1: .byte 1, 2, 8,23 +s2: .byte 2, 1,12,25 +s3: .byte 2, 2,15,29 +s4: .byte 1, 0, 0, 0 +s5: .byte 2, 0, 0, 0 + +h0 = 10 +h1 = 11 +m0 = 12 +m1 = 13 + +/* + param x: r22:r23:r24:25 + param s: r20 +*/ +sn: + push_range 2, 5 + push acc0 + push acc1 + push acc2 + push acc3 + ldi r30, lo8(s_table) + ldi r31, hi8(s_table) + lsl r20 + lsl r20 + add r30, r20 + adc r31, r1 + movw r2, r22 + movw r4, r24 + lpm r20, Z+ + rcall shiftright32 + rcall mov32_to_acc +;--- + movw r22, r2 + movw r24, r4 + lpm r20, Z+ + rcall shiftleft32 + rcall eor32_to_acc +;--- + rcall sn_stub + rcall sn_stub + + movw r22, acc0 + movw r24, acc2 + pop acc3 + pop acc2 + pop acc1 + pop acc0 + rjmp pop5 + +/******************************************************************************/ +/* + param dest: r26:r27 (X) + param src: r30:r31 (Z) + param len: r20 +*/ +memxor_64: +; tst r20 +; breq memxor_exit + ldi r20, 64 +memxor: +10: ld r21, X + ld r22, Z+ + eor r21, r22 + st X+, r21 + dec r20 + brne 10b +memxor_exit: + ret + +/******************************************************************************/ +q0 = 2 +q1 = 3 +h0 = 4 +h1 = 5 +m0 = 6 +m1 = 7 + + +/******************************************************************************/ +load32_from_X: + ld r22, X+ + ld r23, X+ + ld r24, X+ + ld r25, X+ + ret + +load32_from_Y: + ld r22, Y+ + ld r23, Y+ + ld r24, Y+ + ld r25, Y+ + ret + +store32_to_Y: + st Y+, r22 + st Y+, r23 + st Y+, r24 + st Y+, r25 + ret + +add_X_to_32: + ld r0, X+ + add r22, r0 + ld r0, X+ + adc r23, r0 + ld r0, X+ + adc r24, r0 + ld r0, X+ + adc r25, r0 + ret + +store32_to_X: + st X+, r22 + st X+, r23 + st X+, r24 + st X+, r25 + ret + +mov32_to_acc: + movw acc0, r22 + movw acc2, r24 + ret + +/******************************************************************************/ +/* + param q: r28:r29 (Y) + param h: r26:r27 (X) + param m: r30:r31 (Z) +*/ + +f2_1_shift_table: +; .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55 + .byte 0x55, 0x87, 0x55, 0x51, 0x03, 0x66, 0x64, 0x2B +f2_2_shift_table: +; .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1) + .byte (8<<1)+1, (6<<1), (6<<1)+1, (4<<1)+1, (3<<1), (4<<1), (7<<1), (2<<1) +expand2_rot_table: + .byte 3,7,13,16,19,23,27 + +f0_hacktable: + .byte 0x03, 0x11, 5*4 + .byte 0xDD, 0xB3, 7*4 + .byte 0x2A, 0x79, 10*4 + .byte 0x07, 0xAA, 13*4 + .byte 0x51, 0xC2, 14*4 + + +/******************************************************************************* +* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){ +* uint32_t r; +* r = pgm_read_dword(k_lut+j); +* r += rotl_addel(((uint32_t*)m)[j&0xf], j+0); +* r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3); +* r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10); +* r ^= ((uint32_t*)h)[(j+7)&0xf]; +* return r; +* } +* param j: r24 +* param m: r22:r23 +* param h: r20:r21 +*/ +j = 16 +acc2 = 8 +acc3 = 9 +h0 = 10 +h1 = 11 +m0 = 12 +m1 = 13 +acc0 = 14 +acc1 = 15 + +load_acc_from_X: + ld acc0, X+ + ld acc1, X+ + ld acc2, X+ + ld acc3, X+ + ret + +add_acc_to_X: + ld r0, X + add r0, acc0 + st X+, r0 + ld r0, X + adc r0, acc1 + st X+, r0 + ld r0, X + adc r0, acc2 + st X+, r0 + ld r0, X + adc r0, acc3 + st X+, r0 + ret + +load_rotate_add_M: + mov r20, j + andi r20, 0x0f + mov r0, r20 + lsl r0 + lsl r0 + movw r26, m0 + add r26, r0 + adc r27, r1 + rcall load32_from_X + inc r20 + rcall rotateleft32 + brts 10f + rjmp add32_to_acc +; ret +10: sub acc0, r22 + sbc acc1, r23 + sbc acc2, r24 + sbc acc3, r25 + ret + + +;--- + +/******************************************************************************/ +load_sn_add: + rcall load32_from_X + rcall sn +add32_to_acc: + add acc0, r22 + adc acc1, r23 + adc acc2, r24 + adc acc3, r25 + ret + +/* + param q: r26:r27 + param m: r22:r23 + param h: r20:r21 + param j: r24 +*/ + +expand_intro: + push_range 26, 27 + push r24 +addelement: + mov j, r24 + movw h0, r20 + movw m0, r22 + sbiw r26, 4 + rcall load_acc_from_X + ldi r24, 0x55 + add acc0, r24 + adc acc1, r24 + adc acc2, r24 + ldi r24, 5 + adc acc3, r24 + rcall store_acc_to_dec_X + adiw r26, 4 + clt + rcall load_rotate_add_M + subi j, -3 + rcall load_rotate_add_M + set + subi j, -7 + rcall load_rotate_add_M + lsl j + lsl j + subi j, -7*4+10*4 + andi j, 0x3f + movw r26, h0 + add r26, j + adc r27, r1 + rcall load32_from_X + rcall eor32_to_acc +;-- + pop r24 + pop_range 26, 27 + lsl r24 + lsl r24 + add r26, r24 + adc r27, r1 + ret +expand1: + rcall expand_intro + ldi r19, 1 +10: + mov r20, r19 + andi r20, 3 + rcall load_sn_add + inc r19 + cpi r19, 17 + brne 10b + rjmp expand2_exit + + +/******************************************************************************/ +/* + param q: r26:r27 + param m: r22:r23 + param h: r20:r21 + param j: r24 +*/ + + +expand2: + rcall expand_intro + ldi r19, 14 + ldi r30, lo8(expand2_rot_table) + ldi r31, hi8(expand2_rot_table) +10: + rcall load32_from_X + sbrs r19, 0 + rjmp 12f + lpm r20, Z+ + rcall rotateleft32 +12: rcall add32_to_acc + dec r19 + brne 10b + ldi r20, 4 + rcall load_sn_add + ldi r20, 5 + rcall load_sn_add +expand2_exit: + adiw r26, 4 +store_acc_to_dec_X: + st -X, acc3 + st -X, acc2 + st -X, acc1 + st -X, acc0 + ret + +/******************************************************************************/ +/* + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +/* for calling expand1/2 + param q: r26:r27 + param m: r22:r23 + param h: r20:r21 + param j: r24 +*/ + +/******************************************************************************/ +/* + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ + +/******************************************************************************/ +/* + param ctx: r24:r25 + param msg: r22:r23 +*/ +/* f0 + param q: r28:r29 (Y) + param h: r26:r27 (X) + param m: r30:r31 (Z) +*/ +/* f1 + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +/* f2 + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +q0 = 2 +q1 = 3 +h0 = 4 +h1 = 5 +m0 = 6 +m1 = 7 +ctx0 = 2 +ctx1 = 3 +msg0 = 4 +msg1 = 5 + +restore_f1: + movw r26, r2 + movw r22, r4 + movw r20, r6 + ret +bmw_small_nextBlock_early: + movw r24, ctx0 + movw r22, msg0 +.global bmw224_nextBlock +bmw_small_nextBlock: +bmw224_nextBlock: +bmw256_nextBlock: + push_range 2, 7 + push_range 28, 29 + push_range 8, 17 + stack_alloc_large 32*4, r28, r29 + ldi r16, 0x4f + push r16 + ldi r16, 0xff + push r16 + push r16 + ldi r16, 0xfb + push r16 + adiw r28, 1 +; push_range 28, 29 /* push Q */ +; push_range 22, 25 /* push M & H */ + /* increment counter */ + movw r26, r24 + movw r2, r26 + adiw r26, 63 + adiw r26, 1 + rcall load_acc_from_X + ldi r19, 1 + add acc0, r19 + adc acc1, r1 + adc acc2, r1 + adc acc3, r1 + rcall store_acc_to_dec_X + /* call f0 */ + movw r30, r22 + movw r26, r24 +f0: + movw h0, r26 + movw q0, r28 + movw m0, r30 + /* xor m into h */ +; ldi r20, 64 + rcall memxor_64 + movw r30, m0 + movw r26, h0 + + /* set q to zero */ + ldi r22, 64 +10: st Y+, r1 + dec r22 + brne 10b + movw r28, q0 + /* calculate W and store it in Q */ + ldi r19, 5 +30: + ldi r18, 16 + /* load initial index */ + + /* load values from hacktable */ + ldi r30, lo8(f0_hacktable-3) + ldi r31, hi8(f0_hacktable-3) + mov r16, r19 + lsl r16 + add r16, r19 + add r30, r16 + adc r31, r1 + lpm r21, Z+ + lpm r20, Z+ + lpm r16, Z+ +40: + ;call add_hx_to_w +add_hx_to_w: + movw r26, h0 + add r26, r16 + adc r27, r1 + rcall load32_from_Y + sbiw r28, 4 + lsl r20 + rol r21 + brcs 300f + /* addition */ + rcall add_X_to_32 + rjmp 500f +300: /* substract */ + rcall load_acc_from_X + sub r22, acc0 + sbc r23, acc1 + sbc r24, acc2 + sbc r25, acc3 + +500: + rcall store32_to_Y + subi r16, -4 + andi r16, 0x0f<<2 + dec r18 + brne 40b + movw r28, q0 + dec r19 + brne 30b + movw r26, h0 + /* xor m into h */ +; ldi r20, 64 + movw r26, h0 + movw r30, m0 + rcall memxor_64 + sbiw r26, 60 +;--- + clr r17 + ldi r21, 15 + mov r8, r21 +50: + rcall load32_from_Y + sbiw r28, 4 + mov r20, r17 + rcall sn + inc r17 + cpi r17, 5 + brne 52f + clr r17 +52: + rcall add_X_to_32 + rcall store32_to_Y + + dec r8 + brne 50b +;--- + rcall load32_from_Y + clr r20 + rcall sn + movw r26, h0 + rcall add_X_to_32 + sbiw r26, 4 + sbiw r28, 4 + rcall store32_to_Y + sbiw r28, 4 + sbiw r28, 15*4 + movw r20, h0 + movw r22, m0 + + /* call f1*/ + movw r2, r28 +f1: + movw r4, r22 + movw r6, r20 + movw r26, r2 + clr r24 + rcall expand1 + rcall restore_f1 + ldi r24, 1 + rcall expand1 + ldi r17, 2 +10: rcall restore_f1 + mov r24, r17 + rcall expand2 + inc r17 + sbrs r17, 4 + rjmp 10b + rcall restore_f1 + movw r24, r2 + + + /* call f2 */ +; pop_range 20, 25 +; push_range 20, 25 +; rcall printQ +; push r20 +; push r21 +acc2 = 8 +acc3 = 9 +acc0 = 14 +acc1 = 15 +xl0 = 2 +xl1 = 3 +xl2 = 4 +xl3 = 5 +xh0 = 6 +xh1 = 7 +xh2 = 10 +xh3 = 11 +q16_0 = 12 +q16_1 = 13 +h0 = 18 +h1 = 19 +f2: + movw r26, r24 + /* calc XL & XH */ + adiw r26, 63 + adiw r26, 1 + movw q16_0, r26 + movw h0, r20 +;--- +; push h0 +; push h1 +;--- + movw r28, r22 + rcall load_acc_from_X + ldi r17, 15 +10: rcall load32_from_X + rcall eor32_to_acc + cpi r17, 9 + brne 15f + movw xl0, acc0 + movw xl2, acc2 +15: + dec r17 + brne 10b + movw xh0, acc0 + movw xh2, acc2 +;--- DBG +; push_range 22, 25 +; movw r22, xl0 +; movw r24, xl2 +; rcall print32 +; movw r22, xh0 +; movw r24, xh2 +; rcall print32 +; pop_range 22, 25 +;--- END DBG + /* copy m(Y) into h */ + movw r26, h0 + ldi r22, 64 +10: + ld r23, Y+ + st X+, r23 + dec r22 + brne 10b +;--- /* calc first half of h0..h15 */ + movw r28, q16_0 + movw r26, h0 + ldi r30, lo8(f2_1_shift_table) + ldi r31, hi8(f2_1_shift_table) + ldi r17, 16 +10: +;--- + movw r22, xh0 + movw r24, xh2 + cpi r17, 9 + brge 15f + clr r1 + rjmp 26f +15: lpm r20, Z+ + mov r1, r20 + andi r20, 0x0f + clt + cpi r17, 16 + breq 20f + cpi r17, 11 + brne 21f +20: set +21: brts 25f + rcall shiftright32 + rjmp 26f +25: rcall shiftleft32 +26: rcall mov32_to_acc +;--- + rcall load32_from_Y + mov r20, r1 + clr r1 + swap r20 + andi r20, 0x0f + brts 27f + rcall shiftleft32 + rjmp 28f +27: rcall shiftright32 +28: rcall eor32_to_acc +;--- + rcall load32_from_X + rcall eor32_to_acc + rcall store_acc_to_dec_X + adiw r26, 4 +;--- + dec r17 + brne 10b +;----- + sbiw r28, 4*8 /* Y points to q[24] */ + movw r30, r28 + sbiw r28, 63 + sbiw r28, 33 /* Y points to q[0] */ + movw r26, r28 + ldi r20, 8*4 + /* xor q[24..31] into q[0..7] */ + rcall memxor + /* xor q[23] into q[8] */ + sbiw r30, 9*4 + ldi r20, 4 + rcall memxor + /* xor q[16..22] into q[9..15] */ + sbiw r30, 8*4 + ldi r20, 7*4 + rcall memxor + + movw r26, h0 + ldi r17, 15 + ldi r30, lo8(f2_2_shift_table) + ldi r31, hi8(f2_2_shift_table) +10: movw r22, xl0 + movw r24, xl2 + sbrc r17, 3 + rjmp 20f + lpm r20, Z+ + lsr r20 + brcs 15f + rcall shiftright32 + rjmp 20f +15: + rcall shiftleft32 +20: + rcall mov32_to_acc + rcall load32_from_Y + rcall eor32_to_acc + rcall add_acc_to_X + dec r17 + brpl 10b +;----- + sbiw r26, 8*4 /* X points to h8 */ + movw r28, r26 + sbiw r28, 4*4 /* Y points to h4 */ + ldi r17, 8 + ldi r18, 9 +10: + rcall load32_from_Y + mov r20, r18 + rcall rotateleft32 + rcall mov32_to_acc + rcall add_acc_to_X + inc r18 + cpi r17, 5 + brne 20f + sbiw r28, 8*4 +20: dec r17 + brne 10b + +exit: +;--- DBG +; pop r25 +; pop r24 +; ldi r22, 'H' +; rcall printX +;--- END DBG + stack_free_large3 32*4+4 + pop_range 10, 17 +pop9: + pop_range 8, 9 +pop28: + pop_range 28, 29 +pop7: + pop_range 6, 7 +pop5: + pop_range 2, 5 + ret + +/******************************************************************************/ +ctx0 = 2 +ctx1 = 3 +blc0 = 4 +blc1 = 5 +len0 = 28 +len1 = 29 +buf0 = 6 +buf1 = 7 + +load32_from_Z_stub: + movw r30, ctx0 + adiw r30, 60 + ldd r21, Z+4 + ldd r22, Z+5 + ldd r23, Z+6 + ldd r24, Z+7 + ret + +/******************************************************************************/ +/* + param ctx: r24:r25 + param msg: r22:r23 + param len: r20:r21 +*/ + +.global bmw224_lastBlock +bmw_small_lastBlock: +bmw224_lastBlock: +bmw256_lastBlock: +/* while(length_b >= BMW_SMALL_BLOCKSIZE){ + bmw_small_nextBlock(ctx, block); + length_b -= BMW_SMALL_BLOCKSIZE; + block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B; + } +*/ + push_range 2, 7 + push_range 28, 29 + movw ctx0, r24 + movw blc0, r22 + movw len0, r20 +1: + cpi len1, hi8(512) + brlo 2f + rcall bmw_small_nextBlock_early + ldi r24, 64 + add blc0, r24 + adc blc1, r1 + subi len1, hi8(512) + rjmp 1b +2: +/* struct { + uint8_t buffer[64]; + uint32_t ctr; + } pctx; +*/ + stack_alloc_large 68 + adiw r30, 1 + movw buf0, r30 +/* memset(pctx.buffer, 0, 64); + memcpy(pctx.buffer, block, (length_b+7)/8); + pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07); +*/ movw r24, len0 + ldi r23, 63 + movw r26, blc0 + lsr r25 + ror r24 + lsr r24 + lsr r24 + breq 301f + sub r23, r24 + /* copy (#r24) bytes to stack buffer */ +30: ld r20, X+ + st Z+, r20 + dec r24 + brne 30b +301: /* calculate the appended byte */ + clr r20 + mov r21, len0 + ldi r24, 0x80 + andi r21, 0x07 + breq 305f + ld r20, X+ +303: + lsr r24 + dec r21 + brne 303b +305: + or r20, r24 + st Z+, r20 + tst r23 + breq 32f +31: st Z+, r1 + dec r23 + brne 31b +32: +/* if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511 + bmw_small_nextBlock(ctx, pctx.buffer); + memset(pctx.buffer, 0, 64-8); + ctx->counter -= 1; + } +*/ + tst len1 + breq 400f + cpi len0, 192 + brlo 400f + movw blc0, buf0 + rcall bmw_small_nextBlock_early + movw r26, buf0 + ldi r20, 64-8 +350: + st X+, r1 + dec r20 + brne 350b + rcall load32_from_Z_stub + subi r21, 1 + sbc r22, r1 + sbc r23, r1 + sbc r24, r1 + rjmp 410f +/* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b; + bmw_small_nextBlock(ctx, pctx.buffer); +*/ +400: + rcall load32_from_Z_stub +410: + clr r25 + ldi r20, 1 + lsl r21 + rcall rol32 + mov r20, len0 + add r21, len1 + adc r22, r1 + adc r23, r1 + adc r24, r1 + adc r25, r1 + movw r26, buf0 + adiw r26, 64-8 + st X+, r20 + st X+, r21 + rcall store32_to_X + st X+, r1 + st X+, r1 + movw blc0, buf0 + rcall bmw_small_nextBlock_early +/* memset(pctx.buffer, 0xaa, 64); + for(i=0; i<16;++i){ + pctx.buffer[i*4] = i+0xa0; + } +*/ + ldi r22, 0xa0 + ldi r23, 0xaa + ldi r24, 0xaa + ldi r25, 0xaa + movw r26, buf0 +500: + rcall store32_to_X + inc r22 + sbrs r22, 4 + rjmp 500b +/* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h); + memcpy(ctx->h, pctx.buffer, 64); +*/ + movw r24, buf0 + movw r22, ctx0 + rcall bmw_small_nextBlock + ldi r18, 64 + movw r26, ctx0 + movw r30, buf0 +600: + ld r20, Z+ + st X+, r20 + dec r18 + brne 600b + + stack_free_large 68 + rjmp pop28 + + +/******************************************************************************* +* void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){ +* memcpy(dest, &(ctx->h[9]), 224/8); +* } +* +* param dest: r24:r25 +* param ctx: r22:r23 +*/ +.global bmw224_ctx2hash +bmw224_ctx2hash: + movw r30, r22 + adiw r30, 9*4 + ldi r18, 28 +1: movw r26, r24 +1: ld r23, Z+ + st X+, r23 + dec r18 + brne 1b + ret + + +/******************************************************************************* +* void bmw224(void* dest, const void* msg, uint32_t length_b){ +* bmw_small_ctx_t ctx; +* bmw224_init(&ctx); +* while(length_b>=BMW_SMALL_BLOCKSIZE){ +* bmw_small_nextBlock(&ctx, msg); +* length_b -= BMW_SMALL_BLOCKSIZE; +* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B; +* } +* bmw_small_lastBlock(&ctx, msg, length_b); +* bmw224_ctx2hash(dest, &ctx); +* } +* +* param dest: r24:r25 +* param msg: r22:r23 +* param length_b: r18:r21 +*/ +ctx0 = 2 +ctx1 = 3 +msg0 = 4 +msg1 = 5 +len0 = 28 +len1 = 29 +len2 = 8 +len3 = 9 +dst0 = 6 +dst1 = 7 +.global bmw224 +bmw224: + push_range 2, 7 + push_range 28, 29 + push_range 8, 9 + stack_alloc_large 64+4 +10: movw ctx0, r30 + movw dst0, r24 + movw msg0, r22 + movw len0, r18 + movw len2, r20 + movw r24, ctx0 + rcall bmw224_init +20: + mov r18, len2 + or r18, len3 + breq 50f + rcall bmw_small_nextBlock_early + subi len1, 2 + sbc len2, r1 + sbc len3, r1 + ldi r20, 64 + add msg0, r20 + adc msg1, r1 + rjmp 20b +50: + movw r24, ctx0 + movw r22, msg0 + movw r20, len0 + rcall bmw_small_lastBlock + movw r24, dst0 + movw r22, ctx0 + rcall bmw224_ctx2hash + stack_free_large 64+4 + rjmp pop9 + +/******************************************************************************* +* void bmw224_init(bmw224_ctx_t* ctx){ +* uint8_t i; +* ctx->h[0] = 0x00010203; +* for(i=1; i<16; ++i){ +* ctx->h[i] = ctx->h[i-1]+ 0x04040404; +* } +* ctx->counter=0; +* } +* +* param ctx: r24:r25 +*/ +.global bmw224_init +bmw224_init: + ldi r22, 0x00 + ldi r23, 0x40 + movw r26, r24 + adiw r26, 4 +10: + st -X, r22 + inc r22 + mov r20, r22 + andi r20, 0x3 + brne 10b + adiw r26, 8 +20: cp r22, r23 + brne 10b + st -X, r1 + st -X, r1 + st -X, r1 + st -X, r1 + ret + + +/******************************************************************************/ + +#if DEBUG + +printQ: + push_range 20, 25 + ldi r16, 4 + mov r9, r16 + movw r16, r24 + ldi r24, lo8(qdbg_str) + ldi r25, hi8(qdbg_str) + call cli_putstr_P + clr r8 +10: ldi r24, lo8(qdbg_str1) + ldi r25, hi8(qdbg_str1) + call cli_putstr_P + mov r24, r8 + call cli_hexdump_byte + ldi r24, lo8(qdbg_str2) + ldi r25, hi8(qdbg_str2) + call cli_putstr_P + movw r24, r16 + clr r23 + ldi r22, 4 + call cli_hexdump_rev + add r16, r9 + adc r17, r1 + inc r8 + sbrs r8, 5 + rjmp 10b + pop_range 20, 25 + ret +qdbg_str: .asciz "\r\nDBG Q: " +qdbg_str1: .asciz "\r\n Q[" +qdbg_str2: .asciz "] = " + + +printX: + push_range 6, 9 + push_range 16, 27 + push_range 30, 31 + ldi r16, 4 + mov r6, r22 + mov r9, r16 + movw r16, r24 + ldi r24, lo8(Xdbg_str) + ldi r25, hi8(Xdbg_str) + call cli_putstr_P + mov r24, r6 + call cli_putc + ldi r24, ':' + call cli_putc + clr r8 +10: ldi r24, lo8(Xdbg_str1) + ldi r25, hi8(Xdbg_str1) + call cli_putstr_P + mov r24, r6 + call cli_putc + ldi r24, '[' + call cli_putc + mov r24, r8 + call cli_hexdump_byte + ldi r24, lo8(Xdbg_str2) + ldi r25, hi8(Xdbg_str2) + call cli_putstr_P + movw r24, r16 + clr r23 + ldi r22, 4 + call cli_hexdump_rev + add r16, r9 + adc r17, r1 + inc r8 + sbrs r8, 4 + rjmp 10b + pop_range 30, 31 + pop_range 16, 27 + pop_range 6, 9 + ret +Xdbg_str: .asciz "\r\nDBG " +Xdbg_str1: .asciz "\r\n " +Xdbg_str2: .asciz "] = " + +print32: + push_range 6, 9 + push_range 16, 27 + push_range 30, 31 + movw r6, r22 + movw r8, r24 + ldi r24, lo8(Xdbg_str) + ldi r25, hi8(Xdbg_str) + call cli_putstr_P + mov r24, r9 + call cli_hexdump_byte + mov r24, r8 + call cli_hexdump_byte + mov r24, r7 + call cli_hexdump_byte + mov r24, r6 + call cli_hexdump_byte + pop_range 30, 31 + pop_range 16, 27 + pop_range 6, 9 + ret + + +print_acc: + push_range 16, 27 + push_range 30, 31 + ldi r24, lo8(Xdbg_str) + ldi r25, hi8(Xdbg_str) + call cli_putstr_P + mov r24, r9 + call cli_hexdump_byte + mov r24, r8 + call cli_hexdump_byte + mov r24, r15 + call cli_hexdump_byte + mov r24, r14 + call cli_hexdump_byte + pop_range 30, 31 + pop_range 16, 27 + ret + +#endif + diff --git a/bmw/bmw_256-tinyasm.S b/bmw/bmw_256-tinyasm.S new file mode 100644 index 0000000..6327bc1 --- /dev/null +++ b/bmw/bmw_256-tinyasm.S @@ -0,0 +1,1302 @@ +/* bmw_small-tinyasm.S */ +/* + This file is part of the AVR-Crypto-Lib. + Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/* + * File: bmw_small-tinyasm.S + * Author: Daniel Otte + * Date: 2010-03-28 + * License: GPLv3 or later + * Description: implementation of BlueMidnightWish + * + */ + +#include "avr-asm-macros.S" + +acc2 = 8 +acc3 = 9 +acc0 = 14 +acc1 = 15 + +#define DEBUG 0 + +/******************************************************************************/ +/* + param a: r22:r23:r24:r25 + param s: r20 +*/ +shiftleft32: + clr r0 + cpi r20, 8 + brlo bitrotateleft_1 + mov r25, r24 + mov r24, r23 + mov r23, r22 + clr r22 + subi r20, 8 + rjmp shiftleft32 + +/******************************************************************************/ +/* + param a: r22:r23:r24:r25 + param s: r20 +*/ +shiftright32: + cpi r20, 8 + brlo bitshiftright + mov r22, r23 + mov r23, r24 + mov r24, r25 + clr r25 + subi r20, 8 + rjmp shiftright32 +bitshiftright: + tst r20 + breq 20f +10: lsr r25 + ror r24 + ror r23 + ror r22 + dec r20 + brne 10b +20: ret + +/******************************************************************************/ +/* + param a: r22:r23:r24:r25 + param s: r20 +*/ +rotateleft32: + cpi r20, 8 + brlo bitrotateleft + mov r0, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r0 + subi r20, 8 + rjmp rotateleft32 +bitrotateleft: + mov r0, r25 +bitrotateleft_1: + tst r20 + breq 20f +10: + lsl r0 +rol32: + rol r22 + rol r23 + rol r24 + rol r25 + dec r20 + brne 10b +20: ret + + +/******************************************************************************/ + +sn_stub: + movw r22, r2 + movw r24, r4 + lpm r20, Z+ + rcall rotateleft32 +eor32_to_acc: + eor acc0, r22 + eor acc1, r23 + eor acc2, r24 + eor acc3, r25 + ret + +s_table: +s0: .byte 1, 3, 4,19 +s1: .byte 1, 2, 8,23 +s2: .byte 2, 1,12,25 +s3: .byte 2, 2,15,29 +s4: .byte 1, 0, 0, 0 +s5: .byte 2, 0, 0, 0 + +h0 = 10 +h1 = 11 +m0 = 12 +m1 = 13 + +/* + param x: r22:r23:r24:25 + param s: r20 +*/ +sn: + push_range 2, 5 + push acc0 + push acc1 + push acc2 + push acc3 + ldi r30, lo8(s_table) + ldi r31, hi8(s_table) + lsl r20 + lsl r20 + add r30, r20 + adc r31, r1 + movw r2, r22 + movw r4, r24 + lpm r20, Z+ + rcall shiftright32 + rcall mov32_to_acc +;--- + movw r22, r2 + movw r24, r4 + lpm r20, Z+ + rcall shiftleft32 + rcall eor32_to_acc +;--- + rcall sn_stub + rcall sn_stub + + movw r22, acc0 + movw r24, acc2 + pop acc3 + pop acc2 + pop acc1 + pop acc0 + rjmp pop5 + +/******************************************************************************/ +/* + param dest: r26:r27 (X) + param src: r30:r31 (Z) + param len: r20 +*/ +memxor_64: +; tst r20 +; breq memxor_exit + ldi r20, 64 +memxor: +10: ld r21, X + ld r22, Z+ + eor r21, r22 + st X+, r21 + dec r20 + brne 10b +memxor_exit: + ret + +/******************************************************************************/ +q0 = 2 +q1 = 3 +h0 = 4 +h1 = 5 +m0 = 6 +m1 = 7 + + +/******************************************************************************/ +load32_from_X: + ld r22, X+ + ld r23, X+ + ld r24, X+ + ld r25, X+ + ret + +load32_from_Y: + ld r22, Y+ + ld r23, Y+ + ld r24, Y+ + ld r25, Y+ + ret + +store32_to_Y: + st Y+, r22 + st Y+, r23 + st Y+, r24 + st Y+, r25 + ret + +add_X_to_32: + ld r0, X+ + add r22, r0 + ld r0, X+ + adc r23, r0 + ld r0, X+ + adc r24, r0 + ld r0, X+ + adc r25, r0 + ret + +store32_to_X: + st X+, r22 + st X+, r23 + st X+, r24 + st X+, r25 + ret + +mov32_to_acc: + movw acc0, r22 + movw acc2, r24 + ret + +/******************************************************************************/ +/* + param q: r28:r29 (Y) + param h: r26:r27 (X) + param m: r30:r31 (Z) +*/ + +f2_1_shift_table: +; .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55 + .byte 0x55, 0x87, 0x55, 0x51, 0x03, 0x66, 0x64, 0x2B +f2_2_shift_table: +; .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1) + .byte (8<<1)+1, (6<<1), (6<<1)+1, (4<<1)+1, (3<<1), (4<<1), (7<<1), (2<<1) +expand2_rot_table: + .byte 3,7,13,16,19,23,27 + +f0_hacktable: + .byte 0x03, 0x11, 5*4 + .byte 0xDD, 0xB3, 7*4 + .byte 0x2A, 0x79, 10*4 + .byte 0x07, 0xAA, 13*4 + .byte 0x51, 0xC2, 14*4 + + +/******************************************************************************* +* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){ +* uint32_t r; +* r = pgm_read_dword(k_lut+j); +* r += rotl_addel(((uint32_t*)m)[j&0xf], j+0); +* r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3); +* r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10); +* r ^= ((uint32_t*)h)[(j+7)&0xf]; +* return r; +* } +* param j: r24 +* param m: r22:r23 +* param h: r20:r21 +*/ +j = 16 +acc2 = 8 +acc3 = 9 +h0 = 10 +h1 = 11 +m0 = 12 +m1 = 13 +acc0 = 14 +acc1 = 15 + +load_acc_from_X: + ld acc0, X+ + ld acc1, X+ + ld acc2, X+ + ld acc3, X+ + ret + +add_acc_to_X: + ld r0, X + add r0, acc0 + st X+, r0 + ld r0, X + adc r0, acc1 + st X+, r0 + ld r0, X + adc r0, acc2 + st X+, r0 + ld r0, X + adc r0, acc3 + st X+, r0 + ret + +load_rotate_add_M: + mov r20, j + andi r20, 0x0f + mov r0, r20 + lsl r0 + lsl r0 + movw r26, m0 + add r26, r0 + adc r27, r1 + rcall load32_from_X + inc r20 + rcall rotateleft32 + brts 10f + rjmp add32_to_acc +; ret +10: sub acc0, r22 + sbc acc1, r23 + sbc acc2, r24 + sbc acc3, r25 + ret + + +;--- + +/******************************************************************************/ +load_sn_add: + rcall load32_from_X + rcall sn +add32_to_acc: + add acc0, r22 + adc acc1, r23 + adc acc2, r24 + adc acc3, r25 + ret + +/* + param q: r26:r27 + param m: r22:r23 + param h: r20:r21 + param j: r24 +*/ + +expand_intro: + push_range 26, 27 + push r24 +addelement: + mov j, r24 + movw h0, r20 + movw m0, r22 + sbiw r26, 4 + rcall load_acc_from_X + ldi r24, 0x55 + add acc0, r24 + adc acc1, r24 + adc acc2, r24 + ldi r24, 5 + adc acc3, r24 + rcall store_acc_to_dec_X + adiw r26, 4 + clt + rcall load_rotate_add_M + subi j, -3 + rcall load_rotate_add_M + set + subi j, -7 + rcall load_rotate_add_M + lsl j + lsl j + subi j, -7*4+10*4 + andi j, 0x3f + movw r26, h0 + add r26, j + adc r27, r1 + rcall load32_from_X + rcall eor32_to_acc +;-- + pop r24 + pop_range 26, 27 + lsl r24 + lsl r24 + add r26, r24 + adc r27, r1 + ret +expand1: + rcall expand_intro + ldi r19, 1 +10: + mov r20, r19 + andi r20, 3 + rcall load_sn_add + inc r19 + cpi r19, 17 + brne 10b + rjmp expand2_exit + + +/******************************************************************************/ +/* + param q: r26:r27 + param m: r22:r23 + param h: r20:r21 + param j: r24 +*/ + + +expand2: + rcall expand_intro + ldi r19, 14 + ldi r30, lo8(expand2_rot_table) + ldi r31, hi8(expand2_rot_table) +10: + rcall load32_from_X + sbrs r19, 0 + rjmp 12f + lpm r20, Z+ + rcall rotateleft32 +12: rcall add32_to_acc + dec r19 + brne 10b + ldi r20, 4 + rcall load_sn_add + ldi r20, 5 + rcall load_sn_add +expand2_exit: + adiw r26, 4 +store_acc_to_dec_X: + st -X, acc3 + st -X, acc2 + st -X, acc1 + st -X, acc0 + ret + +/******************************************************************************/ +/* + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +/* for calling expand1/2 + param q: r26:r27 + param m: r22:r23 + param h: r20:r21 + param j: r24 +*/ + +/******************************************************************************/ +/* + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ + +/******************************************************************************/ +/* + param ctx: r24:r25 + param msg: r22:r23 +*/ +/* f0 + param q: r28:r29 (Y) + param h: r26:r27 (X) + param m: r30:r31 (Z) +*/ +/* f1 + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +/* f2 + param q: r24:r25 + param m: r22:r23 + param h: r20:r21 +*/ +q0 = 2 +q1 = 3 +h0 = 4 +h1 = 5 +m0 = 6 +m1 = 7 +ctx0 = 2 +ctx1 = 3 +msg0 = 4 +msg1 = 5 + +restore_f1: + movw r26, r2 + movw r22, r4 + movw r20, r6 + ret +bmw_small_nextBlock_early: + movw r24, ctx0 + movw r22, msg0 +.global bmw_small_nextBlock +.global bmw256_nextBlock +bmw_small_nextBlock: +bmw224_nextBlock: +bmw256_nextBlock: + push_range 2, 7 + push_range 28, 29 + push_range 8, 17 + stack_alloc_large 32*4, r28, r29 + ldi r16, 0x4f + push r16 + ldi r16, 0xff + push r16 + push r16 + ldi r16, 0xfb + push r16 + adiw r28, 1 +; push_range 28, 29 /* push Q */ +; push_range 22, 25 /* push M & H */ + /* increment counter */ + movw r26, r24 + movw r2, r26 + adiw r26, 63 + adiw r26, 1 + rcall load_acc_from_X + ldi r19, 1 + add acc0, r19 + adc acc1, r1 + adc acc2, r1 + adc acc3, r1 + rcall store_acc_to_dec_X + /* call f0 */ + movw r30, r22 + movw r26, r24 +f0: + movw h0, r26 + movw q0, r28 + movw m0, r30 + /* xor m into h */ +; ldi r20, 64 + rcall memxor_64 + movw r30, m0 + movw r26, h0 + + /* set q to zero */ + ldi r22, 64 +10: st Y+, r1 + dec r22 + brne 10b + movw r28, q0 + /* calculate W and store it in Q */ + ldi r19, 5 +30: + ldi r18, 16 + /* load initial index */ + + /* load values from hacktable */ + ldi r30, lo8(f0_hacktable-3) + ldi r31, hi8(f0_hacktable-3) + mov r16, r19 + lsl r16 + add r16, r19 + add r30, r16 + adc r31, r1 + lpm r21, Z+ + lpm r20, Z+ + lpm r16, Z+ +40: + ;call add_hx_to_w +add_hx_to_w: + movw r26, h0 + add r26, r16 + adc r27, r1 + rcall load32_from_Y + sbiw r28, 4 + lsl r20 + rol r21 + brcs 300f + /* addition */ + rcall add_X_to_32 + rjmp 500f +300: /* substract */ + rcall load_acc_from_X + sub r22, acc0 + sbc r23, acc1 + sbc r24, acc2 + sbc r25, acc3 + +500: + rcall store32_to_Y + subi r16, -4 + andi r16, 0x0f<<2 + dec r18 + brne 40b + movw r28, q0 + dec r19 + brne 30b + movw r26, h0 + /* xor m into h */ +; ldi r20, 64 + movw r26, h0 + movw r30, m0 + rcall memxor_64 + sbiw r26, 60 +;--- + clr r17 + ldi r21, 15 + mov r8, r21 +50: + rcall load32_from_Y + sbiw r28, 4 + mov r20, r17 + rcall sn + inc r17 + cpi r17, 5 + brne 52f + clr r17 +52: + rcall add_X_to_32 + rcall store32_to_Y + + dec r8 + brne 50b +;--- + rcall load32_from_Y + clr r20 + rcall sn + movw r26, h0 + rcall add_X_to_32 + sbiw r26, 4 + sbiw r28, 4 + rcall store32_to_Y + sbiw r28, 4 + sbiw r28, 15*4 + movw r20, h0 + movw r22, m0 + + /* call f1*/ + movw r2, r28 +f1: + movw r4, r22 + movw r6, r20 + movw r26, r2 + clr r24 + rcall expand1 + rcall restore_f1 + ldi r24, 1 + rcall expand1 + ldi r17, 2 +10: rcall restore_f1 + mov r24, r17 + rcall expand2 + inc r17 + sbrs r17, 4 + rjmp 10b + rcall restore_f1 + movw r24, r2 + + + /* call f2 */ +; pop_range 20, 25 +; push_range 20, 25 +; rcall printQ +; push r20 +; push r21 +acc2 = 8 +acc3 = 9 +acc0 = 14 +acc1 = 15 +xl0 = 2 +xl1 = 3 +xl2 = 4 +xl3 = 5 +xh0 = 6 +xh1 = 7 +xh2 = 10 +xh3 = 11 +q16_0 = 12 +q16_1 = 13 +h0 = 18 +h1 = 19 +f2: + movw r26, r24 + /* calc XL & XH */ + adiw r26, 63 + adiw r26, 1 + movw q16_0, r26 + movw h0, r20 +;--- +; push h0 +; push h1 +;--- + movw r28, r22 + rcall load_acc_from_X + ldi r17, 15 +10: rcall load32_from_X + rcall eor32_to_acc + cpi r17, 9 + brne 15f + movw xl0, acc0 + movw xl2, acc2 +15: + dec r17 + brne 10b + movw xh0, acc0 + movw xh2, acc2 +;--- DBG +; push_range 22, 25 +; movw r22, xl0 +; movw r24, xl2 +; rcall print32 +; movw r22, xh0 +; movw r24, xh2 +; rcall print32 +; pop_range 22, 25 +;--- END DBG + /* copy m(Y) into h */ + movw r26, h0 + ldi r22, 64 +10: + ld r23, Y+ + st X+, r23 + dec r22 + brne 10b +;--- /* calc first half of h0..h15 */ + movw r28, q16_0 + movw r26, h0 + ldi r30, lo8(f2_1_shift_table) + ldi r31, hi8(f2_1_shift_table) + ldi r17, 16 +10: +;--- + movw r22, xh0 + movw r24, xh2 + cpi r17, 9 + brge 15f + clr r1 + rjmp 26f +15: lpm r20, Z+ + mov r1, r20 + andi r20, 0x0f + clt + cpi r17, 16 + breq 20f + cpi r17, 11 + brne 21f +20: set +21: brts 25f + rcall shiftright32 + rjmp 26f +25: rcall shiftleft32 +26: rcall mov32_to_acc +;--- + rcall load32_from_Y + mov r20, r1 + clr r1 + swap r20 + andi r20, 0x0f + brts 27f + rcall shiftleft32 + rjmp 28f +27: rcall shiftright32 +28: rcall eor32_to_acc +;--- + rcall load32_from_X + rcall eor32_to_acc + rcall store_acc_to_dec_X + adiw r26, 4 +;--- + dec r17 + brne 10b +;----- + sbiw r28, 4*8 /* Y points to q[24] */ + movw r30, r28 + sbiw r28, 63 + sbiw r28, 33 /* Y points to q[0] */ + movw r26, r28 + ldi r20, 8*4 + /* xor q[24..31] into q[0..7] */ + rcall memxor + /* xor q[23] into q[8] */ + sbiw r30, 9*4 + ldi r20, 4 + rcall memxor + /* xor q[16..22] into q[9..15] */ + sbiw r30, 8*4 + ldi r20, 7*4 + rcall memxor + + movw r26, h0 + ldi r17, 15 + ldi r30, lo8(f2_2_shift_table) + ldi r31, hi8(f2_2_shift_table) +10: movw r22, xl0 + movw r24, xl2 + sbrc r17, 3 + rjmp 20f + lpm r20, Z+ + lsr r20 + brcs 15f + rcall shiftright32 + rjmp 20f +15: + rcall shiftleft32 +20: + rcall mov32_to_acc + rcall load32_from_Y + rcall eor32_to_acc + rcall add_acc_to_X + dec r17 + brpl 10b +;----- + sbiw r26, 8*4 /* X points to h8 */ + movw r28, r26 + sbiw r28, 4*4 /* Y points to h4 */ + ldi r17, 8 + ldi r18, 9 +10: + rcall load32_from_Y + mov r20, r18 + rcall rotateleft32 + rcall mov32_to_acc + rcall add_acc_to_X + inc r18 + cpi r17, 5 + brne 20f + sbiw r28, 8*4 +20: dec r17 + brne 10b + +exit: +;--- DBG +; pop r25 +; pop r24 +; ldi r22, 'H' +; rcall printX +;--- END DBG + stack_free_large3 32*4+4 + pop_range 10, 17 +pop9: + pop_range 8, 9 +pop28: + pop_range 28, 29 +pop7: + pop_range 6, 7 +pop5: + pop_range 2, 5 + ret + +/******************************************************************************/ +ctx0 = 2 +ctx1 = 3 +blc0 = 4 +blc1 = 5 +len0 = 28 +len1 = 29 +buf0 = 6 +buf1 = 7 + +load32_from_Z_stub: + movw r30, ctx0 + adiw r30, 60 + ldd r21, Z+4 + ldd r22, Z+5 + ldd r23, Z+6 + ldd r24, Z+7 + ret + +/******************************************************************************/ +/* + param ctx: r24:r25 + param msg: r22:r23 + param len: r20:r21 +*/ + +.global bmw_small_lastBlock +.global bmw256_lastBlock +bmw_small_lastBlock: +bmw224_lastBlock: +bmw256_lastBlock: +/* while(length_b >= BMW_SMALL_BLOCKSIZE){ + bmw_small_nextBlock(ctx, block); + length_b -= BMW_SMALL_BLOCKSIZE; + block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B; + } +*/ + push_range 2, 7 + push_range 28, 29 + movw ctx0, r24 + movw blc0, r22 + movw len0, r20 +1: + cpi len1, hi8(512) + brlo 2f + rcall bmw_small_nextBlock_early + ldi r24, 64 + add blc0, r24 + adc blc1, r1 + subi len1, hi8(512) + rjmp 1b +2: +/* struct { + uint8_t buffer[64]; + uint32_t ctr; + } pctx; +*/ + stack_alloc_large 68 + adiw r30, 1 + movw buf0, r30 +/* memset(pctx.buffer, 0, 64); + memcpy(pctx.buffer, block, (length_b+7)/8); + pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07); +*/ movw r24, len0 + ldi r23, 63 + movw r26, blc0 + lsr r25 + ror r24 + lsr r24 + lsr r24 + breq 301f + sub r23, r24 + /* copy (#r24) bytes to stack buffer */ +30: ld r20, X+ + st Z+, r20 + dec r24 + brne 30b +301: /* calculate the appended byte */ + clr r20 + mov r21, len0 + ldi r24, 0x80 + andi r21, 0x07 + breq 305f + ld r20, X+ +303: + lsr r24 + dec r21 + brne 303b +305: + or r20, r24 + st Z+, r20 + tst r23 + breq 32f +31: st Z+, r1 + dec r23 + brne 31b +32: +/* if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511 + bmw_small_nextBlock(ctx, pctx.buffer); + memset(pctx.buffer, 0, 64-8); + ctx->counter -= 1; + } +*/ + tst len1 + breq 400f + cpi len0, 192 + brlo 400f + movw blc0, buf0 + rcall bmw_small_nextBlock_early + movw r26, buf0 + ldi r20, 64-8 +350: + st X+, r1 + dec r20 + brne 350b + rcall load32_from_Z_stub + subi r21, 1 + sbc r22, r1 + sbc r23, r1 + sbc r24, r1 + rjmp 410f +/* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b; + bmw_small_nextBlock(ctx, pctx.buffer); +*/ +400: + rcall load32_from_Z_stub +410: + clr r25 + ldi r20, 1 + lsl r21 + rcall rol32 + mov r20, len0 + add r21, len1 + adc r22, r1 + adc r23, r1 + adc r24, r1 + adc r25, r1 + movw r26, buf0 + adiw r26, 64-8 + st X+, r20 + st X+, r21 + rcall store32_to_X + st X+, r1 + st X+, r1 + movw blc0, buf0 + rcall bmw_small_nextBlock_early +/* memset(pctx.buffer, 0xaa, 64); + for(i=0; i<16;++i){ + pctx.buffer[i*4] = i+0xa0; + } +*/ + ldi r22, 0xa0 + ldi r23, 0xaa + ldi r24, 0xaa + ldi r25, 0xaa + movw r26, buf0 +500: + rcall store32_to_X + inc r22 + sbrs r22, 4 + rjmp 500b +/* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h); + memcpy(ctx->h, pctx.buffer, 64); +*/ + movw r24, buf0 + movw r22, ctx0 + rcall bmw_small_nextBlock + ldi r18, 64 + movw r26, ctx0 + movw r30, buf0 +600: + ld r20, Z+ + st X+, r20 + dec r18 + brne 600b + + stack_free_large 68 + rjmp pop28 + + +/******************************************************************************* +* void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){ +* memcpy(dest, &(ctx->h[8]), 256/8); +* } +* +* param dest: r24:r25 +* param ctx: r22:r23 +*/ +.global bmw256_ctx2hash +bmw256_ctx2hash: + movw r30, r22 + adiw r30, 8*4 + ldi r18, 32 +1: movw r26, r24 +1: ld r23, Z+ + st X+, r23 + dec r18 + brne 1b + ret + +/******************************************************************************* +* void bmw256(void* dest, const void* msg, uint32_t length_b){ +* bmw_small_ctx_t ctx; +* bmw256_init(&ctx); +* while(length_b>=BMW_SMALL_BLOCKSIZE){ +* bmw_small_nextBlock(&ctx, msg); +* length_b -= BMW_SMALL_BLOCKSIZE; +* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B; +* } +* bmw_small_lastBlock(&ctx, msg, length_b); +* bmw256_ctx2hash(dest, &ctx); +* } +* +* param dest: r24:r25 +* param msg: r22:r23 +* param length_b: r18:r21 +*/ +ctx0 = 2 +ctx1 = 3 +msg0 = 4 +msg1 = 5 +len0 = 6 +len1 = 7 +len2 = 8 +len3 = 9 +dst0 = 10 +dst1 = 11 + + +/******************************************************************************* +* void bmw224(void* dest, const void* msg, uint32_t length_b){ +* bmw_small_ctx_t ctx; +* bmw224_init(&ctx); +* while(length_b>=BMW_SMALL_BLOCKSIZE){ +* bmw_small_nextBlock(&ctx, msg); +* length_b -= BMW_SMALL_BLOCKSIZE; +* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B; +* } +* bmw_small_lastBlock(&ctx, msg, length_b); +* bmw224_ctx2hash(dest, &ctx); +* } +* +* param dest: r24:r25 +* param msg: r22:r23 +* param length_b: r18:r21 +*/ +ctx0 = 2 +ctx1 = 3 +msg0 = 4 +msg1 = 5 +len0 = 28 +len1 = 29 +len2 = 8 +len3 = 9 +dst0 = 6 +dst1 = 7 + + +.global bmw256 +bmw256: + push_range 2, 7 + push_range 28, 29 + push_range 8, 9 + stack_alloc_large 64+4 +10: movw ctx0, r30 + movw dst0, r24 + movw msg0, r22 + movw len0, r18 + movw len2, r20 + movw r24, ctx0 + rcall bmw256_init +20: + mov r18, len2 + or r18, len3 + breq 50f + rcall bmw_small_nextBlock_early + subi len1, 2 + sbc len2, r1 + sbc len3, r1 + ldi r20, 64 + add msg0, r20 + adc msg1, r1 + rjmp 20b +50: + movw r24, ctx0 + movw r22, msg0 + movw r20, len0 + rcall bmw_small_lastBlock + movw r24, dst0 + movw r22, ctx0 + rcall bmw256_ctx2hash + stack_free_large 64+4 + rjmp pop9 + +/******************************************************************************/ +.global bmw256_init +bmw256_init: + ldi r22, 0x40 + ldi r23, 0x80 + movw r26, r24 + adiw r26, 4 +10: + st -X, r22 + inc r22 + mov r20, r22 + andi r20, 0x3 + brne 10b + adiw r26, 8 +20: cp r22, r23 + brne 10b + st -X, r1 + st -X, r1 + st -X, r1 + st -X, r1 + ret + + +/******************************************************************************/ + +#if DEBUG + +printQ: + push_range 20, 25 + ldi r16, 4 + mov r9, r16 + movw r16, r24 + ldi r24, lo8(qdbg_str) + ldi r25, hi8(qdbg_str) + call cli_putstr_P + clr r8 +10: ldi r24, lo8(qdbg_str1) + ldi r25, hi8(qdbg_str1) + call cli_putstr_P + mov r24, r8 + call cli_hexdump_byte + ldi r24, lo8(qdbg_str2) + ldi r25, hi8(qdbg_str2) + call cli_putstr_P + movw r24, r16 + clr r23 + ldi r22, 4 + call cli_hexdump_rev + add r16, r9 + adc r17, r1 + inc r8 + sbrs r8, 5 + rjmp 10b + pop_range 20, 25 + ret +qdbg_str: .asciz "\r\nDBG Q: " +qdbg_str1: .asciz "\r\n Q[" +qdbg_str2: .asciz "] = " + + +printX: + push_range 6, 9 + push_range 16, 27 + push_range 30, 31 + ldi r16, 4 + mov r6, r22 + mov r9, r16 + movw r16, r24 + ldi r24, lo8(Xdbg_str) + ldi r25, hi8(Xdbg_str) + call cli_putstr_P + mov r24, r6 + call cli_putc + ldi r24, ':' + call cli_putc + clr r8 +10: ldi r24, lo8(Xdbg_str1) + ldi r25, hi8(Xdbg_str1) + call cli_putstr_P + mov r24, r6 + call cli_putc + ldi r24, '[' + call cli_putc + mov r24, r8 + call cli_hexdump_byte + ldi r24, lo8(Xdbg_str2) + ldi r25, hi8(Xdbg_str2) + call cli_putstr_P + movw r24, r16 + clr r23 + ldi r22, 4 + call cli_hexdump_rev + add r16, r9 + adc r17, r1 + inc r8 + sbrs r8, 4 + rjmp 10b + pop_range 30, 31 + pop_range 16, 27 + pop_range 6, 9 + ret +Xdbg_str: .asciz "\r\nDBG " +Xdbg_str1: .asciz "\r\n " +Xdbg_str2: .asciz "] = " + +print32: + push_range 6, 9 + push_range 16, 27 + push_range 30, 31 + movw r6, r22 + movw r8, r24 + ldi r24, lo8(Xdbg_str) + ldi r25, hi8(Xdbg_str) + call cli_putstr_P + mov r24, r9 + call cli_hexdump_byte + mov r24, r8 + call cli_hexdump_byte + mov r24, r7 + call cli_hexdump_byte + mov r24, r6 + call cli_hexdump_byte + pop_range 30, 31 + pop_range 16, 27 + pop_range 6, 9 + ret + + +print_acc: + push_range 16, 27 + push_range 30, 31 + ldi r24, lo8(Xdbg_str) + ldi r25, hi8(Xdbg_str) + call cli_putstr_P + mov r24, r9 + call cli_hexdump_byte + mov r24, r8 + call cli_hexdump_byte + mov r24, r15 + call cli_hexdump_byte + mov r24, r14 + call cli_hexdump_byte + pop_range 30, 31 + pop_range 16, 27 + ret + +#endif + diff --git a/bmw/bmw_small-tinyasm.S b/bmw/bmw_small-tinyasm.S index f3da544..2775ce5 100644 --- a/bmw/bmw_small-tinyasm.S +++ b/bmw/bmw_small-tinyasm.S @@ -1138,7 +1138,6 @@ dst1 = 7 .global bmw224 bmw224: clt - rjmp bmw_small_all bmw_small_all: diff --git a/mkfiles/bmw_tiny_sep.mk b/mkfiles/bmw_tiny_sep.mk new file mode 100644 index 0000000..a4d83ee --- /dev/null +++ b/mkfiles/bmw_tiny_sep.mk @@ -0,0 +1,12 @@ +# Makefile for BlueMidnightWish +ALGO_NAME := BMW_TINY_SEPERATE + +# comment out the following line for removement of BlueMidnightWish from the build process +HASHES += $(ALGO_NAME) + +$(ALGO_NAME)_DIR := bmw/ +$(ALGO_NAME)_OBJ := bmw_256-tinyasm.o bmw_224-tinyasm.o bmw_large.o +$(ALGO_NAME)_TEST_BIN := main-bmw-test.o hfal_bmw_small.o hfal_bmw_large.o $(CLI_STD) $(HFAL_STD) +$(ALGO_NAME)_NESSIE_TEST := test nessie +$(ALGO_NAME)_PERFORMANCE_TEST := performance +