]> git.cryptolib.org Git - avr-crypto-lib.git/commitdiff
BMW224/256 now below 2KiB\!
authorbg <bg@b1d182e4-1ff8-0310-901f-bddb46175740>
Sun, 4 Apr 2010 22:52:01 +0000 (22:52 +0000)
committerbg <bg@b1d182e4-1ff8-0310-901f-bddb46175740>
Sun, 4 Apr 2010 22:52:01 +0000 (22:52 +0000)
bmw/bmw_small-tinyasm.S [new file with mode: 0644]
hfal-performance.c
mkfiles/bmw_tiny.mk [new file with mode: 0644]

diff --git a/bmw/bmw_small-tinyasm.S b/bmw/bmw_small-tinyasm.S
new file mode 100644 (file)
index 0000000..764f281
--- /dev/null
@@ -0,0 +1,1439 @@
+/* bmw_small-tinyasm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2009  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * File:        bmw_small-tinyasm.S
+ * Author:      Daniel Otte
+ * Date:        2010-03-28
+ * License:     GPLv3 or later
+ * Description: implementation of BlueMidnightWish
+ *
+ */
+
+#include "avr-asm-macros.S"
+
+/******************************************************************************/
+/*
+  param a: r22:r23:r24:r25
+  param s: r20
+*/
+shiftleft32:
+       clr r0
+       cpi r20, 8
+       brlo bitrotateleft_1
+       mov r25, r24
+       mov r24, r23
+       mov r23, r22
+       clr r22
+       subi r20, 8
+       rjmp shiftleft32
+
+/******************************************************************************/
+/*
+  param a: r22:r23:r24:r25
+  param s: r20
+*/
+shiftright32:
+       cpi r20, 8
+       brlo bitshiftright
+       mov r22, r23
+       mov r23, r24
+       mov r24, r25
+       clr r25
+       subi r20, 8
+       rjmp shiftright32
+bitshiftright:
+       tst r20
+       breq 20f
+10:    lsr r25
+       ror r24
+       ror r23
+       ror r22
+       dec r20
+       brne 10b
+20: ret
+
+/******************************************************************************/
+/*
+  param a: r22:r23:r24:r25
+  param s: r20
+*/
+rotateleft32:
+       cpi r20, 8
+       brlo bitrotateleft
+       mov r0, r25
+       mov r25, r24
+       mov r24, r23
+       mov r23, r22
+       mov r22, r0
+       subi r20, 8
+       rjmp rotateleft32
+bitrotateleft:
+    mov r0, r25
+bitrotateleft_1:
+       tst r20
+       breq 20f
+10:
+       lsl r0
+       rol r22
+       rol r23
+       rol r24
+       rol r25
+       dec r20
+       brne 10b
+20: ret
+
+
+/******************************************************************************/
+
+s_table:
+s0:  .byte 1, 3, 4,19
+s1:  .byte 1, 2, 8,23
+s2:  .byte 2, 1,12,25
+s3:  .byte 2, 2,15,29
+s4:  .byte 1, 0, 0, 0
+s5:  .byte 2, 0, 0, 0
+
+eor_r22_in_r16:
+       eor r16, r22
+       eor r17, r23
+       eor r18, r24
+       eor r19, r25
+       ret
+
+/*
+  param x: r22:r23:r24:25
+  param s: r20
+*/
+sn:
+       push_range 12, 20
+       ldi r30, lo8(s_table)
+       ldi r31, hi8(s_table)
+       lsl r20
+       lsl r20
+       add r30, r20
+       adc r31, r1
+       movw r12, r22
+       movw r14, r24
+       lpm r20, Z+
+       rcall shiftright32
+       movw r16, r22
+       movw r18, r24
+;---
+       movw r22, r12
+       movw r24, r14
+       lpm r20, Z+
+       rcall shiftleft32
+       rcall eor_r22_in_r16
+;---
+       movw r22, r12
+       movw r24, r14
+       lpm r20, Z+
+       rcall rotateleft32
+       rcall eor_r22_in_r16
+;---
+       movw r22, r12
+       movw r24, r14
+       lpm r20, Z+
+       rcall rotateleft32
+       eor r22, r16
+       eor r23, r17
+       eor r24, r18
+       eor r25, r19
+       pop_range 12, 20
+       ret
+
+/******************************************************************************/
+/*
+  param dest: r26:r27 (X)
+  param src:  r30:r31 (Z)
+  param len:  r20
+*/
+memxor_short:
+;      tst r20
+;      breq memxor_exit
+10: ld r21, X
+       ld r22, Z+
+       eor r21, r22
+       st X+, r21
+       dec r20
+       brne 10b
+memxor_exit:
+       ret
+
+/******************************************************************************/
+q0 = 2
+q1 = 3
+h0 = 4
+h1 = 5
+m0 = 6
+m1 = 7
+
+add_hx_to_w:
+       movw r26, h0
+       add r26, r16
+       adc r27, r1
+       ld r22, Y
+       ldd r23, Y+1
+       ldd r24, Y+2
+       ldd r25, Y+3
+       lsl r20
+       rol r21
+       brcs 30f
+       /* addition */
+       ld r0, X+
+       add r22, r0
+       ld r0, X+
+       adc r23, r0
+       ld r0, X+
+       adc r24, r0
+       ld r0, X+
+       adc r25, r0
+       rjmp 50f
+30: /* substract */
+       ld r0, X+
+       sub r22, r0
+       ld r0, X+
+       sbc r23, r0
+       ld r0, X+
+       sbc r24, r0
+       ld r0, X+
+       sbc r25, r0
+50:
+       st Y+, r22
+       st Y+, r23
+       st Y+, r24
+       st Y+, r25
+       ret
+
+/******************************************************************************/
+load32_from_X:
+       ld r22, X+
+       ld r23, X+
+       ld r24, X+
+       ld r25, X+
+       ret
+
+load32_from_Y:
+       ld r22, Y+
+       ld r23, Y+
+       ld r24, Y+
+       ld r25, Y+
+       ret
+/******************************************************************************/
+/*
+  param q:  r28:r29 (Y)
+  param h:  r26:r27 (X)
+  param m:  r30:r31 (Z)
+*/
+
+f0_hacktable:
+       .byte 0x03, 0x11
+       .byte 0xDD, 0xB3
+       .byte 0x2A, 0x79
+       .byte 0x07, 0xAA
+       .byte 0x51, 0xC2
+f0_indextable:
+       .byte 5*4,7*4,10*4,13*4,14*4
+;      .byte 0 ; just for alignment
+f0_s_table:
+       .byte 0,1,2,3,4
+       .byte 0,1,2,3,4
+       .byte 0,1,2,3,4
+;      .byte 0
+
+f0:
+       movw h0, r26
+       movw q0, r28
+       movw m0, r30
+;--- DBG
+;      push_range 22, 25
+;      movw r24, r26
+;      ldi r22, 'H'
+;      rcall printX
+;      pop_range  22, 25
+;--- END DBG
+;--- DBG
+;      push_range 22, 25
+;      movw r24, r30
+;      ldi r22, 'M'
+;      rcall printX
+;      pop_range  22, 25
+;--- END DBG
+       /* xor m into h */
+       ldi r20, 64
+       rcall memxor_short
+       movw r30, m0
+       movw r26, h0
+
+       /* set q to zero */
+       ldi r22, 64
+10:    st Y+, r1
+       dec r22
+       brne 10b
+       movw r28, q0
+       /* calculate W and store it in Q */
+       ldi r19, 5
+30:
+       ldi r18, 16
+       /* load initial index */
+       ldi r30, lo8(f0_indextable-1)
+       ldi r31, hi8(f0_indextable-1)
+       add r30, r19
+       adc r31, r1
+       lpm r16, Z
+       /* load values from hacktable */
+       ldi r30, lo8(f0_hacktable-2)
+       ldi r31, hi8(f0_hacktable-2)
+       lsl r19
+       add r30, r19
+       adc r31, r1
+       lsr r19
+       lpm r21, Z+
+       lpm r20, Z
+40:
+       call add_hx_to_w
+       subi r16, -4
+       andi r16, 0x0f<<2
+       dec r18
+       brne 40b
+       movw r28, q0
+       dec r19
+       brne 30b
+       movw r26, h0
+;--- DBG
+;      push_range 22, 25
+;      movw r24, r28
+;      ldi r22, 'W'
+;      rcall printX
+;      pop_range  22, 25
+;--- END DBG
+       /* xor m into h */
+       ldi r20, 64
+       movw r26, h0
+       movw r30, m0
+       rcall memxor_short
+       sbiw r26, 60
+;---
+       ldi r30, lo8(f0_s_table)
+       ldi r31, hi8(f0_s_table)
+       ldi r21, 15
+       mov r8, r21
+50:
+       ldd r22, Y+0
+       ldd r23, Y+1
+       ldd r24, Y+2
+       ldd r25, Y+3
+       lpm r20, Z+
+       movw r2, r30
+       rcall sn
+       movw r30, r2
+
+       ld r0, X+
+       add r22, r0
+       ld r0, X+
+       adc r23, r0
+       ld r0, X+
+       adc r24, r0
+       ld r0, X+
+       adc r25, r0
+
+       st Y+, r22
+       st Y+, r23
+       st Y+, r24
+       st Y+, r25
+       dec r8
+       brne 50b
+;---
+       ldd r22, Y+0
+       ldd r23, Y+1
+       ldd r24, Y+2
+       ldd r25, Y+3
+       clr r20
+       rcall sn
+       movw r30, r2
+       movw r26, h0
+       ld r0, X+
+       add r22, r0
+       ld r0, X+
+       adc r23, r0
+       ld r0, X+
+       adc r24, r0
+       ld r0, X+
+       adc r25, r0
+       sbiw r26, 4
+       st Y+, r22
+       st Y+, r23
+       st Y+, r24
+       st Y+, r25
+       ret
+
+/******************************************************************************/
+
+const_lut:
+       .long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f
+       .long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3
+       .long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7
+       .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b
+
+/*******************************************************************************
+* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){
+*      uint32_t r;
+*      r  = pgm_read_dword(k_lut+j);
+*      r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
+*      r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
+*      r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
+*      r ^= ((uint32_t*)h)[(j+7)&0xf];
+*      return r;
+* }
+* param j: r24
+* param m: r22:r23
+* param h: r20:r21
+*/
+j    = 16
+acc2 =  8
+acc3 =  9
+h0   = 10
+h1   = 11
+m0   = 12
+m1   = 13
+acc0 = 14
+acc1 = 15
+
+add32_to_acc:
+       add acc0, r22
+       adc acc1, r23
+       adc acc2, r24
+       adc acc3, r25
+       ret
+
+eor32_to_acc:
+       eor acc0, r22
+       eor acc1, r23
+       eor acc2, r24
+       eor acc3, r25
+       ret
+
+load_acc_from_X:
+       ld acc0, X+
+       ld acc1, X+
+       ld acc2, X+
+       ld acc3, X+
+       ret
+
+add_acc_to_Z:
+       ld r0, Z
+       add r0, acc0
+       st Z+, r0
+       ld r0, Z
+       adc r0, acc1
+       st Z+, r0
+       ld r0, Z
+       adc r0, acc2
+       st Z+, r0
+       ld r0, Z
+       adc r0, acc3
+       st Z+, r0
+       ret
+
+load_rotate_add_M:
+       andi r20, 0x0f
+       mov r0, r20
+       lsl r0
+       lsl r0
+       movw r26, m0
+       add r26, r0
+       adc r27, r1
+       ld r22, X+
+       ld r23, X+
+       ld r24, X+
+       ld r25, X+
+       inc r20
+       rcall rotateleft32
+       brts 10f
+       rcall add32_to_acc
+       ret
+10:    sub acc0, r22
+       sbc acc1, r23
+       sbc acc2, r24
+       sbc acc3, r25
+       ret
+
+addelement:
+       mov j, r24
+       movw h0, r20
+       movw m0, r22
+       lsl r24
+       lsl r24
+       mov r28, r24
+       ldi r30, lo8(const_lut)
+       ldi r31, hi8(const_lut)
+       add r30, r24
+       adc r31, r1
+       lpm acc0, Z+
+       lpm acc1, Z+
+       lpm acc2, Z+
+       lpm acc3, Z+
+       clt
+       mov r20, j
+       rcall load_rotate_add_M
+       mov r20, j
+       subi r20, -3
+       rcall load_rotate_add_M
+       mov r20, j
+       set
+       subi r20, -10
+       rcall load_rotate_add_M
+       lsl j
+       lsl j
+       subi j, -7*4
+       andi j, 0x3f
+       movw r26, h0
+       add r26, j
+       adc r27, r1
+       ld r0, X+
+       eor acc0, r0
+       ld r0, X+
+       eor acc1, r0
+       ld r0, X+
+       eor acc2, r0
+       ld r0, X+
+       eor acc3, r0
+;---
+       ret
+
+/******************************************************************************/
+/*
+  param q: r26:r27
+  param m: r22:r23
+  param h: r20:r21
+  param j: r24
+*/
+
+expand_intro:
+       push_range 20, 27
+;      push r24
+       rcall addelement
+;      pop r24
+       pop_range 20, 27
+       lsl r24
+       lsl r24
+       add r26, r24
+       adc r27, r1
+       ret
+expand1:
+       rcall expand_intro
+       ldi r19, 1
+10:
+       rcall load32_from_X
+       mov r20, r19
+       andi r20, 3
+       rcall sn
+       rcall add32_to_acc
+       inc r19
+       cpi r19, 17
+       brne 10b
+expand1_exit:
+;      adiw r26, 63
+       st X+, acc0
+       st X+, acc1
+       st X+, acc2
+       st X+, acc3
+       ret
+
+/******************************************************************************/
+/*
+  param q: r26:r27
+  param m: r22:r23
+  param h: r20:r21
+  param j: r24
+*/
+
+expand2_rot_table:
+       .byte 0,3,0,7,0,13,0,16,0,19,0,23,0,27
+
+expand2:
+       rcall expand_intro
+       ldi r19, 14
+       ldi r30, lo8(expand2_rot_table)
+       ldi r31, hi8(expand2_rot_table)
+10:
+       rcall load32_from_X
+       mov r20, r19
+       lpm r20, Z+
+       rcall rotateleft32
+       rcall add32_to_acc
+       dec r19
+       brne 10b
+       rcall load32_from_X
+       ldi r20, 4
+       rcall sn
+       rcall add32_to_acc
+       rcall load32_from_X
+       ldi r20, 5
+       rcall sn
+       rcall add32_to_acc
+
+       rjmp expand1_exit
+
+/******************************************************************************/
+/*
+  param q: r24:r25
+  param m: r22:r23
+  param h: r20:r21
+*/
+/* for calling expand1/2
+  param q: r26:r27
+  param m: r22:r23
+  param h: r20:r21
+  param j: r24
+*/
+f1:
+       movw r2, r24
+       movw r4, r22
+       movw r6, r20
+       movw r26, r2
+;      movw r22, r4
+;   movw r20, r6
+       clr r24
+       rcall expand1
+       movw r26, r2
+       movw r22, r4
+    movw r20, r6
+       ldi r24, 1
+       rcall expand1
+       ldi r17, 2
+10:    movw r26, r2
+       movw r22, r4
+       movw r20, r6
+       mov r24, r17
+       rcall expand2
+       inc r17
+       sbrs r17, 4
+       rjmp 10b
+       ret
+
+/******************************************************************************/
+/*
+  param q: r24:r25
+  param m: r22:r23
+  param h: r20:r21
+*/
+f2_1_shift_table:
+       .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+       .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55
+f2_2_shift_table:
+       .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1)
+       .byte 0 ; just for alignment
+acc2  =  8
+acc3  =  9
+acc0  = 14
+acc1  = 15
+xl0   =  2
+xl1   =  3
+xl2   =  4
+xl3   =  5
+xh0   =  6
+xh1   =  7
+xh2   = 10
+xh3   = 11
+q16_0 = 12
+q16_1 = 13
+h0   =  18
+h1   =  19
+f2:
+       movw r26, r24
+       /* calc XL */
+       adiw r26, 63
+       adiw r26,  1
+       movw q16_0, r26
+       clr xl0
+       clr xl1
+       clr xl2
+       clr xl3
+       ldi r17, 8
+10:    ld r0, X+
+       eor xl0, r0
+       ld r0, X+
+       eor xl1, r0
+       ld r0, X+
+       eor xl2, r0
+       ld r0, X+
+       eor xl3, r0
+       dec r17
+       brne 10b
+;--- /* calc XH */
+       movw xh0, xl0
+       movw xh2, xl2
+       ldi r17, 8
+10:    ld r0, X+
+       eor xh0, r0
+       ld r0, X+
+       eor xh1, r0
+       ld r0, X+
+       eor xh2, r0
+       ld r0, X+
+       eor xh3, r0
+       dec r17
+       brne 10b
+;--- DBG
+;      push_range 22, 25
+;      movw r22, xl0
+;      movw r24, xl2
+;      rcall print32
+;      movw r22, xh0
+;      movw r24, xh2
+;      rcall print32
+;      pop_range 22, 25
+;--- END DBG
+
+;--- /* calc first half of h0..h15 */
+       movw h0, r20
+       movw r28, r22
+       movw r26, q16_0
+       ldi r17, 16
+10:
+       ld acc0, Y+
+       ld acc1, Y+
+       ld acc2, Y+
+       ld acc3, Y+
+;---
+       ldi r30, lo8(f2_1_shift_table-1)
+       ldi r31, hi8(f2_1_shift_table-1)
+       movw r22, xh0
+       movw r24, xh2
+       add r30, r17
+       adc r31, r1
+       lpm r20, Z
+       mov r1, r20
+       andi r20, 0x0f
+       clt
+       cpi r17, 16
+       breq 20f
+       cpi r17, 11
+       brne 21f
+20:    set
+21:    brts 25f
+       rcall shiftright32
+       rjmp 26f
+25:    rcall shiftleft32
+26: rcall eor32_to_acc
+;---
+       rcall load32_from_X
+       mov r20, r1
+       clr r1
+       swap r20
+       andi r20, 0x0f
+       brts 27f
+       rcall shiftleft32
+       rjmp 28f
+27:    rcall shiftright32
+28:    rcall eor32_to_acc
+;---
+       movw r30, h0
+       st Z+, acc0
+       st Z+, acc1
+       st Z+, acc2
+       st Z+, acc3
+       movw h0, r30
+;---
+       dec r17
+       brne 10b
+;-----
+       sbiw r26, 4*8 /* X points to q[24] */
+       movw r28, r26
+       sbiw r28, 63
+       sbiw r28, 33 /* Y points to q[0] */
+       sbiw r30, 63
+       sbiw r30,  1 /* Z points to h0 */
+       ldi r17, 8
+10:    movw acc0, xl0
+       movw acc2, xl2
+       rcall load32_from_X
+       rcall eor32_to_acc
+       rcall load32_from_Y
+       rcall eor32_to_acc
+       rcall add_acc_to_Z
+       dec r17
+       brne 10b
+       sbiw r26, 9*4 /* X points to q[23] */
+       rcall load_acc_from_X
+       eor acc1, xl0
+       eor acc2, xl1
+       eor acc3, xl2
+       rcall load32_from_Y
+       rcall eor32_to_acc
+       rcall add_acc_to_Z
+;---
+       sbiw r26, 8*4 /* X points to q[16] */
+       mov h0, r30
+       ldi r17, 7
+10:
+       ldi r30, lo8(f2_2_shift_table-1)
+       ldi r31, hi8(f2_2_shift_table-1)
+       add r30, r17
+       adc r31, r1
+       lpm r20, Z
+       rcall load_acc_from_X
+       movw r22, xl0
+       movw r24, xl2
+       lsr r20
+       brcc 20f
+       rcall shiftleft32
+       rjmp 21f
+20:    rcall shiftright32
+21:
+       rcall eor32_to_acc
+       rcall load32_from_Y
+       rcall eor32_to_acc
+       movw r30, h0
+       rcall add_acc_to_Z
+       movw h0, r30
+       dec r17
+       brne 10b
+;-----
+       sbiw r30, 8*4 /* Z points to h8 */
+       movw r26, r30
+       sbiw r26, 4*4 /* X points to h4 */
+       ldi r17, 8
+       ldi r18, 9
+10:
+       rcall load32_from_X
+       mov r20, r18
+       rcall rotateleft32
+       movw acc0, r22
+       movw acc2, r24
+       rcall add_acc_to_Z
+       inc r18
+       cpi r17, 5
+       breq 20f
+       dec r17
+       brne 10b
+       ret
+20: sbiw r26, 8*4
+       dec r17
+       rjmp 10b
+
+/******************************************************************************/
+/*
+  param ctx:  r24:r25
+  param msg:  r22:r23
+*/
+/* f0
+  param q:  r28:r29 (Y)
+  param h:  r26:r27 (X)
+  param m:  r30:r31 (Z)
+*/
+/* f1
+  param q: r24:r25
+  param m: r22:r23
+  param h: r20:r21
+*/
+/* f2
+  param q: r24:r25
+  param m: r22:r23
+  param h: r20:r21
+*/
+.global bmw_small_nextBlock
+.global bmw224_nextBlock
+.global bmw256_nextBlock
+bmw_small_nextBlock:
+bmw224_nextBlock:
+bmw256_nextBlock:
+       push_range 28, 29
+       push_range  2, 17
+       stack_alloc_large 32*4, r28, r29
+       adiw r28, 1
+       push_range 28, 29 /* push Q */
+       push_range 22, 25 /* push M & H */
+       /* increment counter */
+       movw r26, r24
+       movw r2, r26
+       adiw r26, 63
+       adiw r26,  1
+       rcall load_acc_from_X
+       ldi r19, 1
+       add acc0, r19
+       adc acc1, r1
+       adc acc2, r1
+       adc acc3, r1
+       st -X, acc0
+       st -X, acc1
+       st -X, acc2
+       st -X, acc3
+       /* call f0 */
+       movw r30, r22
+       movw r26, r24
+       rcall f0
+       /* call f1*/
+       pop r21
+       pop r20
+       pop r23
+       pop r22
+       pop r25
+       pop r24
+;      rcall printQ
+       push_range 20, 25
+       rcall f1
+       /* call f2 */
+;      pop_range 20, 25
+;      push_range 20, 25
+;      rcall printQ
+       pop_range 20, 25
+;      push r20
+;      push r21
+       call f2
+;--- DBG
+;      pop r25
+;      pop r24
+;      ldi r22, 'H'
+;      rcall printX
+;--- END DBG
+       stack_free_large3 32*4
+       pop_range  2, 17
+       pop_range 28, 29
+       ret
+
+/******************************************************************************/
+/*
+  param ctx:  r24:r25
+  param msg:  r22:r23
+  param len:  r20:r21
+*/
+ctx0 =  2
+ctx1 =  3
+blc0 =  4
+blc1 =  5
+len0 = 28
+len1 = 29
+buf0 =  6
+buf1 =  7
+
+.global bmw_small_lastBlock
+.global bmw224_lastBlock
+.global bmw256_lastBlock
+bmw_small_lastBlock:
+bmw224_lastBlock:
+bmw256_lastBlock:
+/*     while(length_b >= BMW_SMALL_BLOCKSIZE){
+               bmw_small_nextBlock(ctx, block);
+               length_b -= BMW_SMALL_BLOCKSIZE;
+               block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
+       }
+*/
+       push_range 2, 7
+       push_range 28, 29
+       movw ctx0, r24
+       movw blc0, r22
+       movw len0, r20
+1:
+       cpi len1, hi8(512)
+       brlo 2f
+       movw r24, ctx0
+       movw r22, blc0
+       rcall bmw_small_nextBlock
+       ldi r24, 64
+       add blc0, r24
+       adc blc1, r1
+       subi len1, hi8(512)
+       rjmp 1b
+2:
+/*     struct {
+               uint8_t  buffer[64];
+               uint32_t ctr;
+       } pctx;
+*/
+       stack_alloc_large 68
+       adiw r30, 1
+       movw buf0, r30
+/*     memset(pctx.buffer, 0, 64);
+       memcpy(pctx.buffer, block, (length_b+7)/8);
+       pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
+*/     movw r24, len0
+       lsr r25
+       ror r24
+       lsr r24
+       lsr r24
+       ldi r23, 63
+       sub r23, r24
+       movw r26, blc0
+       tst r24
+       breq 301f
+       /* copy (#r24) bytes to stack buffer */
+30: ld r20, X+
+       st Z+, r20
+       dec r24
+       brne 30b
+301: /* calculate the appended byte */
+       clr r20
+       mov r21, len0
+       ldi r24, 0x80
+       andi r21, 0x07
+       breq 305f
+       ld r20, X+
+303:
+       lsr r24
+       dec r21
+       brne 303b
+305:
+       or r20, r24
+       st Z+, r20
+       tst r23
+       breq 32f
+31: st Z+, r1
+       dec r23
+       brne 31b
+32:
+/*     if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
+               bmw_small_nextBlock(ctx, pctx.buffer);
+               memset(pctx.buffer, 0, 64-8);
+               ctx->counter -= 1;
+       }
+*/
+       tst len1
+       breq 400f
+       cpi len0, 192
+       brlo 400f
+       movw r24, ctx0
+       movw r22, buf0
+       rcall bmw_small_nextBlock
+       movw r26, buf0
+       ldi r20, 64-8
+350:
+       st X+, r1
+       dec r20
+       brne 350b
+       movw r30, ctx0
+       adiw r30, 60
+       ldd r21, Z+4
+       ldd r22, Z+5
+       ldd r23, Z+6
+       ldd r24, Z+7
+       subi r21, 1
+       sbc r22, r1
+       sbc r23, r1
+       sbc r24, r1
+       rjmp 410f
+/*     *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
+       bmw_small_nextBlock(ctx, pctx.buffer);
+*/
+400:
+       movw r30, ctx0
+       adiw r30, 60
+       ldd r21, Z+4
+       ldd r22, Z+5
+       ldd r23, Z+6
+       ldd r24, Z+7
+410:
+       clr r25
+       lsl r21
+       rol r22
+       rol r23
+       rol r24
+       rol r25
+       mov r20, len0
+       add r21, len1
+       adc r22, r1
+       adc r23, r1
+       adc r24, r1
+       adc r25, r1
+       movw r30, buf0
+       adiw r30, 64-8
+       st Z+, r20
+       st Z+, r21
+       st Z+, r22
+       st Z+, r23
+       st Z+, r24
+       st Z+, r25
+       st Z+, r1
+       st Z+, r1
+       movw r24, ctx0
+       movw r22, buf0
+       rcall bmw_small_nextBlock
+/*     memset(pctx.buffer, 0xaa, 64);
+       for(i=0; i<16;++i){
+               pctx.buffer[i*4] = i+0xa0;
+       }
+*/
+       ldi r18, 0xa0
+       ldi r19, 0xaa
+       movw r26, buf0
+500:
+       st X+, r18
+       st X+, r19
+       st X+, r19
+       st X+, r19
+       inc r18
+       sbrs r18, 4
+       rjmp 500b
+/*     bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
+       memcpy(ctx->h, pctx.buffer, 64);
+*/
+       movw r24, buf0
+       movw r22, ctx0
+       rcall bmw_small_nextBlock
+       ldi r18, 64
+       movw r26, ctx0
+       movw r30, buf0
+600:
+       ld r20, Z+
+       st X+, r20
+       dec r18
+       brne 600b
+
+       stack_free_large 68
+       pop_range 28, 29
+       pop_range 2, 7
+       ret
+
+
+/*******************************************************************************
+* void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){
+*      memcpy(dest, &(ctx->h[9]), 224/8);
+* }
+*
+* param dest:  r24:r25
+* param ctx:   r22:r23
+*/
+.global bmw224_ctx2hash
+bmw224_ctx2hash:
+       movw r26, r24
+       movw r30, r22
+       adiw r30, 9*4
+       ldi r22, 28
+       rjmp 1f
+
+/*******************************************************************************
+* void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
+*      memcpy(dest, &(ctx->h[8]), 256/8);
+* }
+*
+* param dest:  r24:r25
+* param ctx:   r22:r23
+*/
+.global bmw256_ctx2hash
+bmw256_ctx2hash:
+       movw r26, r24
+       movw r30, r22
+       adiw r30, 8*4
+       ldi r22, 32
+1:
+       ld r23, Z+
+       st X+, r23
+       dec r22
+       brne 1b
+       ret
+
+/*******************************************************************************
+* void bmw256(void* dest, const void* msg, uint32_t length_b){
+*      bmw_small_ctx_t ctx;
+*      bmw256_init(&ctx);
+*      while(length_b>=BMW_SMALL_BLOCKSIZE){
+*              bmw_small_nextBlock(&ctx, msg);
+*              length_b -= BMW_SMALL_BLOCKSIZE;
+*              msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
+*      }
+*      bmw_small_lastBlock(&ctx, msg, length_b);
+*      bmw256_ctx2hash(dest, &ctx);
+* }
+*
+* param dest:     r24:r25
+* param msg:      r22:r23
+* param length_b: r18:r21
+*/
+ctx0 =   2
+ctx1 =   3
+msg0 =   4
+msg1 =   5
+len0 =   6
+len1 =   7
+len2 =   8
+len3 =   9
+dst0 =  10
+dst1 =  11
+.global bmw256
+bmw256:
+       push r16
+       ldi r16, 1
+       rjmp bmw_small_all
+
+/*******************************************************************************
+* void bmw224(void* dest, const void* msg, uint32_t length_b){
+*      bmw_small_ctx_t ctx;
+*      bmw224_init(&ctx);
+*      while(length_b>=BMW_SMALL_BLOCKSIZE){
+*              bmw_small_nextBlock(&ctx, msg);
+*              length_b -= BMW_SMALL_BLOCKSIZE;
+*              msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
+*      }
+*      bmw_small_lastBlock(&ctx, msg, length_b);
+*      bmw224_ctx2hash(dest, &ctx);
+* }
+*
+* param dest:     r24:r25
+* param msg:      r22:r23
+* param length_b: r18:r21
+*/
+ctx0 =   2
+ctx1 =   3
+msg0 =   4
+msg1 =   5
+len0 =   6
+len1 =   7
+len2 =   8
+len3 =   9
+dst0 =  10
+dst1 =  11
+.global bmw224
+bmw224:
+       push r16
+       clr r16
+
+bmw_small_all:
+       push_range 2, 11
+       stack_alloc_large 64+4
+       adiw r30, 1
+       movw ctx0, r30
+       movw dst0, r24
+       movw msg0, r22
+       movw len0, r18
+       movw len2, r20
+       movw r24, ctx0
+       ldi r30, pm_lo8(init_lut)
+       ldi r31, pm_hi8(init_lut)
+       add r30, r16
+       adc r31, r1
+       icall
+20:
+       mov r18, len2
+       or  r18, len3
+       breq 50f
+       movw r24, ctx0
+       movw r22, msg0
+       rcall bmw_small_nextBlock
+       ldi r20, 2
+       sub len1, r20
+       sbc len2, r1
+       sbc len3, r1
+       ldi r20, 64
+       add msg0, r20
+       adc msg1, r1
+       rjmp 20b
+50:
+       movw r24, ctx0
+       movw r22, msg0
+       movw r20, len0
+       rcall bmw_small_lastBlock
+       movw r24, dst0
+       movw r22, ctx0
+       ldi r30, pm_lo8(c2h_lut)
+       ldi r31, pm_hi8(c2h_lut)
+       add r30, r16
+       adc r31, r1
+       icall
+       stack_free_large 64+4
+       pop_range 2, 11
+       pop r16
+       ret
+
+init_lut:
+       rjmp bmw224_init
+       rjmp bmw256_init
+c2h_lut:
+       rjmp bmw224_ctx2hash
+       rjmp bmw256_ctx2hash
+
+/*******************************************************************************
+* void bmw224_init(bmw224_ctx_t* ctx){
+*      uint8_t i;
+*      ctx->h[0] = 0x00010203;
+*      for(i=1; i<16; ++i){
+*              ctx->h[i] = ctx->h[i-1]+ 0x04040404;
+*      }
+*      ctx->counter=0;
+* }
+*
+* param ctx:  r24:r25
+*/
+.global bmw224_init
+bmw224_init:
+       movw r26, r24
+       ldi r22, 0x03
+       ldi r23, 0x02
+       ldi r24, 0x01
+       ldi r25, 0x00
+bmw_small_init:
+       st X+, r22
+       st X+, r23
+       st X+, r24
+       st X+, r25
+       ldi r18, 16-1
+       ldi r20, 0x04
+1:
+       add r22, r20
+       adc r23, r20
+       adc r24, r20
+       adc r25, r20
+       st X+, r22
+       st X+, r23
+       st X+, r24
+       st X+, r25
+       dec r18
+       brne 1b
+       st X+, r1
+       st X+, r1
+       st X+, r1
+       st X+, r1
+       ret
+
+.global bmw256_init
+bmw256_init:
+       movw r26, r24
+       ldi r22, 0x43
+       ldi r23, 0x42
+       ldi r24, 0x41
+       ldi r25, 0x40
+       rjmp bmw_small_init
+
+
+/******************************************************************************/
+
+#if DEBUG
+
+printQ:
+       push_range 20, 25
+       ldi r16, 4
+       mov r9, r16
+       movw r16, r24
+       ldi r24, lo8(qdbg_str)
+       ldi r25, hi8(qdbg_str)
+       call cli_putstr_P
+       clr r8
+10:    ldi r24, lo8(qdbg_str1)
+       ldi r25, hi8(qdbg_str1)
+       call cli_putstr_P
+       mov r24, r8
+       call cli_hexdump_byte
+       ldi r24, lo8(qdbg_str2)
+       ldi r25, hi8(qdbg_str2)
+       call cli_putstr_P
+       movw r24, r16
+       clr r23
+       ldi r22, 4
+       call cli_hexdump_rev
+       add r16, r9
+       adc r17, r1
+       inc r8
+       sbrs r8, 5
+       rjmp 10b
+       pop_range 20, 25
+       ret
+qdbg_str:  .asciz "\r\nDBG Q: "
+qdbg_str1: .asciz "\r\n Q["
+qdbg_str2: .asciz "] =  "
+
+
+printX:
+       push_range 6, 9
+       push_range 16, 27
+       push_range 30, 31
+       ldi r16, 4
+       mov r6, r22
+       mov r9, r16
+       movw r16, r24
+       ldi r24, lo8(Xdbg_str)
+       ldi r25, hi8(Xdbg_str)
+       call cli_putstr_P
+       mov r24, r6
+       call cli_putc
+       ldi r24, ':'
+       call cli_putc
+       clr r8
+10:    ldi r24, lo8(Xdbg_str1)
+       ldi r25, hi8(Xdbg_str1)
+       call cli_putstr_P
+       mov r24, r6
+       call cli_putc
+       ldi r24, '['
+       call cli_putc
+       mov r24, r8
+       call cli_hexdump_byte
+       ldi r24, lo8(Xdbg_str2)
+       ldi r25, hi8(Xdbg_str2)
+       call cli_putstr_P
+       movw r24, r16
+       clr r23
+       ldi r22, 4
+       call cli_hexdump_rev
+       add r16, r9
+       adc r17, r1
+       inc r8
+       sbrs r8, 4
+       rjmp 10b
+       pop_range 30, 31
+       pop_range 16, 27
+       pop_range 6, 9
+       ret
+Xdbg_str:  .asciz "\r\nDBG "
+Xdbg_str1: .asciz "\r\n "
+Xdbg_str2: .asciz "] = "
+
+print32:
+       push_range 6, 9
+       push_range 16, 27
+       push_range 30, 31
+       movw r6, r22
+       movw r8, r24
+       ldi r24, lo8(Xdbg_str)
+       ldi r25, hi8(Xdbg_str)
+       call cli_putstr_P
+       mov r24, r9
+       call cli_hexdump_byte
+       mov r24, r8
+       call cli_hexdump_byte
+       mov r24, r7
+       call cli_hexdump_byte
+       mov r24, r6
+       call cli_hexdump_byte
+       pop_range 30, 31
+       pop_range 16, 27
+       pop_range 6, 9
+       ret
+
+
+print_acc:
+       push_range 16, 27
+       push_range 30, 31
+       ldi r24, lo8(Xdbg_str)
+       ldi r25, hi8(Xdbg_str)
+       call cli_putstr_P
+       mov r24, r9
+       call cli_hexdump_byte
+       mov r24, r8
+       call cli_hexdump_byte
+       mov r24, r15
+       call cli_hexdump_byte
+       mov r24, r14
+       call cli_hexdump_byte
+       pop_range 30, 31
+       pop_range 16, 27
+       ret
+
+#endif
+
index 6f51dc68bb2d21dcd93a78a776571cef9bd4afe0..5371f80d9f255594524fee8023a3eb37e3b9a9ad 100644 (file)
@@ -139,13 +139,12 @@ void hfal_stacksize(const hfdesc_t* hd){
        uint8_t data[(hf.blocksize_b+7)/8];
        uint8_t digest[(hf.hashsize_b+7)/8];
        uint16_t t1, t2;
-       uint8_t i;
 
        if(hf.type!=HFDESC_TYPE_HASHFUNCTION)
                return;
        cli_putstr_P(PSTR("\r\n\r\n === "));
        cli_putstr_P(hf.name);
-       cli_putstr_P(PSTR(" stack-usage === "
+       cli_putstr_P(PSTR(" stack-usage === "));
 
        cli();
        stack_measure_init(&smctx, PATTERN_A);
@@ -153,7 +152,7 @@ void hfal_stacksize(const hfdesc_t* hd){
        t1 = stack_measure_final(&smctx);
        stack_measure_init(&smctx, PATTERN_B);
        hf.init(&ctx);
-       t1 = stack_measure_final(&smctx);
+       t2 = stack_measure_final(&smctx);
        sei();
 
        t1 = (t1>t2)?t1:t2;
@@ -166,7 +165,7 @@ void hfal_stacksize(const hfdesc_t* hd){
        t1 = stack_measure_final(&smctx);
        stack_measure_init(&smctx, PATTERN_B);
        hf.nextBlock(&ctx, data);
-       t1 = stack_measure_final(&smctx);
+       t2 = stack_measure_final(&smctx);
        sei();
 
        t1 = (t1>t2)?t1:t2;
@@ -179,7 +178,7 @@ void hfal_stacksize(const hfdesc_t* hd){
        t1 = stack_measure_final(&smctx);
        stack_measure_init(&smctx, PATTERN_B);
        hf.lastBlock(&ctx, data, 0);
-       t1 = stack_measure_final(&smctx);
+       t2 = stack_measure_final(&smctx);
        sei();
 
        t1 = (t1>t2)?t1:t2;
@@ -192,7 +191,7 @@ void hfal_stacksize(const hfdesc_t* hd){
        t1 = stack_measure_final(&smctx);
        stack_measure_init(&smctx, PATTERN_B);
        hf.ctx2hash(digest, &ctx);
-       t1 = stack_measure_final(&smctx);
+       t2 = stack_measure_final(&smctx);
        sei();
 
        t1 = (t1>t2)?t1:t2;
diff --git a/mkfiles/bmw_tiny.mk b/mkfiles/bmw_tiny.mk
new file mode 100644 (file)
index 0000000..f38cee3
--- /dev/null
@@ -0,0 +1,12 @@
+# Makefile for BlueMidnightWish
+ALGO_NAME := BMW_TINY
+
+# comment out the following line for removement of BlueMidnightWish from the build process
+HASHES += $(ALGO_NAME)
+
+$(ALGO_NAME)_DIR      := bmw/
+$(ALGO_NAME)_OBJ      := bmw_small-tinyasm.o bmw_large.o 
+$(ALGO_NAME)_TEST_BIN := main-bmw-test.o hfal_bmw_small.o hfal_bmw_large.o $(CLI_STD) $(HFAL_STD)
+$(ALGO_NAME)_NESSIE_TEST      := test nessie
+$(ALGO_NAME)_PERFORMANCE_TEST := performance
+