]> git.cryptolib.org Git - avr-crypto-lib.git/commitdiff
bmw tiny split up
authorbg <bg@b1d182e4-1ff8-0310-901f-bddb46175740>
Tue, 11 May 2010 15:02:32 +0000 (15:02 +0000)
committerbg <bg@b1d182e4-1ff8-0310-901f-bddb46175740>
Tue, 11 May 2010 15:02:32 +0000 (15:02 +0000)
bmw/bmw_224-tinyasm.S [new file with mode: 0644]
bmw/bmw_256-tinyasm.S [new file with mode: 0644]
bmw/bmw_small-tinyasm.S
mkfiles/bmw_tiny_sep.mk [new file with mode: 0644]

diff --git a/bmw/bmw_224-tinyasm.S b/bmw/bmw_224-tinyasm.S
new file mode 100644 (file)
index 0000000..e01752c
--- /dev/null
@@ -0,0 +1,1281 @@
+/* bmw_small-tinyasm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2009  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * File:        bmw_small-tinyasm.S
+ * Author:      Daniel Otte
+ * Date:        2010-03-28
+ * License:     GPLv3 or later
+ * Description: implementation of BlueMidnightWish
+ *
+ */
+
+#include "avr-asm-macros.S"
+
+acc2 =  8
+acc3 =  9
+acc0 = 14
+acc1 = 15
+
+#define DEBUG 0
+
+/******************************************************************************/
+/*
+  param a: r22:r23:r24:r25
+  param s: r20
+*/
+shiftleft32:
+       clr r0
+       cpi r20, 8
+       brlo bitrotateleft_1
+       mov r25, r24
+       mov r24, r23
+       mov r23, r22
+       clr r22
+       subi r20, 8
+       rjmp shiftleft32
+
+/******************************************************************************/
+/*
+  param a: r22:r23:r24:r25
+  param s: r20
+*/
+shiftright32:
+       cpi r20, 8
+       brlo bitshiftright
+       mov r22, r23
+       mov r23, r24
+       mov r24, r25
+       clr r25
+       subi r20, 8
+       rjmp shiftright32
+bitshiftright:
+       tst r20
+       breq 20f
+10:    lsr r25
+       ror r24
+       ror r23
+       ror r22
+       dec r20
+       brne 10b
+20: ret
+
+/******************************************************************************/
+/*
+  param a: r22:r23:r24:r25
+  param s: r20
+*/
+rotateleft32:
+       cpi r20, 8
+       brlo bitrotateleft
+       mov r0, r25
+       mov r25, r24
+       mov r24, r23
+       mov r23, r22
+       mov r22, r0
+       subi r20, 8
+       rjmp rotateleft32
+bitrotateleft:
+    mov r0, r25
+bitrotateleft_1:
+       tst r20
+       breq 20f
+10:
+       lsl r0
+rol32:
+       rol r22
+       rol r23
+       rol r24
+       rol r25
+       dec r20
+       brne 10b
+20: ret
+
+
+/******************************************************************************/
+
+sn_stub:
+       movw r22, r2
+       movw r24, r4
+       lpm r20, Z+
+       rcall rotateleft32
+eor32_to_acc:
+       eor acc0, r22
+       eor acc1, r23
+       eor acc2, r24
+       eor acc3, r25
+       ret
+
+s_table:
+s0:  .byte 1, 3, 4,19
+s1:  .byte 1, 2, 8,23
+s2:  .byte 2, 1,12,25
+s3:  .byte 2, 2,15,29
+s4:  .byte 1, 0, 0, 0
+s5:  .byte 2, 0, 0, 0
+
+h0   = 10
+h1   = 11
+m0   = 12
+m1   = 13
+
+/*
+  param x: r22:r23:r24:25
+  param s: r20
+*/
+sn:
+       push_range 2, 5
+       push acc0
+       push acc1
+       push acc2
+       push acc3
+       ldi r30, lo8(s_table)
+       ldi r31, hi8(s_table)
+       lsl r20
+       lsl r20
+       add r30, r20
+       adc r31, r1
+       movw r2, r22
+       movw r4, r24
+       lpm r20, Z+
+       rcall shiftright32
+       rcall mov32_to_acc
+;---
+       movw r22, r2
+       movw r24, r4
+       lpm r20, Z+
+       rcall shiftleft32
+       rcall eor32_to_acc
+;---
+       rcall sn_stub
+       rcall sn_stub
+
+       movw r22, acc0
+       movw r24, acc2
+       pop acc3
+       pop acc2
+       pop acc1
+       pop acc0
+       rjmp pop5
+
+/******************************************************************************/
+/*
+  param dest: r26:r27 (X)
+  param src:  r30:r31 (Z)
+  param len:  r20
+*/
+memxor_64:
+;      tst r20
+;      breq memxor_exit
+       ldi r20, 64
+memxor:
+10: ld r21, X
+       ld r22, Z+
+       eor r21, r22
+       st X+, r21
+       dec r20
+       brne 10b
+memxor_exit:
+       ret
+
+/******************************************************************************/
+q0 = 2
+q1 = 3
+h0 = 4
+h1 = 5
+m0 = 6
+m1 = 7
+
+
+/******************************************************************************/
+load32_from_X:
+       ld r22, X+
+       ld r23, X+
+       ld r24, X+
+       ld r25, X+
+       ret
+
+load32_from_Y:
+       ld r22, Y+
+       ld r23, Y+
+       ld r24, Y+
+       ld r25, Y+
+       ret
+
+store32_to_Y:
+       st Y+, r22
+       st Y+, r23
+       st Y+, r24
+       st Y+, r25
+       ret
+
+add_X_to_32:
+       ld r0, X+
+       add r22, r0
+       ld r0, X+
+       adc r23, r0
+       ld r0, X+
+       adc r24, r0
+       ld r0, X+
+       adc r25, r0
+       ret
+
+store32_to_X:
+       st X+, r22
+       st X+, r23
+       st X+, r24
+       st X+, r25
+       ret
+
+mov32_to_acc:
+       movw acc0, r22
+       movw acc2, r24
+       ret
+
+/******************************************************************************/
+/*
+  param q:  r28:r29 (Y)
+  param h:  r26:r27 (X)
+  param m:  r30:r31 (Z)
+*/
+
+f2_1_shift_table:
+;      .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55
+       .byte 0x55, 0x87, 0x55, 0x51, 0x03, 0x66, 0x64, 0x2B
+f2_2_shift_table:
+;      .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1)
+       .byte (8<<1)+1, (6<<1), (6<<1)+1, (4<<1)+1, (3<<1), (4<<1), (7<<1), (2<<1)
+expand2_rot_table:
+       .byte 3,7,13,16,19,23,27
+
+f0_hacktable:
+       .byte 0x03, 0x11, 5*4
+       .byte 0xDD, 0xB3, 7*4
+       .byte 0x2A, 0x79, 10*4
+       .byte 0x07, 0xAA, 13*4
+       .byte 0x51, 0xC2, 14*4
+
+
+/*******************************************************************************
+* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){
+*      uint32_t r;
+*      r  = pgm_read_dword(k_lut+j);
+*      r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
+*      r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
+*      r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
+*      r ^= ((uint32_t*)h)[(j+7)&0xf];
+*      return r;
+* }
+* param j: r24
+* param m: r22:r23
+* param h: r20:r21
+*/
+j    = 16
+acc2 =  8
+acc3 =  9
+h0   = 10
+h1   = 11
+m0   = 12
+m1   = 13
+acc0 = 14
+acc1 = 15
+
+load_acc_from_X:
+       ld acc0, X+
+       ld acc1, X+
+       ld acc2, X+
+       ld acc3, X+
+       ret
+
+add_acc_to_X:
+       ld r0, X
+       add r0, acc0
+       st X+, r0
+       ld r0, X
+       adc r0, acc1
+       st X+, r0
+       ld r0, X
+       adc r0, acc2
+       st X+, r0
+       ld r0, X
+       adc r0, acc3
+       st X+, r0
+       ret
+
+load_rotate_add_M:
+       mov r20, j
+       andi r20, 0x0f
+       mov r0, r20
+       lsl r0
+       lsl r0
+       movw r26, m0
+       add r26, r0
+       adc r27, r1
+       rcall load32_from_X
+       inc r20
+       rcall rotateleft32
+       brts 10f
+       rjmp add32_to_acc
+;      ret
+10:    sub acc0, r22
+       sbc acc1, r23
+       sbc acc2, r24
+       sbc acc3, r25
+       ret
+
+
+;---
+
+/******************************************************************************/
+load_sn_add:
+       rcall load32_from_X
+       rcall sn
+add32_to_acc:
+       add acc0, r22
+       adc acc1, r23
+       adc acc2, r24
+       adc acc3, r25
+       ret
+
+/*
+  param q: r26:r27
+  param m: r22:r23
+  param h: r20:r21
+  param j: r24
+*/
+
+expand_intro:
+       push_range 26, 27
+       push r24
+addelement:
+       mov j, r24
+       movw h0, r20
+       movw m0, r22
+       sbiw r26, 4
+       rcall load_acc_from_X
+       ldi r24, 0x55
+       add acc0, r24
+       adc acc1, r24
+       adc acc2, r24
+       ldi r24, 5
+       adc acc3, r24
+       rcall store_acc_to_dec_X
+       adiw r26, 4
+       clt
+       rcall load_rotate_add_M
+       subi j, -3
+       rcall load_rotate_add_M
+       set
+       subi j, -7
+       rcall load_rotate_add_M
+       lsl j
+       lsl j
+       subi j, -7*4+10*4
+       andi j, 0x3f
+       movw r26, h0
+       add r26, j
+       adc r27, r1
+       rcall load32_from_X
+       rcall eor32_to_acc
+;--
+       pop r24
+       pop_range 26, 27
+       lsl r24
+       lsl r24
+       add r26, r24
+       adc r27, r1
+       ret
+expand1:
+       rcall expand_intro
+       ldi r19, 1
+10:
+       mov r20, r19
+       andi r20, 3
+       rcall load_sn_add
+       inc r19
+       cpi r19, 17
+       brne 10b
+       rjmp expand2_exit
+
+
+/******************************************************************************/
+/*
+  param q: r26:r27
+  param m: r22:r23
+  param h: r20:r21
+  param j: r24
+*/
+
+
+expand2:
+       rcall expand_intro
+       ldi r19, 14
+       ldi r30, lo8(expand2_rot_table)
+       ldi r31, hi8(expand2_rot_table)
+10:
+       rcall load32_from_X
+       sbrs r19, 0
+       rjmp 12f
+       lpm r20, Z+
+       rcall rotateleft32
+12:    rcall add32_to_acc
+       dec r19
+       brne 10b
+       ldi r20, 4
+       rcall load_sn_add
+       ldi r20, 5
+       rcall load_sn_add
+expand2_exit:
+       adiw r26, 4
+store_acc_to_dec_X:
+       st -X, acc3
+       st -X, acc2
+       st -X, acc1
+       st -X, acc0
+       ret
+
+/******************************************************************************/
+/*
+  param q: r24:r25
+  param m: r22:r23
+  param h: r20:r21
+*/
+/* for calling expand1/2
+  param q: r26:r27
+  param m: r22:r23
+  param h: r20:r21
+  param j: r24
+*/
+
+/******************************************************************************/
+/*
+  param q: r24:r25
+  param m: r22:r23
+  param h: r20:r21
+*/
+
+/******************************************************************************/
+/*
+  param ctx:  r24:r25
+  param msg:  r22:r23
+*/
+/* f0
+  param q:  r28:r29 (Y)
+  param h:  r26:r27 (X)
+  param m:  r30:r31 (Z)
+*/
+/* f1
+  param q: r24:r25
+  param m: r22:r23
+  param h: r20:r21
+*/
+/* f2
+  param q: r24:r25
+  param m: r22:r23
+  param h: r20:r21
+*/
+q0 = 2
+q1 = 3
+h0 = 4
+h1 = 5
+m0 = 6
+m1 = 7
+ctx0 =   2
+ctx1 =   3
+msg0 =   4
+msg1 =   5
+
+restore_f1:
+       movw r26, r2
+       movw r22, r4
+    movw r20, r6
+       ret
+bmw_small_nextBlock_early:
+       movw r24, ctx0
+       movw r22, msg0
+.global bmw224_nextBlock
+bmw_small_nextBlock:
+bmw224_nextBlock:
+bmw256_nextBlock:
+       push_range  2, 7
+       push_range 28, 29
+       push_range  8, 17
+       stack_alloc_large 32*4, r28, r29
+       ldi r16, 0x4f
+       push r16
+       ldi r16, 0xff
+       push r16
+       push r16
+       ldi r16, 0xfb
+       push r16
+       adiw r28, 1
+;      push_range 28, 29 /* push Q */
+;      push_range 22, 25 /* push M & H */
+       /* increment counter */
+       movw r26, r24
+       movw r2, r26
+       adiw r26, 63
+       adiw r26,  1
+       rcall load_acc_from_X
+       ldi r19, 1
+       add acc0, r19
+       adc acc1, r1
+       adc acc2, r1
+       adc acc3, r1
+       rcall store_acc_to_dec_X
+       /* call f0 */
+       movw r30, r22
+       movw r26, r24
+f0:
+       movw h0, r26
+       movw q0, r28
+       movw m0, r30
+       /* xor m into h */
+;      ldi r20, 64
+       rcall memxor_64
+       movw r30, m0
+       movw r26, h0
+
+       /* set q to zero */
+       ldi r22, 64
+10:    st Y+, r1
+       dec r22
+       brne 10b
+       movw r28, q0
+       /* calculate W and store it in Q */
+       ldi r19, 5
+30:
+       ldi r18, 16
+       /* load initial index */
+
+       /* load values from hacktable */
+       ldi r30, lo8(f0_hacktable-3)
+       ldi r31, hi8(f0_hacktable-3)
+       mov r16, r19
+       lsl r16
+       add r16, r19
+       add r30, r16
+       adc r31, r1
+       lpm r21, Z+
+       lpm r20, Z+
+       lpm r16, Z+
+40:
+       ;call add_hx_to_w
+add_hx_to_w:
+       movw r26, h0
+       add r26, r16
+       adc r27, r1
+       rcall load32_from_Y
+       sbiw r28, 4
+       lsl r20
+       rol r21
+       brcs 300f
+       /* addition */
+       rcall add_X_to_32
+       rjmp 500f
+300: /* substract */
+       rcall load_acc_from_X
+       sub r22, acc0
+       sbc r23, acc1
+       sbc r24, acc2
+       sbc r25, acc3
+
+500:
+       rcall store32_to_Y
+       subi r16, -4
+       andi r16, 0x0f<<2
+       dec r18
+       brne 40b
+       movw r28, q0
+       dec r19
+       brne 30b
+       movw r26, h0
+       /* xor m into h */
+;      ldi r20, 64
+       movw r26, h0
+       movw r30, m0
+       rcall memxor_64
+       sbiw r26, 60
+;---
+       clr r17
+       ldi r21, 15
+       mov r8, r21
+50:
+       rcall load32_from_Y
+       sbiw r28, 4
+       mov r20, r17
+       rcall sn
+       inc r17
+       cpi r17, 5
+       brne 52f
+       clr r17
+52:
+       rcall add_X_to_32
+       rcall store32_to_Y
+
+       dec r8
+       brne 50b
+;---
+       rcall load32_from_Y
+       clr r20
+       rcall sn
+       movw r26, h0
+       rcall add_X_to_32
+       sbiw r26, 4
+       sbiw r28, 4
+       rcall store32_to_Y
+       sbiw r28, 4
+       sbiw r28, 15*4
+       movw r20, h0
+       movw r22, m0
+
+       /* call f1*/
+       movw r2, r28
+f1:
+       movw r4, r22
+       movw r6, r20
+       movw r26, r2
+       clr r24
+       rcall expand1
+       rcall restore_f1
+       ldi r24, 1
+       rcall expand1
+       ldi r17, 2
+10: rcall restore_f1
+       mov r24, r17
+       rcall expand2
+       inc r17
+       sbrs r17, 4
+       rjmp 10b
+       rcall restore_f1
+       movw r24, r2
+
+
+       /* call f2 */
+;      pop_range 20, 25
+;      push_range 20, 25
+;      rcall printQ
+;      push r20
+;      push r21
+acc2  =  8
+acc3  =  9
+acc0  = 14
+acc1  = 15
+xl0   =  2
+xl1   =  3
+xl2   =  4
+xl3   =  5
+xh0   =  6
+xh1   =  7
+xh2   = 10
+xh3   = 11
+q16_0 = 12
+q16_1 = 13
+h0   =  18
+h1   =  19
+f2:
+       movw r26, r24
+       /* calc XL & XH */
+       adiw r26, 63
+       adiw r26,  1
+       movw q16_0, r26
+       movw h0, r20
+;---
+;      push h0
+;      push h1
+;---
+       movw r28, r22
+       rcall load_acc_from_X
+       ldi r17, 15
+10:    rcall load32_from_X
+       rcall eor32_to_acc
+       cpi r17, 9
+       brne 15f
+       movw xl0, acc0
+       movw xl2, acc2
+15:
+       dec r17
+       brne 10b
+       movw xh0, acc0
+       movw xh2, acc2
+;--- DBG
+;      push_range 22, 25
+;      movw r22, xl0
+;      movw r24, xl2
+;      rcall print32
+;      movw r22, xh0
+;      movw r24, xh2
+;      rcall print32
+;      pop_range 22, 25
+;--- END DBG
+        /* copy m(Y) into h */
+       movw r26, h0
+       ldi r22, 64
+10:
+       ld r23, Y+
+       st X+, r23
+       dec r22
+       brne 10b
+;--- /* calc first half of h0..h15 */
+       movw r28, q16_0
+       movw r26, h0
+       ldi r30, lo8(f2_1_shift_table)
+       ldi r31, hi8(f2_1_shift_table)
+       ldi r17, 16
+10:
+;---
+       movw r22, xh0
+       movw r24, xh2
+       cpi r17, 9
+       brge 15f
+       clr r1
+       rjmp 26f
+15:    lpm r20, Z+
+       mov r1, r20
+       andi r20, 0x0f
+       clt
+       cpi r17, 16
+       breq 20f
+       cpi r17, 11
+       brne 21f
+20:    set
+21:    brts 25f
+       rcall shiftright32
+       rjmp 26f
+25:    rcall shiftleft32
+26: rcall mov32_to_acc
+;---
+       rcall load32_from_Y
+       mov r20, r1
+       clr r1
+       swap r20
+       andi r20, 0x0f
+       brts 27f
+       rcall shiftleft32
+       rjmp 28f
+27:    rcall shiftright32
+28:    rcall eor32_to_acc
+;---
+       rcall load32_from_X
+       rcall eor32_to_acc
+       rcall store_acc_to_dec_X
+       adiw r26, 4
+;---
+       dec r17
+       brne 10b
+;-----
+       sbiw r28, 4*8 /* Y points to q[24] */
+       movw r30, r28
+       sbiw r28, 63
+       sbiw r28, 33 /* Y points to q[0] */
+       movw r26, r28
+       ldi r20, 8*4
+       /* xor q[24..31] into q[0..7] */
+       rcall memxor
+       /* xor q[23] into q[8] */
+       sbiw r30, 9*4
+       ldi r20, 4
+       rcall memxor
+       /* xor q[16..22] into q[9..15] */
+       sbiw r30, 8*4
+       ldi r20, 7*4
+       rcall memxor
+
+       movw r26, h0
+       ldi r17, 15
+       ldi r30, lo8(f2_2_shift_table)
+       ldi r31, hi8(f2_2_shift_table)
+10:    movw r22, xl0
+       movw r24, xl2
+       sbrc r17, 3
+       rjmp 20f
+       lpm r20, Z+
+       lsr r20
+       brcs 15f
+       rcall shiftright32
+       rjmp 20f
+15:
+       rcall shiftleft32
+20:
+       rcall mov32_to_acc
+       rcall load32_from_Y
+       rcall eor32_to_acc
+       rcall add_acc_to_X
+       dec r17
+       brpl 10b
+;-----
+       sbiw r26, 8*4 /* X points to h8 */
+       movw r28, r26
+       sbiw r28, 4*4 /* Y points to h4 */
+       ldi r17, 8
+       ldi r18, 9
+10:
+       rcall load32_from_Y
+       mov r20, r18
+       rcall rotateleft32
+       rcall mov32_to_acc
+       rcall add_acc_to_X
+       inc r18
+       cpi r17, 5
+       brne 20f
+       sbiw r28, 8*4
+20:    dec r17
+       brne 10b
+
+exit:
+;--- DBG
+;      pop r25
+;      pop r24
+;      ldi r22, 'H'
+;      rcall printX
+;--- END DBG
+       stack_free_large3 32*4+4
+       pop_range 10, 17
+pop9:
+       pop_range 8, 9
+pop28:
+       pop_range 28, 29
+pop7:
+       pop_range 6, 7
+pop5:
+       pop_range 2, 5
+       ret
+
+/******************************************************************************/
+ctx0 =  2
+ctx1 =  3
+blc0 =  4
+blc1 =  5
+len0 = 28
+len1 = 29
+buf0 =  6
+buf1 =  7
+
+load32_from_Z_stub:
+       movw r30, ctx0
+       adiw r30, 60
+       ldd r21, Z+4
+       ldd r22, Z+5
+       ldd r23, Z+6
+       ldd r24, Z+7
+       ret
+
+/******************************************************************************/
+/*
+  param ctx:  r24:r25
+  param msg:  r22:r23
+  param len:  r20:r21
+*/
+
+.global bmw224_lastBlock
+bmw_small_lastBlock:
+bmw224_lastBlock:
+bmw256_lastBlock:
+/*     while(length_b >= BMW_SMALL_BLOCKSIZE){
+               bmw_small_nextBlock(ctx, block);
+               length_b -= BMW_SMALL_BLOCKSIZE;
+               block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
+       }
+*/
+       push_range 2, 7
+       push_range 28, 29
+       movw ctx0, r24
+       movw blc0, r22
+       movw len0, r20
+1:
+       cpi len1, hi8(512)
+       brlo 2f
+       rcall bmw_small_nextBlock_early
+       ldi r24, 64
+       add blc0, r24
+       adc blc1, r1
+       subi len1, hi8(512)
+       rjmp 1b
+2:
+/*     struct {
+               uint8_t  buffer[64];
+               uint32_t ctr;
+       } pctx;
+*/
+       stack_alloc_large 68
+       adiw r30, 1
+       movw buf0, r30
+/*     memset(pctx.buffer, 0, 64);
+       memcpy(pctx.buffer, block, (length_b+7)/8);
+       pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
+*/     movw r24, len0
+       ldi r23, 63
+       movw r26, blc0
+       lsr r25
+       ror r24
+       lsr r24
+       lsr r24
+       breq 301f
+       sub r23, r24
+       /* copy (#r24) bytes to stack buffer */
+30: ld r20, X+
+       st Z+, r20
+       dec r24
+       brne 30b
+301: /* calculate the appended byte */
+       clr r20
+       mov r21, len0
+       ldi r24, 0x80
+       andi r21, 0x07
+       breq 305f
+       ld r20, X+
+303:
+       lsr r24
+       dec r21
+       brne 303b
+305:
+       or r20, r24
+       st Z+, r20
+       tst r23
+       breq 32f
+31: st Z+, r1
+       dec r23
+       brne 31b
+32:
+/*     if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
+               bmw_small_nextBlock(ctx, pctx.buffer);
+               memset(pctx.buffer, 0, 64-8);
+               ctx->counter -= 1;
+       }
+*/
+       tst len1
+       breq 400f
+       cpi len0, 192
+       brlo 400f
+       movw blc0, buf0
+       rcall bmw_small_nextBlock_early
+       movw r26, buf0
+       ldi r20, 64-8
+350:
+       st X+, r1
+       dec r20
+       brne 350b
+       rcall load32_from_Z_stub
+       subi r21, 1
+       sbc r22, r1
+       sbc r23, r1
+       sbc r24, r1
+       rjmp 410f
+/*     *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
+       bmw_small_nextBlock(ctx, pctx.buffer);
+*/
+400:
+       rcall load32_from_Z_stub
+410:
+       clr r25
+       ldi r20, 1
+       lsl r21
+       rcall rol32
+       mov r20, len0
+       add r21, len1
+       adc r22, r1
+       adc r23, r1
+       adc r24, r1
+       adc r25, r1
+       movw r26, buf0
+       adiw r26, 64-8
+       st X+, r20
+       st X+, r21
+       rcall store32_to_X
+       st X+, r1
+       st X+, r1
+       movw blc0, buf0
+       rcall bmw_small_nextBlock_early
+/*     memset(pctx.buffer, 0xaa, 64);
+       for(i=0; i<16;++i){
+               pctx.buffer[i*4] = i+0xa0;
+       }
+*/
+       ldi r22, 0xa0
+       ldi r23, 0xaa
+       ldi r24, 0xaa
+       ldi r25, 0xaa
+       movw r26, buf0
+500:
+       rcall store32_to_X
+       inc r22
+       sbrs r22, 4
+       rjmp 500b
+/*     bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
+       memcpy(ctx->h, pctx.buffer, 64);
+*/
+    movw r24, buf0
+    movw r22, ctx0
+    rcall bmw_small_nextBlock
+       ldi r18, 64
+       movw r26, ctx0
+       movw r30, buf0
+600:
+       ld r20, Z+
+       st X+, r20
+       dec r18
+       brne 600b
+
+       stack_free_large 68
+       rjmp pop28
+
+
+/*******************************************************************************
+* void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){
+*      memcpy(dest, &(ctx->h[9]), 224/8);
+* }
+*
+* param dest:  r24:r25
+* param ctx:   r22:r23
+*/
+.global bmw224_ctx2hash
+bmw224_ctx2hash:
+       movw r30, r22
+       adiw r30, 9*4
+       ldi r18, 28
+1:     movw r26, r24
+1:  ld r23, Z+
+       st X+, r23
+       dec r18
+       brne 1b
+       ret
+
+
+/*******************************************************************************
+* void bmw224(void* dest, const void* msg, uint32_t length_b){
+*      bmw_small_ctx_t ctx;
+*      bmw224_init(&ctx);
+*      while(length_b>=BMW_SMALL_BLOCKSIZE){
+*              bmw_small_nextBlock(&ctx, msg);
+*              length_b -= BMW_SMALL_BLOCKSIZE;
+*              msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
+*      }
+*      bmw_small_lastBlock(&ctx, msg, length_b);
+*      bmw224_ctx2hash(dest, &ctx);
+* }
+*
+* param dest:     r24:r25
+* param msg:      r22:r23
+* param length_b: r18:r21
+*/
+ctx0 =   2
+ctx1 =   3
+msg0 =   4
+msg1 =   5
+len0 =  28
+len1 =  29
+len2 =   8
+len3 =   9
+dst0 =   6
+dst1 =   7
+.global bmw224
+bmw224:
+       push_range 2, 7
+       push_range 28, 29
+       push_range 8, 9
+       stack_alloc_large 64+4
+10:    movw ctx0, r30
+       movw dst0, r24
+       movw msg0, r22
+       movw len0, r18
+       movw len2, r20
+       movw r24, ctx0
+       rcall bmw224_init
+20:
+       mov r18, len2
+       or  r18, len3
+       breq 50f
+       rcall bmw_small_nextBlock_early
+       subi len1, 2
+       sbc len2, r1
+       sbc len3, r1
+       ldi r20, 64
+       add msg0, r20
+       adc msg1, r1
+       rjmp 20b
+50:
+       movw r24, ctx0
+       movw r22, msg0
+       movw r20, len0
+       rcall bmw_small_lastBlock
+       movw r24, dst0
+       movw r22, ctx0
+       rcall bmw224_ctx2hash
+       stack_free_large 64+4
+       rjmp pop9
+
+/*******************************************************************************
+* void bmw224_init(bmw224_ctx_t* ctx){
+*      uint8_t i;
+*      ctx->h[0] = 0x00010203;
+*      for(i=1; i<16; ++i){
+*              ctx->h[i] = ctx->h[i-1]+ 0x04040404;
+*      }
+*      ctx->counter=0;
+* }
+*
+* param ctx:  r24:r25
+*/
+.global bmw224_init
+bmw224_init:
+       ldi r22, 0x00
+       ldi r23, 0x40
+       movw r26, r24
+       adiw r26, 4
+10:
+       st -X, r22
+       inc r22
+       mov r20, r22
+       andi r20, 0x3
+       brne 10b
+       adiw r26, 8
+20: cp r22, r23
+       brne 10b
+       st -X, r1
+       st -X, r1
+       st -X, r1
+       st -X, r1
+       ret
+
+
+/******************************************************************************/
+
+#if DEBUG
+
+printQ:
+       push_range 20, 25
+       ldi r16, 4
+       mov r9, r16
+       movw r16, r24
+       ldi r24, lo8(qdbg_str)
+       ldi r25, hi8(qdbg_str)
+       call cli_putstr_P
+       clr r8
+10:    ldi r24, lo8(qdbg_str1)
+       ldi r25, hi8(qdbg_str1)
+       call cli_putstr_P
+       mov r24, r8
+       call cli_hexdump_byte
+       ldi r24, lo8(qdbg_str2)
+       ldi r25, hi8(qdbg_str2)
+       call cli_putstr_P
+       movw r24, r16
+       clr r23
+       ldi r22, 4
+       call cli_hexdump_rev
+       add r16, r9
+       adc r17, r1
+       inc r8
+       sbrs r8, 5
+       rjmp 10b
+       pop_range 20, 25
+       ret
+qdbg_str:  .asciz "\r\nDBG Q: "
+qdbg_str1: .asciz "\r\n Q["
+qdbg_str2: .asciz "] =  "
+
+
+printX:
+       push_range 6, 9
+       push_range 16, 27
+       push_range 30, 31
+       ldi r16, 4
+       mov r6, r22
+       mov r9, r16
+       movw r16, r24
+       ldi r24, lo8(Xdbg_str)
+       ldi r25, hi8(Xdbg_str)
+       call cli_putstr_P
+       mov r24, r6
+       call cli_putc
+       ldi r24, ':'
+       call cli_putc
+       clr r8
+10:    ldi r24, lo8(Xdbg_str1)
+       ldi r25, hi8(Xdbg_str1)
+       call cli_putstr_P
+       mov r24, r6
+       call cli_putc
+       ldi r24, '['
+       call cli_putc
+       mov r24, r8
+       call cli_hexdump_byte
+       ldi r24, lo8(Xdbg_str2)
+       ldi r25, hi8(Xdbg_str2)
+       call cli_putstr_P
+       movw r24, r16
+       clr r23
+       ldi r22, 4
+       call cli_hexdump_rev
+       add r16, r9
+       adc r17, r1
+       inc r8
+       sbrs r8, 4
+       rjmp 10b
+       pop_range 30, 31
+       pop_range 16, 27
+       pop_range 6, 9
+       ret
+Xdbg_str:  .asciz "\r\nDBG "
+Xdbg_str1: .asciz "\r\n "
+Xdbg_str2: .asciz "] = "
+
+print32:
+       push_range 6, 9
+       push_range 16, 27
+       push_range 30, 31
+       movw r6, r22
+       movw r8, r24
+       ldi r24, lo8(Xdbg_str)
+       ldi r25, hi8(Xdbg_str)
+       call cli_putstr_P
+       mov r24, r9
+       call cli_hexdump_byte
+       mov r24, r8
+       call cli_hexdump_byte
+       mov r24, r7
+       call cli_hexdump_byte
+       mov r24, r6
+       call cli_hexdump_byte
+       pop_range 30, 31
+       pop_range 16, 27
+       pop_range 6, 9
+       ret
+
+
+print_acc:
+       push_range 16, 27
+       push_range 30, 31
+       ldi r24, lo8(Xdbg_str)
+       ldi r25, hi8(Xdbg_str)
+       call cli_putstr_P
+       mov r24, r9
+       call cli_hexdump_byte
+       mov r24, r8
+       call cli_hexdump_byte
+       mov r24, r15
+       call cli_hexdump_byte
+       mov r24, r14
+       call cli_hexdump_byte
+       pop_range 30, 31
+       pop_range 16, 27
+       ret
+
+#endif
+
diff --git a/bmw/bmw_256-tinyasm.S b/bmw/bmw_256-tinyasm.S
new file mode 100644 (file)
index 0000000..6327bc1
--- /dev/null
@@ -0,0 +1,1302 @@
+/* bmw_small-tinyasm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2009  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * File:        bmw_small-tinyasm.S
+ * Author:      Daniel Otte
+ * Date:        2010-03-28
+ * License:     GPLv3 or later
+ * Description: implementation of BlueMidnightWish
+ *
+ */
+
+#include "avr-asm-macros.S"
+
+acc2 =  8
+acc3 =  9
+acc0 = 14
+acc1 = 15
+
+#define DEBUG 0
+
+/******************************************************************************/
+/*
+  param a: r22:r23:r24:r25
+  param s: r20
+*/
+shiftleft32:
+       clr r0
+       cpi r20, 8
+       brlo bitrotateleft_1
+       mov r25, r24
+       mov r24, r23
+       mov r23, r22
+       clr r22
+       subi r20, 8
+       rjmp shiftleft32
+
+/******************************************************************************/
+/*
+  param a: r22:r23:r24:r25
+  param s: r20
+*/
+shiftright32:
+       cpi r20, 8
+       brlo bitshiftright
+       mov r22, r23
+       mov r23, r24
+       mov r24, r25
+       clr r25
+       subi r20, 8
+       rjmp shiftright32
+bitshiftright:
+       tst r20
+       breq 20f
+10:    lsr r25
+       ror r24
+       ror r23
+       ror r22
+       dec r20
+       brne 10b
+20: ret
+
+/******************************************************************************/
+/*
+  param a: r22:r23:r24:r25
+  param s: r20
+*/
+rotateleft32:
+       cpi r20, 8
+       brlo bitrotateleft
+       mov r0, r25
+       mov r25, r24
+       mov r24, r23
+       mov r23, r22
+       mov r22, r0
+       subi r20, 8
+       rjmp rotateleft32
+bitrotateleft:
+    mov r0, r25
+bitrotateleft_1:
+       tst r20
+       breq 20f
+10:
+       lsl r0
+rol32:
+       rol r22
+       rol r23
+       rol r24
+       rol r25
+       dec r20
+       brne 10b
+20: ret
+
+
+/******************************************************************************/
+
+sn_stub:
+       movw r22, r2
+       movw r24, r4
+       lpm r20, Z+
+       rcall rotateleft32
+eor32_to_acc:
+       eor acc0, r22
+       eor acc1, r23
+       eor acc2, r24
+       eor acc3, r25
+       ret
+
+s_table:
+s0:  .byte 1, 3, 4,19
+s1:  .byte 1, 2, 8,23
+s2:  .byte 2, 1,12,25
+s3:  .byte 2, 2,15,29
+s4:  .byte 1, 0, 0, 0
+s5:  .byte 2, 0, 0, 0
+
+h0   = 10
+h1   = 11
+m0   = 12
+m1   = 13
+
+/*
+  param x: r22:r23:r24:25
+  param s: r20
+*/
+sn:
+       push_range 2, 5
+       push acc0
+       push acc1
+       push acc2
+       push acc3
+       ldi r30, lo8(s_table)
+       ldi r31, hi8(s_table)
+       lsl r20
+       lsl r20
+       add r30, r20
+       adc r31, r1
+       movw r2, r22
+       movw r4, r24
+       lpm r20, Z+
+       rcall shiftright32
+       rcall mov32_to_acc
+;---
+       movw r22, r2
+       movw r24, r4
+       lpm r20, Z+
+       rcall shiftleft32
+       rcall eor32_to_acc
+;---
+       rcall sn_stub
+       rcall sn_stub
+
+       movw r22, acc0
+       movw r24, acc2
+       pop acc3
+       pop acc2
+       pop acc1
+       pop acc0
+       rjmp pop5
+
+/******************************************************************************/
+/*
+  param dest: r26:r27 (X)
+  param src:  r30:r31 (Z)
+  param len:  r20
+*/
+memxor_64:
+;      tst r20
+;      breq memxor_exit
+       ldi r20, 64
+memxor:
+10: ld r21, X
+       ld r22, Z+
+       eor r21, r22
+       st X+, r21
+       dec r20
+       brne 10b
+memxor_exit:
+       ret
+
+/******************************************************************************/
+q0 = 2
+q1 = 3
+h0 = 4
+h1 = 5
+m0 = 6
+m1 = 7
+
+
+/******************************************************************************/
+load32_from_X:
+       ld r22, X+
+       ld r23, X+
+       ld r24, X+
+       ld r25, X+
+       ret
+
+load32_from_Y:
+       ld r22, Y+
+       ld r23, Y+
+       ld r24, Y+
+       ld r25, Y+
+       ret
+
+store32_to_Y:
+       st Y+, r22
+       st Y+, r23
+       st Y+, r24
+       st Y+, r25
+       ret
+
+add_X_to_32:
+       ld r0, X+
+       add r22, r0
+       ld r0, X+
+       adc r23, r0
+       ld r0, X+
+       adc r24, r0
+       ld r0, X+
+       adc r25, r0
+       ret
+
+store32_to_X:
+       st X+, r22
+       st X+, r23
+       st X+, r24
+       st X+, r25
+       ret
+
+mov32_to_acc:
+       movw acc0, r22
+       movw acc2, r24
+       ret
+
+/******************************************************************************/
+/*
+  param q:  r28:r29 (Y)
+  param h:  r26:r27 (X)
+  param m:  r30:r31 (Z)
+*/
+
+f2_1_shift_table:
+;      .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55
+       .byte 0x55, 0x87, 0x55, 0x51, 0x03, 0x66, 0x64, 0x2B
+f2_2_shift_table:
+;      .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1)
+       .byte (8<<1)+1, (6<<1), (6<<1)+1, (4<<1)+1, (3<<1), (4<<1), (7<<1), (2<<1)
+expand2_rot_table:
+       .byte 3,7,13,16,19,23,27
+
+f0_hacktable:
+       .byte 0x03, 0x11, 5*4
+       .byte 0xDD, 0xB3, 7*4
+       .byte 0x2A, 0x79, 10*4
+       .byte 0x07, 0xAA, 13*4
+       .byte 0x51, 0xC2, 14*4
+
+
+/*******************************************************************************
+* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){
+*      uint32_t r;
+*      r  = pgm_read_dword(k_lut+j);
+*      r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
+*      r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
+*      r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
+*      r ^= ((uint32_t*)h)[(j+7)&0xf];
+*      return r;
+* }
+* param j: r24
+* param m: r22:r23
+* param h: r20:r21
+*/
+j    = 16
+acc2 =  8
+acc3 =  9
+h0   = 10
+h1   = 11
+m0   = 12
+m1   = 13
+acc0 = 14
+acc1 = 15
+
+load_acc_from_X:
+       ld acc0, X+
+       ld acc1, X+
+       ld acc2, X+
+       ld acc3, X+
+       ret
+
+add_acc_to_X:
+       ld r0, X
+       add r0, acc0
+       st X+, r0
+       ld r0, X
+       adc r0, acc1
+       st X+, r0
+       ld r0, X
+       adc r0, acc2
+       st X+, r0
+       ld r0, X
+       adc r0, acc3
+       st X+, r0
+       ret
+
+load_rotate_add_M:
+       mov r20, j
+       andi r20, 0x0f
+       mov r0, r20
+       lsl r0
+       lsl r0
+       movw r26, m0
+       add r26, r0
+       adc r27, r1
+       rcall load32_from_X
+       inc r20
+       rcall rotateleft32
+       brts 10f
+       rjmp add32_to_acc
+;      ret
+10:    sub acc0, r22
+       sbc acc1, r23
+       sbc acc2, r24
+       sbc acc3, r25
+       ret
+
+
+;---
+
+/******************************************************************************/
+load_sn_add:
+       rcall load32_from_X
+       rcall sn
+add32_to_acc:
+       add acc0, r22
+       adc acc1, r23
+       adc acc2, r24
+       adc acc3, r25
+       ret
+
+/*
+  param q: r26:r27
+  param m: r22:r23
+  param h: r20:r21
+  param j: r24
+*/
+
+expand_intro:
+       push_range 26, 27
+       push r24
+addelement:
+       mov j, r24
+       movw h0, r20
+       movw m0, r22
+       sbiw r26, 4
+       rcall load_acc_from_X
+       ldi r24, 0x55
+       add acc0, r24
+       adc acc1, r24
+       adc acc2, r24
+       ldi r24, 5
+       adc acc3, r24
+       rcall store_acc_to_dec_X
+       adiw r26, 4
+       clt
+       rcall load_rotate_add_M
+       subi j, -3
+       rcall load_rotate_add_M
+       set
+       subi j, -7
+       rcall load_rotate_add_M
+       lsl j
+       lsl j
+       subi j, -7*4+10*4
+       andi j, 0x3f
+       movw r26, h0
+       add r26, j
+       adc r27, r1
+       rcall load32_from_X
+       rcall eor32_to_acc
+;--
+       pop r24
+       pop_range 26, 27
+       lsl r24
+       lsl r24
+       add r26, r24
+       adc r27, r1
+       ret
+expand1:
+       rcall expand_intro
+       ldi r19, 1
+10:
+       mov r20, r19
+       andi r20, 3
+       rcall load_sn_add
+       inc r19
+       cpi r19, 17
+       brne 10b
+       rjmp expand2_exit
+
+
+/******************************************************************************/
+/*
+  param q: r26:r27
+  param m: r22:r23
+  param h: r20:r21
+  param j: r24
+*/
+
+
+expand2:
+       rcall expand_intro
+       ldi r19, 14
+       ldi r30, lo8(expand2_rot_table)
+       ldi r31, hi8(expand2_rot_table)
+10:
+       rcall load32_from_X
+       sbrs r19, 0
+       rjmp 12f
+       lpm r20, Z+
+       rcall rotateleft32
+12:    rcall add32_to_acc
+       dec r19
+       brne 10b
+       ldi r20, 4
+       rcall load_sn_add
+       ldi r20, 5
+       rcall load_sn_add
+expand2_exit:
+       adiw r26, 4
+store_acc_to_dec_X:
+       st -X, acc3
+       st -X, acc2
+       st -X, acc1
+       st -X, acc0
+       ret
+
+/******************************************************************************/
+/*
+  param q: r24:r25
+  param m: r22:r23
+  param h: r20:r21
+*/
+/* for calling expand1/2
+  param q: r26:r27
+  param m: r22:r23
+  param h: r20:r21
+  param j: r24
+*/
+
+/******************************************************************************/
+/*
+  param q: r24:r25
+  param m: r22:r23
+  param h: r20:r21
+*/
+
+/******************************************************************************/
+/*
+  param ctx:  r24:r25
+  param msg:  r22:r23
+*/
+/* f0
+  param q:  r28:r29 (Y)
+  param h:  r26:r27 (X)
+  param m:  r30:r31 (Z)
+*/
+/* f1
+  param q: r24:r25
+  param m: r22:r23
+  param h: r20:r21
+*/
+/* f2
+  param q: r24:r25
+  param m: r22:r23
+  param h: r20:r21
+*/
+q0 = 2
+q1 = 3
+h0 = 4
+h1 = 5
+m0 = 6
+m1 = 7
+ctx0 =   2
+ctx1 =   3
+msg0 =   4
+msg1 =   5
+
+restore_f1:
+       movw r26, r2
+       movw r22, r4
+    movw r20, r6
+       ret
+bmw_small_nextBlock_early:
+       movw r24, ctx0
+       movw r22, msg0
+.global bmw_small_nextBlock
+.global bmw256_nextBlock
+bmw_small_nextBlock:
+bmw224_nextBlock:
+bmw256_nextBlock:
+       push_range  2, 7
+       push_range 28, 29
+       push_range  8, 17
+       stack_alloc_large 32*4, r28, r29
+       ldi r16, 0x4f
+       push r16
+       ldi r16, 0xff
+       push r16
+       push r16
+       ldi r16, 0xfb
+       push r16
+       adiw r28, 1
+;      push_range 28, 29 /* push Q */
+;      push_range 22, 25 /* push M & H */
+       /* increment counter */
+       movw r26, r24
+       movw r2, r26
+       adiw r26, 63
+       adiw r26,  1
+       rcall load_acc_from_X
+       ldi r19, 1
+       add acc0, r19
+       adc acc1, r1
+       adc acc2, r1
+       adc acc3, r1
+       rcall store_acc_to_dec_X
+       /* call f0 */
+       movw r30, r22
+       movw r26, r24
+f0:
+       movw h0, r26
+       movw q0, r28
+       movw m0, r30
+       /* xor m into h */
+;      ldi r20, 64
+       rcall memxor_64
+       movw r30, m0
+       movw r26, h0
+
+       /* set q to zero */
+       ldi r22, 64
+10:    st Y+, r1
+       dec r22
+       brne 10b
+       movw r28, q0
+       /* calculate W and store it in Q */
+       ldi r19, 5
+30:
+       ldi r18, 16
+       /* load initial index */
+
+       /* load values from hacktable */
+       ldi r30, lo8(f0_hacktable-3)
+       ldi r31, hi8(f0_hacktable-3)
+       mov r16, r19
+       lsl r16
+       add r16, r19
+       add r30, r16
+       adc r31, r1
+       lpm r21, Z+
+       lpm r20, Z+
+       lpm r16, Z+
+40:
+       ;call add_hx_to_w
+add_hx_to_w:
+       movw r26, h0
+       add r26, r16
+       adc r27, r1
+       rcall load32_from_Y
+       sbiw r28, 4
+       lsl r20
+       rol r21
+       brcs 300f
+       /* addition */
+       rcall add_X_to_32
+       rjmp 500f
+300: /* substract */
+       rcall load_acc_from_X
+       sub r22, acc0
+       sbc r23, acc1
+       sbc r24, acc2
+       sbc r25, acc3
+
+500:
+       rcall store32_to_Y
+       subi r16, -4
+       andi r16, 0x0f<<2
+       dec r18
+       brne 40b
+       movw r28, q0
+       dec r19
+       brne 30b
+       movw r26, h0
+       /* xor m into h */
+;      ldi r20, 64
+       movw r26, h0
+       movw r30, m0
+       rcall memxor_64
+       sbiw r26, 60
+;---
+       clr r17
+       ldi r21, 15
+       mov r8, r21
+50:
+       rcall load32_from_Y
+       sbiw r28, 4
+       mov r20, r17
+       rcall sn
+       inc r17
+       cpi r17, 5
+       brne 52f
+       clr r17
+52:
+       rcall add_X_to_32
+       rcall store32_to_Y
+
+       dec r8
+       brne 50b
+;---
+       rcall load32_from_Y
+       clr r20
+       rcall sn
+       movw r26, h0
+       rcall add_X_to_32
+       sbiw r26, 4
+       sbiw r28, 4
+       rcall store32_to_Y
+       sbiw r28, 4
+       sbiw r28, 15*4
+       movw r20, h0
+       movw r22, m0
+
+       /* call f1*/
+       movw r2, r28
+f1:
+       movw r4, r22
+       movw r6, r20
+       movw r26, r2
+       clr r24
+       rcall expand1
+       rcall restore_f1
+       ldi r24, 1
+       rcall expand1
+       ldi r17, 2
+10: rcall restore_f1
+       mov r24, r17
+       rcall expand2
+       inc r17
+       sbrs r17, 4
+       rjmp 10b
+       rcall restore_f1
+       movw r24, r2
+
+
+       /* call f2 */
+;      pop_range 20, 25
+;      push_range 20, 25
+;      rcall printQ
+;      push r20
+;      push r21
+acc2  =  8
+acc3  =  9
+acc0  = 14
+acc1  = 15
+xl0   =  2
+xl1   =  3
+xl2   =  4
+xl3   =  5
+xh0   =  6
+xh1   =  7
+xh2   = 10
+xh3   = 11
+q16_0 = 12
+q16_1 = 13
+h0   =  18
+h1   =  19
+f2:
+       movw r26, r24
+       /* calc XL & XH */
+       adiw r26, 63
+       adiw r26,  1
+       movw q16_0, r26
+       movw h0, r20
+;---
+;      push h0
+;      push h1
+;---
+       movw r28, r22
+       rcall load_acc_from_X
+       ldi r17, 15
+10:    rcall load32_from_X
+       rcall eor32_to_acc
+       cpi r17, 9
+       brne 15f
+       movw xl0, acc0
+       movw xl2, acc2
+15:
+       dec r17
+       brne 10b
+       movw xh0, acc0
+       movw xh2, acc2
+;--- DBG
+;      push_range 22, 25
+;      movw r22, xl0
+;      movw r24, xl2
+;      rcall print32
+;      movw r22, xh0
+;      movw r24, xh2
+;      rcall print32
+;      pop_range 22, 25
+;--- END DBG
+        /* copy m(Y) into h */
+       movw r26, h0
+       ldi r22, 64
+10:
+       ld r23, Y+
+       st X+, r23
+       dec r22
+       brne 10b
+;--- /* calc first half of h0..h15 */
+       movw r28, q16_0
+       movw r26, h0
+       ldi r30, lo8(f2_1_shift_table)
+       ldi r31, hi8(f2_1_shift_table)
+       ldi r17, 16
+10:
+;---
+       movw r22, xh0
+       movw r24, xh2
+       cpi r17, 9
+       brge 15f
+       clr r1
+       rjmp 26f
+15:    lpm r20, Z+
+       mov r1, r20
+       andi r20, 0x0f
+       clt
+       cpi r17, 16
+       breq 20f
+       cpi r17, 11
+       brne 21f
+20:    set
+21:    brts 25f
+       rcall shiftright32
+       rjmp 26f
+25:    rcall shiftleft32
+26: rcall mov32_to_acc
+;---
+       rcall load32_from_Y
+       mov r20, r1
+       clr r1
+       swap r20
+       andi r20, 0x0f
+       brts 27f
+       rcall shiftleft32
+       rjmp 28f
+27:    rcall shiftright32
+28:    rcall eor32_to_acc
+;---
+       rcall load32_from_X
+       rcall eor32_to_acc
+       rcall store_acc_to_dec_X
+       adiw r26, 4
+;---
+       dec r17
+       brne 10b
+;-----
+       sbiw r28, 4*8 /* Y points to q[24] */
+       movw r30, r28
+       sbiw r28, 63
+       sbiw r28, 33 /* Y points to q[0] */
+       movw r26, r28
+       ldi r20, 8*4
+       /* xor q[24..31] into q[0..7] */
+       rcall memxor
+       /* xor q[23] into q[8] */
+       sbiw r30, 9*4
+       ldi r20, 4
+       rcall memxor
+       /* xor q[16..22] into q[9..15] */
+       sbiw r30, 8*4
+       ldi r20, 7*4
+       rcall memxor
+
+       movw r26, h0
+       ldi r17, 15
+       ldi r30, lo8(f2_2_shift_table)
+       ldi r31, hi8(f2_2_shift_table)
+10:    movw r22, xl0
+       movw r24, xl2
+       sbrc r17, 3
+       rjmp 20f
+       lpm r20, Z+
+       lsr r20
+       brcs 15f
+       rcall shiftright32
+       rjmp 20f
+15:
+       rcall shiftleft32
+20:
+       rcall mov32_to_acc
+       rcall load32_from_Y
+       rcall eor32_to_acc
+       rcall add_acc_to_X
+       dec r17
+       brpl 10b
+;-----
+       sbiw r26, 8*4 /* X points to h8 */
+       movw r28, r26
+       sbiw r28, 4*4 /* Y points to h4 */
+       ldi r17, 8
+       ldi r18, 9
+10:
+       rcall load32_from_Y
+       mov r20, r18
+       rcall rotateleft32
+       rcall mov32_to_acc
+       rcall add_acc_to_X
+       inc r18
+       cpi r17, 5
+       brne 20f
+       sbiw r28, 8*4
+20:    dec r17
+       brne 10b
+
+exit:
+;--- DBG
+;      pop r25
+;      pop r24
+;      ldi r22, 'H'
+;      rcall printX
+;--- END DBG
+       stack_free_large3 32*4+4
+       pop_range 10, 17
+pop9:
+       pop_range 8, 9
+pop28:
+       pop_range 28, 29
+pop7:
+       pop_range 6, 7
+pop5:
+       pop_range 2, 5
+       ret
+
+/******************************************************************************/
+ctx0 =  2
+ctx1 =  3
+blc0 =  4
+blc1 =  5
+len0 = 28
+len1 = 29
+buf0 =  6
+buf1 =  7
+
+load32_from_Z_stub:
+       movw r30, ctx0
+       adiw r30, 60
+       ldd r21, Z+4
+       ldd r22, Z+5
+       ldd r23, Z+6
+       ldd r24, Z+7
+       ret
+
+/******************************************************************************/
+/*
+  param ctx:  r24:r25
+  param msg:  r22:r23
+  param len:  r20:r21
+*/
+
+.global bmw_small_lastBlock
+.global bmw256_lastBlock
+bmw_small_lastBlock:
+bmw224_lastBlock:
+bmw256_lastBlock:
+/*     while(length_b >= BMW_SMALL_BLOCKSIZE){
+               bmw_small_nextBlock(ctx, block);
+               length_b -= BMW_SMALL_BLOCKSIZE;
+               block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
+       }
+*/
+       push_range 2, 7
+       push_range 28, 29
+       movw ctx0, r24
+       movw blc0, r22
+       movw len0, r20
+1:
+       cpi len1, hi8(512)
+       brlo 2f
+       rcall bmw_small_nextBlock_early
+       ldi r24, 64
+       add blc0, r24
+       adc blc1, r1
+       subi len1, hi8(512)
+       rjmp 1b
+2:
+/*     struct {
+               uint8_t  buffer[64];
+               uint32_t ctr;
+       } pctx;
+*/
+       stack_alloc_large 68
+       adiw r30, 1
+       movw buf0, r30
+/*     memset(pctx.buffer, 0, 64);
+       memcpy(pctx.buffer, block, (length_b+7)/8);
+       pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
+*/     movw r24, len0
+       ldi r23, 63
+       movw r26, blc0
+       lsr r25
+       ror r24
+       lsr r24
+       lsr r24
+       breq 301f
+       sub r23, r24
+       /* copy (#r24) bytes to stack buffer */
+30: ld r20, X+
+       st Z+, r20
+       dec r24
+       brne 30b
+301: /* calculate the appended byte */
+       clr r20
+       mov r21, len0
+       ldi r24, 0x80
+       andi r21, 0x07
+       breq 305f
+       ld r20, X+
+303:
+       lsr r24
+       dec r21
+       brne 303b
+305:
+       or r20, r24
+       st Z+, r20
+       tst r23
+       breq 32f
+31: st Z+, r1
+       dec r23
+       brne 31b
+32:
+/*     if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
+               bmw_small_nextBlock(ctx, pctx.buffer);
+               memset(pctx.buffer, 0, 64-8);
+               ctx->counter -= 1;
+       }
+*/
+       tst len1
+       breq 400f
+       cpi len0, 192
+       brlo 400f
+       movw blc0, buf0
+       rcall bmw_small_nextBlock_early
+       movw r26, buf0
+       ldi r20, 64-8
+350:
+       st X+, r1
+       dec r20
+       brne 350b
+       rcall load32_from_Z_stub
+       subi r21, 1
+       sbc r22, r1
+       sbc r23, r1
+       sbc r24, r1
+       rjmp 410f
+/*     *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
+       bmw_small_nextBlock(ctx, pctx.buffer);
+*/
+400:
+       rcall load32_from_Z_stub
+410:
+       clr r25
+       ldi r20, 1
+       lsl r21
+       rcall rol32
+       mov r20, len0
+       add r21, len1
+       adc r22, r1
+       adc r23, r1
+       adc r24, r1
+       adc r25, r1
+       movw r26, buf0
+       adiw r26, 64-8
+       st X+, r20
+       st X+, r21
+       rcall store32_to_X
+       st X+, r1
+       st X+, r1
+       movw blc0, buf0
+       rcall bmw_small_nextBlock_early
+/*     memset(pctx.buffer, 0xaa, 64);
+       for(i=0; i<16;++i){
+               pctx.buffer[i*4] = i+0xa0;
+       }
+*/
+       ldi r22, 0xa0
+       ldi r23, 0xaa
+       ldi r24, 0xaa
+       ldi r25, 0xaa
+       movw r26, buf0
+500:
+       rcall store32_to_X
+       inc r22
+       sbrs r22, 4
+       rjmp 500b
+/*     bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
+       memcpy(ctx->h, pctx.buffer, 64);
+*/
+    movw r24, buf0
+    movw r22, ctx0
+    rcall bmw_small_nextBlock
+       ldi r18, 64
+       movw r26, ctx0
+       movw r30, buf0
+600:
+       ld r20, Z+
+       st X+, r20
+       dec r18
+       brne 600b
+
+       stack_free_large 68
+       rjmp pop28
+
+
+/*******************************************************************************
+* void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
+*      memcpy(dest, &(ctx->h[8]), 256/8);
+* }
+*
+* param dest:  r24:r25
+* param ctx:   r22:r23
+*/
+.global bmw256_ctx2hash
+bmw256_ctx2hash:
+       movw r30, r22
+       adiw r30, 8*4
+       ldi r18, 32
+1:     movw r26, r24
+1:  ld r23, Z+
+       st X+, r23
+       dec r18
+       brne 1b
+       ret
+
+/*******************************************************************************
+* void bmw256(void* dest, const void* msg, uint32_t length_b){
+*      bmw_small_ctx_t ctx;
+*      bmw256_init(&ctx);
+*      while(length_b>=BMW_SMALL_BLOCKSIZE){
+*              bmw_small_nextBlock(&ctx, msg);
+*              length_b -= BMW_SMALL_BLOCKSIZE;
+*              msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
+*      }
+*      bmw_small_lastBlock(&ctx, msg, length_b);
+*      bmw256_ctx2hash(dest, &ctx);
+* }
+*
+* param dest:     r24:r25
+* param msg:      r22:r23
+* param length_b: r18:r21
+*/
+ctx0 =   2
+ctx1 =   3
+msg0 =   4
+msg1 =   5
+len0 =   6
+len1 =   7
+len2 =   8
+len3 =   9
+dst0 =  10
+dst1 =  11
+
+
+/*******************************************************************************
+* void bmw224(void* dest, const void* msg, uint32_t length_b){
+*      bmw_small_ctx_t ctx;
+*      bmw224_init(&ctx);
+*      while(length_b>=BMW_SMALL_BLOCKSIZE){
+*              bmw_small_nextBlock(&ctx, msg);
+*              length_b -= BMW_SMALL_BLOCKSIZE;
+*              msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
+*      }
+*      bmw_small_lastBlock(&ctx, msg, length_b);
+*      bmw224_ctx2hash(dest, &ctx);
+* }
+*
+* param dest:     r24:r25
+* param msg:      r22:r23
+* param length_b: r18:r21
+*/
+ctx0 =   2
+ctx1 =   3
+msg0 =   4
+msg1 =   5
+len0 =  28
+len1 =  29
+len2 =   8
+len3 =   9
+dst0 =   6
+dst1 =   7
+
+
+.global bmw256
+bmw256:
+       push_range 2, 7
+       push_range 28, 29
+       push_range 8, 9
+       stack_alloc_large 64+4
+10:    movw ctx0, r30
+       movw dst0, r24
+       movw msg0, r22
+       movw len0, r18
+       movw len2, r20
+       movw r24, ctx0
+       rcall bmw256_init
+20:
+       mov r18, len2
+       or  r18, len3
+       breq 50f
+       rcall bmw_small_nextBlock_early
+       subi len1, 2
+       sbc len2, r1
+       sbc len3, r1
+       ldi r20, 64
+       add msg0, r20
+       adc msg1, r1
+       rjmp 20b
+50:
+       movw r24, ctx0
+       movw r22, msg0
+       movw r20, len0
+       rcall bmw_small_lastBlock
+       movw r24, dst0
+       movw r22, ctx0
+       rcall bmw256_ctx2hash
+       stack_free_large 64+4
+       rjmp pop9
+
+/******************************************************************************/
+.global bmw256_init
+bmw256_init:
+       ldi r22, 0x40
+       ldi r23, 0x80
+       movw r26, r24
+       adiw r26, 4
+10:
+       st -X, r22
+       inc r22
+       mov r20, r22
+       andi r20, 0x3
+       brne 10b
+       adiw r26, 8
+20: cp r22, r23
+       brne 10b
+       st -X, r1
+       st -X, r1
+       st -X, r1
+       st -X, r1
+       ret
+
+
+/******************************************************************************/
+
+#if DEBUG
+
+printQ:
+       push_range 20, 25
+       ldi r16, 4
+       mov r9, r16
+       movw r16, r24
+       ldi r24, lo8(qdbg_str)
+       ldi r25, hi8(qdbg_str)
+       call cli_putstr_P
+       clr r8
+10:    ldi r24, lo8(qdbg_str1)
+       ldi r25, hi8(qdbg_str1)
+       call cli_putstr_P
+       mov r24, r8
+       call cli_hexdump_byte
+       ldi r24, lo8(qdbg_str2)
+       ldi r25, hi8(qdbg_str2)
+       call cli_putstr_P
+       movw r24, r16
+       clr r23
+       ldi r22, 4
+       call cli_hexdump_rev
+       add r16, r9
+       adc r17, r1
+       inc r8
+       sbrs r8, 5
+       rjmp 10b
+       pop_range 20, 25
+       ret
+qdbg_str:  .asciz "\r\nDBG Q: "
+qdbg_str1: .asciz "\r\n Q["
+qdbg_str2: .asciz "] =  "
+
+
+printX:
+       push_range 6, 9
+       push_range 16, 27
+       push_range 30, 31
+       ldi r16, 4
+       mov r6, r22
+       mov r9, r16
+       movw r16, r24
+       ldi r24, lo8(Xdbg_str)
+       ldi r25, hi8(Xdbg_str)
+       call cli_putstr_P
+       mov r24, r6
+       call cli_putc
+       ldi r24, ':'
+       call cli_putc
+       clr r8
+10:    ldi r24, lo8(Xdbg_str1)
+       ldi r25, hi8(Xdbg_str1)
+       call cli_putstr_P
+       mov r24, r6
+       call cli_putc
+       ldi r24, '['
+       call cli_putc
+       mov r24, r8
+       call cli_hexdump_byte
+       ldi r24, lo8(Xdbg_str2)
+       ldi r25, hi8(Xdbg_str2)
+       call cli_putstr_P
+       movw r24, r16
+       clr r23
+       ldi r22, 4
+       call cli_hexdump_rev
+       add r16, r9
+       adc r17, r1
+       inc r8
+       sbrs r8, 4
+       rjmp 10b
+       pop_range 30, 31
+       pop_range 16, 27
+       pop_range 6, 9
+       ret
+Xdbg_str:  .asciz "\r\nDBG "
+Xdbg_str1: .asciz "\r\n "
+Xdbg_str2: .asciz "] = "
+
+print32:
+       push_range 6, 9
+       push_range 16, 27
+       push_range 30, 31
+       movw r6, r22
+       movw r8, r24
+       ldi r24, lo8(Xdbg_str)
+       ldi r25, hi8(Xdbg_str)
+       call cli_putstr_P
+       mov r24, r9
+       call cli_hexdump_byte
+       mov r24, r8
+       call cli_hexdump_byte
+       mov r24, r7
+       call cli_hexdump_byte
+       mov r24, r6
+       call cli_hexdump_byte
+       pop_range 30, 31
+       pop_range 16, 27
+       pop_range 6, 9
+       ret
+
+
+print_acc:
+       push_range 16, 27
+       push_range 30, 31
+       ldi r24, lo8(Xdbg_str)
+       ldi r25, hi8(Xdbg_str)
+       call cli_putstr_P
+       mov r24, r9
+       call cli_hexdump_byte
+       mov r24, r8
+       call cli_hexdump_byte
+       mov r24, r15
+       call cli_hexdump_byte
+       mov r24, r14
+       call cli_hexdump_byte
+       pop_range 30, 31
+       pop_range 16, 27
+       ret
+
+#endif
+
index f3da54432c17be630ea7af86182021a50ade088a..2775ce5b75a4fe7ef13c70e5a4899ba7f6968530 100644 (file)
@@ -1138,7 +1138,6 @@ dst1 =   7
 .global bmw224
 bmw224:
        clt
-       rjmp bmw_small_all
 
 
 bmw_small_all:
diff --git a/mkfiles/bmw_tiny_sep.mk b/mkfiles/bmw_tiny_sep.mk
new file mode 100644 (file)
index 0000000..a4d83ee
--- /dev/null
@@ -0,0 +1,12 @@
+# Makefile for BlueMidnightWish
+ALGO_NAME := BMW_TINY_SEPERATE
+
+# comment out the following line for removement of BlueMidnightWish from the build process
+HASHES += $(ALGO_NAME)
+
+$(ALGO_NAME)_DIR      := bmw/
+$(ALGO_NAME)_OBJ      := bmw_256-tinyasm.o bmw_224-tinyasm.o bmw_large.o 
+$(ALGO_NAME)_TEST_BIN := main-bmw-test.o hfal_bmw_small.o hfal_bmw_large.o $(CLI_STD) $(HFAL_STD)
+$(ALGO_NAME)_NESSIE_TEST      := test nessie
+$(ALGO_NAME)_PERFORMANCE_TEST := performance
+