--- /dev/null
+/* bmw_small-tinyasm.S */
+/*
+ This file is part of the AVR-Crypto-Lib.
+ Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * File: bmw_small-tinyasm.S
+ * Author: Daniel Otte
+ * Date: 2010-03-28
+ * License: GPLv3 or later
+ * Description: implementation of BlueMidnightWish
+ *
+ */
+
+#include "avr-asm-macros.S"
+
+/******************************************************************************/
+/*
+ param a: r22:r23:r24:r25
+ param s: r20
+*/
+shiftleft32:
+ clr r0
+ cpi r20, 8
+ brlo bitrotateleft_1
+ mov r25, r24
+ mov r24, r23
+ mov r23, r22
+ clr r22
+ subi r20, 8
+ rjmp shiftleft32
+
+/******************************************************************************/
+/*
+ param a: r22:r23:r24:r25
+ param s: r20
+*/
+shiftright32:
+ cpi r20, 8
+ brlo bitshiftright
+ mov r22, r23
+ mov r23, r24
+ mov r24, r25
+ clr r25
+ subi r20, 8
+ rjmp shiftright32
+bitshiftright:
+ tst r20
+ breq 20f
+10: lsr r25
+ ror r24
+ ror r23
+ ror r22
+ dec r20
+ brne 10b
+20: ret
+
+/******************************************************************************/
+/*
+ param a: r22:r23:r24:r25
+ param s: r20
+*/
+rotateleft32:
+ cpi r20, 8
+ brlo bitrotateleft
+ mov r0, r25
+ mov r25, r24
+ mov r24, r23
+ mov r23, r22
+ mov r22, r0
+ subi r20, 8
+ rjmp rotateleft32
+bitrotateleft:
+ mov r0, r25
+bitrotateleft_1:
+ tst r20
+ breq 20f
+10:
+ lsl r0
+ rol r22
+ rol r23
+ rol r24
+ rol r25
+ dec r20
+ brne 10b
+20: ret
+
+
+/******************************************************************************/
+
+s_table:
+s0: .byte 1, 3, 4,19
+s1: .byte 1, 2, 8,23
+s2: .byte 2, 1,12,25
+s3: .byte 2, 2,15,29
+s4: .byte 1, 0, 0, 0
+s5: .byte 2, 0, 0, 0
+
+eor_r22_in_r16:
+ eor r16, r22
+ eor r17, r23
+ eor r18, r24
+ eor r19, r25
+ ret
+
+/*
+ param x: r22:r23:r24:25
+ param s: r20
+*/
+sn:
+ push_range 12, 20
+ ldi r30, lo8(s_table)
+ ldi r31, hi8(s_table)
+ lsl r20
+ lsl r20
+ add r30, r20
+ adc r31, r1
+ movw r12, r22
+ movw r14, r24
+ lpm r20, Z+
+ rcall shiftright32
+ movw r16, r22
+ movw r18, r24
+;---
+ movw r22, r12
+ movw r24, r14
+ lpm r20, Z+
+ rcall shiftleft32
+ rcall eor_r22_in_r16
+;---
+ movw r22, r12
+ movw r24, r14
+ lpm r20, Z+
+ rcall rotateleft32
+ rcall eor_r22_in_r16
+;---
+ movw r22, r12
+ movw r24, r14
+ lpm r20, Z+
+ rcall rotateleft32
+ eor r22, r16
+ eor r23, r17
+ eor r24, r18
+ eor r25, r19
+ pop_range 12, 20
+ ret
+
+/******************************************************************************/
+/*
+ param dest: r26:r27 (X)
+ param src: r30:r31 (Z)
+ param len: r20
+*/
+memxor_short:
+; tst r20
+; breq memxor_exit
+10: ld r21, X
+ ld r22, Z+
+ eor r21, r22
+ st X+, r21
+ dec r20
+ brne 10b
+memxor_exit:
+ ret
+
+/******************************************************************************/
+q0 = 2
+q1 = 3
+h0 = 4
+h1 = 5
+m0 = 6
+m1 = 7
+
+add_hx_to_w:
+ movw r26, h0
+ add r26, r16
+ adc r27, r1
+ ld r22, Y
+ ldd r23, Y+1
+ ldd r24, Y+2
+ ldd r25, Y+3
+ lsl r20
+ rol r21
+ brcs 30f
+ /* addition */
+ ld r0, X+
+ add r22, r0
+ ld r0, X+
+ adc r23, r0
+ ld r0, X+
+ adc r24, r0
+ ld r0, X+
+ adc r25, r0
+ rjmp 50f
+30: /* substract */
+ ld r0, X+
+ sub r22, r0
+ ld r0, X+
+ sbc r23, r0
+ ld r0, X+
+ sbc r24, r0
+ ld r0, X+
+ sbc r25, r0
+50:
+ st Y+, r22
+ st Y+, r23
+ st Y+, r24
+ st Y+, r25
+ ret
+
+/******************************************************************************/
+load32_from_X:
+ ld r22, X+
+ ld r23, X+
+ ld r24, X+
+ ld r25, X+
+ ret
+
+load32_from_Y:
+ ld r22, Y+
+ ld r23, Y+
+ ld r24, Y+
+ ld r25, Y+
+ ret
+/******************************************************************************/
+/*
+ param q: r28:r29 (Y)
+ param h: r26:r27 (X)
+ param m: r30:r31 (Z)
+*/
+
+f0_hacktable:
+ .byte 0x03, 0x11
+ .byte 0xDD, 0xB3
+ .byte 0x2A, 0x79
+ .byte 0x07, 0xAA
+ .byte 0x51, 0xC2
+f0_indextable:
+ .byte 5*4,7*4,10*4,13*4,14*4
+; .byte 0 ; just for alignment
+f0_s_table:
+ .byte 0,1,2,3,4
+ .byte 0,1,2,3,4
+ .byte 0,1,2,3,4
+; .byte 0
+
+f0:
+ movw h0, r26
+ movw q0, r28
+ movw m0, r30
+;--- DBG
+; push_range 22, 25
+; movw r24, r26
+; ldi r22, 'H'
+; rcall printX
+; pop_range 22, 25
+;--- END DBG
+;--- DBG
+; push_range 22, 25
+; movw r24, r30
+; ldi r22, 'M'
+; rcall printX
+; pop_range 22, 25
+;--- END DBG
+ /* xor m into h */
+ ldi r20, 64
+ rcall memxor_short
+ movw r30, m0
+ movw r26, h0
+
+ /* set q to zero */
+ ldi r22, 64
+10: st Y+, r1
+ dec r22
+ brne 10b
+ movw r28, q0
+ /* calculate W and store it in Q */
+ ldi r19, 5
+30:
+ ldi r18, 16
+ /* load initial index */
+ ldi r30, lo8(f0_indextable-1)
+ ldi r31, hi8(f0_indextable-1)
+ add r30, r19
+ adc r31, r1
+ lpm r16, Z
+ /* load values from hacktable */
+ ldi r30, lo8(f0_hacktable-2)
+ ldi r31, hi8(f0_hacktable-2)
+ lsl r19
+ add r30, r19
+ adc r31, r1
+ lsr r19
+ lpm r21, Z+
+ lpm r20, Z
+40:
+ call add_hx_to_w
+ subi r16, -4
+ andi r16, 0x0f<<2
+ dec r18
+ brne 40b
+ movw r28, q0
+ dec r19
+ brne 30b
+ movw r26, h0
+;--- DBG
+; push_range 22, 25
+; movw r24, r28
+; ldi r22, 'W'
+; rcall printX
+; pop_range 22, 25
+;--- END DBG
+ /* xor m into h */
+ ldi r20, 64
+ movw r26, h0
+ movw r30, m0
+ rcall memxor_short
+ sbiw r26, 60
+;---
+ ldi r30, lo8(f0_s_table)
+ ldi r31, hi8(f0_s_table)
+ ldi r21, 15
+ mov r8, r21
+50:
+ ldd r22, Y+0
+ ldd r23, Y+1
+ ldd r24, Y+2
+ ldd r25, Y+3
+ lpm r20, Z+
+ movw r2, r30
+ rcall sn
+ movw r30, r2
+
+ ld r0, X+
+ add r22, r0
+ ld r0, X+
+ adc r23, r0
+ ld r0, X+
+ adc r24, r0
+ ld r0, X+
+ adc r25, r0
+
+ st Y+, r22
+ st Y+, r23
+ st Y+, r24
+ st Y+, r25
+ dec r8
+ brne 50b
+;---
+ ldd r22, Y+0
+ ldd r23, Y+1
+ ldd r24, Y+2
+ ldd r25, Y+3
+ clr r20
+ rcall sn
+ movw r30, r2
+ movw r26, h0
+ ld r0, X+
+ add r22, r0
+ ld r0, X+
+ adc r23, r0
+ ld r0, X+
+ adc r24, r0
+ ld r0, X+
+ adc r25, r0
+ sbiw r26, 4
+ st Y+, r22
+ st Y+, r23
+ st Y+, r24
+ st Y+, r25
+ ret
+
+/******************************************************************************/
+
+const_lut:
+ .long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f
+ .long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3
+ .long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7
+ .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b
+
+/*******************************************************************************
+* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){
+* uint32_t r;
+* r = pgm_read_dword(k_lut+j);
+* r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
+* r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
+* r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
+* r ^= ((uint32_t*)h)[(j+7)&0xf];
+* return r;
+* }
+* param j: r24
+* param m: r22:r23
+* param h: r20:r21
+*/
+j = 16
+acc2 = 8
+acc3 = 9
+h0 = 10
+h1 = 11
+m0 = 12
+m1 = 13
+acc0 = 14
+acc1 = 15
+
+add32_to_acc:
+ add acc0, r22
+ adc acc1, r23
+ adc acc2, r24
+ adc acc3, r25
+ ret
+
+eor32_to_acc:
+ eor acc0, r22
+ eor acc1, r23
+ eor acc2, r24
+ eor acc3, r25
+ ret
+
+load_acc_from_X:
+ ld acc0, X+
+ ld acc1, X+
+ ld acc2, X+
+ ld acc3, X+
+ ret
+
+add_acc_to_Z:
+ ld r0, Z
+ add r0, acc0
+ st Z+, r0
+ ld r0, Z
+ adc r0, acc1
+ st Z+, r0
+ ld r0, Z
+ adc r0, acc2
+ st Z+, r0
+ ld r0, Z
+ adc r0, acc3
+ st Z+, r0
+ ret
+
+load_rotate_add_M:
+ andi r20, 0x0f
+ mov r0, r20
+ lsl r0
+ lsl r0
+ movw r26, m0
+ add r26, r0
+ adc r27, r1
+ ld r22, X+
+ ld r23, X+
+ ld r24, X+
+ ld r25, X+
+ inc r20
+ rcall rotateleft32
+ brts 10f
+ rcall add32_to_acc
+ ret
+10: sub acc0, r22
+ sbc acc1, r23
+ sbc acc2, r24
+ sbc acc3, r25
+ ret
+
+addelement:
+ mov j, r24
+ movw h0, r20
+ movw m0, r22
+ lsl r24
+ lsl r24
+ mov r28, r24
+ ldi r30, lo8(const_lut)
+ ldi r31, hi8(const_lut)
+ add r30, r24
+ adc r31, r1
+ lpm acc0, Z+
+ lpm acc1, Z+
+ lpm acc2, Z+
+ lpm acc3, Z+
+ clt
+ mov r20, j
+ rcall load_rotate_add_M
+ mov r20, j
+ subi r20, -3
+ rcall load_rotate_add_M
+ mov r20, j
+ set
+ subi r20, -10
+ rcall load_rotate_add_M
+ lsl j
+ lsl j
+ subi j, -7*4
+ andi j, 0x3f
+ movw r26, h0
+ add r26, j
+ adc r27, r1
+ ld r0, X+
+ eor acc0, r0
+ ld r0, X+
+ eor acc1, r0
+ ld r0, X+
+ eor acc2, r0
+ ld r0, X+
+ eor acc3, r0
+;---
+ ret
+
+/******************************************************************************/
+/*
+ param q: r26:r27
+ param m: r22:r23
+ param h: r20:r21
+ param j: r24
+*/
+
+expand_intro:
+ push_range 20, 27
+; push r24
+ rcall addelement
+; pop r24
+ pop_range 20, 27
+ lsl r24
+ lsl r24
+ add r26, r24
+ adc r27, r1
+ ret
+expand1:
+ rcall expand_intro
+ ldi r19, 1
+10:
+ rcall load32_from_X
+ mov r20, r19
+ andi r20, 3
+ rcall sn
+ rcall add32_to_acc
+ inc r19
+ cpi r19, 17
+ brne 10b
+expand1_exit:
+; adiw r26, 63
+ st X+, acc0
+ st X+, acc1
+ st X+, acc2
+ st X+, acc3
+ ret
+
+/******************************************************************************/
+/*
+ param q: r26:r27
+ param m: r22:r23
+ param h: r20:r21
+ param j: r24
+*/
+
+expand2_rot_table:
+ .byte 0,3,0,7,0,13,0,16,0,19,0,23,0,27
+
+expand2:
+ rcall expand_intro
+ ldi r19, 14
+ ldi r30, lo8(expand2_rot_table)
+ ldi r31, hi8(expand2_rot_table)
+10:
+ rcall load32_from_X
+ mov r20, r19
+ lpm r20, Z+
+ rcall rotateleft32
+ rcall add32_to_acc
+ dec r19
+ brne 10b
+ rcall load32_from_X
+ ldi r20, 4
+ rcall sn
+ rcall add32_to_acc
+ rcall load32_from_X
+ ldi r20, 5
+ rcall sn
+ rcall add32_to_acc
+
+ rjmp expand1_exit
+
+/******************************************************************************/
+/*
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+/* for calling expand1/2
+ param q: r26:r27
+ param m: r22:r23
+ param h: r20:r21
+ param j: r24
+*/
+f1:
+ movw r2, r24
+ movw r4, r22
+ movw r6, r20
+ movw r26, r2
+; movw r22, r4
+; movw r20, r6
+ clr r24
+ rcall expand1
+ movw r26, r2
+ movw r22, r4
+ movw r20, r6
+ ldi r24, 1
+ rcall expand1
+ ldi r17, 2
+10: movw r26, r2
+ movw r22, r4
+ movw r20, r6
+ mov r24, r17
+ rcall expand2
+ inc r17
+ sbrs r17, 4
+ rjmp 10b
+ ret
+
+/******************************************************************************/
+/*
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+f2_1_shift_table:
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55
+f2_2_shift_table:
+ .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1)
+ .byte 0 ; just for alignment
+acc2 = 8
+acc3 = 9
+acc0 = 14
+acc1 = 15
+xl0 = 2
+xl1 = 3
+xl2 = 4
+xl3 = 5
+xh0 = 6
+xh1 = 7
+xh2 = 10
+xh3 = 11
+q16_0 = 12
+q16_1 = 13
+h0 = 18
+h1 = 19
+f2:
+ movw r26, r24
+ /* calc XL */
+ adiw r26, 63
+ adiw r26, 1
+ movw q16_0, r26
+ clr xl0
+ clr xl1
+ clr xl2
+ clr xl3
+ ldi r17, 8
+10: ld r0, X+
+ eor xl0, r0
+ ld r0, X+
+ eor xl1, r0
+ ld r0, X+
+ eor xl2, r0
+ ld r0, X+
+ eor xl3, r0
+ dec r17
+ brne 10b
+;--- /* calc XH */
+ movw xh0, xl0
+ movw xh2, xl2
+ ldi r17, 8
+10: ld r0, X+
+ eor xh0, r0
+ ld r0, X+
+ eor xh1, r0
+ ld r0, X+
+ eor xh2, r0
+ ld r0, X+
+ eor xh3, r0
+ dec r17
+ brne 10b
+;--- DBG
+; push_range 22, 25
+; movw r22, xl0
+; movw r24, xl2
+; rcall print32
+; movw r22, xh0
+; movw r24, xh2
+; rcall print32
+; pop_range 22, 25
+;--- END DBG
+
+;--- /* calc first half of h0..h15 */
+ movw h0, r20
+ movw r28, r22
+ movw r26, q16_0
+ ldi r17, 16
+10:
+ ld acc0, Y+
+ ld acc1, Y+
+ ld acc2, Y+
+ ld acc3, Y+
+;---
+ ldi r30, lo8(f2_1_shift_table-1)
+ ldi r31, hi8(f2_1_shift_table-1)
+ movw r22, xh0
+ movw r24, xh2
+ add r30, r17
+ adc r31, r1
+ lpm r20, Z
+ mov r1, r20
+ andi r20, 0x0f
+ clt
+ cpi r17, 16
+ breq 20f
+ cpi r17, 11
+ brne 21f
+20: set
+21: brts 25f
+ rcall shiftright32
+ rjmp 26f
+25: rcall shiftleft32
+26: rcall eor32_to_acc
+;---
+ rcall load32_from_X
+ mov r20, r1
+ clr r1
+ swap r20
+ andi r20, 0x0f
+ brts 27f
+ rcall shiftleft32
+ rjmp 28f
+27: rcall shiftright32
+28: rcall eor32_to_acc
+;---
+ movw r30, h0
+ st Z+, acc0
+ st Z+, acc1
+ st Z+, acc2
+ st Z+, acc3
+ movw h0, r30
+;---
+ dec r17
+ brne 10b
+;-----
+ sbiw r26, 4*8 /* X points to q[24] */
+ movw r28, r26
+ sbiw r28, 63
+ sbiw r28, 33 /* Y points to q[0] */
+ sbiw r30, 63
+ sbiw r30, 1 /* Z points to h0 */
+ ldi r17, 8
+10: movw acc0, xl0
+ movw acc2, xl2
+ rcall load32_from_X
+ rcall eor32_to_acc
+ rcall load32_from_Y
+ rcall eor32_to_acc
+ rcall add_acc_to_Z
+ dec r17
+ brne 10b
+ sbiw r26, 9*4 /* X points to q[23] */
+ rcall load_acc_from_X
+ eor acc1, xl0
+ eor acc2, xl1
+ eor acc3, xl2
+ rcall load32_from_Y
+ rcall eor32_to_acc
+ rcall add_acc_to_Z
+;---
+ sbiw r26, 8*4 /* X points to q[16] */
+ mov h0, r30
+ ldi r17, 7
+10:
+ ldi r30, lo8(f2_2_shift_table-1)
+ ldi r31, hi8(f2_2_shift_table-1)
+ add r30, r17
+ adc r31, r1
+ lpm r20, Z
+ rcall load_acc_from_X
+ movw r22, xl0
+ movw r24, xl2
+ lsr r20
+ brcc 20f
+ rcall shiftleft32
+ rjmp 21f
+20: rcall shiftright32
+21:
+ rcall eor32_to_acc
+ rcall load32_from_Y
+ rcall eor32_to_acc
+ movw r30, h0
+ rcall add_acc_to_Z
+ movw h0, r30
+ dec r17
+ brne 10b
+;-----
+ sbiw r30, 8*4 /* Z points to h8 */
+ movw r26, r30
+ sbiw r26, 4*4 /* X points to h4 */
+ ldi r17, 8
+ ldi r18, 9
+10:
+ rcall load32_from_X
+ mov r20, r18
+ rcall rotateleft32
+ movw acc0, r22
+ movw acc2, r24
+ rcall add_acc_to_Z
+ inc r18
+ cpi r17, 5
+ breq 20f
+ dec r17
+ brne 10b
+ ret
+20: sbiw r26, 8*4
+ dec r17
+ rjmp 10b
+
+/******************************************************************************/
+/*
+ param ctx: r24:r25
+ param msg: r22:r23
+*/
+/* f0
+ param q: r28:r29 (Y)
+ param h: r26:r27 (X)
+ param m: r30:r31 (Z)
+*/
+/* f1
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+/* f2
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+.global bmw_small_nextBlock
+.global bmw224_nextBlock
+.global bmw256_nextBlock
+bmw_small_nextBlock:
+bmw224_nextBlock:
+bmw256_nextBlock:
+ push_range 28, 29
+ push_range 2, 17
+ stack_alloc_large 32*4, r28, r29
+ adiw r28, 1
+ push_range 28, 29 /* push Q */
+ push_range 22, 25 /* push M & H */
+ /* increment counter */
+ movw r26, r24
+ movw r2, r26
+ adiw r26, 63
+ adiw r26, 1
+ rcall load_acc_from_X
+ ldi r19, 1
+ add acc0, r19
+ adc acc1, r1
+ adc acc2, r1
+ adc acc3, r1
+ st -X, acc0
+ st -X, acc1
+ st -X, acc2
+ st -X, acc3
+ /* call f0 */
+ movw r30, r22
+ movw r26, r24
+ rcall f0
+ /* call f1*/
+ pop r21
+ pop r20
+ pop r23
+ pop r22
+ pop r25
+ pop r24
+; rcall printQ
+ push_range 20, 25
+ rcall f1
+ /* call f2 */
+; pop_range 20, 25
+; push_range 20, 25
+; rcall printQ
+ pop_range 20, 25
+; push r20
+; push r21
+ call f2
+;--- DBG
+; pop r25
+; pop r24
+; ldi r22, 'H'
+; rcall printX
+;--- END DBG
+ stack_free_large3 32*4
+ pop_range 2, 17
+ pop_range 28, 29
+ ret
+
+/******************************************************************************/
+/*
+ param ctx: r24:r25
+ param msg: r22:r23
+ param len: r20:r21
+*/
+ctx0 = 2
+ctx1 = 3
+blc0 = 4
+blc1 = 5
+len0 = 28
+len1 = 29
+buf0 = 6
+buf1 = 7
+
+.global bmw_small_lastBlock
+.global bmw224_lastBlock
+.global bmw256_lastBlock
+bmw_small_lastBlock:
+bmw224_lastBlock:
+bmw256_lastBlock:
+/* while(length_b >= BMW_SMALL_BLOCKSIZE){
+ bmw_small_nextBlock(ctx, block);
+ length_b -= BMW_SMALL_BLOCKSIZE;
+ block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
+ }
+*/
+ push_range 2, 7
+ push_range 28, 29
+ movw ctx0, r24
+ movw blc0, r22
+ movw len0, r20
+1:
+ cpi len1, hi8(512)
+ brlo 2f
+ movw r24, ctx0
+ movw r22, blc0
+ rcall bmw_small_nextBlock
+ ldi r24, 64
+ add blc0, r24
+ adc blc1, r1
+ subi len1, hi8(512)
+ rjmp 1b
+2:
+/* struct {
+ uint8_t buffer[64];
+ uint32_t ctr;
+ } pctx;
+*/
+ stack_alloc_large 68
+ adiw r30, 1
+ movw buf0, r30
+/* memset(pctx.buffer, 0, 64);
+ memcpy(pctx.buffer, block, (length_b+7)/8);
+ pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
+*/ movw r24, len0
+ lsr r25
+ ror r24
+ lsr r24
+ lsr r24
+ ldi r23, 63
+ sub r23, r24
+ movw r26, blc0
+ tst r24
+ breq 301f
+ /* copy (#r24) bytes to stack buffer */
+30: ld r20, X+
+ st Z+, r20
+ dec r24
+ brne 30b
+301: /* calculate the appended byte */
+ clr r20
+ mov r21, len0
+ ldi r24, 0x80
+ andi r21, 0x07
+ breq 305f
+ ld r20, X+
+303:
+ lsr r24
+ dec r21
+ brne 303b
+305:
+ or r20, r24
+ st Z+, r20
+ tst r23
+ breq 32f
+31: st Z+, r1
+ dec r23
+ brne 31b
+32:
+/* if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
+ bmw_small_nextBlock(ctx, pctx.buffer);
+ memset(pctx.buffer, 0, 64-8);
+ ctx->counter -= 1;
+ }
+*/
+ tst len1
+ breq 400f
+ cpi len0, 192
+ brlo 400f
+ movw r24, ctx0
+ movw r22, buf0
+ rcall bmw_small_nextBlock
+ movw r26, buf0
+ ldi r20, 64-8
+350:
+ st X+, r1
+ dec r20
+ brne 350b
+ movw r30, ctx0
+ adiw r30, 60
+ ldd r21, Z+4
+ ldd r22, Z+5
+ ldd r23, Z+6
+ ldd r24, Z+7
+ subi r21, 1
+ sbc r22, r1
+ sbc r23, r1
+ sbc r24, r1
+ rjmp 410f
+/* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
+ bmw_small_nextBlock(ctx, pctx.buffer);
+*/
+400:
+ movw r30, ctx0
+ adiw r30, 60
+ ldd r21, Z+4
+ ldd r22, Z+5
+ ldd r23, Z+6
+ ldd r24, Z+7
+410:
+ clr r25
+ lsl r21
+ rol r22
+ rol r23
+ rol r24
+ rol r25
+ mov r20, len0
+ add r21, len1
+ adc r22, r1
+ adc r23, r1
+ adc r24, r1
+ adc r25, r1
+ movw r30, buf0
+ adiw r30, 64-8
+ st Z+, r20
+ st Z+, r21
+ st Z+, r22
+ st Z+, r23
+ st Z+, r24
+ st Z+, r25
+ st Z+, r1
+ st Z+, r1
+ movw r24, ctx0
+ movw r22, buf0
+ rcall bmw_small_nextBlock
+/* memset(pctx.buffer, 0xaa, 64);
+ for(i=0; i<16;++i){
+ pctx.buffer[i*4] = i+0xa0;
+ }
+*/
+ ldi r18, 0xa0
+ ldi r19, 0xaa
+ movw r26, buf0
+500:
+ st X+, r18
+ st X+, r19
+ st X+, r19
+ st X+, r19
+ inc r18
+ sbrs r18, 4
+ rjmp 500b
+/* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
+ memcpy(ctx->h, pctx.buffer, 64);
+*/
+ movw r24, buf0
+ movw r22, ctx0
+ rcall bmw_small_nextBlock
+ ldi r18, 64
+ movw r26, ctx0
+ movw r30, buf0
+600:
+ ld r20, Z+
+ st X+, r20
+ dec r18
+ brne 600b
+
+ stack_free_large 68
+ pop_range 28, 29
+ pop_range 2, 7
+ ret
+
+
+/*******************************************************************************
+* void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){
+* memcpy(dest, &(ctx->h[9]), 224/8);
+* }
+*
+* param dest: r24:r25
+* param ctx: r22:r23
+*/
+.global bmw224_ctx2hash
+bmw224_ctx2hash:
+ movw r26, r24
+ movw r30, r22
+ adiw r30, 9*4
+ ldi r22, 28
+ rjmp 1f
+
+/*******************************************************************************
+* void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
+* memcpy(dest, &(ctx->h[8]), 256/8);
+* }
+*
+* param dest: r24:r25
+* param ctx: r22:r23
+*/
+.global bmw256_ctx2hash
+bmw256_ctx2hash:
+ movw r26, r24
+ movw r30, r22
+ adiw r30, 8*4
+ ldi r22, 32
+1:
+ ld r23, Z+
+ st X+, r23
+ dec r22
+ brne 1b
+ ret
+
+/*******************************************************************************
+* void bmw256(void* dest, const void* msg, uint32_t length_b){
+* bmw_small_ctx_t ctx;
+* bmw256_init(&ctx);
+* while(length_b>=BMW_SMALL_BLOCKSIZE){
+* bmw_small_nextBlock(&ctx, msg);
+* length_b -= BMW_SMALL_BLOCKSIZE;
+* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
+* }
+* bmw_small_lastBlock(&ctx, msg, length_b);
+* bmw256_ctx2hash(dest, &ctx);
+* }
+*
+* param dest: r24:r25
+* param msg: r22:r23
+* param length_b: r18:r21
+*/
+ctx0 = 2
+ctx1 = 3
+msg0 = 4
+msg1 = 5
+len0 = 6
+len1 = 7
+len2 = 8
+len3 = 9
+dst0 = 10
+dst1 = 11
+.global bmw256
+bmw256:
+ push r16
+ ldi r16, 1
+ rjmp bmw_small_all
+
+/*******************************************************************************
+* void bmw224(void* dest, const void* msg, uint32_t length_b){
+* bmw_small_ctx_t ctx;
+* bmw224_init(&ctx);
+* while(length_b>=BMW_SMALL_BLOCKSIZE){
+* bmw_small_nextBlock(&ctx, msg);
+* length_b -= BMW_SMALL_BLOCKSIZE;
+* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
+* }
+* bmw_small_lastBlock(&ctx, msg, length_b);
+* bmw224_ctx2hash(dest, &ctx);
+* }
+*
+* param dest: r24:r25
+* param msg: r22:r23
+* param length_b: r18:r21
+*/
+ctx0 = 2
+ctx1 = 3
+msg0 = 4
+msg1 = 5
+len0 = 6
+len1 = 7
+len2 = 8
+len3 = 9
+dst0 = 10
+dst1 = 11
+.global bmw224
+bmw224:
+ push r16
+ clr r16
+
+bmw_small_all:
+ push_range 2, 11
+ stack_alloc_large 64+4
+ adiw r30, 1
+ movw ctx0, r30
+ movw dst0, r24
+ movw msg0, r22
+ movw len0, r18
+ movw len2, r20
+ movw r24, ctx0
+ ldi r30, pm_lo8(init_lut)
+ ldi r31, pm_hi8(init_lut)
+ add r30, r16
+ adc r31, r1
+ icall
+20:
+ mov r18, len2
+ or r18, len3
+ breq 50f
+ movw r24, ctx0
+ movw r22, msg0
+ rcall bmw_small_nextBlock
+ ldi r20, 2
+ sub len1, r20
+ sbc len2, r1
+ sbc len3, r1
+ ldi r20, 64
+ add msg0, r20
+ adc msg1, r1
+ rjmp 20b
+50:
+ movw r24, ctx0
+ movw r22, msg0
+ movw r20, len0
+ rcall bmw_small_lastBlock
+ movw r24, dst0
+ movw r22, ctx0
+ ldi r30, pm_lo8(c2h_lut)
+ ldi r31, pm_hi8(c2h_lut)
+ add r30, r16
+ adc r31, r1
+ icall
+ stack_free_large 64+4
+ pop_range 2, 11
+ pop r16
+ ret
+
+init_lut:
+ rjmp bmw224_init
+ rjmp bmw256_init
+c2h_lut:
+ rjmp bmw224_ctx2hash
+ rjmp bmw256_ctx2hash
+
+/*******************************************************************************
+* void bmw224_init(bmw224_ctx_t* ctx){
+* uint8_t i;
+* ctx->h[0] = 0x00010203;
+* for(i=1; i<16; ++i){
+* ctx->h[i] = ctx->h[i-1]+ 0x04040404;
+* }
+* ctx->counter=0;
+* }
+*
+* param ctx: r24:r25
+*/
+.global bmw224_init
+bmw224_init:
+ movw r26, r24
+ ldi r22, 0x03
+ ldi r23, 0x02
+ ldi r24, 0x01
+ ldi r25, 0x00
+bmw_small_init:
+ st X+, r22
+ st X+, r23
+ st X+, r24
+ st X+, r25
+ ldi r18, 16-1
+ ldi r20, 0x04
+1:
+ add r22, r20
+ adc r23, r20
+ adc r24, r20
+ adc r25, r20
+ st X+, r22
+ st X+, r23
+ st X+, r24
+ st X+, r25
+ dec r18
+ brne 1b
+ st X+, r1
+ st X+, r1
+ st X+, r1
+ st X+, r1
+ ret
+
+.global bmw256_init
+bmw256_init:
+ movw r26, r24
+ ldi r22, 0x43
+ ldi r23, 0x42
+ ldi r24, 0x41
+ ldi r25, 0x40
+ rjmp bmw_small_init
+
+
+/******************************************************************************/
+
+#if DEBUG
+
+printQ:
+ push_range 20, 25
+ ldi r16, 4
+ mov r9, r16
+ movw r16, r24
+ ldi r24, lo8(qdbg_str)
+ ldi r25, hi8(qdbg_str)
+ call cli_putstr_P
+ clr r8
+10: ldi r24, lo8(qdbg_str1)
+ ldi r25, hi8(qdbg_str1)
+ call cli_putstr_P
+ mov r24, r8
+ call cli_hexdump_byte
+ ldi r24, lo8(qdbg_str2)
+ ldi r25, hi8(qdbg_str2)
+ call cli_putstr_P
+ movw r24, r16
+ clr r23
+ ldi r22, 4
+ call cli_hexdump_rev
+ add r16, r9
+ adc r17, r1
+ inc r8
+ sbrs r8, 5
+ rjmp 10b
+ pop_range 20, 25
+ ret
+qdbg_str: .asciz "\r\nDBG Q: "
+qdbg_str1: .asciz "\r\n Q["
+qdbg_str2: .asciz "] = "
+
+
+printX:
+ push_range 6, 9
+ push_range 16, 27
+ push_range 30, 31
+ ldi r16, 4
+ mov r6, r22
+ mov r9, r16
+ movw r16, r24
+ ldi r24, lo8(Xdbg_str)
+ ldi r25, hi8(Xdbg_str)
+ call cli_putstr_P
+ mov r24, r6
+ call cli_putc
+ ldi r24, ':'
+ call cli_putc
+ clr r8
+10: ldi r24, lo8(Xdbg_str1)
+ ldi r25, hi8(Xdbg_str1)
+ call cli_putstr_P
+ mov r24, r6
+ call cli_putc
+ ldi r24, '['
+ call cli_putc
+ mov r24, r8
+ call cli_hexdump_byte
+ ldi r24, lo8(Xdbg_str2)
+ ldi r25, hi8(Xdbg_str2)
+ call cli_putstr_P
+ movw r24, r16
+ clr r23
+ ldi r22, 4
+ call cli_hexdump_rev
+ add r16, r9
+ adc r17, r1
+ inc r8
+ sbrs r8, 4
+ rjmp 10b
+ pop_range 30, 31
+ pop_range 16, 27
+ pop_range 6, 9
+ ret
+Xdbg_str: .asciz "\r\nDBG "
+Xdbg_str1: .asciz "\r\n "
+Xdbg_str2: .asciz "] = "
+
+print32:
+ push_range 6, 9
+ push_range 16, 27
+ push_range 30, 31
+ movw r6, r22
+ movw r8, r24
+ ldi r24, lo8(Xdbg_str)
+ ldi r25, hi8(Xdbg_str)
+ call cli_putstr_P
+ mov r24, r9
+ call cli_hexdump_byte
+ mov r24, r8
+ call cli_hexdump_byte
+ mov r24, r7
+ call cli_hexdump_byte
+ mov r24, r6
+ call cli_hexdump_byte
+ pop_range 30, 31
+ pop_range 16, 27
+ pop_range 6, 9
+ ret
+
+
+print_acc:
+ push_range 16, 27
+ push_range 30, 31
+ ldi r24, lo8(Xdbg_str)
+ ldi r25, hi8(Xdbg_str)
+ call cli_putstr_P
+ mov r24, r9
+ call cli_hexdump_byte
+ mov r24, r8
+ call cli_hexdump_byte
+ mov r24, r15
+ call cli_hexdump_byte
+ mov r24, r14
+ call cli_hexdump_byte
+ pop_range 30, 31
+ pop_range 16, 27
+ ret
+
+#endif
+