--- /dev/null
+/* bmw_small-tinyasm.S */
+/*
+ This file is part of the AVR-Crypto-Lib.
+ Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * File: bmw_small-tinyasm.S
+ * Author: Daniel Otte
+ * Date: 2010-03-28
+ * License: GPLv3 or later
+ * Description: implementation of BlueMidnightWish
+ *
+ */
+
+#include "avr-asm-macros.S"
+
+acc2 = 8
+acc3 = 9
+acc0 = 14
+acc1 = 15
+
+#define DEBUG 0
+
+/******************************************************************************/
+/*
+ param a: r22:r23:r24:r25
+ param s: r20
+*/
+shiftleft32:
+ clr r0
+ cpi r20, 8
+ brlo bitrotateleft_1
+ mov r25, r24
+ mov r24, r23
+ mov r23, r22
+ clr r22
+ subi r20, 8
+ rjmp shiftleft32
+
+/******************************************************************************/
+/*
+ param a: r22:r23:r24:r25
+ param s: r20
+*/
+shiftright32:
+ cpi r20, 8
+ brlo bitshiftright
+ mov r22, r23
+ mov r23, r24
+ mov r24, r25
+ clr r25
+ subi r20, 8
+ rjmp shiftright32
+bitshiftright:
+ tst r20
+ breq 20f
+10: lsr r25
+ ror r24
+ ror r23
+ ror r22
+ dec r20
+ brne 10b
+20: ret
+
+/******************************************************************************/
+/*
+ param a: r22:r23:r24:r25
+ param s: r20
+*/
+rotateleft32:
+ cpi r20, 8
+ brlo bitrotateleft
+ mov r0, r25
+ mov r25, r24
+ mov r24, r23
+ mov r23, r22
+ mov r22, r0
+ subi r20, 8
+ rjmp rotateleft32
+bitrotateleft:
+ mov r0, r25
+bitrotateleft_1:
+ tst r20
+ breq 20f
+10:
+ lsl r0
+rol32:
+ rol r22
+ rol r23
+ rol r24
+ rol r25
+ dec r20
+ brne 10b
+20: ret
+
+
+/******************************************************************************/
+
+sn_stub:
+ movw r22, r2
+ movw r24, r4
+ lpm r20, Z+
+ rcall rotateleft32
+eor32_to_acc:
+ eor acc0, r22
+ eor acc1, r23
+ eor acc2, r24
+ eor acc3, r25
+ ret
+
+s_table:
+s0: .byte 1, 3, 4,19
+s1: .byte 1, 2, 8,23
+s2: .byte 2, 1,12,25
+s3: .byte 2, 2,15,29
+s4: .byte 1, 0, 0, 0
+s5: .byte 2, 0, 0, 0
+
+h0 = 10
+h1 = 11
+m0 = 12
+m1 = 13
+
+/*
+ param x: r22:r23:r24:25
+ param s: r20
+*/
+sn:
+ push_range 2, 5
+ push acc0
+ push acc1
+ push acc2
+ push acc3
+ ldi r30, lo8(s_table)
+ ldi r31, hi8(s_table)
+ lsl r20
+ lsl r20
+ add r30, r20
+ adc r31, r1
+ movw r2, r22
+ movw r4, r24
+ lpm r20, Z+
+ rcall shiftright32
+ rcall mov32_to_acc
+;---
+ movw r22, r2
+ movw r24, r4
+ lpm r20, Z+
+ rcall shiftleft32
+ rcall eor32_to_acc
+;---
+ rcall sn_stub
+ rcall sn_stub
+
+ movw r22, acc0
+ movw r24, acc2
+ pop acc3
+ pop acc2
+ pop acc1
+ pop acc0
+ rjmp pop5
+
+/******************************************************************************/
+/*
+ param dest: r26:r27 (X)
+ param src: r30:r31 (Z)
+ param len: r20
+*/
+memxor_64:
+; tst r20
+; breq memxor_exit
+ ldi r20, 64
+memxor:
+10: ld r21, X
+ ld r22, Z+
+ eor r21, r22
+ st X+, r21
+ dec r20
+ brne 10b
+memxor_exit:
+ ret
+
+/******************************************************************************/
+q0 = 2
+q1 = 3
+h0 = 4
+h1 = 5
+m0 = 6
+m1 = 7
+
+
+/******************************************************************************/
+load32_from_X:
+ ld r22, X+
+ ld r23, X+
+ ld r24, X+
+ ld r25, X+
+ ret
+
+load32_from_Y:
+ ld r22, Y+
+ ld r23, Y+
+ ld r24, Y+
+ ld r25, Y+
+ ret
+
+store32_to_Y:
+ st Y+, r22
+ st Y+, r23
+ st Y+, r24
+ st Y+, r25
+ ret
+
+add_X_to_32:
+ ld r0, X+
+ add r22, r0
+ ld r0, X+
+ adc r23, r0
+ ld r0, X+
+ adc r24, r0
+ ld r0, X+
+ adc r25, r0
+ ret
+
+store32_to_X:
+ st X+, r22
+ st X+, r23
+ st X+, r24
+ st X+, r25
+ ret
+
+mov32_to_acc:
+ movw acc0, r22
+ movw acc2, r24
+ ret
+
+/******************************************************************************/
+/*
+ param q: r28:r29 (Y)
+ param h: r26:r27 (X)
+ param m: r30:r31 (Z)
+*/
+
+f2_1_shift_table:
+; .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55
+ .byte 0x55, 0x87, 0x55, 0x51, 0x03, 0x66, 0x64, 0x2B
+f2_2_shift_table:
+; .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1)
+ .byte (8<<1)+1, (6<<1), (6<<1)+1, (4<<1)+1, (3<<1), (4<<1), (7<<1), (2<<1)
+expand2_rot_table:
+ .byte 3,7,13,16,19,23,27
+
+f0_hacktable:
+ .byte 0x03, 0x11, 5*4
+ .byte 0xDD, 0xB3, 7*4
+ .byte 0x2A, 0x79, 10*4
+ .byte 0x07, 0xAA, 13*4
+ .byte 0x51, 0xC2, 14*4
+
+
+/*******************************************************************************
+* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){
+* uint32_t r;
+* r = pgm_read_dword(k_lut+j);
+* r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
+* r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
+* r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
+* r ^= ((uint32_t*)h)[(j+7)&0xf];
+* return r;
+* }
+* param j: r24
+* param m: r22:r23
+* param h: r20:r21
+*/
+j = 16
+acc2 = 8
+acc3 = 9
+h0 = 10
+h1 = 11
+m0 = 12
+m1 = 13
+acc0 = 14
+acc1 = 15
+
+load_acc_from_X:
+ ld acc0, X+
+ ld acc1, X+
+ ld acc2, X+
+ ld acc3, X+
+ ret
+
+add_acc_to_X:
+ ld r0, X
+ add r0, acc0
+ st X+, r0
+ ld r0, X
+ adc r0, acc1
+ st X+, r0
+ ld r0, X
+ adc r0, acc2
+ st X+, r0
+ ld r0, X
+ adc r0, acc3
+ st X+, r0
+ ret
+
+load_rotate_add_M:
+ mov r20, j
+ andi r20, 0x0f
+ mov r0, r20
+ lsl r0
+ lsl r0
+ movw r26, m0
+ add r26, r0
+ adc r27, r1
+ rcall load32_from_X
+ inc r20
+ rcall rotateleft32
+ brts 10f
+ rjmp add32_to_acc
+; ret
+10: sub acc0, r22
+ sbc acc1, r23
+ sbc acc2, r24
+ sbc acc3, r25
+ ret
+
+
+;---
+
+/******************************************************************************/
+load_sn_add:
+ rcall load32_from_X
+ rcall sn
+add32_to_acc:
+ add acc0, r22
+ adc acc1, r23
+ adc acc2, r24
+ adc acc3, r25
+ ret
+
+/*
+ param q: r26:r27
+ param m: r22:r23
+ param h: r20:r21
+ param j: r24
+*/
+
+expand_intro:
+ push_range 26, 27
+ push r24
+addelement:
+ mov j, r24
+ movw h0, r20
+ movw m0, r22
+ sbiw r26, 4
+ rcall load_acc_from_X
+ ldi r24, 0x55
+ add acc0, r24
+ adc acc1, r24
+ adc acc2, r24
+ ldi r24, 5
+ adc acc3, r24
+ rcall store_acc_to_dec_X
+ adiw r26, 4
+ clt
+ rcall load_rotate_add_M
+ subi j, -3
+ rcall load_rotate_add_M
+ set
+ subi j, -7
+ rcall load_rotate_add_M
+ lsl j
+ lsl j
+ subi j, -7*4+10*4
+ andi j, 0x3f
+ movw r26, h0
+ add r26, j
+ adc r27, r1
+ rcall load32_from_X
+ rcall eor32_to_acc
+;--
+ pop r24
+ pop_range 26, 27
+ lsl r24
+ lsl r24
+ add r26, r24
+ adc r27, r1
+ ret
+expand1:
+ rcall expand_intro
+ ldi r19, 1
+10:
+ mov r20, r19
+ andi r20, 3
+ rcall load_sn_add
+ inc r19
+ cpi r19, 17
+ brne 10b
+ rjmp expand2_exit
+
+
+/******************************************************************************/
+/*
+ param q: r26:r27
+ param m: r22:r23
+ param h: r20:r21
+ param j: r24
+*/
+
+
+expand2:
+ rcall expand_intro
+ ldi r19, 14
+ ldi r30, lo8(expand2_rot_table)
+ ldi r31, hi8(expand2_rot_table)
+10:
+ rcall load32_from_X
+ sbrs r19, 0
+ rjmp 12f
+ lpm r20, Z+
+ rcall rotateleft32
+12: rcall add32_to_acc
+ dec r19
+ brne 10b
+ ldi r20, 4
+ rcall load_sn_add
+ ldi r20, 5
+ rcall load_sn_add
+expand2_exit:
+ adiw r26, 4
+store_acc_to_dec_X:
+ st -X, acc3
+ st -X, acc2
+ st -X, acc1
+ st -X, acc0
+ ret
+
+/******************************************************************************/
+/*
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+/* for calling expand1/2
+ param q: r26:r27
+ param m: r22:r23
+ param h: r20:r21
+ param j: r24
+*/
+
+/******************************************************************************/
+/*
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+
+/******************************************************************************/
+/*
+ param ctx: r24:r25
+ param msg: r22:r23
+*/
+/* f0
+ param q: r28:r29 (Y)
+ param h: r26:r27 (X)
+ param m: r30:r31 (Z)
+*/
+/* f1
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+/* f2
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+q0 = 2
+q1 = 3
+h0 = 4
+h1 = 5
+m0 = 6
+m1 = 7
+ctx0 = 2
+ctx1 = 3
+msg0 = 4
+msg1 = 5
+
+restore_f1:
+ movw r26, r2
+ movw r22, r4
+ movw r20, r6
+ ret
+bmw_small_nextBlock_early:
+ movw r24, ctx0
+ movw r22, msg0
+.global bmw224_nextBlock
+bmw_small_nextBlock:
+bmw224_nextBlock:
+bmw256_nextBlock:
+ push_range 2, 7
+ push_range 28, 29
+ push_range 8, 17
+ stack_alloc_large 32*4, r28, r29
+ ldi r16, 0x4f
+ push r16
+ ldi r16, 0xff
+ push r16
+ push r16
+ ldi r16, 0xfb
+ push r16
+ adiw r28, 1
+; push_range 28, 29 /* push Q */
+; push_range 22, 25 /* push M & H */
+ /* increment counter */
+ movw r26, r24
+ movw r2, r26
+ adiw r26, 63
+ adiw r26, 1
+ rcall load_acc_from_X
+ ldi r19, 1
+ add acc0, r19
+ adc acc1, r1
+ adc acc2, r1
+ adc acc3, r1
+ rcall store_acc_to_dec_X
+ /* call f0 */
+ movw r30, r22
+ movw r26, r24
+f0:
+ movw h0, r26
+ movw q0, r28
+ movw m0, r30
+ /* xor m into h */
+; ldi r20, 64
+ rcall memxor_64
+ movw r30, m0
+ movw r26, h0
+
+ /* set q to zero */
+ ldi r22, 64
+10: st Y+, r1
+ dec r22
+ brne 10b
+ movw r28, q0
+ /* calculate W and store it in Q */
+ ldi r19, 5
+30:
+ ldi r18, 16
+ /* load initial index */
+
+ /* load values from hacktable */
+ ldi r30, lo8(f0_hacktable-3)
+ ldi r31, hi8(f0_hacktable-3)
+ mov r16, r19
+ lsl r16
+ add r16, r19
+ add r30, r16
+ adc r31, r1
+ lpm r21, Z+
+ lpm r20, Z+
+ lpm r16, Z+
+40:
+ ;call add_hx_to_w
+add_hx_to_w:
+ movw r26, h0
+ add r26, r16
+ adc r27, r1
+ rcall load32_from_Y
+ sbiw r28, 4
+ lsl r20
+ rol r21
+ brcs 300f
+ /* addition */
+ rcall add_X_to_32
+ rjmp 500f
+300: /* substract */
+ rcall load_acc_from_X
+ sub r22, acc0
+ sbc r23, acc1
+ sbc r24, acc2
+ sbc r25, acc3
+
+500:
+ rcall store32_to_Y
+ subi r16, -4
+ andi r16, 0x0f<<2
+ dec r18
+ brne 40b
+ movw r28, q0
+ dec r19
+ brne 30b
+ movw r26, h0
+ /* xor m into h */
+; ldi r20, 64
+ movw r26, h0
+ movw r30, m0
+ rcall memxor_64
+ sbiw r26, 60
+;---
+ clr r17
+ ldi r21, 15
+ mov r8, r21
+50:
+ rcall load32_from_Y
+ sbiw r28, 4
+ mov r20, r17
+ rcall sn
+ inc r17
+ cpi r17, 5
+ brne 52f
+ clr r17
+52:
+ rcall add_X_to_32
+ rcall store32_to_Y
+
+ dec r8
+ brne 50b
+;---
+ rcall load32_from_Y
+ clr r20
+ rcall sn
+ movw r26, h0
+ rcall add_X_to_32
+ sbiw r26, 4
+ sbiw r28, 4
+ rcall store32_to_Y
+ sbiw r28, 4
+ sbiw r28, 15*4
+ movw r20, h0
+ movw r22, m0
+
+ /* call f1*/
+ movw r2, r28
+f1:
+ movw r4, r22
+ movw r6, r20
+ movw r26, r2
+ clr r24
+ rcall expand1
+ rcall restore_f1
+ ldi r24, 1
+ rcall expand1
+ ldi r17, 2
+10: rcall restore_f1
+ mov r24, r17
+ rcall expand2
+ inc r17
+ sbrs r17, 4
+ rjmp 10b
+ rcall restore_f1
+ movw r24, r2
+
+
+ /* call f2 */
+; pop_range 20, 25
+; push_range 20, 25
+; rcall printQ
+; push r20
+; push r21
+acc2 = 8
+acc3 = 9
+acc0 = 14
+acc1 = 15
+xl0 = 2
+xl1 = 3
+xl2 = 4
+xl3 = 5
+xh0 = 6
+xh1 = 7
+xh2 = 10
+xh3 = 11
+q16_0 = 12
+q16_1 = 13
+h0 = 18
+h1 = 19
+f2:
+ movw r26, r24
+ /* calc XL & XH */
+ adiw r26, 63
+ adiw r26, 1
+ movw q16_0, r26
+ movw h0, r20
+;---
+; push h0
+; push h1
+;---
+ movw r28, r22
+ rcall load_acc_from_X
+ ldi r17, 15
+10: rcall load32_from_X
+ rcall eor32_to_acc
+ cpi r17, 9
+ brne 15f
+ movw xl0, acc0
+ movw xl2, acc2
+15:
+ dec r17
+ brne 10b
+ movw xh0, acc0
+ movw xh2, acc2
+;--- DBG
+; push_range 22, 25
+; movw r22, xl0
+; movw r24, xl2
+; rcall print32
+; movw r22, xh0
+; movw r24, xh2
+; rcall print32
+; pop_range 22, 25
+;--- END DBG
+ /* copy m(Y) into h */
+ movw r26, h0
+ ldi r22, 64
+10:
+ ld r23, Y+
+ st X+, r23
+ dec r22
+ brne 10b
+;--- /* calc first half of h0..h15 */
+ movw r28, q16_0
+ movw r26, h0
+ ldi r30, lo8(f2_1_shift_table)
+ ldi r31, hi8(f2_1_shift_table)
+ ldi r17, 16
+10:
+;---
+ movw r22, xh0
+ movw r24, xh2
+ cpi r17, 9
+ brge 15f
+ clr r1
+ rjmp 26f
+15: lpm r20, Z+
+ mov r1, r20
+ andi r20, 0x0f
+ clt
+ cpi r17, 16
+ breq 20f
+ cpi r17, 11
+ brne 21f
+20: set
+21: brts 25f
+ rcall shiftright32
+ rjmp 26f
+25: rcall shiftleft32
+26: rcall mov32_to_acc
+;---
+ rcall load32_from_Y
+ mov r20, r1
+ clr r1
+ swap r20
+ andi r20, 0x0f
+ brts 27f
+ rcall shiftleft32
+ rjmp 28f
+27: rcall shiftright32
+28: rcall eor32_to_acc
+;---
+ rcall load32_from_X
+ rcall eor32_to_acc
+ rcall store_acc_to_dec_X
+ adiw r26, 4
+;---
+ dec r17
+ brne 10b
+;-----
+ sbiw r28, 4*8 /* Y points to q[24] */
+ movw r30, r28
+ sbiw r28, 63
+ sbiw r28, 33 /* Y points to q[0] */
+ movw r26, r28
+ ldi r20, 8*4
+ /* xor q[24..31] into q[0..7] */
+ rcall memxor
+ /* xor q[23] into q[8] */
+ sbiw r30, 9*4
+ ldi r20, 4
+ rcall memxor
+ /* xor q[16..22] into q[9..15] */
+ sbiw r30, 8*4
+ ldi r20, 7*4
+ rcall memxor
+
+ movw r26, h0
+ ldi r17, 15
+ ldi r30, lo8(f2_2_shift_table)
+ ldi r31, hi8(f2_2_shift_table)
+10: movw r22, xl0
+ movw r24, xl2
+ sbrc r17, 3
+ rjmp 20f
+ lpm r20, Z+
+ lsr r20
+ brcs 15f
+ rcall shiftright32
+ rjmp 20f
+15:
+ rcall shiftleft32
+20:
+ rcall mov32_to_acc
+ rcall load32_from_Y
+ rcall eor32_to_acc
+ rcall add_acc_to_X
+ dec r17
+ brpl 10b
+;-----
+ sbiw r26, 8*4 /* X points to h8 */
+ movw r28, r26
+ sbiw r28, 4*4 /* Y points to h4 */
+ ldi r17, 8
+ ldi r18, 9
+10:
+ rcall load32_from_Y
+ mov r20, r18
+ rcall rotateleft32
+ rcall mov32_to_acc
+ rcall add_acc_to_X
+ inc r18
+ cpi r17, 5
+ brne 20f
+ sbiw r28, 8*4
+20: dec r17
+ brne 10b
+
+exit:
+;--- DBG
+; pop r25
+; pop r24
+; ldi r22, 'H'
+; rcall printX
+;--- END DBG
+ stack_free_large3 32*4+4
+ pop_range 10, 17
+pop9:
+ pop_range 8, 9
+pop28:
+ pop_range 28, 29
+pop7:
+ pop_range 6, 7
+pop5:
+ pop_range 2, 5
+ ret
+
+/******************************************************************************/
+ctx0 = 2
+ctx1 = 3
+blc0 = 4
+blc1 = 5
+len0 = 28
+len1 = 29
+buf0 = 6
+buf1 = 7
+
+load32_from_Z_stub:
+ movw r30, ctx0
+ adiw r30, 60
+ ldd r21, Z+4
+ ldd r22, Z+5
+ ldd r23, Z+6
+ ldd r24, Z+7
+ ret
+
+/******************************************************************************/
+/*
+ param ctx: r24:r25
+ param msg: r22:r23
+ param len: r20:r21
+*/
+
+.global bmw224_lastBlock
+bmw_small_lastBlock:
+bmw224_lastBlock:
+bmw256_lastBlock:
+/* while(length_b >= BMW_SMALL_BLOCKSIZE){
+ bmw_small_nextBlock(ctx, block);
+ length_b -= BMW_SMALL_BLOCKSIZE;
+ block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
+ }
+*/
+ push_range 2, 7
+ push_range 28, 29
+ movw ctx0, r24
+ movw blc0, r22
+ movw len0, r20
+1:
+ cpi len1, hi8(512)
+ brlo 2f
+ rcall bmw_small_nextBlock_early
+ ldi r24, 64
+ add blc0, r24
+ adc blc1, r1
+ subi len1, hi8(512)
+ rjmp 1b
+2:
+/* struct {
+ uint8_t buffer[64];
+ uint32_t ctr;
+ } pctx;
+*/
+ stack_alloc_large 68
+ adiw r30, 1
+ movw buf0, r30
+/* memset(pctx.buffer, 0, 64);
+ memcpy(pctx.buffer, block, (length_b+7)/8);
+ pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
+*/ movw r24, len0
+ ldi r23, 63
+ movw r26, blc0
+ lsr r25
+ ror r24
+ lsr r24
+ lsr r24
+ breq 301f
+ sub r23, r24
+ /* copy (#r24) bytes to stack buffer */
+30: ld r20, X+
+ st Z+, r20
+ dec r24
+ brne 30b
+301: /* calculate the appended byte */
+ clr r20
+ mov r21, len0
+ ldi r24, 0x80
+ andi r21, 0x07
+ breq 305f
+ ld r20, X+
+303:
+ lsr r24
+ dec r21
+ brne 303b
+305:
+ or r20, r24
+ st Z+, r20
+ tst r23
+ breq 32f
+31: st Z+, r1
+ dec r23
+ brne 31b
+32:
+/* if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
+ bmw_small_nextBlock(ctx, pctx.buffer);
+ memset(pctx.buffer, 0, 64-8);
+ ctx->counter -= 1;
+ }
+*/
+ tst len1
+ breq 400f
+ cpi len0, 192
+ brlo 400f
+ movw blc0, buf0
+ rcall bmw_small_nextBlock_early
+ movw r26, buf0
+ ldi r20, 64-8
+350:
+ st X+, r1
+ dec r20
+ brne 350b
+ rcall load32_from_Z_stub
+ subi r21, 1
+ sbc r22, r1
+ sbc r23, r1
+ sbc r24, r1
+ rjmp 410f
+/* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
+ bmw_small_nextBlock(ctx, pctx.buffer);
+*/
+400:
+ rcall load32_from_Z_stub
+410:
+ clr r25
+ ldi r20, 1
+ lsl r21
+ rcall rol32
+ mov r20, len0
+ add r21, len1
+ adc r22, r1
+ adc r23, r1
+ adc r24, r1
+ adc r25, r1
+ movw r26, buf0
+ adiw r26, 64-8
+ st X+, r20
+ st X+, r21
+ rcall store32_to_X
+ st X+, r1
+ st X+, r1
+ movw blc0, buf0
+ rcall bmw_small_nextBlock_early
+/* memset(pctx.buffer, 0xaa, 64);
+ for(i=0; i<16;++i){
+ pctx.buffer[i*4] = i+0xa0;
+ }
+*/
+ ldi r22, 0xa0
+ ldi r23, 0xaa
+ ldi r24, 0xaa
+ ldi r25, 0xaa
+ movw r26, buf0
+500:
+ rcall store32_to_X
+ inc r22
+ sbrs r22, 4
+ rjmp 500b
+/* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
+ memcpy(ctx->h, pctx.buffer, 64);
+*/
+ movw r24, buf0
+ movw r22, ctx0
+ rcall bmw_small_nextBlock
+ ldi r18, 64
+ movw r26, ctx0
+ movw r30, buf0
+600:
+ ld r20, Z+
+ st X+, r20
+ dec r18
+ brne 600b
+
+ stack_free_large 68
+ rjmp pop28
+
+
+/*******************************************************************************
+* void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){
+* memcpy(dest, &(ctx->h[9]), 224/8);
+* }
+*
+* param dest: r24:r25
+* param ctx: r22:r23
+*/
+.global bmw224_ctx2hash
+bmw224_ctx2hash:
+ movw r30, r22
+ adiw r30, 9*4
+ ldi r18, 28
+1: movw r26, r24
+1: ld r23, Z+
+ st X+, r23
+ dec r18
+ brne 1b
+ ret
+
+
+/*******************************************************************************
+* void bmw224(void* dest, const void* msg, uint32_t length_b){
+* bmw_small_ctx_t ctx;
+* bmw224_init(&ctx);
+* while(length_b>=BMW_SMALL_BLOCKSIZE){
+* bmw_small_nextBlock(&ctx, msg);
+* length_b -= BMW_SMALL_BLOCKSIZE;
+* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
+* }
+* bmw_small_lastBlock(&ctx, msg, length_b);
+* bmw224_ctx2hash(dest, &ctx);
+* }
+*
+* param dest: r24:r25
+* param msg: r22:r23
+* param length_b: r18:r21
+*/
+ctx0 = 2
+ctx1 = 3
+msg0 = 4
+msg1 = 5
+len0 = 28
+len1 = 29
+len2 = 8
+len3 = 9
+dst0 = 6
+dst1 = 7
+.global bmw224
+bmw224:
+ push_range 2, 7
+ push_range 28, 29
+ push_range 8, 9
+ stack_alloc_large 64+4
+10: movw ctx0, r30
+ movw dst0, r24
+ movw msg0, r22
+ movw len0, r18
+ movw len2, r20
+ movw r24, ctx0
+ rcall bmw224_init
+20:
+ mov r18, len2
+ or r18, len3
+ breq 50f
+ rcall bmw_small_nextBlock_early
+ subi len1, 2
+ sbc len2, r1
+ sbc len3, r1
+ ldi r20, 64
+ add msg0, r20
+ adc msg1, r1
+ rjmp 20b
+50:
+ movw r24, ctx0
+ movw r22, msg0
+ movw r20, len0
+ rcall bmw_small_lastBlock
+ movw r24, dst0
+ movw r22, ctx0
+ rcall bmw224_ctx2hash
+ stack_free_large 64+4
+ rjmp pop9
+
+/*******************************************************************************
+* void bmw224_init(bmw224_ctx_t* ctx){
+* uint8_t i;
+* ctx->h[0] = 0x00010203;
+* for(i=1; i<16; ++i){
+* ctx->h[i] = ctx->h[i-1]+ 0x04040404;
+* }
+* ctx->counter=0;
+* }
+*
+* param ctx: r24:r25
+*/
+.global bmw224_init
+bmw224_init:
+ ldi r22, 0x00
+ ldi r23, 0x40
+ movw r26, r24
+ adiw r26, 4
+10:
+ st -X, r22
+ inc r22
+ mov r20, r22
+ andi r20, 0x3
+ brne 10b
+ adiw r26, 8
+20: cp r22, r23
+ brne 10b
+ st -X, r1
+ st -X, r1
+ st -X, r1
+ st -X, r1
+ ret
+
+
+/******************************************************************************/
+
+#if DEBUG
+
+printQ:
+ push_range 20, 25
+ ldi r16, 4
+ mov r9, r16
+ movw r16, r24
+ ldi r24, lo8(qdbg_str)
+ ldi r25, hi8(qdbg_str)
+ call cli_putstr_P
+ clr r8
+10: ldi r24, lo8(qdbg_str1)
+ ldi r25, hi8(qdbg_str1)
+ call cli_putstr_P
+ mov r24, r8
+ call cli_hexdump_byte
+ ldi r24, lo8(qdbg_str2)
+ ldi r25, hi8(qdbg_str2)
+ call cli_putstr_P
+ movw r24, r16
+ clr r23
+ ldi r22, 4
+ call cli_hexdump_rev
+ add r16, r9
+ adc r17, r1
+ inc r8
+ sbrs r8, 5
+ rjmp 10b
+ pop_range 20, 25
+ ret
+qdbg_str: .asciz "\r\nDBG Q: "
+qdbg_str1: .asciz "\r\n Q["
+qdbg_str2: .asciz "] = "
+
+
+printX:
+ push_range 6, 9
+ push_range 16, 27
+ push_range 30, 31
+ ldi r16, 4
+ mov r6, r22
+ mov r9, r16
+ movw r16, r24
+ ldi r24, lo8(Xdbg_str)
+ ldi r25, hi8(Xdbg_str)
+ call cli_putstr_P
+ mov r24, r6
+ call cli_putc
+ ldi r24, ':'
+ call cli_putc
+ clr r8
+10: ldi r24, lo8(Xdbg_str1)
+ ldi r25, hi8(Xdbg_str1)
+ call cli_putstr_P
+ mov r24, r6
+ call cli_putc
+ ldi r24, '['
+ call cli_putc
+ mov r24, r8
+ call cli_hexdump_byte
+ ldi r24, lo8(Xdbg_str2)
+ ldi r25, hi8(Xdbg_str2)
+ call cli_putstr_P
+ movw r24, r16
+ clr r23
+ ldi r22, 4
+ call cli_hexdump_rev
+ add r16, r9
+ adc r17, r1
+ inc r8
+ sbrs r8, 4
+ rjmp 10b
+ pop_range 30, 31
+ pop_range 16, 27
+ pop_range 6, 9
+ ret
+Xdbg_str: .asciz "\r\nDBG "
+Xdbg_str1: .asciz "\r\n "
+Xdbg_str2: .asciz "] = "
+
+print32:
+ push_range 6, 9
+ push_range 16, 27
+ push_range 30, 31
+ movw r6, r22
+ movw r8, r24
+ ldi r24, lo8(Xdbg_str)
+ ldi r25, hi8(Xdbg_str)
+ call cli_putstr_P
+ mov r24, r9
+ call cli_hexdump_byte
+ mov r24, r8
+ call cli_hexdump_byte
+ mov r24, r7
+ call cli_hexdump_byte
+ mov r24, r6
+ call cli_hexdump_byte
+ pop_range 30, 31
+ pop_range 16, 27
+ pop_range 6, 9
+ ret
+
+
+print_acc:
+ push_range 16, 27
+ push_range 30, 31
+ ldi r24, lo8(Xdbg_str)
+ ldi r25, hi8(Xdbg_str)
+ call cli_putstr_P
+ mov r24, r9
+ call cli_hexdump_byte
+ mov r24, r8
+ call cli_hexdump_byte
+ mov r24, r15
+ call cli_hexdump_byte
+ mov r24, r14
+ call cli_hexdump_byte
+ pop_range 30, 31
+ pop_range 16, 27
+ ret
+
+#endif
+
--- /dev/null
+/* bmw_small-tinyasm.S */
+/*
+ This file is part of the AVR-Crypto-Lib.
+ Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * File: bmw_small-tinyasm.S
+ * Author: Daniel Otte
+ * Date: 2010-03-28
+ * License: GPLv3 or later
+ * Description: implementation of BlueMidnightWish
+ *
+ */
+
+#include "avr-asm-macros.S"
+
+acc2 = 8
+acc3 = 9
+acc0 = 14
+acc1 = 15
+
+#define DEBUG 0
+
+/******************************************************************************/
+/*
+ param a: r22:r23:r24:r25
+ param s: r20
+*/
+shiftleft32:
+ clr r0
+ cpi r20, 8
+ brlo bitrotateleft_1
+ mov r25, r24
+ mov r24, r23
+ mov r23, r22
+ clr r22
+ subi r20, 8
+ rjmp shiftleft32
+
+/******************************************************************************/
+/*
+ param a: r22:r23:r24:r25
+ param s: r20
+*/
+shiftright32:
+ cpi r20, 8
+ brlo bitshiftright
+ mov r22, r23
+ mov r23, r24
+ mov r24, r25
+ clr r25
+ subi r20, 8
+ rjmp shiftright32
+bitshiftright:
+ tst r20
+ breq 20f
+10: lsr r25
+ ror r24
+ ror r23
+ ror r22
+ dec r20
+ brne 10b
+20: ret
+
+/******************************************************************************/
+/*
+ param a: r22:r23:r24:r25
+ param s: r20
+*/
+rotateleft32:
+ cpi r20, 8
+ brlo bitrotateleft
+ mov r0, r25
+ mov r25, r24
+ mov r24, r23
+ mov r23, r22
+ mov r22, r0
+ subi r20, 8
+ rjmp rotateleft32
+bitrotateleft:
+ mov r0, r25
+bitrotateleft_1:
+ tst r20
+ breq 20f
+10:
+ lsl r0
+rol32:
+ rol r22
+ rol r23
+ rol r24
+ rol r25
+ dec r20
+ brne 10b
+20: ret
+
+
+/******************************************************************************/
+
+sn_stub:
+ movw r22, r2
+ movw r24, r4
+ lpm r20, Z+
+ rcall rotateleft32
+eor32_to_acc:
+ eor acc0, r22
+ eor acc1, r23
+ eor acc2, r24
+ eor acc3, r25
+ ret
+
+s_table:
+s0: .byte 1, 3, 4,19
+s1: .byte 1, 2, 8,23
+s2: .byte 2, 1,12,25
+s3: .byte 2, 2,15,29
+s4: .byte 1, 0, 0, 0
+s5: .byte 2, 0, 0, 0
+
+h0 = 10
+h1 = 11
+m0 = 12
+m1 = 13
+
+/*
+ param x: r22:r23:r24:25
+ param s: r20
+*/
+sn:
+ push_range 2, 5
+ push acc0
+ push acc1
+ push acc2
+ push acc3
+ ldi r30, lo8(s_table)
+ ldi r31, hi8(s_table)
+ lsl r20
+ lsl r20
+ add r30, r20
+ adc r31, r1
+ movw r2, r22
+ movw r4, r24
+ lpm r20, Z+
+ rcall shiftright32
+ rcall mov32_to_acc
+;---
+ movw r22, r2
+ movw r24, r4
+ lpm r20, Z+
+ rcall shiftleft32
+ rcall eor32_to_acc
+;---
+ rcall sn_stub
+ rcall sn_stub
+
+ movw r22, acc0
+ movw r24, acc2
+ pop acc3
+ pop acc2
+ pop acc1
+ pop acc0
+ rjmp pop5
+
+/******************************************************************************/
+/*
+ param dest: r26:r27 (X)
+ param src: r30:r31 (Z)
+ param len: r20
+*/
+memxor_64:
+; tst r20
+; breq memxor_exit
+ ldi r20, 64
+memxor:
+10: ld r21, X
+ ld r22, Z+
+ eor r21, r22
+ st X+, r21
+ dec r20
+ brne 10b
+memxor_exit:
+ ret
+
+/******************************************************************************/
+q0 = 2
+q1 = 3
+h0 = 4
+h1 = 5
+m0 = 6
+m1 = 7
+
+
+/******************************************************************************/
+load32_from_X:
+ ld r22, X+
+ ld r23, X+
+ ld r24, X+
+ ld r25, X+
+ ret
+
+load32_from_Y:
+ ld r22, Y+
+ ld r23, Y+
+ ld r24, Y+
+ ld r25, Y+
+ ret
+
+store32_to_Y:
+ st Y+, r22
+ st Y+, r23
+ st Y+, r24
+ st Y+, r25
+ ret
+
+add_X_to_32:
+ ld r0, X+
+ add r22, r0
+ ld r0, X+
+ adc r23, r0
+ ld r0, X+
+ adc r24, r0
+ ld r0, X+
+ adc r25, r0
+ ret
+
+store32_to_X:
+ st X+, r22
+ st X+, r23
+ st X+, r24
+ st X+, r25
+ ret
+
+mov32_to_acc:
+ movw acc0, r22
+ movw acc2, r24
+ ret
+
+/******************************************************************************/
+/*
+ param q: r28:r29 (Y)
+ param h: r26:r27 (X)
+ param m: r30:r31 (Z)
+*/
+
+f2_1_shift_table:
+; .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55
+ .byte 0x55, 0x87, 0x55, 0x51, 0x03, 0x66, 0x64, 0x2B
+f2_2_shift_table:
+; .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1)
+ .byte (8<<1)+1, (6<<1), (6<<1)+1, (4<<1)+1, (3<<1), (4<<1), (7<<1), (2<<1)
+expand2_rot_table:
+ .byte 3,7,13,16,19,23,27
+
+f0_hacktable:
+ .byte 0x03, 0x11, 5*4
+ .byte 0xDD, 0xB3, 7*4
+ .byte 0x2A, 0x79, 10*4
+ .byte 0x07, 0xAA, 13*4
+ .byte 0x51, 0xC2, 14*4
+
+
+/*******************************************************************************
+* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){
+* uint32_t r;
+* r = pgm_read_dword(k_lut+j);
+* r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
+* r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
+* r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
+* r ^= ((uint32_t*)h)[(j+7)&0xf];
+* return r;
+* }
+* param j: r24
+* param m: r22:r23
+* param h: r20:r21
+*/
+j = 16
+acc2 = 8
+acc3 = 9
+h0 = 10
+h1 = 11
+m0 = 12
+m1 = 13
+acc0 = 14
+acc1 = 15
+
+load_acc_from_X:
+ ld acc0, X+
+ ld acc1, X+
+ ld acc2, X+
+ ld acc3, X+
+ ret
+
+add_acc_to_X:
+ ld r0, X
+ add r0, acc0
+ st X+, r0
+ ld r0, X
+ adc r0, acc1
+ st X+, r0
+ ld r0, X
+ adc r0, acc2
+ st X+, r0
+ ld r0, X
+ adc r0, acc3
+ st X+, r0
+ ret
+
+load_rotate_add_M:
+ mov r20, j
+ andi r20, 0x0f
+ mov r0, r20
+ lsl r0
+ lsl r0
+ movw r26, m0
+ add r26, r0
+ adc r27, r1
+ rcall load32_from_X
+ inc r20
+ rcall rotateleft32
+ brts 10f
+ rjmp add32_to_acc
+; ret
+10: sub acc0, r22
+ sbc acc1, r23
+ sbc acc2, r24
+ sbc acc3, r25
+ ret
+
+
+;---
+
+/******************************************************************************/
+load_sn_add:
+ rcall load32_from_X
+ rcall sn
+add32_to_acc:
+ add acc0, r22
+ adc acc1, r23
+ adc acc2, r24
+ adc acc3, r25
+ ret
+
+/*
+ param q: r26:r27
+ param m: r22:r23
+ param h: r20:r21
+ param j: r24
+*/
+
+expand_intro:
+ push_range 26, 27
+ push r24
+addelement:
+ mov j, r24
+ movw h0, r20
+ movw m0, r22
+ sbiw r26, 4
+ rcall load_acc_from_X
+ ldi r24, 0x55
+ add acc0, r24
+ adc acc1, r24
+ adc acc2, r24
+ ldi r24, 5
+ adc acc3, r24
+ rcall store_acc_to_dec_X
+ adiw r26, 4
+ clt
+ rcall load_rotate_add_M
+ subi j, -3
+ rcall load_rotate_add_M
+ set
+ subi j, -7
+ rcall load_rotate_add_M
+ lsl j
+ lsl j
+ subi j, -7*4+10*4
+ andi j, 0x3f
+ movw r26, h0
+ add r26, j
+ adc r27, r1
+ rcall load32_from_X
+ rcall eor32_to_acc
+;--
+ pop r24
+ pop_range 26, 27
+ lsl r24
+ lsl r24
+ add r26, r24
+ adc r27, r1
+ ret
+expand1:
+ rcall expand_intro
+ ldi r19, 1
+10:
+ mov r20, r19
+ andi r20, 3
+ rcall load_sn_add
+ inc r19
+ cpi r19, 17
+ brne 10b
+ rjmp expand2_exit
+
+
+/******************************************************************************/
+/*
+ param q: r26:r27
+ param m: r22:r23
+ param h: r20:r21
+ param j: r24
+*/
+
+
+expand2:
+ rcall expand_intro
+ ldi r19, 14
+ ldi r30, lo8(expand2_rot_table)
+ ldi r31, hi8(expand2_rot_table)
+10:
+ rcall load32_from_X
+ sbrs r19, 0
+ rjmp 12f
+ lpm r20, Z+
+ rcall rotateleft32
+12: rcall add32_to_acc
+ dec r19
+ brne 10b
+ ldi r20, 4
+ rcall load_sn_add
+ ldi r20, 5
+ rcall load_sn_add
+expand2_exit:
+ adiw r26, 4
+store_acc_to_dec_X:
+ st -X, acc3
+ st -X, acc2
+ st -X, acc1
+ st -X, acc0
+ ret
+
+/******************************************************************************/
+/*
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+/* for calling expand1/2
+ param q: r26:r27
+ param m: r22:r23
+ param h: r20:r21
+ param j: r24
+*/
+
+/******************************************************************************/
+/*
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+
+/******************************************************************************/
+/*
+ param ctx: r24:r25
+ param msg: r22:r23
+*/
+/* f0
+ param q: r28:r29 (Y)
+ param h: r26:r27 (X)
+ param m: r30:r31 (Z)
+*/
+/* f1
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+/* f2
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+q0 = 2
+q1 = 3
+h0 = 4
+h1 = 5
+m0 = 6
+m1 = 7
+ctx0 = 2
+ctx1 = 3
+msg0 = 4
+msg1 = 5
+
+restore_f1:
+ movw r26, r2
+ movw r22, r4
+ movw r20, r6
+ ret
+bmw_small_nextBlock_early:
+ movw r24, ctx0
+ movw r22, msg0
+.global bmw_small_nextBlock
+.global bmw256_nextBlock
+bmw_small_nextBlock:
+bmw224_nextBlock:
+bmw256_nextBlock:
+ push_range 2, 7
+ push_range 28, 29
+ push_range 8, 17
+ stack_alloc_large 32*4, r28, r29
+ ldi r16, 0x4f
+ push r16
+ ldi r16, 0xff
+ push r16
+ push r16
+ ldi r16, 0xfb
+ push r16
+ adiw r28, 1
+; push_range 28, 29 /* push Q */
+; push_range 22, 25 /* push M & H */
+ /* increment counter */
+ movw r26, r24
+ movw r2, r26
+ adiw r26, 63
+ adiw r26, 1
+ rcall load_acc_from_X
+ ldi r19, 1
+ add acc0, r19
+ adc acc1, r1
+ adc acc2, r1
+ adc acc3, r1
+ rcall store_acc_to_dec_X
+ /* call f0 */
+ movw r30, r22
+ movw r26, r24
+f0:
+ movw h0, r26
+ movw q0, r28
+ movw m0, r30
+ /* xor m into h */
+; ldi r20, 64
+ rcall memxor_64
+ movw r30, m0
+ movw r26, h0
+
+ /* set q to zero */
+ ldi r22, 64
+10: st Y+, r1
+ dec r22
+ brne 10b
+ movw r28, q0
+ /* calculate W and store it in Q */
+ ldi r19, 5
+30:
+ ldi r18, 16
+ /* load initial index */
+
+ /* load values from hacktable */
+ ldi r30, lo8(f0_hacktable-3)
+ ldi r31, hi8(f0_hacktable-3)
+ mov r16, r19
+ lsl r16
+ add r16, r19
+ add r30, r16
+ adc r31, r1
+ lpm r21, Z+
+ lpm r20, Z+
+ lpm r16, Z+
+40:
+ ;call add_hx_to_w
+add_hx_to_w:
+ movw r26, h0
+ add r26, r16
+ adc r27, r1
+ rcall load32_from_Y
+ sbiw r28, 4
+ lsl r20
+ rol r21
+ brcs 300f
+ /* addition */
+ rcall add_X_to_32
+ rjmp 500f
+300: /* substract */
+ rcall load_acc_from_X
+ sub r22, acc0
+ sbc r23, acc1
+ sbc r24, acc2
+ sbc r25, acc3
+
+500:
+ rcall store32_to_Y
+ subi r16, -4
+ andi r16, 0x0f<<2
+ dec r18
+ brne 40b
+ movw r28, q0
+ dec r19
+ brne 30b
+ movw r26, h0
+ /* xor m into h */
+; ldi r20, 64
+ movw r26, h0
+ movw r30, m0
+ rcall memxor_64
+ sbiw r26, 60
+;---
+ clr r17
+ ldi r21, 15
+ mov r8, r21
+50:
+ rcall load32_from_Y
+ sbiw r28, 4
+ mov r20, r17
+ rcall sn
+ inc r17
+ cpi r17, 5
+ brne 52f
+ clr r17
+52:
+ rcall add_X_to_32
+ rcall store32_to_Y
+
+ dec r8
+ brne 50b
+;---
+ rcall load32_from_Y
+ clr r20
+ rcall sn
+ movw r26, h0
+ rcall add_X_to_32
+ sbiw r26, 4
+ sbiw r28, 4
+ rcall store32_to_Y
+ sbiw r28, 4
+ sbiw r28, 15*4
+ movw r20, h0
+ movw r22, m0
+
+ /* call f1*/
+ movw r2, r28
+f1:
+ movw r4, r22
+ movw r6, r20
+ movw r26, r2
+ clr r24
+ rcall expand1
+ rcall restore_f1
+ ldi r24, 1
+ rcall expand1
+ ldi r17, 2
+10: rcall restore_f1
+ mov r24, r17
+ rcall expand2
+ inc r17
+ sbrs r17, 4
+ rjmp 10b
+ rcall restore_f1
+ movw r24, r2
+
+
+ /* call f2 */
+; pop_range 20, 25
+; push_range 20, 25
+; rcall printQ
+; push r20
+; push r21
+acc2 = 8
+acc3 = 9
+acc0 = 14
+acc1 = 15
+xl0 = 2
+xl1 = 3
+xl2 = 4
+xl3 = 5
+xh0 = 6
+xh1 = 7
+xh2 = 10
+xh3 = 11
+q16_0 = 12
+q16_1 = 13
+h0 = 18
+h1 = 19
+f2:
+ movw r26, r24
+ /* calc XL & XH */
+ adiw r26, 63
+ adiw r26, 1
+ movw q16_0, r26
+ movw h0, r20
+;---
+; push h0
+; push h1
+;---
+ movw r28, r22
+ rcall load_acc_from_X
+ ldi r17, 15
+10: rcall load32_from_X
+ rcall eor32_to_acc
+ cpi r17, 9
+ brne 15f
+ movw xl0, acc0
+ movw xl2, acc2
+15:
+ dec r17
+ brne 10b
+ movw xh0, acc0
+ movw xh2, acc2
+;--- DBG
+; push_range 22, 25
+; movw r22, xl0
+; movw r24, xl2
+; rcall print32
+; movw r22, xh0
+; movw r24, xh2
+; rcall print32
+; pop_range 22, 25
+;--- END DBG
+ /* copy m(Y) into h */
+ movw r26, h0
+ ldi r22, 64
+10:
+ ld r23, Y+
+ st X+, r23
+ dec r22
+ brne 10b
+;--- /* calc first half of h0..h15 */
+ movw r28, q16_0
+ movw r26, h0
+ ldi r30, lo8(f2_1_shift_table)
+ ldi r31, hi8(f2_1_shift_table)
+ ldi r17, 16
+10:
+;---
+ movw r22, xh0
+ movw r24, xh2
+ cpi r17, 9
+ brge 15f
+ clr r1
+ rjmp 26f
+15: lpm r20, Z+
+ mov r1, r20
+ andi r20, 0x0f
+ clt
+ cpi r17, 16
+ breq 20f
+ cpi r17, 11
+ brne 21f
+20: set
+21: brts 25f
+ rcall shiftright32
+ rjmp 26f
+25: rcall shiftleft32
+26: rcall mov32_to_acc
+;---
+ rcall load32_from_Y
+ mov r20, r1
+ clr r1
+ swap r20
+ andi r20, 0x0f
+ brts 27f
+ rcall shiftleft32
+ rjmp 28f
+27: rcall shiftright32
+28: rcall eor32_to_acc
+;---
+ rcall load32_from_X
+ rcall eor32_to_acc
+ rcall store_acc_to_dec_X
+ adiw r26, 4
+;---
+ dec r17
+ brne 10b
+;-----
+ sbiw r28, 4*8 /* Y points to q[24] */
+ movw r30, r28
+ sbiw r28, 63
+ sbiw r28, 33 /* Y points to q[0] */
+ movw r26, r28
+ ldi r20, 8*4
+ /* xor q[24..31] into q[0..7] */
+ rcall memxor
+ /* xor q[23] into q[8] */
+ sbiw r30, 9*4
+ ldi r20, 4
+ rcall memxor
+ /* xor q[16..22] into q[9..15] */
+ sbiw r30, 8*4
+ ldi r20, 7*4
+ rcall memxor
+
+ movw r26, h0
+ ldi r17, 15
+ ldi r30, lo8(f2_2_shift_table)
+ ldi r31, hi8(f2_2_shift_table)
+10: movw r22, xl0
+ movw r24, xl2
+ sbrc r17, 3
+ rjmp 20f
+ lpm r20, Z+
+ lsr r20
+ brcs 15f
+ rcall shiftright32
+ rjmp 20f
+15:
+ rcall shiftleft32
+20:
+ rcall mov32_to_acc
+ rcall load32_from_Y
+ rcall eor32_to_acc
+ rcall add_acc_to_X
+ dec r17
+ brpl 10b
+;-----
+ sbiw r26, 8*4 /* X points to h8 */
+ movw r28, r26
+ sbiw r28, 4*4 /* Y points to h4 */
+ ldi r17, 8
+ ldi r18, 9
+10:
+ rcall load32_from_Y
+ mov r20, r18
+ rcall rotateleft32
+ rcall mov32_to_acc
+ rcall add_acc_to_X
+ inc r18
+ cpi r17, 5
+ brne 20f
+ sbiw r28, 8*4
+20: dec r17
+ brne 10b
+
+exit:
+;--- DBG
+; pop r25
+; pop r24
+; ldi r22, 'H'
+; rcall printX
+;--- END DBG
+ stack_free_large3 32*4+4
+ pop_range 10, 17
+pop9:
+ pop_range 8, 9
+pop28:
+ pop_range 28, 29
+pop7:
+ pop_range 6, 7
+pop5:
+ pop_range 2, 5
+ ret
+
+/******************************************************************************/
+ctx0 = 2
+ctx1 = 3
+blc0 = 4
+blc1 = 5
+len0 = 28
+len1 = 29
+buf0 = 6
+buf1 = 7
+
+load32_from_Z_stub:
+ movw r30, ctx0
+ adiw r30, 60
+ ldd r21, Z+4
+ ldd r22, Z+5
+ ldd r23, Z+6
+ ldd r24, Z+7
+ ret
+
+/******************************************************************************/
+/*
+ param ctx: r24:r25
+ param msg: r22:r23
+ param len: r20:r21
+*/
+
+.global bmw_small_lastBlock
+.global bmw256_lastBlock
+bmw_small_lastBlock:
+bmw224_lastBlock:
+bmw256_lastBlock:
+/* while(length_b >= BMW_SMALL_BLOCKSIZE){
+ bmw_small_nextBlock(ctx, block);
+ length_b -= BMW_SMALL_BLOCKSIZE;
+ block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
+ }
+*/
+ push_range 2, 7
+ push_range 28, 29
+ movw ctx0, r24
+ movw blc0, r22
+ movw len0, r20
+1:
+ cpi len1, hi8(512)
+ brlo 2f
+ rcall bmw_small_nextBlock_early
+ ldi r24, 64
+ add blc0, r24
+ adc blc1, r1
+ subi len1, hi8(512)
+ rjmp 1b
+2:
+/* struct {
+ uint8_t buffer[64];
+ uint32_t ctr;
+ } pctx;
+*/
+ stack_alloc_large 68
+ adiw r30, 1
+ movw buf0, r30
+/* memset(pctx.buffer, 0, 64);
+ memcpy(pctx.buffer, block, (length_b+7)/8);
+ pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
+*/ movw r24, len0
+ ldi r23, 63
+ movw r26, blc0
+ lsr r25
+ ror r24
+ lsr r24
+ lsr r24
+ breq 301f
+ sub r23, r24
+ /* copy (#r24) bytes to stack buffer */
+30: ld r20, X+
+ st Z+, r20
+ dec r24
+ brne 30b
+301: /* calculate the appended byte */
+ clr r20
+ mov r21, len0
+ ldi r24, 0x80
+ andi r21, 0x07
+ breq 305f
+ ld r20, X+
+303:
+ lsr r24
+ dec r21
+ brne 303b
+305:
+ or r20, r24
+ st Z+, r20
+ tst r23
+ breq 32f
+31: st Z+, r1
+ dec r23
+ brne 31b
+32:
+/* if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
+ bmw_small_nextBlock(ctx, pctx.buffer);
+ memset(pctx.buffer, 0, 64-8);
+ ctx->counter -= 1;
+ }
+*/
+ tst len1
+ breq 400f
+ cpi len0, 192
+ brlo 400f
+ movw blc0, buf0
+ rcall bmw_small_nextBlock_early
+ movw r26, buf0
+ ldi r20, 64-8
+350:
+ st X+, r1
+ dec r20
+ brne 350b
+ rcall load32_from_Z_stub
+ subi r21, 1
+ sbc r22, r1
+ sbc r23, r1
+ sbc r24, r1
+ rjmp 410f
+/* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
+ bmw_small_nextBlock(ctx, pctx.buffer);
+*/
+400:
+ rcall load32_from_Z_stub
+410:
+ clr r25
+ ldi r20, 1
+ lsl r21
+ rcall rol32
+ mov r20, len0
+ add r21, len1
+ adc r22, r1
+ adc r23, r1
+ adc r24, r1
+ adc r25, r1
+ movw r26, buf0
+ adiw r26, 64-8
+ st X+, r20
+ st X+, r21
+ rcall store32_to_X
+ st X+, r1
+ st X+, r1
+ movw blc0, buf0
+ rcall bmw_small_nextBlock_early
+/* memset(pctx.buffer, 0xaa, 64);
+ for(i=0; i<16;++i){
+ pctx.buffer[i*4] = i+0xa0;
+ }
+*/
+ ldi r22, 0xa0
+ ldi r23, 0xaa
+ ldi r24, 0xaa
+ ldi r25, 0xaa
+ movw r26, buf0
+500:
+ rcall store32_to_X
+ inc r22
+ sbrs r22, 4
+ rjmp 500b
+/* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
+ memcpy(ctx->h, pctx.buffer, 64);
+*/
+ movw r24, buf0
+ movw r22, ctx0
+ rcall bmw_small_nextBlock
+ ldi r18, 64
+ movw r26, ctx0
+ movw r30, buf0
+600:
+ ld r20, Z+
+ st X+, r20
+ dec r18
+ brne 600b
+
+ stack_free_large 68
+ rjmp pop28
+
+
+/*******************************************************************************
+* void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
+* memcpy(dest, &(ctx->h[8]), 256/8);
+* }
+*
+* param dest: r24:r25
+* param ctx: r22:r23
+*/
+.global bmw256_ctx2hash
+bmw256_ctx2hash:
+ movw r30, r22
+ adiw r30, 8*4
+ ldi r18, 32
+1: movw r26, r24
+1: ld r23, Z+
+ st X+, r23
+ dec r18
+ brne 1b
+ ret
+
+/*******************************************************************************
+* void bmw256(void* dest, const void* msg, uint32_t length_b){
+* bmw_small_ctx_t ctx;
+* bmw256_init(&ctx);
+* while(length_b>=BMW_SMALL_BLOCKSIZE){
+* bmw_small_nextBlock(&ctx, msg);
+* length_b -= BMW_SMALL_BLOCKSIZE;
+* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
+* }
+* bmw_small_lastBlock(&ctx, msg, length_b);
+* bmw256_ctx2hash(dest, &ctx);
+* }
+*
+* param dest: r24:r25
+* param msg: r22:r23
+* param length_b: r18:r21
+*/
+ctx0 = 2
+ctx1 = 3
+msg0 = 4
+msg1 = 5
+len0 = 6
+len1 = 7
+len2 = 8
+len3 = 9
+dst0 = 10
+dst1 = 11
+
+
+/*******************************************************************************
+* void bmw224(void* dest, const void* msg, uint32_t length_b){
+* bmw_small_ctx_t ctx;
+* bmw224_init(&ctx);
+* while(length_b>=BMW_SMALL_BLOCKSIZE){
+* bmw_small_nextBlock(&ctx, msg);
+* length_b -= BMW_SMALL_BLOCKSIZE;
+* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
+* }
+* bmw_small_lastBlock(&ctx, msg, length_b);
+* bmw224_ctx2hash(dest, &ctx);
+* }
+*
+* param dest: r24:r25
+* param msg: r22:r23
+* param length_b: r18:r21
+*/
+ctx0 = 2
+ctx1 = 3
+msg0 = 4
+msg1 = 5
+len0 = 28
+len1 = 29
+len2 = 8
+len3 = 9
+dst0 = 6
+dst1 = 7
+
+
+.global bmw256
+bmw256:
+ push_range 2, 7
+ push_range 28, 29
+ push_range 8, 9
+ stack_alloc_large 64+4
+10: movw ctx0, r30
+ movw dst0, r24
+ movw msg0, r22
+ movw len0, r18
+ movw len2, r20
+ movw r24, ctx0
+ rcall bmw256_init
+20:
+ mov r18, len2
+ or r18, len3
+ breq 50f
+ rcall bmw_small_nextBlock_early
+ subi len1, 2
+ sbc len2, r1
+ sbc len3, r1
+ ldi r20, 64
+ add msg0, r20
+ adc msg1, r1
+ rjmp 20b
+50:
+ movw r24, ctx0
+ movw r22, msg0
+ movw r20, len0
+ rcall bmw_small_lastBlock
+ movw r24, dst0
+ movw r22, ctx0
+ rcall bmw256_ctx2hash
+ stack_free_large 64+4
+ rjmp pop9
+
+/******************************************************************************/
+.global bmw256_init
+bmw256_init:
+ ldi r22, 0x40
+ ldi r23, 0x80
+ movw r26, r24
+ adiw r26, 4
+10:
+ st -X, r22
+ inc r22
+ mov r20, r22
+ andi r20, 0x3
+ brne 10b
+ adiw r26, 8
+20: cp r22, r23
+ brne 10b
+ st -X, r1
+ st -X, r1
+ st -X, r1
+ st -X, r1
+ ret
+
+
+/******************************************************************************/
+
+#if DEBUG
+
+printQ:
+ push_range 20, 25
+ ldi r16, 4
+ mov r9, r16
+ movw r16, r24
+ ldi r24, lo8(qdbg_str)
+ ldi r25, hi8(qdbg_str)
+ call cli_putstr_P
+ clr r8
+10: ldi r24, lo8(qdbg_str1)
+ ldi r25, hi8(qdbg_str1)
+ call cli_putstr_P
+ mov r24, r8
+ call cli_hexdump_byte
+ ldi r24, lo8(qdbg_str2)
+ ldi r25, hi8(qdbg_str2)
+ call cli_putstr_P
+ movw r24, r16
+ clr r23
+ ldi r22, 4
+ call cli_hexdump_rev
+ add r16, r9
+ adc r17, r1
+ inc r8
+ sbrs r8, 5
+ rjmp 10b
+ pop_range 20, 25
+ ret
+qdbg_str: .asciz "\r\nDBG Q: "
+qdbg_str1: .asciz "\r\n Q["
+qdbg_str2: .asciz "] = "
+
+
+printX:
+ push_range 6, 9
+ push_range 16, 27
+ push_range 30, 31
+ ldi r16, 4
+ mov r6, r22
+ mov r9, r16
+ movw r16, r24
+ ldi r24, lo8(Xdbg_str)
+ ldi r25, hi8(Xdbg_str)
+ call cli_putstr_P
+ mov r24, r6
+ call cli_putc
+ ldi r24, ':'
+ call cli_putc
+ clr r8
+10: ldi r24, lo8(Xdbg_str1)
+ ldi r25, hi8(Xdbg_str1)
+ call cli_putstr_P
+ mov r24, r6
+ call cli_putc
+ ldi r24, '['
+ call cli_putc
+ mov r24, r8
+ call cli_hexdump_byte
+ ldi r24, lo8(Xdbg_str2)
+ ldi r25, hi8(Xdbg_str2)
+ call cli_putstr_P
+ movw r24, r16
+ clr r23
+ ldi r22, 4
+ call cli_hexdump_rev
+ add r16, r9
+ adc r17, r1
+ inc r8
+ sbrs r8, 4
+ rjmp 10b
+ pop_range 30, 31
+ pop_range 16, 27
+ pop_range 6, 9
+ ret
+Xdbg_str: .asciz "\r\nDBG "
+Xdbg_str1: .asciz "\r\n "
+Xdbg_str2: .asciz "] = "
+
+print32:
+ push_range 6, 9
+ push_range 16, 27
+ push_range 30, 31
+ movw r6, r22
+ movw r8, r24
+ ldi r24, lo8(Xdbg_str)
+ ldi r25, hi8(Xdbg_str)
+ call cli_putstr_P
+ mov r24, r9
+ call cli_hexdump_byte
+ mov r24, r8
+ call cli_hexdump_byte
+ mov r24, r7
+ call cli_hexdump_byte
+ mov r24, r6
+ call cli_hexdump_byte
+ pop_range 30, 31
+ pop_range 16, 27
+ pop_range 6, 9
+ ret
+
+
+print_acc:
+ push_range 16, 27
+ push_range 30, 31
+ ldi r24, lo8(Xdbg_str)
+ ldi r25, hi8(Xdbg_str)
+ call cli_putstr_P
+ mov r24, r9
+ call cli_hexdump_byte
+ mov r24, r8
+ call cli_hexdump_byte
+ mov r24, r15
+ call cli_hexdump_byte
+ mov r24, r14
+ call cli_hexdump_byte
+ pop_range 30, 31
+ pop_range 16, 27
+ ret
+
+#endif
+