+
+/******************************************************************************/
+/*
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+
+/******************************************************************************/
+/*
+ param ctx: r24:r25
+ param msg: r22:r23
+*/
+/* f0
+ param q: r28:r29 (Y)
+ param h: r26:r27 (X)
+ param m: r30:r31 (Z)
+*/
+/* f1
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+/* f2
+ param q: r24:r25
+ param m: r22:r23
+ param h: r20:r21
+*/
+q0 = 2
+q1 = 3
+h0 = 4
+h1 = 5
+m0 = 6
+m1 = 7
+
+
+.global bmw_small_nextBlock
+.global bmw224_nextBlock
+.global bmw256_nextBlock
+bmw_small_nextBlock:
+bmw224_nextBlock:
+bmw256_nextBlock:
+ push_range 28, 29
+ push_range 2, 17
+ stack_alloc_large 32*4, r28, r29
+ ldi r16, 0x4f
+ push r16
+ ldi r16, 0xff
+ push r16
+ push r16
+ ldi r16, 0xfb
+ push r16
+ adiw r28, 1
+; push_range 28, 29 /* push Q */
+; push_range 22, 25 /* push M & H */
+ /* increment counter */
+ movw r26, r24
+ movw r2, r26
+ adiw r26, 63
+ adiw r26, 1
+ rcall load_acc_from_X
+ ldi r19, 1
+ add acc0, r19
+ adc acc1, r1
+ adc acc2, r1
+ adc acc3, r1
+ rcall store_acc_to_dec_X
+ /* call f0 */
+ movw r30, r22
+ movw r26, r24
+f0:
+ movw h0, r26
+ movw q0, r28
+ movw m0, r30
+ /* xor m into h */
+; ldi r20, 64
+ rcall memxor_short
+ movw r30, m0
+ movw r26, h0
+
+ /* set q to zero */
+ ldi r22, 64
+10: st Y+, r1
+ dec r22
+ brne 10b
+ movw r28, q0
+ /* calculate W and store it in Q */
+ ldi r19, 5
+30:
+ ldi r18, 16
+ /* load initial index */
+
+ /* load values from hacktable */
+ ldi r30, lo8(f0_hacktable-3)
+ ldi r31, hi8(f0_hacktable-3)
+ mov r16, r19
+ lsl r16
+ add r16, r19
+ add r30, r16
+ adc r31, r1
+ lpm r21, Z+
+ lpm r20, Z+
+ lpm r16, Z+
+40:
+ ;call add_hx_to_w
+add_hx_to_w:
+ movw r26, h0
+ add r26, r16
+ adc r27, r1
+ rcall load32_from_Y
+ sbiw r28, 4
+ lsl r20
+ rol r21
+ brcs 300f
+ /* addition */
+ rcall add_X_to_32
+ rjmp 500f
+300: /* substract */
+ rcall load_acc_from_X
+ sub r22, acc0
+ sbc r23, acc1
+ sbc r24, acc2
+ sbc r25, acc3
+
+500:
+ rcall store32_to_Y
+ subi r16, -4
+ andi r16, 0x0f<<2
+ dec r18
+ brne 40b
+ movw r28, q0
+ dec r19
+ brne 30b
+ movw r26, h0
+ /* xor m into h */
+; ldi r20, 64
+ movw r26, h0
+ movw r30, m0
+ rcall memxor_short
+ sbiw r26, 60
+;---
+ clr r17
+ ldi r21, 15
+ mov r8, r21
+50:
+ rcall load32_from_Y
+ sbiw r28, 4
+ mov r20, r17
+ rcall sn
+ inc r17
+ cpi r17, 5
+ brne 52f
+ clr r17
+52:
+ rcall add_X_to_32
+ rcall store32_to_Y
+
+ dec r8
+ brne 50b
+;---
+ rcall load32_from_Y
+ clr r20
+ rcall sn
+ movw r26, h0
+ rcall add_X_to_32
+ sbiw r26, 4
+ sbiw r28, 4
+ rcall store32_to_Y
+ sbiw r28, 4
+ sbiw r28, 15*4
+ movw r20, h0
+ movw r22, m0
+
+ /* call f1*/
+ movw r2, r28