--- /dev/null
+/* bmw_small-asm.S */
+/*
+ This file is part of the AVR-Crypto-Lib.
+ Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * File: bmw_small-asm.S
+ * Author: Daniel Otte
+ * Date: 2009-11-13
+ * License: GPLv3 or later
+ * Description: implementation of BlueMidnightWish
+ *
+ */
+
+#include "avr-asm-macros.S"
+
+shiftcodetable:
+ .byte 0x00 ; 0
+ .byte 0x01 ; 1
+ .byte 0x02 ; 2
+ .byte 0x03 ; 3
+ .byte 0x04 ; 4
+ .byte 0x1B ; 5
+ .byte 0x1A ; 6
+ .byte 0x19 ; 7
+ .byte 0x10 ; 8
+shiftcodetable_9:
+ .byte 0x11 ; 9
+ .byte 0x12 ; 10
+ .byte 0x13 ; 11
+ .byte 0x2C ; 12
+ .byte 0x2B ; 13
+ .byte 0x2A ; 14
+ .byte 0x29 ; 15
+ .byte 0x20 ; 16
+ .byte 0x21 ; 17 unused but necesseray for padding
+
+
+
+/*******************************************************************************
+ * shiftl32
+ * value: r25:r22
+ * shift: r20
+ */
+shiftl32:
+1:
+; clc
+ lsl r22
+ rol r23
+ rol r24
+ rol r25
+ dec r20
+ brne 1b
+ ret
+
+/*******************************************************************************
+ * shiftr32
+ * value: r25:r22
+ * shift: r20
+ */
+shiftr32:
+1:
+; clc
+ lsr r25
+ ror r24
+ ror r23
+ ror r22
+ dec r20
+ brne 1b
+ ret
+
+/*******************************************************************************
+ * rotl32
+ * value: r25:r22
+ * shift: r20
+ */
+rotl32:
+ mov r21, r25
+1:
+ lsl r21
+ rol r22
+ rol r23
+ rol r24
+ rol r25
+ dec r20
+ brne 1b
+ ret
+
+/*******************************************************************************
+ * rotr32
+ * value: r25:r22
+ * shift: r20
+ */
+rotr32:
+ mov r21, r22
+1:
+ lsr r21
+ ror r25
+ ror r24
+ ror r23
+ ror r22
+ dec r20
+ brne 1b
+some_ret:
+ ret
+
+/*******************************************************************************
+ * rotl32p9
+ * value: r25:r22
+ * shift: r20
+ */
+rotl32p9:
+ push_range 30, 31
+ ldi r30, lo8(shiftcodetable_9)
+ ldi r31, hi8(shiftcodetable_9)
+ add r30, r20
+ adc r31, r1
+ lpm r20, Z
+ pop_range 30, 31
+ sbrs r20, 4
+ rjmp 2f
+ mov r0, r25
+ mov r25, r24
+ mov r24, r23
+ mov r23, r22
+ mov r22, r0
+2: sbrs r20, 5
+ rjmp 3f
+ movw r0, r24
+ movw r24, r22
+ movw r22, r0
+ clr r1
+3: bst r20, 3
+ andi r20, 0x07
+ breq some_ret
+ brts rotr32
+ rjmp rotl32
+
+
+/*******************************************************************************
+* uint32_t rotl_addel(uint32_t x, uint8_t v){
+* uint32_t r;
+* r = ROTL32(x, (v&0xf)+1);
+* return r;
+* }
+* param x: r25:r22
+* param v: r20
+*/
+.global rotl_addel
+rotl_addel:
+ andi r20, 0x0f
+ inc r20
+ ldi r30, lo8(shiftcodetable)
+ ldi r31, hi8(shiftcodetable)
+ add r30, r20
+ adc r31, r1
+ lpm r20, Z
+ sbrs r20, 4
+ rjmp 1f
+ mov r21, r25
+ mov r25, r24
+ mov r24, r23
+ mov r23, r22
+ mov r22, r21
+1: sbrs r20, 5
+ rjmp 2f
+ movw r30, r24
+ movw r24, r22
+ movw r22, r30
+2: bst r20, 3
+ andi r20, 0x07
+ brne 3f
+ ret
+3:
+ brts rotr32; 4f
+ rjmp rotl32
+;4: rjmp rotr32
+
+/******************************************************************************/
+
+preg0 = 22 /* preg for processing register */
+preg1 = 23
+preg2 = 24
+preg3 = 25
+breg0 = 26 /* breg for backup register */
+breg1 = 27
+breg2 = 18
+breg3 = 19
+areg0 = 0 /* areg for accumulator register */
+areg1 = 1
+areg2 = 30
+areg3 = 31
+
+/*******************************************************************************
+* uint32_t bmw_small_s0(uint32_t x){
+* uint32_t r;
+* r = SHR32(x, 1)
+* ^ SHL32(x, 3)
+* ^ ROTL32(x, 4)
+* ^ ROTR32(x, 13);
+* return r;
+* }
+*/
+.global bmw_small_s0
+bmw_small_s0:
+ movw breg0, preg0
+ movw breg2, preg2
+ ldi r20, 1
+ rcall shiftr32
+ movw areg2, preg2
+ movw areg0, preg0
+ movw preg2, breg2
+ movw preg0, breg0
+ ldi r20, 3
+ rcall shiftl32
+ eor areg0, preg0
+ eor areg1, preg1
+ eor areg2, preg2
+ eor areg3, preg3
+ movw preg2, breg2
+ movw preg0, breg0
+ ldi r20, 4
+ rcall rotl32
+ eor areg0, preg0
+ eor areg1, preg1
+ eor areg2, preg2
+ eor areg3, preg3
+ /* now the trick, we simply can rotate the old value to the right by 17 */
+ movw breg0, preg0 /* first rotate by 16 */
+ movw preg0, preg2
+ movw preg2, breg0
+outro_1:
+ ldi r20, 1
+ rcall rotr32
+outro_2:
+ eor preg0, areg0
+ eor preg1, areg1
+ eor preg2, areg2
+ eor preg3, areg3
+ clr r1
+ ret
+
+/*******************************************************************************
+* uint32_t bmw_small_s1(uint32_t x){
+* uint32_t r;
+* r = SHR32(x, 1)
+* ^ SHL32(x, 2)
+* ^ ROTL32(x, 8)
+* ^ ROTR32(x, 9);
+* return r;
+* }
+*/
+.global bmw_small_s1
+bmw_small_s1:
+ movw breg0, preg0
+ movw breg2, preg2
+ ldi r20, 1
+ rcall shiftr32
+ movw areg2, preg2
+ movw areg0, preg0
+ movw preg2, breg2
+ movw preg0, breg0
+ ldi r20, 2
+ rcall shiftl32
+ eor areg0, preg0
+ eor areg1, preg1
+ eor areg2, preg2
+ eor areg3, preg3
+ eor areg0, breg3
+ eor areg1, breg0
+ eor areg2, breg1
+ eor areg3, breg2
+ mov preg0, breg1
+ mov preg1, breg2
+ mov preg2, breg3
+ mov preg3, breg0
+ rjmp outro_1
+
+/*******************************************************************************
+* uint32_t bmw_small_s2(uint32_t x){
+* uint32_t r;
+* r = SHR32(x, 2)
+* ^ SHL32(x, 1)
+* ^ ROTL32(x, 12)
+* ^ ROTR32(x, 7);
+* return r;
+* }
+*/
+.global bmw_small_s2
+bmw_small_s2:
+ movw breg0, preg0
+ movw breg2, preg2
+ ldi r20, 2
+ rcall shiftr32
+ movw areg2, preg2
+ movw areg0, preg0
+ movw preg2, breg2
+ movw preg0, breg0
+ ldi r20, 1
+ rcall shiftl32
+ eor areg0, preg0
+ eor areg1, preg1
+ eor areg2, preg2
+ eor areg3, preg3
+ movw preg0, breg2
+ movw preg2, breg0
+ ldi r20, 4
+ rcall rotr32
+ eor areg0, preg0
+ eor areg1, preg1
+ eor areg2, preg2
+ eor areg3, preg3
+ mov preg0, breg1
+ mov preg1, breg2
+ mov preg2, breg3
+ mov preg3, breg0
+ ldi r20, 1
+ rcall rotl32
+ rjmp outro_2
+
+/*******************************************************************************
+* uint32_t bmw_small_s3(uint32_t x){
+* uint32_t r;
+* r = SHR32(x, 2)
+* ^ SHL32(x, 2)
+* ^ ROTL32(x, 15)
+* ^ ROTR32(x, 3);
+* return r;
+* }
+*/
+.global bmw_small_s3
+bmw_small_s3:
+ movw breg0, preg0
+ movw breg2, preg2
+ ldi r20, 2
+ rcall shiftr32
+ movw areg2, preg2
+ movw areg0, preg0
+ movw preg2, breg2
+ movw preg0, breg0
+ ldi r20, 2
+ rcall shiftl32
+ eor areg0, preg0
+ eor areg1, preg1
+ eor areg2, preg2
+ eor areg3, preg3
+ movw preg0, breg2
+ movw preg2, breg0
+ ldi r20, 1
+ rcall rotr32
+ eor areg0, preg0
+ eor areg1, preg1
+ eor areg2, preg2
+ eor areg3, preg3
+ movw preg0, breg0
+ movw preg2, breg2
+ ldi r20, 3
+ rcall rotr32
+ rjmp outro_2
+
+/*******************************************************************************
+* uint32_t bmw_small_s4(uint32_t x){
+* uint32_t r;
+* r = SHR32(x, 1)
+* ^ x;
+* return r;
+* }
+*/
+.global bmw_small_s4
+bmw_small_s4:
+ movw areg0, preg0
+ movw areg2, preg2
+ ldi r20, 1
+ rcall shiftr32
+ rjmp outro_2
+
+/*******************************************************************************
+* uint32_t bmw_small_s5(uint32_t x){
+* uint32_t r;
+* r = SHR32(x, 2)
+* ^ x;
+* return r;
+* }
+*/
+.global bmw_small_s5
+bmw_small_s5:
+ movw areg0, preg0
+ movw areg2, preg2
+ ldi r20, 2
+ rcall shiftr32
+ rjmp outro_2
+
+/*******************************************************************************
+* uint32_t bmw_small_r1(uint32_t x){
+* uint32_t r;
+* r = ROTL32(x, 3);
+* return r;
+* }
+*/
+.global bmw_small_r1
+bmw_small_r1:
+ ldi r20, 3
+ rjmp rotl32
+
+/*******************************************************************************
+* uint32_t bmw_small_r2(uint32_t x){
+* uint32_t r;
+* r = ROTL32(x, 7);
+* return r;
+* }
+*/
+.global bmw_small_r2
+bmw_small_r2:
+ ldi r20, 7
+ rjmp rotl32
+
+/*******************************************************************************
+* uint32_t bmw_small_r3(uint32_t x){
+* uint32_t r;
+* r = ROTL32(x, 13);
+* return r;
+* }
+*/
+.global bmw_small_r3
+bmw_small_r3:
+ movw r18, r24
+ movw r24, r22
+ movw r22, r18
+ ldi r20, 3
+ rjmp rotr32
+
+
+/*******************************************************************************
+* uint32_t bmw_small_r4(uint32_t x){
+* uint32_t r;
+* r = ROTL32(x, 16);
+* return r;
+* }
+*/
+.global bmw_small_r4
+bmw_small_r4:
+ movw r18, r24
+ movw r24, r22
+ movw r22, r18
+ ret
+
+/*******************************************************************************
+* uint32_t bmw_small_r5(uint32_t x){
+* uint32_t r;
+* r = ROTR32(x, 13);
+* return r;
+* }
+*/
+.global bmw_small_r5
+bmw_small_r5:
+ movw r18, r24
+ movw r24, r22
+ movw r22, r18
+ ldi r20, 3
+ rjmp rotl32
+
+/*******************************************************************************
+* uint32_t bmw_small_r6(uint32_t x){
+* uint32_t r;
+* r = ROTR32(x, 9);
+* return r;
+* }
+*/
+.global bmw_small_r6
+bmw_small_r6:
+ mov r18, r22
+ mov r22, r23
+ mov r23, r24
+ mov r24, r25
+ mov r25, r18
+ ldi r20, 1
+ rjmp rotr32
+
+/*******************************************************************************
+* uint32_t bmw_small_r7(uint32_t x){
+* uint32_t r;
+* r = ROTR32(x, 5);
+* return r;
+* }
+*/
+.global bmw_small_r7
+bmw_small_r7:
+ ldi r20, 5
+ rjmp rotr32
+
+/******************************************************************************/
+
+const_lut:
+ .long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f
+ .long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3
+ .long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7
+ .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b
+
+/*******************************************************************************
+* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){
+* uint32_t r;
+* r = pgm_read_dword(k_lut+j);
+* r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
+* r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
+* r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
+* r ^= ((uint32_t*)h)[(j+7)&0xf];
+* return r;
+* }
+* param j: r24
+* param m: r22:r23
+* param h: r20:r21
+*/
+j = 16
+acc2 = 8
+acc3 = 9
+h0 = 10
+h1 = 11
+m0 = 12
+m1 = 13
+acc0 = 14
+acc1 = 15
+.global addelement
+addelement:
+ push_range 8, 16
+ mov j, r24
+ movw h0, r20
+ movw m0, r22
+ mov r25, r24
+ lsl r25
+ lsl r25
+ ldi r30, lo8(const_lut)
+ ldi r31, hi8(const_lut)
+ add r30, r25
+ adc r31, r1
+ lpm acc0, Z+
+ lpm acc1, Z+
+ lpm acc2, Z+
+ lpm acc3, Z+
+
+ mov r20, j
+ andi r20, 0x0f
+ lsl r20
+ lsl r20
+ movw r26, m0
+ add r26, r20
+ adc r27, r1
+ ld r22, X+
+ ld r23, X+
+ ld r24, X+
+ ld r25, X+
+ mov r20, j
+ rcall rotl_addel
+ add acc0, r22
+ adc acc1, r23
+ adc acc2, r24
+ adc acc3, r25
+
+ subi j, -3
+ mov r20, j
+ andi r20, 0x0f
+ lsl r20
+ lsl r20
+ movw r26, m0
+ add r26, r20
+ adc r27, r1
+ ld r22, X+
+ ld r23, X+
+ ld r24, X+
+ ld r25, X+
+ mov r20, j
+ rcall rotl_addel
+ add acc0, r22
+ adc acc1, r23
+ adc acc2, r24
+ adc acc3, r25
+
+ subi j, -7
+ mov r20, j
+ andi r20, 0x0f
+ lsl r20
+ lsl r20
+ movw r26, m0
+ add r26, r20
+ adc r27, r1
+ ld r22, X+
+ ld r23, X+
+ ld r24, X+
+ ld r25, X+
+ mov r20, j
+ rcall rotl_addel
+ sub acc0, r22
+ sbc acc1, r23
+ sbc acc2, r24
+ sbc acc3, r25
+
+ subi j, 3
+ mov r20, j
+ andi r20, 0x0f
+ lsl r20
+ lsl r20
+ movw r26, h0
+ add r26, r20
+ adc r27, r1
+ ld r22, X+
+ ld r23, X+
+ ld r24, X+
+ ld r25, X+
+ eor r22, acc0
+ eor r23, acc1
+ eor r24, acc2
+ eor r25, acc3
+ pop_range 8, 16
+ ret
+
+/*******************************************************************************
+* uint32_t bmw_small_expand1(uint8_t j, const void* m, const void* h, const uint32_t* q){
+* uint32_t(*s[])(uint32_t) = {bmw_small_s1, bmw_small_s2, bmw_small_s3, bmw_small_s0};
+* uint32_t r;
+* uint8_t i;
+* r = addelement(j, m, h);
+* i=15;
+* do{
+* r += s[i%4](q[j+i]);
+* }while(i--!=0);
+* return r;
+*
+* param j: r24
+* param m: r22:r23
+* param h: r20:r21
+* param q: r18:r19
+*/
+acc0 = 2
+acc1 = 3
+acc2 = 4
+acc3 = 5
+.global bmw_small_expand1
+bmw_small_expand1:
+ push_range 28, 29
+ movw r28, r18
+ mov r18, r24
+ lsl r18
+ lsl r18
+ add r28, r18
+ adc r29, r1
+ rcall addelement
+ push_range 2, 5
+ push r16
+ ldi r16, 4
+ movw acc0, r22
+ movw acc2, r24
+1:
+ ld r22, Y+
+ ld r23, Y+
+ ld r24, Y+
+ ld r25, Y+
+ rcall bmw_small_s1
+ add acc0, r22
+ adc acc1, r23
+ adc acc2, r24
+ adc acc3, r25
+ ld r22, Y+
+ ld r23, Y+
+ ld r24, Y+
+ ld r25, Y+
+ rcall bmw_small_s2
+ add acc0, r22
+ adc acc1, r23
+ adc acc2, r24
+ adc acc3, r25
+ ld r22, Y+
+ ld r23, Y+
+ ld r24, Y+
+ ld r25, Y+
+ rcall bmw_small_s3
+ add acc0, r22
+ adc acc1, r23
+ adc acc2, r24
+ adc acc3, r25
+ ld r22, Y+
+ ld r23, Y+
+ ld r24, Y+
+ ld r25, Y+
+ rcall bmw_small_s0
+ add acc0, r22
+ adc acc1, r23
+ adc acc2, r24
+ adc acc3, r25
+ dec r16
+ brne 1b
+expand1_exit:
+ movw r22, acc0
+ movw r24, acc2
+ pop r16
+ pop_range 2, 5
+ pop_range 28, 29
+ ret
+
+/*******************************************************************************
+* uint32_t bmw_small_expand2(uint8_t j, const void* m, const void* h, const uint32_t* q){
+* uint32_t(*rf[])(uint32_t) = {bmw_small_r1, bmw_small_r2, bmw_small_r3,
+* bmw_small_r4, bmw_small_r5, bmw_small_r6,
+* bmw_small_r7};
+* uint32_t r;
+* uint8_t i;
+* r = addelement(j, m, h);
+* for(i=0; i<14; i+=2){
+* r += q[j+i];
+* }
+* for(i=0; i<14; i+=2){
+* r += rf[i/2](q[j+i+1]);
+* }
+* r += bmw_small_s4(q[j+14]);
+* r += bmw_small_s5(q[j+15]);
+* return r;
+* }
+*/
+expand2_jumptable:
+ ret
+ rjmp bmw_small_r1
+ ret
+ rjmp bmw_small_r2
+ ret
+ rjmp bmw_small_r3
+ ret
+ rjmp bmw_small_r4
+ ret
+ rjmp bmw_small_r5
+ ret
+ rjmp bmw_small_r6
+ ret
+ rjmp bmw_small_r7
+ rjmp bmw_small_s4
+ rjmp bmw_small_s5
+
+.global bmw_small_expand2
+bmw_small_expand2:
+ push_range 28, 29
+ movw r28, r18
+ mov r18, r24
+ lsl r18
+ lsl r18
+ add r28, r18
+ adc r29, r1
+ rcall addelement
+ push_range 2, 5
+ push r16
+ ldi r16, 16
+ movw acc0, r22
+ movw acc2, r24
+ ldi r30, pm_lo8(expand2_jumptable)
+ ldi r31, pm_hi8(expand2_jumptable)
+1:
+ ld r22, Y+
+ ld r23, Y+
+ ld r24, Y+
+ ld r25, Y+
+ push r30
+ push r31
+ icall
+ pop r31
+ pop r30
+ adiw r30, 1
+ add acc0, r22
+ adc acc1, r23
+ adc acc2, r24
+ adc acc3, r25
+ dec r16
+ brne 1b
+ rjmp expand1_exit
+
+/*******************************************************************************
+* void bmw_small_f1(uint32_t* q, const void* m, const void* h){
+* uint8_t i;
+* q[16] = bmw_small_expand1(0, m, h, q);
+* q[17] = bmw_small_expand1(1, m, h, q);
+* for(i=2; i<16; ++i){
+* q[16+i] = bmw_small_expand2(i, m, h, q);
+* }
+* }
+*/
+m0 = 2
+m1 = 3
+h0 = 4
+h1 = 5
+q0 = 6
+q1 = 7
+.global bmw_small_f1
+bmw_small_f1:
+ push_range 2, 7
+ push_range 28, 29
+ push r16
+ movw q0, r24
+ movw m0, r22
+ movw h0, r20
+ movw r28, q0
+ adiw r28, 63
+ adiw r28, 1
+ clr r24
+ clr r25 /* not required */
+ movw r18, q0
+ rcall bmw_small_expand1
+ st Y+, r22
+ st Y+, r23
+ st Y+, r24
+ st Y+, r25
+ ldi r16, 1
+ mov r24, r16
+ clr r25 /* not required */
+ movw r22, m0
+ movw r20, h0
+ movw r18, q0
+ rcall bmw_small_expand1
+ st Y+, r22
+ st Y+, r23
+ st Y+, r24
+ st Y+, r25
+ inc r16
+1:
+ mov r24, r16
+ movw r22, m0
+ movw r20, h0
+ movw r18, q0
+ rcall bmw_small_expand2
+ st Y+, r22
+ st Y+, r23
+ st Y+, r24
+ st Y+, r25
+ inc r16
+ cpi r16, 16
+ brne 1b
+ pop r16
+ pop_range 28, 29
+ pop_range 2, 7
+ ret
+
+/*******************************************************************************
+* uint16_t hack_table[5] PROGMEM = { 0x0311, 0xDDB3, 0x2A79, 0x07AA, 0x51C2 };
+* uint8_t offset_table[5] PROGMEM = { 4+16, 6+16, 9+16, 12+16, 13+16 };
+*
+* void bmw_small_f0(uint32_t* h, const void* m, uint32_t* q){
+* uint16_t hack_reg;
+* uint8_t c,i,j;
+* uint32_t(*s[])(uint32_t)={ bmw_small_s0, bmw_small_s1, bmw_small_s2,
+* bmw_small_s3, bmw_small_s4 };
+* for(i=0; i<16; ++i){
+* ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i];
+* }
+* dump_x(h, 16, 'T');
+* memset(q, 0, 4*16);
+* c=4;
+* do{
+* i=15;
+* j=pgm_read_byte(offset_table+c);
+* hack_reg=pgm_read_word(&(hack_table[c]));
+* do{
+* if(hack_reg&1){
+* q[i]-= h[j&15];
+* }else{
+* q[i]+= h[j&15];
+* }
+* --j;
+* hack_reg>>= 1;
+* }while(i--!=0);
+* }while(c--!=0);
+* dump_x(q, 16, 'W');
+* for(i=0; i<16; ++i){
+* q[i] = s[i%5](q[i]);
+* }
+* for(i=0; i<16; ++i){
+* ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i];
+* }
+* for(i=0; i<16; ++i){
+* q[i] += h[(i+1)&0xf];
+* }
+* }
+*
+* param h: r24:r25
+* param m: r22:r23
+* param q: r20:r21
+*/
+h0 = 24
+h1 = 25
+m0 = 22
+m1 = 23
+q0 = 20
+q1 = 21
+acc0 = 4
+acc1 = 5
+acc2 = 6
+acc3 = 7
+bcc0 = 8
+bcc1 = 9
+bcc2 = 10
+bcc3 = 11
+hack = 16
+
+f0_helper:
+20:
+ ldd acc0, Z+0
+ ldd acc1, Z+1
+ ldd acc2, Z+2
+ ldd acc3, Z+3
+ ld bcc0, X+
+ ld bcc1, X+
+ ld bcc2, X+
+ ld bcc3, X+
+ lsr r17
+ ror r16
+ brcs l20_sub
+ add acc0, bcc0
+ adc acc1, bcc1
+ adc acc2, bcc2
+ adc acc3, bcc3
+ rjmp l20_post
+l20_sub:
+ sub acc0, bcc0
+ sbc acc1, bcc1
+ sbc acc2, bcc2
+ sbc acc3, bcc3
+l20_post:
+ st Z+, acc0
+ st Z+, acc1
+ st Z+, acc2
+ st Z+, acc3
+ dec r18
+ brne 20b
+ ret
+
+f0_jumptable:
+ rjmp bmw_small_s0
+ rjmp bmw_small_s1
+ rjmp bmw_small_s2
+ rjmp bmw_small_s3
+ rjmp bmw_small_s4
+ rjmp bmw_small_s0
+ rjmp bmw_small_s1
+ rjmp bmw_small_s2
+ rjmp bmw_small_s3
+ rjmp bmw_small_s4
+ rjmp bmw_small_s0
+ rjmp bmw_small_s1
+ rjmp bmw_small_s2
+ rjmp bmw_small_s3
+ rjmp bmw_small_s4
+ rjmp bmw_small_s0
+
+.global bmw_small_f0
+bmw_small_f0:
+ push_range 28, 29
+ push_range 4, 11
+ push_range 16, 17
+ /* h[i] ^= m[i]; q[i]= 0 */
+ movw r26, h0 ; h
+ movw r30, m0 ; m
+ movw r28, q0 ; q
+ ldi r18, 64
+1: ld r0, X
+ ld r19, Z+
+ eor r0, r19
+ st X+, r0
+ st Y+, r1
+ dec r18
+ brne 1b
+;------
+ ldi r17, 0x88
+ ldi r16, 0xC0
+ movw r26, h0 ; X = h
+ adiw r26, 5*4
+ ldi r18, 16-5
+ movw r30, q0 ; Z = q
+ rcall f0_helper
+ movw r26, h0 ; X = h
+ ldi r18, 5
+ rcall f0_helper
+;---
+ ldi r17, 0xCD
+ ldi r16, 0xBB
+ movw r26, h0 ; X = h
+ adiw r26, 7*4
+ ldi r18, 16-7
+ movw r30, q0 ; Z = q
+ rcall f0_helper
+ movw r26, h0 ; X = h
+ ldi r18, 7
+ rcall f0_helper
+;---
+ ldi r17, 0x9E
+ ldi r16, 0x54
+ movw r26, h0 ; X = h
+ adiw r26, 10*4
+ ldi r18, 16-10
+ movw r30, q0 ; Z = q
+ rcall f0_helper
+ movw r26, h0 ; X = h
+ ldi r18, 10
+ rcall f0_helper
+;---
+ ldi r17, 0x55
+ ldi r16, 0xE0
+ movw r26, h0 ; X = h
+ adiw r26, 13*4
+ ldi r18, 16-13
+ movw r30, q0 ; Z = q
+ rcall f0_helper
+ movw r26, h0 ; X = h
+ ldi r18, 13
+ rcall f0_helper
+;---
+ ldi r17, 0x43
+ ldi r16, 0x8A
+ movw r26, h0 ; X = h
+ adiw r26, 14*4
+ ldi r18, 16-14
+ movw r30, q0 ; Z = q
+ rcall f0_helper
+ movw r26, h0 ; X = h
+ ldi r18, 14
+ rcall f0_helper
+;--------------- h[i] ^= m[i]
+ movw r26, h0 ; h
+ movw r30, m0 ; m
+ ldi r18, 64
+25: ld r0, X
+ ld r19, Z+
+ eor r0, r19
+ st X+, r0
+ dec r18
+ brne 25b
+;--------------- q[i] = s[i%5](q[i])
+ ldi r16, 16
+ ldi r30, pm_lo8(f0_jumptable)
+ ldi r31, pm_hi8(f0_jumptable)
+ movw bcc0, r30
+ movw bcc2, h0 ; h
+ movw acc0, q0 ; q
+ movw r28, q0 ; Y = q
+30:
+ ldd r22, Y+0
+ ldd r23, Y+1
+ ldd r24, Y+2
+ ldd r25, Y+3
+ icall
+ st Y+, r22
+ st Y+, r23
+ st Y+, r24
+ st Y+, r25
+ movw r30, bcc0
+ adiw r30, 1
+ movw bcc0, r30
+ dec r16
+ brne 30b
+;--------------- q[i] += h[(i+1)%16]
+ movw r30, acc0 ; q
+ movw r26, bcc2 ; h
+ adiw r26, 4
+ ldi r18, 15
+40:
+ ld acc0, Z
+ ld acc1, X+
+ add acc0, acc1
+ st Z+, acc0
+ ld acc0, Z
+ ld acc1, X+
+ adc acc0, acc1
+ st Z+, acc0
+ ld acc0, Z
+ ld acc1, X+
+ adc acc0, acc1
+ st Z+, acc0
+ ld acc0, Z
+ ld acc1, X+
+ adc acc0, acc1
+ st Z+, acc0
+ dec r18
+ brne 40b
+ movw r26, bcc2 ; h
+ ld acc0, Z
+ ld acc1, X+
+ add acc0, acc1
+ st Z+, acc0
+ ld acc0, Z
+ ld acc1, X+
+ adc acc0, acc1
+ st Z+, acc0
+ ld acc0, Z
+ ld acc1, X+
+ adc acc0, acc1
+ st Z+, acc0
+ ld acc0, Z
+ ld acc1, X+
+ adc acc0, acc1
+ st Z+, acc0
+
+ pop_range 16, 17
+ pop_range 4, 11
+ pop_range 28, 29
+ ret
+
+/*******************************************************************************
+* void bmw_small_f2(uint32_t* h, const uint32_t* q, const void* m){
+* uint32_t xl=0, xh;
+* uint8_t i;
+* for(i=16;i<24;++i){
+* xl ^= q[i];
+* }
+* xh = xl;
+* for(i=24;i<32;++i){
+* xh ^= q[i];
+* }
+* memcpy(h, m, 16*4);
+* h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5);
+* h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6);
+* h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5);
+* h[4] ^= SHR32(xh, 3) ^ q[20];
+* h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6);
+* h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5);
+* h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8);
+* h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2);
+* for(i=0; i<8; ++i){
+* h[i] += xl ^ q[24+i] ^ q[i];
+* }
+* for(i=0; i<8; ++i){
+* h[8+i] ^= xh ^ q[24+i];
+* h[8+i] += ROTL32(h[(4+i)%8],i+9);
+* }
+* h[11] += SHL32(xl, 4) ^ q[18] ^ q[11];
+* h[10] += SHL32(xl, 6) ^ q[17] ^ q[10];
+* h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8];
+* h[15] += SHR32(xl, 2) ^ q[22] ^ q[15];
+* h[12] += SHR32(xl, 3) ^ q[19] ^ q[12];
+* h[13] += SHR32(xl, 4) ^ q[20] ^ q[13];
+* h[ 9] += SHR32(xl, 6) ^ q[16] ^ q[ 9];
+* h[14] += SHR32(xl, 7) ^ q[21] ^ q[14];
+* }
+*
+* param h: r24:r25
+* param q: r22:r23
+* param m: r20:r21
+*/
+xl0 = 2
+xl1 = 3
+xl2 = 4
+xl3 = 5
+xh0 = 6
+xh1 = 7
+xh2 = 8
+xh3 = 9
+q0 = 10
+q1 = 11
+h0 = 12
+h1 = 13
+t0 = 14
+t1 = 15
+t2 = 16
+t3 = 17
+
+
+.macro modify_h_2 addr:req
+ ldd r22, Y+\addr*4+0
+ ldd r23, Y+\addr*4+1
+ ldd r24, Y+\addr*4+2
+ ldd r25, Y+\addr*4+3
+ eor r22, t0
+ eor r23, t1
+ eor r24, t2
+ eor r25, t3
+ ldd r0, Z+\addr*4+0
+ add r0, r22
+ std Z+\addr*4+0, r0
+ ldd r0, Z+\addr*4+1
+ adc r0, r23
+ std Z+\addr*4+1, r0
+ ldd r0, Z+\addr*4+2
+ adc r0, r24
+ std Z+\addr*4+2, r0
+ ldd r0, Z+\addr*4+3
+ adc r0, r25
+ std Z+\addr*4+3, r0
+.endm
+
+tshiftr:
+ lsr t3
+ ror t2
+ ror t1
+ ror t0
+ dec r20
+ brne tshiftr
+ ret
+
+tshiftl:
+ lsl t0
+ rol t1
+ rol t2
+ rol t3
+ dec r20
+ brne tshiftl
+ ret
+
+.global bmw_small_f2
+bmw_small_f2:
+ /* memcpy(h, m, 64) */
+ movw r26, r24
+ movw r30, r20
+ ldi r18, 64
+1: ld r0, Z+
+ st X+, r0
+ dec r18
+ brne 1b
+ push_range 28, 29
+ push_range 2, 17
+ movw q0, r22
+ movw h0, r24
+ /* calc xl */
+/* for(i=16;i<24;++i){
+ xl ^= q[i];
+ }
+*/
+ movw r26, q0
+ adiw r26, 63
+ adiw r26, 1 ; X points at q[16]
+ ld xl0, X+
+ ld xl1, X+
+ ld xl2, X+
+ ld xl3, X+
+ ldi r18, 8-1
+20: ld r0, X+
+ eor xl0, r0
+ ld r0, X+
+ eor xl1, r0
+ ld r0, X+
+ eor xl2, r0
+ ld r0, X+
+ eor xl3, r0
+ dec r18
+ brne 20b
+ /* calc xh */
+/* xh = xl
+ for(i=24;i<32;++i){
+ xh ^= q[i];
+ }
+*/
+ movw xh0, xl0
+ movw xh2, xl2
+ ldi r18, 8
+25: ld r0, X+
+ eor xh0, r0
+ ld r0, X+
+ eor xh1, r0
+ ld r0, X+
+ eor xh2, r0
+ ld r0, X+
+ eor xh3, r0
+ dec r18
+ brne 25b
+/* h[0]..h[7] */
+ movw r30, h0
+ movw r28, q0
+ adiw r28, 60 ; Y points at q[15]
+/* h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5); */
+ movw t0, xh0
+ movw t2, xh2
+ ldi r20, 5
+ rcall tshiftl
+ ldd r22, Y+4
+ ldd r23, Y+5
+ ldd r24, Y+6
+ ldd r25, Y+7
+ ldi r20, 5
+ rcall shiftr32
+ eor r22, t0
+ eor r23, t1
+ eor r24, t2
+ eor r25, t3
+ ldd r0, Z+0
+ eor r22, r0
+ ldd r0, Z+1
+ eor r23, r0
+ ldd r0, Z+2
+ eor r24, r0
+ ldd r0, Z+3
+ eor r25, r0
+ std Z+0, r22
+ std Z+1, r23
+ std Z+2, r24
+ std Z+3, r25
+/* h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6); */
+ lsl t0
+ rol t1
+ rol t2
+ rol t3
+ ldd r22, Y+24
+ ldd r23, Y+25
+ ldd r24, Y+26
+ ldd r25, Y+27
+ ldi r20, 6
+ rcall shiftr32
+ eor r22, t0
+ eor r23, t1
+ eor r24, t2
+ eor r25, t3
+ ldd r0, Z+20
+ eor r22, r0
+ ldd r0, Z+21
+ eor r23, r0
+ ldd r0, Z+22
+ eor r24, r0
+ ldd r0, Z+23
+ eor r25, r0
+ std Z+20, r22
+ std Z+21, r23
+ std Z+22, r24
+ std Z+23, r25
+/* h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5); */
+ movw t0, xh0
+ movw t2, xh2
+ lsr t3
+ ror t2
+ ror t1
+ ror t0
+ ldd r22, Y+16
+ ldd r23, Y+17
+ ldd r24, Y+18
+ ldd r25, Y+19
+ ldi r20, 5
+ rcall shiftl32
+ eor r22, t0
+ eor r23, t1
+ eor r24, t2
+ eor r25, t3
+ ldd r0, Z+12
+ eor r22, r0
+ ldd r0, Z+13
+ eor r23, r0
+ ldd r0, Z+14
+ eor r24, r0
+ ldd r0, Z+15
+ eor r25, r0
+ std Z+12, r22
+ std Z+13, r23
+ std Z+14, r24
+ std Z+15, r25
+/* h[4] ^= SHR32(xh, 3) ^ q[20]; */
+ ldi r20, 2
+ rcall tshiftr
+ ldd r22, Y+20
+ ldd r23, Y+21
+ ldd r24, Y+22
+ ldd r25, Y+23
+ eor r22, t0
+ eor r23, t1
+ eor r24, t2
+ eor r25, t3
+ ldd r0, Z+16
+ eor r22, r0
+ ldd r0, Z+17
+ eor r23, r0
+ ldd r0, Z+18
+ eor r24, r0
+ ldd r0, Z+19
+ eor r25, r0
+ std Z+16, r22
+ std Z+17, r23
+ std Z+18, r24
+ std Z+19, r25
+/* h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6); */
+ lsr t3
+ ror t2
+ ror t1
+ ror t0
+ ldd r22, Y+28
+ ldd r23, Y+29
+ ldd r24, Y+30
+ ldd r25, Y+31
+ ldi r20, 6
+ rcall shiftl32
+ eor r22, t0
+ eor r23, t1
+ eor r24, t2
+ eor r25, t3
+ ldd r0, Z+24
+ eor r22, r0
+ ldd r0, Z+25
+ eor r23, r0
+ ldd r0, Z+26
+ eor r24, r0
+ ldd r0, Z+27
+ eor r25, r0
+ std Z+24, r22
+ std Z+25, r23
+ std Z+26, r24
+ std Z+27, r25
+/* h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5); */
+ lsr t3
+ ror t2
+ ror t1
+ ror t0
+ ldd r22, Y+12
+ ldd r23, Y+13
+ ldd r24, Y+14
+ ldd r25, Y+15
+ ldi r20, 5
+ rcall shiftl32
+ eor r22, t0
+ eor r23, t1
+ eor r24, t2
+ eor r25, t3
+ ldd r0, Z+8
+ eor r22, r0
+ ldd r0, Z+9
+ eor r23, r0
+ ldd r0, Z+10
+ eor r24, r0
+ ldd r0, Z+11
+ eor r25, r0
+ std Z+8 , r22
+ std Z+9 , r23
+ std Z+10, r24
+ std Z+11, r25
+/* h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8); */
+ ldi r20, 2
+ rcall tshiftr
+ ldd r23, Y+8
+ ldd r24, Y+9
+ ldd r25, Y+10
+ mov r22, t0
+ eor r23, t1
+ eor r24, t2
+ eor r25, t3
+ ldd r0, Z+4
+ eor r22, r0
+ ldd r0, Z+5
+ eor r23, r0
+ ldd r0, Z+6
+ eor r24, r0
+ ldd r0, Z+7
+ eor r25, r0
+ std Z+4 , r22
+ std Z+5 , r23
+ std Z+6 , r24
+ std Z+7 , r25
+/* h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2); */
+ ldi r20, 4
+ rcall tshiftr
+ ldd r22, Y+32
+ ldd r23, Y+33
+ ldd r24, Y+34
+ ldd r25, Y+35
+ ldi r20, 2
+ rcall shiftl32
+ eor r22, t0
+ eor r23, t1
+ eor r24, t2
+ eor r25, t3
+ ldd r0, Z+28
+ eor r22, r0
+ ldd r0, Z+29
+ eor r23, r0
+ ldd r0, Z+30
+ eor r24, r0
+ ldd r0, Z+31
+ eor r25, r0
+ std Z+28, r22
+ std Z+29, r23
+ std Z+30, r24
+ std Z+31, r25
+/* for(i=0; i<8; ++i){
+* h[i] += xl ^ q[24+i] ^ q[i];
+* }
+*/
+ movw r26, q0
+ movw r28, q0
+ adiw r28, 63
+ adiw r28, 24*4-63
+ ldi r18, 8
+10:
+ movw t0, xl0
+ movw t2, xl2
+ ld r0, X+
+ eor t0, r0
+ ld r0, X+
+ eor t1, r0
+ ld r0, X+
+ eor t2, r0
+ ld r0, X+
+ eor t3, r0
+ ld r0, Y+
+ eor t0, r0
+ ld r0, Y+
+ eor t1, r0
+ ld r0, Y+
+ eor t2, r0
+ ld r0, Y+
+ eor t3, r0
+ ldd r22, Z+0
+ ldd r23, Z+1
+ ldd r24, Z+2
+ ldd r25, Z+3
+ add r22, t0
+ adc r23, t1
+ adc r24, t2
+ adc r25, t3
+ st Z+, r22
+ st Z+, r23
+ st Z+, r24
+ st Z+, r25
+ dec r18
+ brne 10b
+ ; Z points to h[8]
+/* for(i=0; i<8; ++i){
+ h[8+i] ^= xh ^ q[24+i];
+ h[8+i] += ROTL32(h[(4+i)%8],i+9);
+ }
+*/
+ ; Z points at h[8]
+; clr r18
+ sbiw r28, 8*4 ; Y points at q[24]
+ movw r26, r30
+ sbiw r26, 4*4 ; X points at h[4]
+15:
+ ldd t0, Z+0
+ ldd t1, Z+1
+ ldd t2, Z+2
+ ldd t3, Z+3
+ eor t0, xh0
+ eor t1, xh1
+ eor t2, xh2
+ eor t3, xh3
+ ld r0, Y+
+ eor t0, r0
+ ld r0, Y+
+ eor t1, r0
+ ld r0, Y+
+ eor t2, r0
+ ld r0, Y+
+ eor t3, r0
+ ld r22, X+
+ ld r23, X+
+ ld r24, X+
+ ld r25, X+
+ mov r20, r18
+ rcall rotl32p9
+ add t0, r22
+ adc t1, r23
+ adc t2, r24
+ adc t3, r25
+ st Z+, t0
+ st Z+, t1
+ st Z+, t2
+ st Z+, t3
+ inc r18
+ cpi r18, 4
+ brne 16f
+ movw r26, h0
+16:
+ sbrs r18, 3
+ rjmp 15b
+ sbiw r30, 4*8 ; adjust Z to point at h[8]
+ sbiw r28, 16*4-1
+ sbiw r28, 1 ; adjust Y to point at q[16]
+ movw r26, r28
+ sbiw r26, 7*4 ; adjust X to point at q[9]
+ ldi r18, 7*4
+20: /* now we do the memxor stuff */
+ ld t0, X
+ ld t1, Y+
+ eor t0, t1
+ st X+, t0
+ dec r18
+ brne 20b
+ ; X points at q[16]
+ ; Y points at q[23]
+ sbiw r26, 4*8 ; X points at q[8]
+
+ clr t0
+ mov t1, xl0
+ mov t2, xl1
+ mov t3, xl2
+/* h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8]; */
+ ld r22, X+
+ ld r23, X+
+ ld r24, X+
+ ld r25, X+
+ ld r0, Y+
+ eor r22, r0
+ ld r0, Y+
+ eor r23, r0
+ ld r0, Y+
+ eor r24, r0
+ ld r0, Y+
+ eor r25, r0
+ eor r22, t0
+ eor r23, t1
+ eor r24, t2
+ eor r25, t3
+ ld r0, Z
+ add r0, r22
+ st Z+, r0
+ ld r0, Z
+ adc r0, r23
+ st Z+, r0
+ ld r0, Z
+ adc r0, r24
+ st Z+, r0
+ ld r0, Z
+ adc r0, r25
+ st Z+, r0
+ movw r28, r26
+ ; Z points at h[9]
+ ; X points at q[9] but we won't need it anymore
+ ; Y points at q[9]
+/* h[11] += SHL32(xl, 4) ^ q[11]; */
+ movw t0, xl0
+ movw t2, xl2
+ ldi r20, 4
+ rcall tshiftl
+ modify_h_2 2
+/* h[10] += SHL32(xl, 6) ^ q[10]; */
+ ldi r20, 2
+ rcall tshiftl
+ modify_h_2 1
+/* h[15] += SHR32(xl, 2) ^ q[15]; */
+ movw t0, xl0
+ movw t2, xl2
+ ldi r20, 2
+ rcall tshiftr
+ modify_h_2 6
+/* h[12] += SHR32(xl, 3) ^ q[12]; */
+ ldi r20, 1
+ rcall tshiftr
+ modify_h_2 3
+/* h[13] += SHR32(xl, 4) ^ q[13]; */
+ ldi r20, 1
+ rcall tshiftr
+ modify_h_2 4
+/* h[ 9] += SHR32(xl, 6) ^ q[ 9]; */
+ ldi r20, 2
+ rcall tshiftr
+ modify_h_2 0
+/* h[14] += SHR32(xl, 7) ^ q[14]; */
+ ldi r20, 1
+ rcall tshiftr
+ modify_h_2 5
+bmw_small_f2_exit:
+ pop_range 2, 17
+ pop_range 28, 29
+ ret
+
+cli_putb:
+ push r2
+ push_range 18, 26
+ push_range 30, 31
+ mov r2, r24
+ swap r24
+ andi r24, 0xf
+ ldi r30, lo8(hextable)
+ ldi r31, hi8(hextable)
+ add r30, r24
+ adc r31, r1
+ lpm r24, Z
+ clr r25
+ call cli_putc
+ mov r24, r2
+ andi r24, 0xf
+ ldi r30, lo8(hextable)
+ ldi r31, hi8(hextable)
+ add r30, r24
+ adc r31, r1
+ lpm r24, Z
+ clr r25
+ call cli_putc
+ pop_range 30, 31
+ pop_range 18, 26
+ pop r2
+ ret
+hextable:
+ .byte '0', '1', '2', '3', '4', '5', '6', '7'
+ .byte '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
+
+cli_putchar:
+ push_range 18, 31
+ call cli_putc
+ pop_range 18, 31
+ ret
--- /dev/null
+/* bmw_small.c */
+/*
+ This file is part of the AVR-Crypto-Lib.
+ Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * \file bmw_small.c
+ * \author Daniel Otte
+ * \email daniel.otte@rub.de
+ * \date 2009-04-27
+ * \license GPLv3 or later
+ *
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <avr/pgmspace.h>
+#include "bmw_small.h"
+
+
+#define SHL32(a,n) ((a)<<(n))
+#define SHR32(a,n) ((a)>>(n))
+#define ROTL32(a,n) (((a)<<(n))|((a)>>(32-(n))))
+#define ROTR32(a,n) (((a)>>(n))|((a)<<(32-(n))))
+
+#define DEBUG 0
+
+
+#if DEBUG
+ #include "cli.h"
+
+ void ctx_dump(const bmw_small_ctx_t* ctx){
+ uint8_t i;
+ cli_putstr_P(PSTR("\r\n==== ctx dump ===="));
+ for(i=0; i<16;++i){
+ cli_putstr_P(PSTR("\r\n h["));
+ cli_hexdump(&i, 1);
+ cli_putstr_P(PSTR("] = "));
+ cli_hexdump_rev(&(ctx->h[i]), 4);
+ }
+ cli_putstr_P(PSTR("\r\n counter = "));
+ cli_hexdump(&(ctx->counter), 4);
+ }
+
+ void dump_x(const uint32_t* q, uint8_t elements, char x){
+ uint8_t i;
+ cli_putstr_P(PSTR("\r\n==== "));
+ cli_putc(x);
+ cli_putstr_P(PSTR(" dump ===="));
+ for(i=0; i<elements;++i){
+ cli_putstr_P(PSTR("\r\n "));
+ cli_putc(x);
+ cli_putstr_P(PSTR("["));
+ cli_hexdump(&i, 1);
+ cli_putstr_P(PSTR("] = "));
+ cli_hexdump_rev(&(q[i]), 4);
+ }
+ }
+#else
+ #define ctx_dump(x)
+ #define dump_x(a,b,c)
+#endif
+
+void bmw_small_f1(uint32_t* q, const void* m, const void* h);
+void bmw_small_f0(uint32_t* h, const void* m, uint32_t* q);
+void bmw_small_f2(uint32_t* h, uint32_t* q, const void* m);
+
+/*
+static
+void bmw_small_f2(uint32_t* h, const uint32_t* q, const void* m){
+ uint32_t xl=0, xh;
+ uint8_t i;
+ for(i=16;i<24;++i){
+ xl ^= q[i];
+ }
+ xh = xl;
+ for(i=24;i<32;++i){
+ xh ^= q[i];
+ }
+#if DEBUG
+ cli_putstr_P(PSTR("\r\n XL = "));
+ cli_hexdump_rev(&xl, 4);
+ cli_putstr_P(PSTR("\r\n XH = "));
+ cli_hexdump_rev(&xh, 4);
+#endif
+ memcpy(h, m, 16*4);
+ h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5);
+ h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6);
+ h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5);
+ h[4] ^= SHR32(xh, 3) ^ q[20];
+ h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6);
+ h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5);
+ h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8);
+ h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2);
+ for(i=0; i<8; ++i){
+ h[i] += xl ^ q[24+i] ^ q[i];
+ }
+ for(i=0; i<8; ++i){
+ h[8+i] ^= xh ^ q[24+i];
+ h[8+i] += ROTL32(h[(4+i)%8],i+9);
+ }
+ h[11] += SHL32(xl, 4) ^ q[18] ^ q[11];
+ h[10] += SHL32(xl, 6) ^ q[17] ^ q[10];
+ h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8];
+ h[15] += SHR32(xl, 2) ^ q[22] ^ q[15];
+ h[12] += SHR32(xl, 3) ^ q[19] ^ q[12];
+ h[13] += SHR32(xl, 4) ^ q[20] ^ q[13];
+ h[ 9] += SHR32(xl, 6) ^ q[16] ^ q[ 9];
+ h[14] += SHR32(xl, 7) ^ q[21] ^ q[14];
+}
+*/
+void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block){
+ uint32_t q[32];
+ dump_x(block, 16, 'M');
+ bmw_small_f0(ctx->h, block, q);
+ dump_x(q, 16, 'Q');
+ bmw_small_f1(q, block, ctx->h);
+ dump_x(q, 32, 'Q');
+ bmw_small_f2(ctx->h, q, block);
+ ctx->counter += 1;
+ ctx_dump(ctx);
+}
+
+void bmw_small_lastBlock(bmw_small_ctx_t* ctx, const void* block, uint16_t length_b){
+ uint8_t buffer[64];
+ while(length_b >= BMW_SMALL_BLOCKSIZE){
+ bmw_small_nextBlock(ctx, block);
+ length_b -= BMW_SMALL_BLOCKSIZE;
+ block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
+ }
+ memset(buffer, 0, 64);
+ memcpy(buffer, block, (length_b+7)/8);
+ buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
+ if(length_b+1>64*8-64){
+ bmw_small_nextBlock(ctx, buffer);
+ memset(buffer, 0, 64-8);
+ ctx->counter -= 1;
+ }
+ *((uint64_t*)&(buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
+ bmw_small_nextBlock(ctx, buffer);
+ uint8_t i;
+ uint32_t q[32];
+ memset(buffer, 0xaa, 64);
+ for(i=0; i<16;++i){
+ buffer[i*4] = i+0xa0;
+ }
+// dump_x(buffer, 16, 'A');
+ dump_x(ctx->h, 16, 'M');
+ bmw_small_f0((uint32_t*)buffer, ctx->h, q);
+ dump_x(buffer, 16, 'a');
+ dump_x(q, 16, 'Q');
+ bmw_small_f1(q, ctx->h, (uint32_t*)buffer);
+ dump_x(q, 32, 'Q');
+ bmw_small_f2((uint32_t*)buffer, q, ctx->h);
+ memcpy(ctx->h, buffer, 64);
+}
+
+void bmw224_init(bmw224_ctx_t* ctx){
+ uint8_t i;
+ ctx->h[0] = 0x00010203;
+ for(i=1; i<16; ++i){
+ ctx->h[i] = ctx->h[i-1]+ 0x04040404;
+ }
+ ctx->counter=0;
+// ctx_dump(ctx);
+}
+
+void bmw256_init(bmw256_ctx_t* ctx){
+ uint8_t i;
+ ctx->h[0] = 0x40414243;
+ for(i=1; i<16; ++i){
+ ctx->h[i] = ctx->h[i-1]+ 0x04040404;
+ }
+ ctx->counter=0;
+// ctx_dump(ctx);
+}
+
+void bmw224_nextBlock(bmw224_ctx_t* ctx, const void* block){
+ bmw_small_nextBlock(ctx, block);
+}
+
+void bmw256_nextBlock(bmw256_ctx_t* ctx, const void* block){
+ bmw_small_nextBlock(ctx, block);
+}
+
+void bmw224_lastBlock(bmw224_ctx_t* ctx, const void* block, uint16_t length_b){
+ bmw_small_lastBlock(ctx, block, length_b);
+}
+
+void bmw256_lastBlock(bmw256_ctx_t* ctx, const void* block, uint16_t length_b){
+ bmw_small_lastBlock(ctx, block, length_b);
+}
+
+void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){
+ memcpy(dest, &(ctx->h[9]), 224/8);
+}
+
+void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
+ memcpy(dest, &(ctx->h[8]), 256/8);
+}
+
+void bmw224(void* dest, const void* msg, uint32_t length_b){
+ bmw_small_ctx_t ctx;
+ bmw224_init(&ctx);
+ while(length_b>=BMW_SMALL_BLOCKSIZE){
+ bmw_small_nextBlock(&ctx, msg);
+ length_b -= BMW_SMALL_BLOCKSIZE;
+ msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
+ }
+ bmw_small_lastBlock(&ctx, msg, length_b);
+ bmw224_ctx2hash(dest, &ctx);
+}
+
+void bmw256(void* dest, const void* msg, uint32_t length_b){
+ bmw_small_ctx_t ctx;
+ bmw256_init(&ctx);
+ while(length_b>=BMW_SMALL_BLOCKSIZE){
+ bmw_small_nextBlock(&ctx, msg);
+ length_b -= BMW_SMALL_BLOCKSIZE;
+ msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
+ }
+ bmw_small_lastBlock(&ctx, msg, length_b);
+ bmw256_ctx2hash(dest, &ctx);
+}
+
+