/* bmw_small-asm.S */ /* This file is part of the AVR-Crypto-Lib. Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* * File: bmw_small-asm.S * Author: Daniel Otte * Date: 2009-11-13 * License: GPLv3 or later * Description: implementation of BlueMidnightWish * */ #include "avr-asm-macros.S" shiftcodetable: ; .byte 0x00 ; 0 shiftcodetable_1: .byte 0x01 ; 1 .byte 0x02 ; 2 .byte 0x03 ; 3 .byte 0x04 ; 4 .byte 0x1B ; 5 .byte 0x1A ; 6 .byte 0x19 ; 7 .byte 0x10 ; 8 shiftcodetable_9: .byte 0x11 ; 9 .byte 0x12 ; 10 .byte 0x13 ; 11 .byte 0x2C ; 12 .byte 0x2B ; 13 .byte 0x2A ; 14 .byte 0x29 ; 15 .byte 0x20 ; 16 ; .byte 0x21 ; 17 unused but necesseray for padding /******************************************************************************* * shiftl32 * value: r25:r22 * shift: r20 */ shiftl32: 1: ; clc lsl r22 rol r23 rol r24 rol r25 dec r20 brne 1b ret /******************************************************************************* * shiftr32 * value: r25:r22 * shift: r20 */ shiftr32: 1: ; clc lsr r25 ror r24 ror r23 ror r22 dec r20 brne 1b ret /******************************************************************************* * rotl32 * value: r25:r22 * shift: r20 */ rotl32: mov r21, r25 1: lsl r21 rol r22 rol r23 rol r24 rol r25 dec r20 brne 1b ret /******************************************************************************* * rotr32 * value: r25:r22 * shift: r20 */ rotr32: mov r21, r22 1: lsr r21 ror r25 ror r24 ror r23 ror r22 dec r20 brne 1b some_ret: ret /******************************************************************************* * rotl32p9 * value: r25:r22 * shift: r20 */ rotl32p9: push_range 30, 31 ldi r30, lo8(shiftcodetable_9) ldi r31, hi8(shiftcodetable_9) add r30, r20 adc r31, r1 lpm r20, Z pop_range 30, 31 sbrs r20, 4 rjmp 2f mov r0, r25 mov r25, r24 mov r24, r23 mov r23, r22 mov r22, r0 2: sbrs r20, 5 rjmp 3f movw r0, r24 movw r24, r22 movw r22, r0 clr r1 3: bst r20, 3 andi r20, 0x07 breq some_ret brts rotr32 rjmp rotl32 /******************************************************************************* * uint32_t rotl_addel(uint32_t x, uint8_t v){ * uint32_t r; * r = ROTL32(x, (v&0xf)+1); * return r; * } * param x: r25:r22 * param v: r20 */ .global rotl_addel rotl_addel: andi r20, 0x0f ldi r30, lo8(shiftcodetable_1) ldi r31, hi8(shiftcodetable_1) add r30, r20 adc r31, r1 lpm r20, Z sbrs r20, 4 rjmp 1f mov r21, r25 mov r25, r24 mov r24, r23 mov r23, r22 mov r22, r21 1: sbrs r20, 5 rjmp 2f movw r30, r24 movw r24, r22 movw r22, r30 2: bst r20, 3 andi r20, 0x07 breq some_ret 3: brts rotr32; 4f rjmp rotl32 ;4: rjmp rotr32 /******************************************************************************/ preg0 = 22 /* preg for processing register */ preg1 = 23 preg2 = 24 preg3 = 25 breg0 = 26 /* breg for backup register */ breg1 = 27 breg2 = 18 breg3 = 19 areg0 = 0 /* areg for accumulator register */ areg1 = 1 areg2 = 30 areg3 = 31 /******************************************************************************* * uint32_t bmw_small_s0(uint32_t x){ * uint32_t r; * r = SHR32(x, 1) * ^ SHL32(x, 3) * ^ ROTL32(x, 4) * ^ ROTR32(x, 13); * return r; * } */ .global bmw_small_s0 bmw_small_s0: movw breg0, preg0 movw breg2, preg2 ldi r20, 1 rcall shiftr32 movw areg2, preg2 movw areg0, preg0 movw preg2, breg2 movw preg0, breg0 ldi r20, 3 rcall shiftl32 eor areg0, preg0 eor areg1, preg1 eor areg2, preg2 eor areg3, preg3 movw preg2, breg2 movw preg0, breg0 ldi r20, 4 rcall rotl32 eor areg0, preg0 eor areg1, preg1 eor areg2, preg2 eor areg3, preg3 /* now the trick, we simply can rotate the old value to the right by 17 */ movw breg0, preg0 /* first rotate by 16 */ movw preg0, preg2 movw preg2, breg0 outro_1: ldi r20, 1 rcall rotr32 outro_2: eor preg0, areg0 eor preg1, areg1 eor preg2, areg2 eor preg3, areg3 clr r1 ret /******************************************************************************* * uint32_t bmw_small_s1(uint32_t x){ * uint32_t r; * r = SHR32(x, 1) * ^ SHL32(x, 2) * ^ ROTL32(x, 8) * ^ ROTR32(x, 9); * return r; * } */ .global bmw_small_s1 bmw_small_s1: movw breg0, preg0 movw breg2, preg2 ldi r20, 1 rcall shiftr32 movw areg2, preg2 movw areg0, preg0 movw preg2, breg2 movw preg0, breg0 ldi r20, 2 rcall shiftl32 eor areg0, preg0 eor areg1, preg1 eor areg2, preg2 eor areg3, preg3 eor areg0, breg3 eor areg1, breg0 eor areg2, breg1 eor areg3, breg2 mov preg0, breg1 mov preg1, breg2 mov preg2, breg3 mov preg3, breg0 rjmp outro_1 /******************************************************************************* * uint32_t bmw_small_s2(uint32_t x){ * uint32_t r; * r = SHR32(x, 2) * ^ SHL32(x, 1) * ^ ROTL32(x, 12) * ^ ROTR32(x, 7); * return r; * } */ .global bmw_small_s2 bmw_small_s2: movw breg0, preg0 movw breg2, preg2 ldi r20, 2 rcall shiftr32 movw areg2, preg2 movw areg0, preg0 movw preg2, breg2 movw preg0, breg0 ldi r20, 1 rcall shiftl32 eor areg0, preg0 eor areg1, preg1 eor areg2, preg2 eor areg3, preg3 movw preg0, breg2 movw preg2, breg0 ldi r20, 4 rcall rotr32 eor areg0, preg0 eor areg1, preg1 eor areg2, preg2 eor areg3, preg3 mov preg0, breg1 mov preg1, breg2 mov preg2, breg3 mov preg3, breg0 ldi r20, 1 rcall rotl32 rjmp outro_2 /******************************************************************************* * uint32_t bmw_small_s3(uint32_t x){ * uint32_t r; * r = SHR32(x, 2) * ^ SHL32(x, 2) * ^ ROTL32(x, 15) * ^ ROTR32(x, 3); * return r; * } */ .global bmw_small_s3 bmw_small_s3: movw breg0, preg0 movw breg2, preg2 ldi r20, 2 rcall shiftr32 movw areg2, preg2 movw areg0, preg0 movw preg2, breg2 movw preg0, breg0 ldi r20, 2 rcall shiftl32 eor areg0, preg0 eor areg1, preg1 eor areg2, preg2 eor areg3, preg3 movw preg0, breg2 movw preg2, breg0 ldi r20, 1 rcall rotr32 eor areg0, preg0 eor areg1, preg1 eor areg2, preg2 eor areg3, preg3 movw preg0, breg0 movw preg2, breg2 ldi r20, 3 rcall rotr32 rjmp outro_2 /******************************************************************************* * uint32_t bmw_small_s4(uint32_t x){ * uint32_t r; * r = SHR32(x, 1) * ^ x; * return r; * } */ .global bmw_small_s4 bmw_small_s4: movw areg0, preg0 movw areg2, preg2 ldi r20, 1 rcall shiftr32 rjmp outro_2 /******************************************************************************* * uint32_t bmw_small_s5(uint32_t x){ * uint32_t r; * r = SHR32(x, 2) * ^ x; * return r; * } */ .global bmw_small_s5 bmw_small_s5: movw areg0, preg0 movw areg2, preg2 ldi r20, 2 rcall shiftr32 rjmp outro_2 /******************************************************************************* * uint32_t bmw_small_r1(uint32_t x){ * uint32_t r; * r = ROTL32(x, 3); * return r; * } */ .global bmw_small_r1 bmw_small_r1: ldi r20, 3 rjmp rotl32 /******************************************************************************* * uint32_t bmw_small_r2(uint32_t x){ * uint32_t r; * r = ROTL32(x, 7); * return r; * } */ .global bmw_small_r2 bmw_small_r2: ldi r20, 7 rjmp rotl32 /******************************************************************************* * uint32_t bmw_small_r3(uint32_t x){ * uint32_t r; * r = ROTL32(x, 13); * return r; * } */ .global bmw_small_r3 bmw_small_r3: movw r18, r24 movw r24, r22 movw r22, r18 ldi r20, 3 rjmp rotr32 /******************************************************************************* * uint32_t bmw_small_r4(uint32_t x){ * uint32_t r; * r = ROTL32(x, 16); * return r; * } */ .global bmw_small_r4 bmw_small_r4: movw r18, r24 movw r24, r22 movw r22, r18 ret /******************************************************************************* * uint32_t bmw_small_r5(uint32_t x){ * uint32_t r; * r = ROTR32(x, 13); * return r; * } */ .global bmw_small_r5 bmw_small_r5: movw r18, r24 movw r24, r22 movw r22, r18 ldi r20, 3 rjmp rotl32 /******************************************************************************* * uint32_t bmw_small_r6(uint32_t x){ * uint32_t r; * r = ROTR32(x, 9); * return r; * } */ .global bmw_small_r6 bmw_small_r6: mov r18, r22 mov r22, r23 mov r23, r24 mov r24, r25 mov r25, r18 ldi r20, 1 rjmp rotr32 /******************************************************************************* * uint32_t bmw_small_r7(uint32_t x){ * uint32_t r; * r = ROTR32(x, 5); * return r; * } */ .global bmw_small_r7 bmw_small_r7: ldi r20, 5 rjmp rotr32 /******************************************************************************/ const_lut: .long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f .long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3 .long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7 .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b /******************************************************************************* * uint32_t addelment(uint8_t j, const uint32_t *m, const uint32_t *h){ * uint32_t r; * r = pgm_read_dword(k_lut+j); * r += rotl_addel(((uint32_t*)m)[j&0xf], j+0); * r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3); * r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10); * r ^= ((uint32_t*)h)[(j+7)&0xf]; * return r; * } * param j: r24 * param m: r22:r23 * param h: r20:r21 */ j = 16 acc2 = 8 acc3 = 9 h0 = 10 h1 = 11 m0 = 12 m1 = 13 acc0 = 14 acc1 = 15 .global addelement addelement: push_range 8, 16 mov j, r24 movw h0, r20 movw m0, r22 lsl r24 lsl r24 ldi r30, lo8(const_lut) ldi r31, hi8(const_lut) add r30, r24 adc r31, r1 lpm acc0, Z+ lpm acc1, Z+ lpm acc2, Z+ lpm acc3, Z+ mov r20, j andi r20, 0x0f lsl r20 lsl r20 movw r26, m0 add r26, r20 adc r27, r1 ld r22, X+ ld r23, X+ ld r24, X+ ld r25, X+ mov r20, j rcall rotl_addel add acc0, r22 adc acc1, r23 adc acc2, r24 adc acc3, r25 subi j, -3 mov r20, j andi r20, 0x0f lsl r20 lsl r20 movw r26, m0 add r26, r20 adc r27, r1 ld r22, X+ ld r23, X+ ld r24, X+ ld r25, X+ mov r20, j rcall rotl_addel add acc0, r22 adc acc1, r23 adc acc2, r24 adc acc3, r25 subi j, -7 mov r20, j andi r20, 0x0f lsl r20 lsl r20 movw r26, m0 add r26, r20 adc r27, r1 ld r22, X+ ld r23, X+ ld r24, X+ ld r25, X+ mov r20, j rcall rotl_addel sub acc0, r22 sbc acc1, r23 sbc acc2, r24 sbc acc3, r25 subi j, 3 mov r20, j andi r20, 0x0f lsl r20 lsl r20 movw r26, h0 add r26, r20 adc r27, r1 ld r22, X+ ld r23, X+ ld r24, X+ ld r25, X+ eor r22, acc0 eor r23, acc1 eor r24, acc2 eor r25, acc3 pop_range 8, 16 ret /******************************************************************************* * uint32_t bmw_small_expand1(uint8_t j, const void *m, const void *h, const uint32_t *q){ * uint32_t(*s[])(uint32_t) = {bmw_small_s1, bmw_small_s2, bmw_small_s3, bmw_small_s0}; * uint32_t r; * uint8_t i; * r = addelement(j, m, h); * i=15; * do{ * r += s[i%4](q[j+i]); * }while(i--!=0); * return r; * * param j: r24 * param m: r22:r23 * param h: r20:r21 * param q: r18:r19 */ acc0 = 2 acc1 = 3 acc2 = 4 acc3 = 5 .global bmw_small_expand1 bmw_small_expand1: push_range 28, 29 movw r28, r18 mov r18, r24 lsl r18 lsl r18 add r28, r18 adc r29, r1 rcall addelement push_range 2, 5 push r16 ldi r16, 4 movw acc0, r22 movw acc2, r24 1: ld r22, Y+ ld r23, Y+ ld r24, Y+ ld r25, Y+ rcall bmw_small_s1 add acc0, r22 adc acc1, r23 adc acc2, r24 adc acc3, r25 ld r22, Y+ ld r23, Y+ ld r24, Y+ ld r25, Y+ rcall bmw_small_s2 add acc0, r22 adc acc1, r23 adc acc2, r24 adc acc3, r25 ld r22, Y+ ld r23, Y+ ld r24, Y+ ld r25, Y+ rcall bmw_small_s3 add acc0, r22 adc acc1, r23 adc acc2, r24 adc acc3, r25 ld r22, Y+ ld r23, Y+ ld r24, Y+ ld r25, Y+ rcall bmw_small_s0 add acc0, r22 adc acc1, r23 adc acc2, r24 adc acc3, r25 dec r16 brne 1b expand1_exit: movw r22, acc0 movw r24, acc2 pop r16 pop_range 2, 5 pop_range 28, 29 ret /******************************************************************************* * uint32_t bmw_small_expand2(uint8_t j, const void *m, const void *h, const uint32_t *q){ * uint32_t(*rf[])(uint32_t) = {bmw_small_r1, bmw_small_r2, bmw_small_r3, * bmw_small_r4, bmw_small_r5, bmw_small_r6, * bmw_small_r7}; * uint32_t r; * uint8_t i; * r = addelement(j, m, h); * for(i=0; i<14; i+=2){ * r += q[j+i]; * } * for(i=0; i<14; i+=2){ * r += rf[i/2](q[j+i+1]); * } * r += bmw_small_s4(q[j+14]); * r += bmw_small_s5(q[j+15]); * return r; * } */ expand2_jumptable: ret rjmp bmw_small_r1 ret rjmp bmw_small_r2 ret rjmp bmw_small_r3 ret rjmp bmw_small_r4 ret rjmp bmw_small_r5 ret rjmp bmw_small_r6 ret rjmp bmw_small_r7 rjmp bmw_small_s4 rjmp bmw_small_s5 .global bmw_small_expand2 bmw_small_expand2: push_range 28, 29 movw r28, r18 mov r18, r24 lsl r18 lsl r18 add r28, r18 adc r29, r1 rcall addelement push_range 2, 5 push r16 ldi r16, 16 movw acc0, r22 movw acc2, r24 ldi r30, pm_lo8(expand2_jumptable) ldi r31, pm_hi8(expand2_jumptable) 1: ld r22, Y+ ld r23, Y+ ld r24, Y+ ld r25, Y+ push r30 push r31 icall pop r31 pop r30 adiw r30, 1 add acc0, r22 adc acc1, r23 adc acc2, r24 adc acc3, r25 dec r16 brne 1b rjmp expand1_exit /******************************************************************************* * void bmw_small_f1(uint32_t *q, const void *m, const void *h){ * uint8_t i; * q[16] = bmw_small_expand1(0, m, h, q); * q[17] = bmw_small_expand1(1, m, h, q); * for(i=2; i<16; ++i){ * q[16+i] = bmw_small_expand2(i, m, h, q); * } * } */ m0 = 2 m1 = 3 h0 = 4 h1 = 5 q0 = 6 q1 = 7 .global bmw_small_f1 bmw_small_f1: ; push_range 2, 7 ; push_range 28, 29 push r16 movw q0, r24 movw m0, r22 movw h0, r20 movw r28, q0 adiw r28, 63 adiw r28, 1 clr r24 clr r25 /* not required */ movw r18, q0 rcall bmw_small_expand1 st Y+, r22 st Y+, r23 st Y+, r24 st Y+, r25 ldi r16, 1 mov r24, r16 clr r25 /* not required */ movw r22, m0 movw r20, h0 movw r18, q0 rcall bmw_small_expand1 st Y+, r22 st Y+, r23 st Y+, r24 st Y+, r25 inc r16 1: mov r24, r16 movw r22, m0 movw r20, h0 movw r18, q0 rcall bmw_small_expand2 st Y+, r22 st Y+, r23 st Y+, r24 st Y+, r25 inc r16 cpi r16, 16 brne 1b pop r16 ; pop_range 28, 29 ; pop_range 2, 7 ret /******************************************************************************* * uint16_t hack_table[5] PROGMEM = { 0x0311, 0xDDB3, 0x2A79, 0x07AA, 0x51C2 }; * uint8_t offset_table[5] PROGMEM = { 4+16, 6+16, 9+16, 12+16, 13+16 }; * * void bmw_small_f0(uint32_t *h, const void *m, uint32_t *q){ * uint16_t hack_reg; * uint8_t c,i,j; * uint32_t(*s[])(uint32_t)={ bmw_small_s0, bmw_small_s1, bmw_small_s2, * bmw_small_s3, bmw_small_s4 }; * for(i=0; i<16; ++i){ * ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i]; * } * dump_x(h, 16, 'T'); * memset(q, 0, 4*16); * c=4; * do{ * i=15; * j=pgm_read_byte(offset_table+c); * hack_reg=pgm_read_word(&(hack_table[c])); * do{ * if(hack_reg&1){ * q[i]-= h[j&15]; * }else{ * q[i]+= h[j&15]; * } * --j; * hack_reg>>= 1; * }while(i--!=0); * }while(c--!=0); * dump_x(q, 16, 'W'); * for(i=0; i<16; ++i){ * q[i] = s[i%5](q[i]); * } * for(i=0; i<16; ++i){ * ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i]; * } * for(i=0; i<16; ++i){ * q[i] += h[(i+1)&0xf]; * } * } * * param h: r24:r25 * param m: r22:r23 * param q: r20:r21 */ h0 = 24 h1 = 25 m0 = 22 m1 = 23 q0 = 20 q1 = 21 acc0 = 4 acc1 = 5 acc2 = 6 acc3 = 7 bcc0 = 8 bcc1 = 9 bcc2 = 10 bcc3 = 11 hack = 16 f0_helper: 20: ldd acc0, Z+0 ldd acc1, Z+1 ldd acc2, Z+2 ldd acc3, Z+3 ld bcc0, X+ ld bcc1, X+ ld bcc2, X+ ld bcc3, X+ lsr r17 ror r16 brcs l20_sub add acc0, bcc0 adc acc1, bcc1 adc acc2, bcc2 adc acc3, bcc3 rjmp l20_post l20_sub: sub acc0, bcc0 sbc acc1, bcc1 sbc acc2, bcc2 sbc acc3, bcc3 l20_post: st Z+, acc0 st Z+, acc1 st Z+, acc2 st Z+, acc3 dec r18 brne 20b ret f0_jumptable: rjmp bmw_small_s0 rjmp bmw_small_s1 rjmp bmw_small_s2 rjmp bmw_small_s3 rjmp bmw_small_s4 rjmp bmw_small_s0 rjmp bmw_small_s1 rjmp bmw_small_s2 rjmp bmw_small_s3 rjmp bmw_small_s4 rjmp bmw_small_s0 rjmp bmw_small_s1 rjmp bmw_small_s2 rjmp bmw_small_s3 rjmp bmw_small_s4 rjmp bmw_small_s0 .global bmw_small_f0 bmw_small_f0: ; push_range 28, 29 ; push_range 4, 11 ; push_range 16, 17 /* h[i] ^= m[i]; q[i]= 0 */ movw r26, h0 ; h movw r30, m0 ; m movw r28, q0 ; q ldi r18, 64 1: ld r0, X ld r19, Z+ eor r0, r19 st X+, r0 st Y+, r1 dec r18 brne 1b ;------ ldi r17, 0x88 ldi r16, 0xC0 movw r26, h0 ; X = h adiw r26, 5*4 ldi r18, 16-5 movw r30, q0 ; Z = q rcall f0_helper movw r26, h0 ; X = h ldi r18, 5 rcall f0_helper ;--- ldi r17, 0xCD ldi r16, 0xBB movw r26, h0 ; X = h adiw r26, 7*4 ldi r18, 16-7 movw r30, q0 ; Z = q rcall f0_helper movw r26, h0 ; X = h ldi r18, 7 rcall f0_helper ;--- ldi r17, 0x9E ldi r16, 0x54 movw r26, h0 ; X = h adiw r26, 10*4 ldi r18, 16-10 movw r30, q0 ; Z = q rcall f0_helper movw r26, h0 ; X = h ldi r18, 10 rcall f0_helper ;--- ldi r17, 0x55 ldi r16, 0xE0 movw r26, h0 ; X = h adiw r26, 13*4 ldi r18, 16-13 movw r30, q0 ; Z = q rcall f0_helper movw r26, h0 ; X = h ldi r18, 13 rcall f0_helper ;--- ldi r17, 0x43 ldi r16, 0x8A movw r26, h0 ; X = h adiw r26, 14*4 ldi r18, 16-14 movw r30, q0 ; Z = q rcall f0_helper movw r26, h0 ; X = h ldi r18, 14 rcall f0_helper ;--------------- h[i] ^= m[i] movw r26, h0 ; h movw r30, m0 ; m ldi r18, 64 25: ld r0, X ld r19, Z+ eor r0, r19 st X+, r0 dec r18 brne 25b ;--------------- q[i] = s[i%5](q[i]) ldi r16, 16 ldi r30, pm_lo8(f0_jumptable) ldi r31, pm_hi8(f0_jumptable) movw bcc0, r30 movw bcc2, h0 ; h movw acc0, q0 ; q movw r28, q0 ; Y = q 30: ldd r22, Y+0 ldd r23, Y+1 ldd r24, Y+2 ldd r25, Y+3 icall st Y+, r22 st Y+, r23 st Y+, r24 st Y+, r25 movw r30, bcc0 adiw r30, 1 movw bcc0, r30 dec r16 brne 30b ;--------------- q[i] += h[(i+1)%16] movw r30, acc0 ; q movw r26, bcc2 ; h adiw r26, 4 ldi r18, 15 40: ld acc0, Z ld acc1, X+ add acc0, acc1 st Z+, acc0 ld acc0, Z ld acc1, X+ adc acc0, acc1 st Z+, acc0 ld acc0, Z ld acc1, X+ adc acc0, acc1 st Z+, acc0 ld acc0, Z ld acc1, X+ adc acc0, acc1 st Z+, acc0 dec r18 brne 40b movw r26, bcc2 ; h ld acc0, Z ld acc1, X+ add acc0, acc1 st Z+, acc0 ld acc0, Z ld acc1, X+ adc acc0, acc1 st Z+, acc0 ld acc0, Z ld acc1, X+ adc acc0, acc1 st Z+, acc0 ld acc0, Z ld acc1, X+ adc acc0, acc1 st Z+, acc0 ; pop_range 16, 17 ; pop_range 4, 11 ; pop_range 28, 29 ret /******************************************************************************* * void bmw_small_f2(uint32_t *h, const uint32_t *q, const void *m){ * uint32_t xl=0, xh; * uint8_t i; * for(i=16;i<24;++i){ * xl ^= q[i]; * } * xh = xl; * for(i=24;i<32;++i){ * xh ^= q[i]; * } * memcpy(h, m, 16*4); * h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5); * h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6); * h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5); * h[4] ^= SHR32(xh, 3) ^ q[20]; * h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6); * h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5); * h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8); * h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2); * for(i=0; i<8; ++i){ * h[i] += xl ^ q[24+i] ^ q[i]; * } * for(i=0; i<8; ++i){ * h[8+i] ^= xh ^ q[24+i]; * h[8+i] += ROTL32(h[(4+i)%8],i+9); * } * h[11] += SHL32(xl, 4) ^ q[18] ^ q[11]; * h[10] += SHL32(xl, 6) ^ q[17] ^ q[10]; * h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8]; * h[15] += SHR32(xl, 2) ^ q[22] ^ q[15]; * h[12] += SHR32(xl, 3) ^ q[19] ^ q[12]; * h[13] += SHR32(xl, 4) ^ q[20] ^ q[13]; * h[ 9] += SHR32(xl, 6) ^ q[16] ^ q[ 9]; * h[14] += SHR32(xl, 7) ^ q[21] ^ q[14]; * } * * param h: r24:r25 * param q: r22:r23 * param m: r20:r21 */ xl0 = 2 xl1 = 3 xl2 = 4 xl3 = 5 xh0 = 6 xh1 = 7 xh2 = 8 xh3 = 9 q0 = 10 q1 = 11 h0 = 12 h1 = 13 t0 = 14 t1 = 15 t2 = 16 t3 = 17 .macro modify_h_2 addr:req ldd r22, Y+\addr*4+0 ldd r23, Y+\addr*4+1 ldd r24, Y+\addr*4+2 ldd r25, Y+\addr*4+3 eor r22, t0 eor r23, t1 eor r24, t2 eor r25, t3 ldd r0, Z+\addr*4+0 add r0, r22 std Z+\addr*4+0, r0 ldd r0, Z+\addr*4+1 adc r0, r23 std Z+\addr*4+1, r0 ldd r0, Z+\addr*4+2 adc r0, r24 std Z+\addr*4+2, r0 ldd r0, Z+\addr*4+3 adc r0, r25 std Z+\addr*4+3, r0 .endm tshiftr: lsr t3 ror t2 ror t1 ror t0 dec r20 brne tshiftr ret tshiftl: lsl t0 rol t1 rol t2 rol t3 dec r20 brne tshiftl ret .global bmw_small_f2 bmw_small_f2: /* memcpy(h, m, 64) */ movw r26, r24 movw r30, r20 ldi r18, 64 1: ld r0, Z+ st X+, r0 dec r18 brne 1b ; push_range 28, 29 ; push_range 2, 17 movw q0, r22 movw h0, r24 /* calc xl */ /* for(i=16;i<24;++i){ xl ^= q[i]; } */ movw r26, q0 adiw r26, 63 adiw r26, 1 ; X points at q[16] ld xl0, X+ ld xl1, X+ ld xl2, X+ ld xl3, X+ ldi r18, 8-1 20: ld r0, X+ eor xl0, r0 ld r0, X+ eor xl1, r0 ld r0, X+ eor xl2, r0 ld r0, X+ eor xl3, r0 dec r18 brne 20b /* calc xh */ /* xh = xl for(i=24;i<32;++i){ xh ^= q[i]; } */ movw xh0, xl0 movw xh2, xl2 ldi r18, 8 25: ld r0, X+ eor xh0, r0 ld r0, X+ eor xh1, r0 ld r0, X+ eor xh2, r0 ld r0, X+ eor xh3, r0 dec r18 brne 25b /* h[0]..h[7] */ movw r30, h0 movw r28, q0 adiw r28, 60 ; Y points at q[15] /* h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5); */ movw t0, xh0 movw t2, xh2 ldi r20, 5 rcall tshiftl ldd r22, Y+4 ldd r23, Y+5 ldd r24, Y+6 ldd r25, Y+7 ldi r20, 5 rcall shiftr32 eor r22, t0 eor r23, t1 eor r24, t2 eor r25, t3 ldd r0, Z+0 eor r22, r0 ldd r0, Z+1 eor r23, r0 ldd r0, Z+2 eor r24, r0 ldd r0, Z+3 eor r25, r0 std Z+0, r22 std Z+1, r23 std Z+2, r24 std Z+3, r25 /* h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6); */ lsl t0 rol t1 rol t2 rol t3 ldd r22, Y+24 ldd r23, Y+25 ldd r24, Y+26 ldd r25, Y+27 ldi r20, 6 rcall shiftr32 eor r22, t0 eor r23, t1 eor r24, t2 eor r25, t3 ldd r0, Z+20 eor r22, r0 ldd r0, Z+21 eor r23, r0 ldd r0, Z+22 eor r24, r0 ldd r0, Z+23 eor r25, r0 std Z+20, r22 std Z+21, r23 std Z+22, r24 std Z+23, r25 /* h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5); */ movw t0, xh0 movw t2, xh2 lsr t3 ror t2 ror t1 ror t0 ldd r22, Y+16 ldd r23, Y+17 ldd r24, Y+18 ldd r25, Y+19 ldi r20, 5 rcall shiftl32 eor r22, t0 eor r23, t1 eor r24, t2 eor r25, t3 ldd r0, Z+12 eor r22, r0 ldd r0, Z+13 eor r23, r0 ldd r0, Z+14 eor r24, r0 ldd r0, Z+15 eor r25, r0 std Z+12, r22 std Z+13, r23 std Z+14, r24 std Z+15, r25 /* h[4] ^= SHR32(xh, 3) ^ q[20]; */ ldi r20, 2 rcall tshiftr ldd r22, Y+20 ldd r23, Y+21 ldd r24, Y+22 ldd r25, Y+23 eor r22, t0 eor r23, t1 eor r24, t2 eor r25, t3 ldd r0, Z+16 eor r22, r0 ldd r0, Z+17 eor r23, r0 ldd r0, Z+18 eor r24, r0 ldd r0, Z+19 eor r25, r0 std Z+16, r22 std Z+17, r23 std Z+18, r24 std Z+19, r25 /* h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6); */ lsr t3 ror t2 ror t1 ror t0 ldd r22, Y+28 ldd r23, Y+29 ldd r24, Y+30 ldd r25, Y+31 ldi r20, 6 rcall shiftl32 eor r22, t0 eor r23, t1 eor r24, t2 eor r25, t3 ldd r0, Z+24 eor r22, r0 ldd r0, Z+25 eor r23, r0 ldd r0, Z+26 eor r24, r0 ldd r0, Z+27 eor r25, r0 std Z+24, r22 std Z+25, r23 std Z+26, r24 std Z+27, r25 /* h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5); */ lsr t3 ror t2 ror t1 ror t0 ldd r22, Y+12 ldd r23, Y+13 ldd r24, Y+14 ldd r25, Y+15 ldi r20, 5 rcall shiftl32 eor r22, t0 eor r23, t1 eor r24, t2 eor r25, t3 ldd r0, Z+8 eor r22, r0 ldd r0, Z+9 eor r23, r0 ldd r0, Z+10 eor r24, r0 ldd r0, Z+11 eor r25, r0 std Z+8 , r22 std Z+9 , r23 std Z+10, r24 std Z+11, r25 /* h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8); */ ldi r20, 2 rcall tshiftr ldd r23, Y+8 ldd r24, Y+9 ldd r25, Y+10 mov r22, t0 eor r23, t1 eor r24, t2 eor r25, t3 ldd r0, Z+4 eor r22, r0 ldd r0, Z+5 eor r23, r0 ldd r0, Z+6 eor r24, r0 ldd r0, Z+7 eor r25, r0 std Z+4 , r22 std Z+5 , r23 std Z+6 , r24 std Z+7 , r25 /* h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2); */ ldi r20, 4 rcall tshiftr ldd r22, Y+32 ldd r23, Y+33 ldd r24, Y+34 ldd r25, Y+35 ldi r20, 2 rcall shiftl32 eor r22, t0 eor r23, t1 eor r24, t2 eor r25, t3 ldd r0, Z+28 eor r22, r0 ldd r0, Z+29 eor r23, r0 ldd r0, Z+30 eor r24, r0 ldd r0, Z+31 eor r25, r0 std Z+28, r22 std Z+29, r23 std Z+30, r24 std Z+31, r25 /* for(i=0; i<8; ++i){ * h[i] += xl ^ q[24+i] ^ q[i]; * } */ movw r26, q0 movw r28, q0 adiw r28, 63 adiw r28, 24*4-63 ldi r18, 8 10: movw t0, xl0 movw t2, xl2 ld r0, X+ eor t0, r0 ld r0, X+ eor t1, r0 ld r0, X+ eor t2, r0 ld r0, X+ eor t3, r0 ld r0, Y+ eor t0, r0 ld r0, Y+ eor t1, r0 ld r0, Y+ eor t2, r0 ld r0, Y+ eor t3, r0 ldd r22, Z+0 ldd r23, Z+1 ldd r24, Z+2 ldd r25, Z+3 add r22, t0 adc r23, t1 adc r24, t2 adc r25, t3 st Z+, r22 st Z+, r23 st Z+, r24 st Z+, r25 dec r18 brne 10b ; Z points to h[8] /* for(i=0; i<8; ++i){ h[8+i] ^= xh ^ q[24+i]; h[8+i] += ROTL32(h[(4+i)%8],i+9); } */ ; Z points at h[8] ; clr r18 sbiw r28, 8*4 ; Y points at q[24] movw r26, r30 sbiw r26, 4*4 ; X points at h[4] 15: ldd t0, Z+0 ldd t1, Z+1 ldd t2, Z+2 ldd t3, Z+3 eor t0, xh0 eor t1, xh1 eor t2, xh2 eor t3, xh3 ld r0, Y+ eor t0, r0 ld r0, Y+ eor t1, r0 ld r0, Y+ eor t2, r0 ld r0, Y+ eor t3, r0 ld r22, X+ ld r23, X+ ld r24, X+ ld r25, X+ mov r20, r18 rcall rotl32p9 add t0, r22 adc t1, r23 adc t2, r24 adc t3, r25 st Z+, t0 st Z+, t1 st Z+, t2 st Z+, t3 inc r18 cpi r18, 4 brne 16f movw r26, h0 16: sbrs r18, 3 rjmp 15b sbiw r30, 4*8 ; adjust Z to point at h[8] sbiw r28, 16*4-1 sbiw r28, 1 ; adjust Y to point at q[16] movw r26, r28 sbiw r26, 7*4 ; adjust X to point at q[9] ldi r18, 7*4 20: /* now we do the memxor stuff */ ld t0, X ld t1, Y+ eor t0, t1 st X+, t0 dec r18 brne 20b ; X points at q[16] ; Y points at q[23] sbiw r26, 4*8 ; X points at q[8] clr t0 mov t1, xl0 mov t2, xl1 mov t3, xl2 /* h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8]; */ ld r22, X+ ld r23, X+ ld r24, X+ ld r25, X+ ld r0, Y+ eor r22, r0 ld r0, Y+ eor r23, r0 ld r0, Y+ eor r24, r0 ld r0, Y+ eor r25, r0 eor r22, t0 eor r23, t1 eor r24, t2 eor r25, t3 ld r0, Z add r0, r22 st Z+, r0 ld r0, Z adc r0, r23 st Z+, r0 ld r0, Z adc r0, r24 st Z+, r0 ld r0, Z adc r0, r25 st Z+, r0 movw r28, r26 ; Z points at h[9] ; X points at q[9] but we won't need it anymore ; Y points at q[9] /* h[11] += SHL32(xl, 4) ^ q[11]; */ movw t0, xl0 movw t2, xl2 ldi r20, 4 rcall tshiftl modify_h_2 2 /* h[10] += SHL32(xl, 6) ^ q[10]; */ ldi r20, 2 rcall tshiftl modify_h_2 1 /* h[15] += SHR32(xl, 2) ^ q[15]; */ movw t0, xl0 movw t2, xl2 ldi r20, 2 rcall tshiftr modify_h_2 6 /* h[12] += SHR32(xl, 3) ^ q[12]; */ ldi r20, 1 rcall tshiftr modify_h_2 3 /* h[13] += SHR32(xl, 4) ^ q[13]; */ ldi r20, 1 rcall tshiftr modify_h_2 4 /* h[ 9] += SHR32(xl, 6) ^ q[ 9]; */ ldi r20, 2 rcall tshiftr modify_h_2 0 /* h[14] += SHR32(xl, 7) ^ q[14]; */ ldi r20, 1 rcall tshiftr modify_h_2 5 bmw_small_f2_exit: ; pop_range 2, 17 ; pop_range 28, 29 ret #if DEBUG_FUNCTIONS cli_putb: push r2 push_range 18, 26 push_range 30, 31 mov r2, r24 swap r24 andi r24, 0xf ldi r30, lo8(hextable) ldi r31, hi8(hextable) add r30, r24 adc r31, r1 lpm r24, Z clr r25 call cli_putc mov r24, r2 andi r24, 0xf ldi r30, lo8(hextable) ldi r31, hi8(hextable) add r30, r24 adc r31, r1 lpm r24, Z clr r25 call cli_putc pop_range 30, 31 pop_range 18, 26 pop r2 ret hextable: .byte '0', '1', '2', '3', '4', '5', '6', '7' .byte '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' cli_putchar: push_range 18, 31 call cli_putc pop_range 18, 31 ret #endif /******************************************************************************* * void bmw_small_nextBlock(bmw_small_ctx_t *ctx, const void *block){ * uint32_t q[32]; * dump_x(block, 16, 'M'); * bmw_small_f0(ctx->h, block, q); * dump_x(q, 16, 'Q'); * bmw_small_f1(q, block, ctx->h); * dump_x(q, 32, 'Q'); * bmw_small_f2(ctx->h, q, block); * ctx->counter += 1; * ctx_dump(ctx); * } * * param ctx: r24:r25 * param block: r22:r23 */ h0 = 2 h1 = 3 b0 = 4 b1 = 5 q0 = 6 q1 = 7 .global bmw_small_nextBlock .global bmw224_nextBlock .global bmw256_nextBlock bmw_small_nextBlock: bmw224_nextBlock: bmw256_nextBlock: push_range 28, 29 push_range 2, 17 stack_alloc_large 32*4, 30, 31 adiw r30, 1 movw q0, r30 movw h0, r24 movw b0, r22 /* increment counter */ movw r30, r24 adiw r30, 60 ldd r22, Z+4 ldd r23, Z+5 ldd r24, Z+6 ldd r25, Z+7 ldi r21, 1 add r22, r21 adc r23, r1 adc r24, r1 adc r25, r1 std Z+4, r22 std Z+5, r23 std Z+6, r24 std Z+7, r25 /* call bmw_small_f0(ctx->h, block, q) */ movw r24, h0 movw r22, b0 movw r20, q0 push_ q1, q0, b1, b0, h1, h0 rcall bmw_small_f0 /* call bmw_small_f1(q, block, ctx->h) */ pop_ 20, 21, 22, 23, 24, 25, push_ 21, 20, 25, 24, 23, 22 rcall bmw_small_f1 /* call bmw_small_f2(ctx->h, q, block) */ pop_ 20, 21, 22, 23, 24, 25, rcall bmw_small_f2 stack_free_large3 32*4 pop_range 2, 17 pop_range 28, 29 ret /******************************************************************************* * void bmw224_init(bmw224_ctx_t *ctx){ * uint8_t i; * ctx->h[0] = 0x00010203; * for(i=1; i<16; ++i){ * ctx->h[i] = ctx->h[i-1]+ 0x04040404; * } * ctx->counter=0; * } * * param ctx: r24:r25 */ .global bmw224_init bmw224_init: movw r26, r24 ldi r22, 0x03 ldi r23, 0x02 ldi r24, 0x01 ldi r25, 0x00 bmw_small_init: st X+, r22 st X+, r23 st X+, r24 st X+, r25 ldi r18, 16-1 ldi r20, 0x04 1: add r22, r20 adc r23, r20 adc r24, r20 adc r25, r20 st X+, r22 st X+, r23 st X+, r24 st X+, r25 dec r18 brne 1b st X+, r1 st X+, r1 st X+, r1 st X+, r1 ret .global bmw256_init bmw256_init: movw r26, r24 ldi r22, 0x43 ldi r23, 0x42 ldi r24, 0x41 ldi r25, 0x40 rjmp bmw_small_init /******************************************************************************* * void bmw_small_lastBlock(bmw_small_ctx_t *ctx, const void *block, uint16_t length_b){ * struct { * uint8_t buffer[64]; * uint32_t ctr; * } pctx; * while(length_b >= BMW_SMALL_BLOCKSIZE){ * bmw_small_nextBlock(ctx, block); * length_b -= BMW_SMALL_BLOCKSIZE; * block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B; * } * memset(pctx.buffer, 0, 64); * memcpy(pctx.buffer, block, (length_b+7)/8); * pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07); * if(length_b+1>64*8-64){ * bmw_small_nextBlock(ctx, pctx.buffer); * memset(pctx.buffer, 0, 64-8); * ctx->counter -= 1; * } * *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b; * bmw_small_nextBlock(ctx, pctx.buffer); * uint8_t i; * memset(pctx.buffer, 0xaa, 64); * for(i=0; i<16;++i){ * pctx.buffer[i*4] = i+0xa0; * } * bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h); * memcpy(ctx->h, pctx.buffer, 64); * } * * param ctx: r24:r25 * param block: r22:r23 * param length_b: r20:r21 */ ctx0 = 2 ctx1 = 3 blc0 = 4 blc1 = 5 len0 = 28 len1 = 29 buf0 = 6 buf1 = 7 .global bmw_small_lastBlock .global bmw224_lastBlock .global bmw256_lastBlock bmw_small_lastBlock: bmw224_lastBlock: bmw256_lastBlock: /* while(length_b >= BMW_SMALL_BLOCKSIZE){ bmw_small_nextBlock(ctx, block); length_b -= BMW_SMALL_BLOCKSIZE; block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B; } */ push_range 2, 7 push_range 28, 29 movw ctx0, r24 movw blc0, r22 movw len0, r20 1: cpi len1, hi8(512) brlo 2f movw r24, ctx0 movw r22, blc0 rcall bmw_small_nextBlock ldi r24, 64 add blc0, r24 adc blc1, r1 subi len1, hi8(512) rjmp 1b 2: /* struct { uint8_t buffer[64]; uint32_t ctr; } pctx; */ stack_alloc_large 68 adiw r30, 1 movw buf0, r30 /* memset(pctx.buffer, 0, 64); memcpy(pctx.buffer, block, (length_b+7)/8); pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07); */ movw r24, len0 lsr r25 ror r24 lsr r24 lsr r24 ; inc r24 ldi r23, 63 sub r23, r24 movw r26, blc0 tst r24 breq 301f 30: ld r20, X+ st Z+, r20 dec r24 brne 30b 301: clr r20 mov r21, len0 ldi r24, 0x80 andi r21, 0x07 breq 305f ld r20, X+ 303: lsr r24 dec r21 brne 303b 305: or r20, r24 st Z+, r20 tst r23 breq 32f 31: st Z+, r1 dec r23 brne 31b 32: /* if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511 bmw_small_nextBlock(ctx, pctx.buffer); memset(pctx.buffer, 0, 64-8); ctx->counter -= 1; } */ tst len1 breq 400f cpi len0, 192 brlo 400f movw r24, ctx0 movw r22, buf0 rcall bmw_small_nextBlock movw r26, buf0 ldi r20, 64-8 350: st X+, r1 dec r20 brne 350b movw r30, ctx0 adiw r30, 60 ldd r21, Z+4 ldd r22, Z+5 ldd r23, Z+6 ldd r24, Z+7 subi r21, 1 sbc r22, r1 sbc r23, r1 sbc r24, r1 rjmp 410f /* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b; bmw_small_nextBlock(ctx, pctx.buffer); */ 400: movw r30, ctx0 adiw r30, 60 ldd r21, Z+4 ldd r22, Z+5 ldd r23, Z+6 ldd r24, Z+7 410: clr r25 lsl r21 rol r22 rol r23 rol r24 rol r25 mov r20, len0 add r21, len1 adc r22, r1 adc r23, r1 adc r24, r1 adc r25, r1 movw r30, buf0 adiw r30, 64-8 st Z+, r20 st Z+, r21 st Z+, r22 st Z+, r23 st Z+, r24 st Z+, r25 st Z+, r1 st Z+, r1 movw r24, ctx0 movw r22, buf0 rcall bmw_small_nextBlock /* memset(pctx.buffer, 0xaa, 64); for(i=0; i<16;++i){ pctx.buffer[i*4] = i+0xa0; } */ ldi r18, 0xa0 ldi r19, 0xaa movw r26, buf0 500: st X+, r18 st X+, r19 st X+, r19 st X+, r19 inc r18 sbrs r18, 4 rjmp 500b /* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h); memcpy(ctx->h, pctx.buffer, 64); */ movw r24, buf0 movw r22, ctx0 rcall bmw_small_nextBlock ldi r18, 64 movw r26, ctx0 movw r30, buf0 600: ld r20, Z+ st X+, r20 dec r18 brne 600b stack_free_large 68 pop_range 28, 29 pop_range 2, 7 ret /******************************************************************************* * void bmw224_ctx2hash(void *dest, const bmw224_ctx_t *ctx){ * memcpy(dest, &(ctx->h[9]), 224/8); * } * * param dest: r24:r25 * param ctx: r22:r23 */ .global bmw224_ctx2hash bmw224_ctx2hash: movw r26, r24 movw r30, r22 adiw r30, 9*4 ldi r22, 28 rjmp 1f /******************************************************************************* * void bmw256_ctx2hash(void *dest, const bmw256_ctx_t *ctx){ * memcpy(dest, &(ctx->h[8]), 256/8); * } * * param dest: r24:r25 * param ctx: r22:r23 */ .global bmw256_ctx2hash bmw256_ctx2hash: movw r26, r24 movw r30, r22 adiw r30, 8*4 ldi r22, 32 1: ld r23, Z+ st X+, r23 dec r22 brne 1b ret /******************************************************************************* * void bmw256(void *dest, const void *msg, uint32_t length_b){ * bmw_small_ctx_t ctx; * bmw256_init(&ctx); * while(length_b>=BMW_SMALL_BLOCKSIZE){ * bmw_small_nextBlock(&ctx, msg); * length_b -= BMW_SMALL_BLOCKSIZE; * msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B; * } * bmw_small_lastBlock(&ctx, msg, length_b); * bmw256_ctx2hash(dest, &ctx); * } * * param dest: r24:r25 * param msg: r22:r23 * param length_b: r18:r21 */ ctx0 = 2 ctx1 = 3 msg0 = 4 msg1 = 5 len0 = 6 len1 = 7 len2 = 8 len3 = 9 dst0 = 10 dst1 = 11 .global bmw256 bmw256: push r16 ldi r16, 1 rjmp bmw_small_all /******************************************************************************* * void bmw224(void *dest, const void *msg, uint32_t length_b){ * bmw_small_ctx_t ctx; * bmw224_init(&ctx); * while(length_b>=BMW_SMALL_BLOCKSIZE){ * bmw_small_nextBlock(&ctx, msg); * length_b -= BMW_SMALL_BLOCKSIZE; * msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B; * } * bmw_small_lastBlock(&ctx, msg, length_b); * bmw224_ctx2hash(dest, &ctx); * } * * param dest: r24:r25 * param msg: r22:r23 * param length_b: r18:r21 */ ctx0 = 2 ctx1 = 3 msg0 = 4 msg1 = 5 len0 = 6 len1 = 7 len2 = 8 len3 = 9 dst0 = 10 dst1 = 11 .global bmw224 bmw224: push r16 clr r16 bmw_small_all: push_range 2, 11 stack_alloc_large 64+4 adiw r30, 1 movw ctx0, r30 movw dst0, r24 movw msg0, r22 movw len0, r18 movw len2, r20 movw r24, ctx0 ldi r30, pm_lo8(init_lut) ldi r31, pm_hi8(init_lut) add r30, r16 adc r31, r1 icall 20: mov r18, len2 or r18, len3 breq 50f movw r24, ctx0 movw r22, msg0 rcall bmw_small_nextBlock ldi r20, 2 sub len1, r20 sbc len2, r1 sbc len3, r1 ldi r20, 64 add msg0, r20 adc msg1, r1 rjmp 20b 50: movw r24, ctx0 movw r22, msg0 movw r20, len0 rcall bmw_small_lastBlock movw r24, dst0 movw r22, ctx0 ldi r30, pm_lo8(c2h_lut) ldi r31, pm_hi8(c2h_lut) add r30, r16 adc r31, r1 icall stack_free_large 64+4 pop_range 2, 11 pop r16 ret init_lut: rjmp bmw224_init rjmp bmw256_init c2h_lut: rjmp bmw224_ctx2hash rjmp bmw256_ctx2hash