From: bg Date: Sat, 12 Dec 2009 01:12:49 +0000 (+0000) Subject: first impression of BMW in assembler X-Git-Url: https://git.cryptolib.org/?p=avr-crypto-lib.git;a=commitdiff_plain;h=3d99e4ba447ef04801609c5459b7c0c332ae332f first impression of BMW in assembler --- diff --git a/bmw/bmw_small-asm.S b/bmw/bmw_small-asm.S new file mode 100644 index 0000000..62bd166 --- /dev/null +++ b/bmw/bmw_small-asm.S @@ -0,0 +1,1697 @@ +/* bmw_small-asm.S */ +/* + This file is part of the AVR-Crypto-Lib. + Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/* + * File: bmw_small-asm.S + * Author: Daniel Otte + * Date: 2009-11-13 + * License: GPLv3 or later + * Description: implementation of BlueMidnightWish + * + */ + +#include "avr-asm-macros.S" + +shiftcodetable: + .byte 0x00 ; 0 + .byte 0x01 ; 1 + .byte 0x02 ; 2 + .byte 0x03 ; 3 + .byte 0x04 ; 4 + .byte 0x1B ; 5 + .byte 0x1A ; 6 + .byte 0x19 ; 7 + .byte 0x10 ; 8 +shiftcodetable_9: + .byte 0x11 ; 9 + .byte 0x12 ; 10 + .byte 0x13 ; 11 + .byte 0x2C ; 12 + .byte 0x2B ; 13 + .byte 0x2A ; 14 + .byte 0x29 ; 15 + .byte 0x20 ; 16 + .byte 0x21 ; 17 unused but necesseray for padding + + + +/******************************************************************************* + * shiftl32 + * value: r25:r22 + * shift: r20 + */ +shiftl32: +1: +; clc + lsl r22 + rol r23 + rol r24 + rol r25 + dec r20 + brne 1b + ret + +/******************************************************************************* + * shiftr32 + * value: r25:r22 + * shift: r20 + */ +shiftr32: +1: +; clc + lsr r25 + ror r24 + ror r23 + ror r22 + dec r20 + brne 1b + ret + +/******************************************************************************* + * rotl32 + * value: r25:r22 + * shift: r20 + */ +rotl32: + mov r21, r25 +1: + lsl r21 + rol r22 + rol r23 + rol r24 + rol r25 + dec r20 + brne 1b + ret + +/******************************************************************************* + * rotr32 + * value: r25:r22 + * shift: r20 + */ +rotr32: + mov r21, r22 +1: + lsr r21 + ror r25 + ror r24 + ror r23 + ror r22 + dec r20 + brne 1b +some_ret: + ret + +/******************************************************************************* + * rotl32p9 + * value: r25:r22 + * shift: r20 + */ +rotl32p9: + push_range 30, 31 + ldi r30, lo8(shiftcodetable_9) + ldi r31, hi8(shiftcodetable_9) + add r30, r20 + adc r31, r1 + lpm r20, Z + pop_range 30, 31 + sbrs r20, 4 + rjmp 2f + mov r0, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r0 +2: sbrs r20, 5 + rjmp 3f + movw r0, r24 + movw r24, r22 + movw r22, r0 + clr r1 +3: bst r20, 3 + andi r20, 0x07 + breq some_ret + brts rotr32 + rjmp rotl32 + + +/******************************************************************************* +* uint32_t rotl_addel(uint32_t x, uint8_t v){ +* uint32_t r; +* r = ROTL32(x, (v&0xf)+1); +* return r; +* } +* param x: r25:r22 +* param v: r20 +*/ +.global rotl_addel +rotl_addel: + andi r20, 0x0f + inc r20 + ldi r30, lo8(shiftcodetable) + ldi r31, hi8(shiftcodetable) + add r30, r20 + adc r31, r1 + lpm r20, Z + sbrs r20, 4 + rjmp 1f + mov r21, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r21 +1: sbrs r20, 5 + rjmp 2f + movw r30, r24 + movw r24, r22 + movw r22, r30 +2: bst r20, 3 + andi r20, 0x07 + brne 3f + ret +3: + brts rotr32; 4f + rjmp rotl32 +;4: rjmp rotr32 + +/******************************************************************************/ + +preg0 = 22 /* preg for processing register */ +preg1 = 23 +preg2 = 24 +preg3 = 25 +breg0 = 26 /* breg for backup register */ +breg1 = 27 +breg2 = 18 +breg3 = 19 +areg0 = 0 /* areg for accumulator register */ +areg1 = 1 +areg2 = 30 +areg3 = 31 + +/******************************************************************************* +* uint32_t bmw_small_s0(uint32_t x){ +* uint32_t r; +* r = SHR32(x, 1) +* ^ SHL32(x, 3) +* ^ ROTL32(x, 4) +* ^ ROTR32(x, 13); +* return r; +* } +*/ +.global bmw_small_s0 +bmw_small_s0: + movw breg0, preg0 + movw breg2, preg2 + ldi r20, 1 + rcall shiftr32 + movw areg2, preg2 + movw areg0, preg0 + movw preg2, breg2 + movw preg0, breg0 + ldi r20, 3 + rcall shiftl32 + eor areg0, preg0 + eor areg1, preg1 + eor areg2, preg2 + eor areg3, preg3 + movw preg2, breg2 + movw preg0, breg0 + ldi r20, 4 + rcall rotl32 + eor areg0, preg0 + eor areg1, preg1 + eor areg2, preg2 + eor areg3, preg3 + /* now the trick, we simply can rotate the old value to the right by 17 */ + movw breg0, preg0 /* first rotate by 16 */ + movw preg0, preg2 + movw preg2, breg0 +outro_1: + ldi r20, 1 + rcall rotr32 +outro_2: + eor preg0, areg0 + eor preg1, areg1 + eor preg2, areg2 + eor preg3, areg3 + clr r1 + ret + +/******************************************************************************* +* uint32_t bmw_small_s1(uint32_t x){ +* uint32_t r; +* r = SHR32(x, 1) +* ^ SHL32(x, 2) +* ^ ROTL32(x, 8) +* ^ ROTR32(x, 9); +* return r; +* } +*/ +.global bmw_small_s1 +bmw_small_s1: + movw breg0, preg0 + movw breg2, preg2 + ldi r20, 1 + rcall shiftr32 + movw areg2, preg2 + movw areg0, preg0 + movw preg2, breg2 + movw preg0, breg0 + ldi r20, 2 + rcall shiftl32 + eor areg0, preg0 + eor areg1, preg1 + eor areg2, preg2 + eor areg3, preg3 + eor areg0, breg3 + eor areg1, breg0 + eor areg2, breg1 + eor areg3, breg2 + mov preg0, breg1 + mov preg1, breg2 + mov preg2, breg3 + mov preg3, breg0 + rjmp outro_1 + +/******************************************************************************* +* uint32_t bmw_small_s2(uint32_t x){ +* uint32_t r; +* r = SHR32(x, 2) +* ^ SHL32(x, 1) +* ^ ROTL32(x, 12) +* ^ ROTR32(x, 7); +* return r; +* } +*/ +.global bmw_small_s2 +bmw_small_s2: + movw breg0, preg0 + movw breg2, preg2 + ldi r20, 2 + rcall shiftr32 + movw areg2, preg2 + movw areg0, preg0 + movw preg2, breg2 + movw preg0, breg0 + ldi r20, 1 + rcall shiftl32 + eor areg0, preg0 + eor areg1, preg1 + eor areg2, preg2 + eor areg3, preg3 + movw preg0, breg2 + movw preg2, breg0 + ldi r20, 4 + rcall rotr32 + eor areg0, preg0 + eor areg1, preg1 + eor areg2, preg2 + eor areg3, preg3 + mov preg0, breg1 + mov preg1, breg2 + mov preg2, breg3 + mov preg3, breg0 + ldi r20, 1 + rcall rotl32 + rjmp outro_2 + +/******************************************************************************* +* uint32_t bmw_small_s3(uint32_t x){ +* uint32_t r; +* r = SHR32(x, 2) +* ^ SHL32(x, 2) +* ^ ROTL32(x, 15) +* ^ ROTR32(x, 3); +* return r; +* } +*/ +.global bmw_small_s3 +bmw_small_s3: + movw breg0, preg0 + movw breg2, preg2 + ldi r20, 2 + rcall shiftr32 + movw areg2, preg2 + movw areg0, preg0 + movw preg2, breg2 + movw preg0, breg0 + ldi r20, 2 + rcall shiftl32 + eor areg0, preg0 + eor areg1, preg1 + eor areg2, preg2 + eor areg3, preg3 + movw preg0, breg2 + movw preg2, breg0 + ldi r20, 1 + rcall rotr32 + eor areg0, preg0 + eor areg1, preg1 + eor areg2, preg2 + eor areg3, preg3 + movw preg0, breg0 + movw preg2, breg2 + ldi r20, 3 + rcall rotr32 + rjmp outro_2 + +/******************************************************************************* +* uint32_t bmw_small_s4(uint32_t x){ +* uint32_t r; +* r = SHR32(x, 1) +* ^ x; +* return r; +* } +*/ +.global bmw_small_s4 +bmw_small_s4: + movw areg0, preg0 + movw areg2, preg2 + ldi r20, 1 + rcall shiftr32 + rjmp outro_2 + +/******************************************************************************* +* uint32_t bmw_small_s5(uint32_t x){ +* uint32_t r; +* r = SHR32(x, 2) +* ^ x; +* return r; +* } +*/ +.global bmw_small_s5 +bmw_small_s5: + movw areg0, preg0 + movw areg2, preg2 + ldi r20, 2 + rcall shiftr32 + rjmp outro_2 + +/******************************************************************************* +* uint32_t bmw_small_r1(uint32_t x){ +* uint32_t r; +* r = ROTL32(x, 3); +* return r; +* } +*/ +.global bmw_small_r1 +bmw_small_r1: + ldi r20, 3 + rjmp rotl32 + +/******************************************************************************* +* uint32_t bmw_small_r2(uint32_t x){ +* uint32_t r; +* r = ROTL32(x, 7); +* return r; +* } +*/ +.global bmw_small_r2 +bmw_small_r2: + ldi r20, 7 + rjmp rotl32 + +/******************************************************************************* +* uint32_t bmw_small_r3(uint32_t x){ +* uint32_t r; +* r = ROTL32(x, 13); +* return r; +* } +*/ +.global bmw_small_r3 +bmw_small_r3: + movw r18, r24 + movw r24, r22 + movw r22, r18 + ldi r20, 3 + rjmp rotr32 + + +/******************************************************************************* +* uint32_t bmw_small_r4(uint32_t x){ +* uint32_t r; +* r = ROTL32(x, 16); +* return r; +* } +*/ +.global bmw_small_r4 +bmw_small_r4: + movw r18, r24 + movw r24, r22 + movw r22, r18 + ret + +/******************************************************************************* +* uint32_t bmw_small_r5(uint32_t x){ +* uint32_t r; +* r = ROTR32(x, 13); +* return r; +* } +*/ +.global bmw_small_r5 +bmw_small_r5: + movw r18, r24 + movw r24, r22 + movw r22, r18 + ldi r20, 3 + rjmp rotl32 + +/******************************************************************************* +* uint32_t bmw_small_r6(uint32_t x){ +* uint32_t r; +* r = ROTR32(x, 9); +* return r; +* } +*/ +.global bmw_small_r6 +bmw_small_r6: + mov r18, r22 + mov r22, r23 + mov r23, r24 + mov r24, r25 + mov r25, r18 + ldi r20, 1 + rjmp rotr32 + +/******************************************************************************* +* uint32_t bmw_small_r7(uint32_t x){ +* uint32_t r; +* r = ROTR32(x, 5); +* return r; +* } +*/ +.global bmw_small_r7 +bmw_small_r7: + ldi r20, 5 + rjmp rotr32 + +/******************************************************************************/ + +const_lut: + .long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f + .long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3 + .long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7 + .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b + +/******************************************************************************* +* uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){ +* uint32_t r; +* r = pgm_read_dword(k_lut+j); +* r += rotl_addel(((uint32_t*)m)[j&0xf], j+0); +* r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3); +* r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10); +* r ^= ((uint32_t*)h)[(j+7)&0xf]; +* return r; +* } +* param j: r24 +* param m: r22:r23 +* param h: r20:r21 +*/ +j = 16 +acc2 = 8 +acc3 = 9 +h0 = 10 +h1 = 11 +m0 = 12 +m1 = 13 +acc0 = 14 +acc1 = 15 +.global addelement +addelement: + push_range 8, 16 + mov j, r24 + movw h0, r20 + movw m0, r22 + mov r25, r24 + lsl r25 + lsl r25 + ldi r30, lo8(const_lut) + ldi r31, hi8(const_lut) + add r30, r25 + adc r31, r1 + lpm acc0, Z+ + lpm acc1, Z+ + lpm acc2, Z+ + lpm acc3, Z+ + + mov r20, j + andi r20, 0x0f + lsl r20 + lsl r20 + movw r26, m0 + add r26, r20 + adc r27, r1 + ld r22, X+ + ld r23, X+ + ld r24, X+ + ld r25, X+ + mov r20, j + rcall rotl_addel + add acc0, r22 + adc acc1, r23 + adc acc2, r24 + adc acc3, r25 + + subi j, -3 + mov r20, j + andi r20, 0x0f + lsl r20 + lsl r20 + movw r26, m0 + add r26, r20 + adc r27, r1 + ld r22, X+ + ld r23, X+ + ld r24, X+ + ld r25, X+ + mov r20, j + rcall rotl_addel + add acc0, r22 + adc acc1, r23 + adc acc2, r24 + adc acc3, r25 + + subi j, -7 + mov r20, j + andi r20, 0x0f + lsl r20 + lsl r20 + movw r26, m0 + add r26, r20 + adc r27, r1 + ld r22, X+ + ld r23, X+ + ld r24, X+ + ld r25, X+ + mov r20, j + rcall rotl_addel + sub acc0, r22 + sbc acc1, r23 + sbc acc2, r24 + sbc acc3, r25 + + subi j, 3 + mov r20, j + andi r20, 0x0f + lsl r20 + lsl r20 + movw r26, h0 + add r26, r20 + adc r27, r1 + ld r22, X+ + ld r23, X+ + ld r24, X+ + ld r25, X+ + eor r22, acc0 + eor r23, acc1 + eor r24, acc2 + eor r25, acc3 + pop_range 8, 16 + ret + +/******************************************************************************* +* uint32_t bmw_small_expand1(uint8_t j, const void* m, const void* h, const uint32_t* q){ +* uint32_t(*s[])(uint32_t) = {bmw_small_s1, bmw_small_s2, bmw_small_s3, bmw_small_s0}; +* uint32_t r; +* uint8_t i; +* r = addelement(j, m, h); +* i=15; +* do{ +* r += s[i%4](q[j+i]); +* }while(i--!=0); +* return r; +* +* param j: r24 +* param m: r22:r23 +* param h: r20:r21 +* param q: r18:r19 +*/ +acc0 = 2 +acc1 = 3 +acc2 = 4 +acc3 = 5 +.global bmw_small_expand1 +bmw_small_expand1: + push_range 28, 29 + movw r28, r18 + mov r18, r24 + lsl r18 + lsl r18 + add r28, r18 + adc r29, r1 + rcall addelement + push_range 2, 5 + push r16 + ldi r16, 4 + movw acc0, r22 + movw acc2, r24 +1: + ld r22, Y+ + ld r23, Y+ + ld r24, Y+ + ld r25, Y+ + rcall bmw_small_s1 + add acc0, r22 + adc acc1, r23 + adc acc2, r24 + adc acc3, r25 + ld r22, Y+ + ld r23, Y+ + ld r24, Y+ + ld r25, Y+ + rcall bmw_small_s2 + add acc0, r22 + adc acc1, r23 + adc acc2, r24 + adc acc3, r25 + ld r22, Y+ + ld r23, Y+ + ld r24, Y+ + ld r25, Y+ + rcall bmw_small_s3 + add acc0, r22 + adc acc1, r23 + adc acc2, r24 + adc acc3, r25 + ld r22, Y+ + ld r23, Y+ + ld r24, Y+ + ld r25, Y+ + rcall bmw_small_s0 + add acc0, r22 + adc acc1, r23 + adc acc2, r24 + adc acc3, r25 + dec r16 + brne 1b +expand1_exit: + movw r22, acc0 + movw r24, acc2 + pop r16 + pop_range 2, 5 + pop_range 28, 29 + ret + +/******************************************************************************* +* uint32_t bmw_small_expand2(uint8_t j, const void* m, const void* h, const uint32_t* q){ +* uint32_t(*rf[])(uint32_t) = {bmw_small_r1, bmw_small_r2, bmw_small_r3, +* bmw_small_r4, bmw_small_r5, bmw_small_r6, +* bmw_small_r7}; +* uint32_t r; +* uint8_t i; +* r = addelement(j, m, h); +* for(i=0; i<14; i+=2){ +* r += q[j+i]; +* } +* for(i=0; i<14; i+=2){ +* r += rf[i/2](q[j+i+1]); +* } +* r += bmw_small_s4(q[j+14]); +* r += bmw_small_s5(q[j+15]); +* return r; +* } +*/ +expand2_jumptable: + ret + rjmp bmw_small_r1 + ret + rjmp bmw_small_r2 + ret + rjmp bmw_small_r3 + ret + rjmp bmw_small_r4 + ret + rjmp bmw_small_r5 + ret + rjmp bmw_small_r6 + ret + rjmp bmw_small_r7 + rjmp bmw_small_s4 + rjmp bmw_small_s5 + +.global bmw_small_expand2 +bmw_small_expand2: + push_range 28, 29 + movw r28, r18 + mov r18, r24 + lsl r18 + lsl r18 + add r28, r18 + adc r29, r1 + rcall addelement + push_range 2, 5 + push r16 + ldi r16, 16 + movw acc0, r22 + movw acc2, r24 + ldi r30, pm_lo8(expand2_jumptable) + ldi r31, pm_hi8(expand2_jumptable) +1: + ld r22, Y+ + ld r23, Y+ + ld r24, Y+ + ld r25, Y+ + push r30 + push r31 + icall + pop r31 + pop r30 + adiw r30, 1 + add acc0, r22 + adc acc1, r23 + adc acc2, r24 + adc acc3, r25 + dec r16 + brne 1b + rjmp expand1_exit + +/******************************************************************************* +* void bmw_small_f1(uint32_t* q, const void* m, const void* h){ +* uint8_t i; +* q[16] = bmw_small_expand1(0, m, h, q); +* q[17] = bmw_small_expand1(1, m, h, q); +* for(i=2; i<16; ++i){ +* q[16+i] = bmw_small_expand2(i, m, h, q); +* } +* } +*/ +m0 = 2 +m1 = 3 +h0 = 4 +h1 = 5 +q0 = 6 +q1 = 7 +.global bmw_small_f1 +bmw_small_f1: + push_range 2, 7 + push_range 28, 29 + push r16 + movw q0, r24 + movw m0, r22 + movw h0, r20 + movw r28, q0 + adiw r28, 63 + adiw r28, 1 + clr r24 + clr r25 /* not required */ + movw r18, q0 + rcall bmw_small_expand1 + st Y+, r22 + st Y+, r23 + st Y+, r24 + st Y+, r25 + ldi r16, 1 + mov r24, r16 + clr r25 /* not required */ + movw r22, m0 + movw r20, h0 + movw r18, q0 + rcall bmw_small_expand1 + st Y+, r22 + st Y+, r23 + st Y+, r24 + st Y+, r25 + inc r16 +1: + mov r24, r16 + movw r22, m0 + movw r20, h0 + movw r18, q0 + rcall bmw_small_expand2 + st Y+, r22 + st Y+, r23 + st Y+, r24 + st Y+, r25 + inc r16 + cpi r16, 16 + brne 1b + pop r16 + pop_range 28, 29 + pop_range 2, 7 + ret + +/******************************************************************************* +* uint16_t hack_table[5] PROGMEM = { 0x0311, 0xDDB3, 0x2A79, 0x07AA, 0x51C2 }; +* uint8_t offset_table[5] PROGMEM = { 4+16, 6+16, 9+16, 12+16, 13+16 }; +* +* void bmw_small_f0(uint32_t* h, const void* m, uint32_t* q){ +* uint16_t hack_reg; +* uint8_t c,i,j; +* uint32_t(*s[])(uint32_t)={ bmw_small_s0, bmw_small_s1, bmw_small_s2, +* bmw_small_s3, bmw_small_s4 }; +* for(i=0; i<16; ++i){ +* ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i]; +* } +* dump_x(h, 16, 'T'); +* memset(q, 0, 4*16); +* c=4; +* do{ +* i=15; +* j=pgm_read_byte(offset_table+c); +* hack_reg=pgm_read_word(&(hack_table[c])); +* do{ +* if(hack_reg&1){ +* q[i]-= h[j&15]; +* }else{ +* q[i]+= h[j&15]; +* } +* --j; +* hack_reg>>= 1; +* }while(i--!=0); +* }while(c--!=0); +* dump_x(q, 16, 'W'); +* for(i=0; i<16; ++i){ +* q[i] = s[i%5](q[i]); +* } +* for(i=0; i<16; ++i){ +* ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i]; +* } +* for(i=0; i<16; ++i){ +* q[i] += h[(i+1)&0xf]; +* } +* } +* +* param h: r24:r25 +* param m: r22:r23 +* param q: r20:r21 +*/ +h0 = 24 +h1 = 25 +m0 = 22 +m1 = 23 +q0 = 20 +q1 = 21 +acc0 = 4 +acc1 = 5 +acc2 = 6 +acc3 = 7 +bcc0 = 8 +bcc1 = 9 +bcc2 = 10 +bcc3 = 11 +hack = 16 + +f0_helper: +20: + ldd acc0, Z+0 + ldd acc1, Z+1 + ldd acc2, Z+2 + ldd acc3, Z+3 + ld bcc0, X+ + ld bcc1, X+ + ld bcc2, X+ + ld bcc3, X+ + lsr r17 + ror r16 + brcs l20_sub + add acc0, bcc0 + adc acc1, bcc1 + adc acc2, bcc2 + adc acc3, bcc3 + rjmp l20_post +l20_sub: + sub acc0, bcc0 + sbc acc1, bcc1 + sbc acc2, bcc2 + sbc acc3, bcc3 +l20_post: + st Z+, acc0 + st Z+, acc1 + st Z+, acc2 + st Z+, acc3 + dec r18 + brne 20b + ret + +f0_jumptable: + rjmp bmw_small_s0 + rjmp bmw_small_s1 + rjmp bmw_small_s2 + rjmp bmw_small_s3 + rjmp bmw_small_s4 + rjmp bmw_small_s0 + rjmp bmw_small_s1 + rjmp bmw_small_s2 + rjmp bmw_small_s3 + rjmp bmw_small_s4 + rjmp bmw_small_s0 + rjmp bmw_small_s1 + rjmp bmw_small_s2 + rjmp bmw_small_s3 + rjmp bmw_small_s4 + rjmp bmw_small_s0 + +.global bmw_small_f0 +bmw_small_f0: + push_range 28, 29 + push_range 4, 11 + push_range 16, 17 + /* h[i] ^= m[i]; q[i]= 0 */ + movw r26, h0 ; h + movw r30, m0 ; m + movw r28, q0 ; q + ldi r18, 64 +1: ld r0, X + ld r19, Z+ + eor r0, r19 + st X+, r0 + st Y+, r1 + dec r18 + brne 1b +;------ + ldi r17, 0x88 + ldi r16, 0xC0 + movw r26, h0 ; X = h + adiw r26, 5*4 + ldi r18, 16-5 + movw r30, q0 ; Z = q + rcall f0_helper + movw r26, h0 ; X = h + ldi r18, 5 + rcall f0_helper +;--- + ldi r17, 0xCD + ldi r16, 0xBB + movw r26, h0 ; X = h + adiw r26, 7*4 + ldi r18, 16-7 + movw r30, q0 ; Z = q + rcall f0_helper + movw r26, h0 ; X = h + ldi r18, 7 + rcall f0_helper +;--- + ldi r17, 0x9E + ldi r16, 0x54 + movw r26, h0 ; X = h + adiw r26, 10*4 + ldi r18, 16-10 + movw r30, q0 ; Z = q + rcall f0_helper + movw r26, h0 ; X = h + ldi r18, 10 + rcall f0_helper +;--- + ldi r17, 0x55 + ldi r16, 0xE0 + movw r26, h0 ; X = h + adiw r26, 13*4 + ldi r18, 16-13 + movw r30, q0 ; Z = q + rcall f0_helper + movw r26, h0 ; X = h + ldi r18, 13 + rcall f0_helper +;--- + ldi r17, 0x43 + ldi r16, 0x8A + movw r26, h0 ; X = h + adiw r26, 14*4 + ldi r18, 16-14 + movw r30, q0 ; Z = q + rcall f0_helper + movw r26, h0 ; X = h + ldi r18, 14 + rcall f0_helper +;--------------- h[i] ^= m[i] + movw r26, h0 ; h + movw r30, m0 ; m + ldi r18, 64 +25: ld r0, X + ld r19, Z+ + eor r0, r19 + st X+, r0 + dec r18 + brne 25b +;--------------- q[i] = s[i%5](q[i]) + ldi r16, 16 + ldi r30, pm_lo8(f0_jumptable) + ldi r31, pm_hi8(f0_jumptable) + movw bcc0, r30 + movw bcc2, h0 ; h + movw acc0, q0 ; q + movw r28, q0 ; Y = q +30: + ldd r22, Y+0 + ldd r23, Y+1 + ldd r24, Y+2 + ldd r25, Y+3 + icall + st Y+, r22 + st Y+, r23 + st Y+, r24 + st Y+, r25 + movw r30, bcc0 + adiw r30, 1 + movw bcc0, r30 + dec r16 + brne 30b +;--------------- q[i] += h[(i+1)%16] + movw r30, acc0 ; q + movw r26, bcc2 ; h + adiw r26, 4 + ldi r18, 15 +40: + ld acc0, Z + ld acc1, X+ + add acc0, acc1 + st Z+, acc0 + ld acc0, Z + ld acc1, X+ + adc acc0, acc1 + st Z+, acc0 + ld acc0, Z + ld acc1, X+ + adc acc0, acc1 + st Z+, acc0 + ld acc0, Z + ld acc1, X+ + adc acc0, acc1 + st Z+, acc0 + dec r18 + brne 40b + movw r26, bcc2 ; h + ld acc0, Z + ld acc1, X+ + add acc0, acc1 + st Z+, acc0 + ld acc0, Z + ld acc1, X+ + adc acc0, acc1 + st Z+, acc0 + ld acc0, Z + ld acc1, X+ + adc acc0, acc1 + st Z+, acc0 + ld acc0, Z + ld acc1, X+ + adc acc0, acc1 + st Z+, acc0 + + pop_range 16, 17 + pop_range 4, 11 + pop_range 28, 29 + ret + +/******************************************************************************* +* void bmw_small_f2(uint32_t* h, const uint32_t* q, const void* m){ +* uint32_t xl=0, xh; +* uint8_t i; +* for(i=16;i<24;++i){ +* xl ^= q[i]; +* } +* xh = xl; +* for(i=24;i<32;++i){ +* xh ^= q[i]; +* } +* memcpy(h, m, 16*4); +* h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5); +* h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6); +* h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5); +* h[4] ^= SHR32(xh, 3) ^ q[20]; +* h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6); +* h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5); +* h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8); +* h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2); +* for(i=0; i<8; ++i){ +* h[i] += xl ^ q[24+i] ^ q[i]; +* } +* for(i=0; i<8; ++i){ +* h[8+i] ^= xh ^ q[24+i]; +* h[8+i] += ROTL32(h[(4+i)%8],i+9); +* } +* h[11] += SHL32(xl, 4) ^ q[18] ^ q[11]; +* h[10] += SHL32(xl, 6) ^ q[17] ^ q[10]; +* h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8]; +* h[15] += SHR32(xl, 2) ^ q[22] ^ q[15]; +* h[12] += SHR32(xl, 3) ^ q[19] ^ q[12]; +* h[13] += SHR32(xl, 4) ^ q[20] ^ q[13]; +* h[ 9] += SHR32(xl, 6) ^ q[16] ^ q[ 9]; +* h[14] += SHR32(xl, 7) ^ q[21] ^ q[14]; +* } +* +* param h: r24:r25 +* param q: r22:r23 +* param m: r20:r21 +*/ +xl0 = 2 +xl1 = 3 +xl2 = 4 +xl3 = 5 +xh0 = 6 +xh1 = 7 +xh2 = 8 +xh3 = 9 +q0 = 10 +q1 = 11 +h0 = 12 +h1 = 13 +t0 = 14 +t1 = 15 +t2 = 16 +t3 = 17 + + +.macro modify_h_2 addr:req + ldd r22, Y+\addr*4+0 + ldd r23, Y+\addr*4+1 + ldd r24, Y+\addr*4+2 + ldd r25, Y+\addr*4+3 + eor r22, t0 + eor r23, t1 + eor r24, t2 + eor r25, t3 + ldd r0, Z+\addr*4+0 + add r0, r22 + std Z+\addr*4+0, r0 + ldd r0, Z+\addr*4+1 + adc r0, r23 + std Z+\addr*4+1, r0 + ldd r0, Z+\addr*4+2 + adc r0, r24 + std Z+\addr*4+2, r0 + ldd r0, Z+\addr*4+3 + adc r0, r25 + std Z+\addr*4+3, r0 +.endm + +tshiftr: + lsr t3 + ror t2 + ror t1 + ror t0 + dec r20 + brne tshiftr + ret + +tshiftl: + lsl t0 + rol t1 + rol t2 + rol t3 + dec r20 + brne tshiftl + ret + +.global bmw_small_f2 +bmw_small_f2: + /* memcpy(h, m, 64) */ + movw r26, r24 + movw r30, r20 + ldi r18, 64 +1: ld r0, Z+ + st X+, r0 + dec r18 + brne 1b + push_range 28, 29 + push_range 2, 17 + movw q0, r22 + movw h0, r24 + /* calc xl */ +/* for(i=16;i<24;++i){ + xl ^= q[i]; + } +*/ + movw r26, q0 + adiw r26, 63 + adiw r26, 1 ; X points at q[16] + ld xl0, X+ + ld xl1, X+ + ld xl2, X+ + ld xl3, X+ + ldi r18, 8-1 +20: ld r0, X+ + eor xl0, r0 + ld r0, X+ + eor xl1, r0 + ld r0, X+ + eor xl2, r0 + ld r0, X+ + eor xl3, r0 + dec r18 + brne 20b + /* calc xh */ +/* xh = xl + for(i=24;i<32;++i){ + xh ^= q[i]; + } +*/ + movw xh0, xl0 + movw xh2, xl2 + ldi r18, 8 +25: ld r0, X+ + eor xh0, r0 + ld r0, X+ + eor xh1, r0 + ld r0, X+ + eor xh2, r0 + ld r0, X+ + eor xh3, r0 + dec r18 + brne 25b +/* h[0]..h[7] */ + movw r30, h0 + movw r28, q0 + adiw r28, 60 ; Y points at q[15] +/* h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5); */ + movw t0, xh0 + movw t2, xh2 + ldi r20, 5 + rcall tshiftl + ldd r22, Y+4 + ldd r23, Y+5 + ldd r24, Y+6 + ldd r25, Y+7 + ldi r20, 5 + rcall shiftr32 + eor r22, t0 + eor r23, t1 + eor r24, t2 + eor r25, t3 + ldd r0, Z+0 + eor r22, r0 + ldd r0, Z+1 + eor r23, r0 + ldd r0, Z+2 + eor r24, r0 + ldd r0, Z+3 + eor r25, r0 + std Z+0, r22 + std Z+1, r23 + std Z+2, r24 + std Z+3, r25 +/* h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6); */ + lsl t0 + rol t1 + rol t2 + rol t3 + ldd r22, Y+24 + ldd r23, Y+25 + ldd r24, Y+26 + ldd r25, Y+27 + ldi r20, 6 + rcall shiftr32 + eor r22, t0 + eor r23, t1 + eor r24, t2 + eor r25, t3 + ldd r0, Z+20 + eor r22, r0 + ldd r0, Z+21 + eor r23, r0 + ldd r0, Z+22 + eor r24, r0 + ldd r0, Z+23 + eor r25, r0 + std Z+20, r22 + std Z+21, r23 + std Z+22, r24 + std Z+23, r25 +/* h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5); */ + movw t0, xh0 + movw t2, xh2 + lsr t3 + ror t2 + ror t1 + ror t0 + ldd r22, Y+16 + ldd r23, Y+17 + ldd r24, Y+18 + ldd r25, Y+19 + ldi r20, 5 + rcall shiftl32 + eor r22, t0 + eor r23, t1 + eor r24, t2 + eor r25, t3 + ldd r0, Z+12 + eor r22, r0 + ldd r0, Z+13 + eor r23, r0 + ldd r0, Z+14 + eor r24, r0 + ldd r0, Z+15 + eor r25, r0 + std Z+12, r22 + std Z+13, r23 + std Z+14, r24 + std Z+15, r25 +/* h[4] ^= SHR32(xh, 3) ^ q[20]; */ + ldi r20, 2 + rcall tshiftr + ldd r22, Y+20 + ldd r23, Y+21 + ldd r24, Y+22 + ldd r25, Y+23 + eor r22, t0 + eor r23, t1 + eor r24, t2 + eor r25, t3 + ldd r0, Z+16 + eor r22, r0 + ldd r0, Z+17 + eor r23, r0 + ldd r0, Z+18 + eor r24, r0 + ldd r0, Z+19 + eor r25, r0 + std Z+16, r22 + std Z+17, r23 + std Z+18, r24 + std Z+19, r25 +/* h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6); */ + lsr t3 + ror t2 + ror t1 + ror t0 + ldd r22, Y+28 + ldd r23, Y+29 + ldd r24, Y+30 + ldd r25, Y+31 + ldi r20, 6 + rcall shiftl32 + eor r22, t0 + eor r23, t1 + eor r24, t2 + eor r25, t3 + ldd r0, Z+24 + eor r22, r0 + ldd r0, Z+25 + eor r23, r0 + ldd r0, Z+26 + eor r24, r0 + ldd r0, Z+27 + eor r25, r0 + std Z+24, r22 + std Z+25, r23 + std Z+26, r24 + std Z+27, r25 +/* h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5); */ + lsr t3 + ror t2 + ror t1 + ror t0 + ldd r22, Y+12 + ldd r23, Y+13 + ldd r24, Y+14 + ldd r25, Y+15 + ldi r20, 5 + rcall shiftl32 + eor r22, t0 + eor r23, t1 + eor r24, t2 + eor r25, t3 + ldd r0, Z+8 + eor r22, r0 + ldd r0, Z+9 + eor r23, r0 + ldd r0, Z+10 + eor r24, r0 + ldd r0, Z+11 + eor r25, r0 + std Z+8 , r22 + std Z+9 , r23 + std Z+10, r24 + std Z+11, r25 +/* h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8); */ + ldi r20, 2 + rcall tshiftr + ldd r23, Y+8 + ldd r24, Y+9 + ldd r25, Y+10 + mov r22, t0 + eor r23, t1 + eor r24, t2 + eor r25, t3 + ldd r0, Z+4 + eor r22, r0 + ldd r0, Z+5 + eor r23, r0 + ldd r0, Z+6 + eor r24, r0 + ldd r0, Z+7 + eor r25, r0 + std Z+4 , r22 + std Z+5 , r23 + std Z+6 , r24 + std Z+7 , r25 +/* h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2); */ + ldi r20, 4 + rcall tshiftr + ldd r22, Y+32 + ldd r23, Y+33 + ldd r24, Y+34 + ldd r25, Y+35 + ldi r20, 2 + rcall shiftl32 + eor r22, t0 + eor r23, t1 + eor r24, t2 + eor r25, t3 + ldd r0, Z+28 + eor r22, r0 + ldd r0, Z+29 + eor r23, r0 + ldd r0, Z+30 + eor r24, r0 + ldd r0, Z+31 + eor r25, r0 + std Z+28, r22 + std Z+29, r23 + std Z+30, r24 + std Z+31, r25 +/* for(i=0; i<8; ++i){ +* h[i] += xl ^ q[24+i] ^ q[i]; +* } +*/ + movw r26, q0 + movw r28, q0 + adiw r28, 63 + adiw r28, 24*4-63 + ldi r18, 8 +10: + movw t0, xl0 + movw t2, xl2 + ld r0, X+ + eor t0, r0 + ld r0, X+ + eor t1, r0 + ld r0, X+ + eor t2, r0 + ld r0, X+ + eor t3, r0 + ld r0, Y+ + eor t0, r0 + ld r0, Y+ + eor t1, r0 + ld r0, Y+ + eor t2, r0 + ld r0, Y+ + eor t3, r0 + ldd r22, Z+0 + ldd r23, Z+1 + ldd r24, Z+2 + ldd r25, Z+3 + add r22, t0 + adc r23, t1 + adc r24, t2 + adc r25, t3 + st Z+, r22 + st Z+, r23 + st Z+, r24 + st Z+, r25 + dec r18 + brne 10b + ; Z points to h[8] +/* for(i=0; i<8; ++i){ + h[8+i] ^= xh ^ q[24+i]; + h[8+i] += ROTL32(h[(4+i)%8],i+9); + } +*/ + ; Z points at h[8] +; clr r18 + sbiw r28, 8*4 ; Y points at q[24] + movw r26, r30 + sbiw r26, 4*4 ; X points at h[4] +15: + ldd t0, Z+0 + ldd t1, Z+1 + ldd t2, Z+2 + ldd t3, Z+3 + eor t0, xh0 + eor t1, xh1 + eor t2, xh2 + eor t3, xh3 + ld r0, Y+ + eor t0, r0 + ld r0, Y+ + eor t1, r0 + ld r0, Y+ + eor t2, r0 + ld r0, Y+ + eor t3, r0 + ld r22, X+ + ld r23, X+ + ld r24, X+ + ld r25, X+ + mov r20, r18 + rcall rotl32p9 + add t0, r22 + adc t1, r23 + adc t2, r24 + adc t3, r25 + st Z+, t0 + st Z+, t1 + st Z+, t2 + st Z+, t3 + inc r18 + cpi r18, 4 + brne 16f + movw r26, h0 +16: + sbrs r18, 3 + rjmp 15b + sbiw r30, 4*8 ; adjust Z to point at h[8] + sbiw r28, 16*4-1 + sbiw r28, 1 ; adjust Y to point at q[16] + movw r26, r28 + sbiw r26, 7*4 ; adjust X to point at q[9] + ldi r18, 7*4 +20: /* now we do the memxor stuff */ + ld t0, X + ld t1, Y+ + eor t0, t1 + st X+, t0 + dec r18 + brne 20b + ; X points at q[16] + ; Y points at q[23] + sbiw r26, 4*8 ; X points at q[8] + + clr t0 + mov t1, xl0 + mov t2, xl1 + mov t3, xl2 +/* h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8]; */ + ld r22, X+ + ld r23, X+ + ld r24, X+ + ld r25, X+ + ld r0, Y+ + eor r22, r0 + ld r0, Y+ + eor r23, r0 + ld r0, Y+ + eor r24, r0 + ld r0, Y+ + eor r25, r0 + eor r22, t0 + eor r23, t1 + eor r24, t2 + eor r25, t3 + ld r0, Z + add r0, r22 + st Z+, r0 + ld r0, Z + adc r0, r23 + st Z+, r0 + ld r0, Z + adc r0, r24 + st Z+, r0 + ld r0, Z + adc r0, r25 + st Z+, r0 + movw r28, r26 + ; Z points at h[9] + ; X points at q[9] but we won't need it anymore + ; Y points at q[9] +/* h[11] += SHL32(xl, 4) ^ q[11]; */ + movw t0, xl0 + movw t2, xl2 + ldi r20, 4 + rcall tshiftl + modify_h_2 2 +/* h[10] += SHL32(xl, 6) ^ q[10]; */ + ldi r20, 2 + rcall tshiftl + modify_h_2 1 +/* h[15] += SHR32(xl, 2) ^ q[15]; */ + movw t0, xl0 + movw t2, xl2 + ldi r20, 2 + rcall tshiftr + modify_h_2 6 +/* h[12] += SHR32(xl, 3) ^ q[12]; */ + ldi r20, 1 + rcall tshiftr + modify_h_2 3 +/* h[13] += SHR32(xl, 4) ^ q[13]; */ + ldi r20, 1 + rcall tshiftr + modify_h_2 4 +/* h[ 9] += SHR32(xl, 6) ^ q[ 9]; */ + ldi r20, 2 + rcall tshiftr + modify_h_2 0 +/* h[14] += SHR32(xl, 7) ^ q[14]; */ + ldi r20, 1 + rcall tshiftr + modify_h_2 5 +bmw_small_f2_exit: + pop_range 2, 17 + pop_range 28, 29 + ret + +cli_putb: + push r2 + push_range 18, 26 + push_range 30, 31 + mov r2, r24 + swap r24 + andi r24, 0xf + ldi r30, lo8(hextable) + ldi r31, hi8(hextable) + add r30, r24 + adc r31, r1 + lpm r24, Z + clr r25 + call cli_putc + mov r24, r2 + andi r24, 0xf + ldi r30, lo8(hextable) + ldi r31, hi8(hextable) + add r30, r24 + adc r31, r1 + lpm r24, Z + clr r25 + call cli_putc + pop_range 30, 31 + pop_range 18, 26 + pop r2 + ret +hextable: + .byte '0', '1', '2', '3', '4', '5', '6', '7' + .byte '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' + +cli_putchar: + push_range 18, 31 + call cli_putc + pop_range 18, 31 + ret diff --git a/bmw/bmw_small-cstub.c b/bmw/bmw_small-cstub.c new file mode 100644 index 0000000..af26144 --- /dev/null +++ b/bmw/bmw_small-cstub.c @@ -0,0 +1,239 @@ +/* bmw_small.c */ +/* + This file is part of the AVR-Crypto-Lib. + Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +/* + * \file bmw_small.c + * \author Daniel Otte + * \email daniel.otte@rub.de + * \date 2009-04-27 + * \license GPLv3 or later + * + */ + +#include +#include +#include +#include "bmw_small.h" + + +#define SHL32(a,n) ((a)<<(n)) +#define SHR32(a,n) ((a)>>(n)) +#define ROTL32(a,n) (((a)<<(n))|((a)>>(32-(n)))) +#define ROTR32(a,n) (((a)>>(n))|((a)<<(32-(n)))) + +#define DEBUG 0 + + +#if DEBUG + #include "cli.h" + + void ctx_dump(const bmw_small_ctx_t* ctx){ + uint8_t i; + cli_putstr_P(PSTR("\r\n==== ctx dump ====")); + for(i=0; i<16;++i){ + cli_putstr_P(PSTR("\r\n h[")); + cli_hexdump(&i, 1); + cli_putstr_P(PSTR("] = ")); + cli_hexdump_rev(&(ctx->h[i]), 4); + } + cli_putstr_P(PSTR("\r\n counter = ")); + cli_hexdump(&(ctx->counter), 4); + } + + void dump_x(const uint32_t* q, uint8_t elements, char x){ + uint8_t i; + cli_putstr_P(PSTR("\r\n==== ")); + cli_putc(x); + cli_putstr_P(PSTR(" dump ====")); + for(i=0; i. +*/ + +/* + * File: memxor.S + * Author: Daniel Otte + * Date: 2008-08-07 + * License: GPLv3 or later + * Description: memxor, XORing one block into another + * + */ + +/* + * void memxor(void* dest, const void* src, uint16_t n); + */ + /* + * param dest is passed in r24:r25 + * param src is passed in r22:r23 + * param n is passed in r20:r21 + */ +.global memxor +memxor: + movw r30, r24 + movw r26, r22 + movw r24, r20 + adiw r24, 0 + breq 2f +1: + ld r20, X+ + ld r21, Z + eor r20, r21 + st Z+, r20 + sbiw r24, 1 + brne 1b +2: + ret + + + + + + + + + + + + + + diff --git a/bmw/memxor.h b/bmw/memxor.h new file mode 100644 index 0000000..a62a616 --- /dev/null +++ b/bmw/memxor.h @@ -0,0 +1,7 @@ +#ifndef MEMXOR_H_ +#define MEMXOR_H_ +#include + +void memxor(void* dest, const void* src, uint16_t n); + +#endif diff --git a/mkfiles/bmw.mk b/mkfiles/bmw.mk new file mode 100644 index 0000000..6a57584 --- /dev/null +++ b/mkfiles/bmw.mk @@ -0,0 +1,12 @@ +# Makefile for BlueMidnightWish +ALGO_NAME := BMW + +# comment out the following line for removement of BlueMidnightWish from the build process +HASHES += $(ALGO_NAME) + +$(ALGO_NAME)_DIR := bmw/ +$(ALGO_NAME)_OBJ := bmw_small-asm.o bmw_small-cstub.o bmw_large.o +$(ALGO_NAME)_TEST_BIN := main-bmw-test.o hfal_bmw_small.o hfal_bmw_large.o $(CLI_STD) $(HFAL_STD) +$(ALGO_NAME)_NESSIE_TEST := test nessie +$(ALGO_NAME)_PERFORMANCE_TEST := performance + diff --git a/mkfiles/bmw_c.mk b/mkfiles/bmw_c.mk index 585bbb2..03a1e9e 100644 --- a/mkfiles/bmw_c.mk +++ b/mkfiles/bmw_c.mk @@ -5,7 +5,7 @@ ALGO_NAME := BMW_C HASHES += $(ALGO_NAME) $(ALGO_NAME)_DIR := bmw/ -$(ALGO_NAME)_OBJ := bmw_small.o bmw_large.o +$(ALGO_NAME)_OBJ := bmw_small.o bmw_large.o memxor.o $(ALGO_NAME)_TEST_BIN := main-bmw-test.o hfal_bmw_small.o hfal_bmw_large.o $(CLI_STD) $(HFAL_STD) $(ALGO_NAME)_NESSIE_TEST := test nessie $(ALGO_NAME)_PERFORMANCE_TEST := performance