/* noekeon_asm.S */ /* This file is part of the AVR-Crypto-Lib. Copyright (C) 2006-2015 Daniel Otte (bg@nerilex.org) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* * noekeon assembler implementation for avr * author: Daniel Otte * email: bg@nerilex.org * license: GPLv3 */ #include .macro push_all push r2 push r3 push r4 push r5 push r6 push r7 push r8 push r9 push r10 push r11 push r12 push r13 push r14 push r15 push r16 push r17 push r28 push r29 .endm .macro pop_all pop r29 pop r28 pop r17 pop r16 pop r15 pop r14 pop r13 pop r12 pop r11 pop r10 pop r9 pop r8 pop r7 pop r6 pop r5 pop r4 pop r3 pop r2 clr r1 .endm push_all_func: pop r31 pop r30 push_all ijmp pop_all_func: pop r31 pop r30 pop_all ijmp .macro xchg a b eor \a, \b eor \b, \a eor \a, \b .endm .macro op32 op a b \op \a\()_0, \b\()_0 \op \a\()_1, \b\()_1 \op \a\()_2, \b\()_2 \op \a\()_3, \b\()_3 .endm .macro op32_4t op a b c d w x y z \op \a, \w \op \b, \x \op \c, \y \op \d, \z .endm .macro op32_prefix op p q a b c d w x y z \op \p\()\a, \q\()\w \op \p\()\b, \q\()\x \op \p\()\c, \q\()\y \op \p\()\d, \q\()\z .endm ; === bigendian_rotl32 === ; this function rotates a 32bit bigendian word n bits to the left ; param1: the 32-bit value ; given in r25,r24,r23,r22 (r22 is most significant) ; param2: the 8-bit parameter giving the number of bits to rotate ; given in r20 ; return: the rotatet 32-bit word ; given in r25,r24,r23,r22 bigendian_rotl32: /* copy high bit of r22 to carry */ mov r1, r22 2: rol r1 rol r25 rol r24 rol r23 rol r22 dec r20 brne 2b bigendian_rotl32_exit: clr r1 ret /******************************************************************************/ ; === bigendian_rotl32 === ; this function rotates a 32bit bigendian word n bits to the right ; param1: the 32-bit value ; given in r25,r24,r23,r22 (r22 is most significant) ; param2: the 8-bit parameter giving the number of bits to rotate ; given in r20 ; return: the rotatet 32-bit word ; given in r25,r24,r23,r22 bigendian_rotr32: /* copy high bit of r25 to carry */ mov r1, r25 2: ror r1 ror r22 ror r23 ror r24 ror r25 dec r20 brne 2b bigendian_rotr32_exit: clr r1 ret /******************************************************************************/ /* void theta(uint32_t *k, uint32_t *a){ uint32_t temp; temp = a[0] ^ a[2]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8); a[1] ^= temp; a[3] ^= temp; a[0] ^= k[0]; a[1] ^= k[1]; a[2] ^= k[2]; a[3] ^= k[3]; temp = a[1] ^ a[3]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8); a[0] ^= temp; a[2] ^= temp; } */ round_const: .byte 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, \ 0x2F, 0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, \ 0xD4 ;-- a[0] state0_0 = 2 state0_1 = 3 state0_2 = 4 state0_3 = 5 ;-- a[1] state1_0 = 6 state1_1 = 7 state1_2 = 8 state1_3 = 9 ;-- a[2] state2_0 = 10 state2_1 = 11 state2_2 = 12 state2_3 = 13 ;-- a[3] state3_0 = 14 state3_1 = 15 state3_2 = 16 state3_3 = 17 ; === theta === ; ; param1: the state in r2-r17 ; param2: pointer to k in X (r26,r27) ; temp_a = 18 temp_b = 19 temp_c = 20 temp_d = 21 theta: /* temp = a[0] ^ a[2]; temp ^= temp>>>8 ^ temp<<<8 */ op32_prefix mov, temp_, state0_, a,b,c,d, 0,1,2,3 op32_prefix eor, temp_, state2_, a,b,c,d, 0,1,2,3 mov r1, temp_a eor r1, temp_b eor r1, temp_c eor r1, temp_d op32_prefix eor, temp_, r, a,b,c,d, 1,1,1,1 /* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */ /* a[1] ^= temp */ eor state1_0, temp_c eor state1_1, temp_d eor state1_2, temp_a eor state1_3, temp_b /* a[3] ^= temp */ eor state3_0, temp_c eor state3_1, temp_d eor state3_2, temp_a eor state3_3, temp_b /* state ^ k (X points to K) */ ldi r28, 2 clr r29 /* Y points to r2 aka state0_0 */ ldi temp_a, 16 1: ld r1, X+ ld r0, Y eor r1, r0 st Y+, r1 dec temp_a brne 1b sbiw r26, 16 /* set X back to key */ mov temp_a, state1_0 mov temp_b, state1_1 mov temp_c, state1_2 mov temp_d, state1_3 eor temp_a, state3_0 eor temp_b, state3_1 eor temp_c, state3_2 eor temp_d, state3_3 mov r1, temp_a eor r1, temp_b eor r1, temp_c eor r1, temp_d eor temp_a, r1 eor temp_b, r1 eor temp_c, r1 eor temp_d, r1 /* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */ /* a[0] ^= temp */ eor state0_0, temp_c eor state0_1, temp_d eor state0_2, temp_a eor state0_3, temp_b /* a[2] ^= temp */ eor state2_0, temp_c eor state2_1, temp_d eor state2_2, temp_a eor state2_3, temp_b clr r1 ret /******************************************************************************/ #ifndef NOEKEON_NO_ENC ; === noekeon_enc === ; ; param1: pointer to buffer (r24,r25) ; param2: pointer to k (r22,r23) ; .global noekeon_enc noekeon_enc: rcall push_all_func /* load state */ movw r26, r22 ldi r28, 2 clr r29 /* Y points at r2 aka state0_0 */ movw r30, r24 /* Z points at state */ push r30 push r31 ldi r22, 16 push r22 /* 16 is also the number of rounds and gets pushed here */ 1: ld r0, Z+ st Y+, r0 dec r22 brne 1b /* state loaded */ push r1 /* push round constan2 (0x00) */ ldi r20, 0x80 push r20 /* push round constan2 (0x00) */ rjmp 3f 2: ldi r30, lo8(round_const+15) ldi r31, hi8(round_const+15) sub r30, r22 sbci r31, 0 clr r1 push r1 lpm r0, Z push r0 3: rcall round /* pops rc2 & rc1 */ pop r22 dec r22 push r22 brne 2b pop r22 ldi r22, 0xD4 eor state0_3, r22 rcall theta pop r31 pop r30 clr r29 ldi r28, 2 ldi r22, 16 1: ld r0, Y+ st Z+, r0 dec r22 brne 1b rcall pop_all_func ret #endif /******************************************************************************/ /******************************************************************************/ #ifndef NOEKEON_NO_DEC ; === noekeon_dec === ; ; param1: pointer to buffer/state (r24,r25) ; param2: pointer to k (r22,r23) ; .global noekeon_dec noekeon_dec: rcall push_all_func /* allocate 16 bytes on the stack */ in r30, _SFR_IO_ADDR(SPL) in r31, _SFR_IO_ADDR(SPH) sbiw r30, 16 out _SFR_IO_ADDR(SPH), r31 out _SFR_IO_ADDR(SPL), r30 adiw r30, 1 /* push state pointer */ push r24 push r25 movw r26, r22 /* move key ptr to X */ /* set stackkey to zero */ ldi r22, 16 1: st Z+, r1 dec r22 brne 1b /* copy key to state */ clr r29 ldi r28, 2 ldi r22, 16 1: ld r0, X+ st Y+, r0 dec r22 brne 1b movw r26, r30 sbiw r26, 16 /* set X back to begining of stack key */ rcall theta /* mov state to stackkey */ clr r29 ldi r28, 2 ldi r22, 16 1: ld r0, Y+ st X+, r0 dec r22 brne 1b sbiw r26, 16 /* set X back to begining of stack key */ /* move data from stateptr to state */ pop r31 pop r30 push r30 push r31 clr r29 ldi r28, 2 ldi r22, 16 push r22 1: ld r0, Z+ st Y+, r0 dec r22 brne 1b ;--- snip 8< ---- ldi r20, 0xD4 push r20 /* push round constant2 (0xD4) */ push r22 /* push round constan1 (0x00) */ rjmp 3f 2: ldi r30, lo8(round_const-1) ldi r31, hi8(round_const-1) clr r1 add r30, r22 adc r31, r1 lpm r0, Z push r0 push r1 3: rcall round /* pops rc2 & rc1 */ pop r22 dec r22 push r22 brne 2b ;---- pop r22 rcall theta ldi r22, 0x80 eor state0_3, r22 write_state_back: /* write state back */ pop r31 /* pop state pointer */ pop r30 clr r29 ldi r28, 2 ldi r22, 16 1: ld r0, Y+ st Z+, r0 dec r22 brne 1b /* remove key from stack */ in r30, _SFR_IO_ADDR(SPL) in r31, _SFR_IO_ADDR(SPH) adiw r30, 16 out _SFR_IO_ADDR(SPH), r31 out _SFR_IO_ADDR(SPL), r30 rcall pop_all_func ret #endif /******************************************************************************/ round: pop r24 pop r25 pop r1 eor state0_3, r1 rcall theta pop r1 eor state0_3, r1 push r25 push r24 pi_gamma_pi: ldi r30, pm_lo8(bigendian_rotl32) ldi r31, pm_hi8(bigendian_rotl32) rcall pi /* pi1 done; now gamma */ rcall gamma_1 /* a[0] <-> a[3] */ xchg state0_0, state3_0 xchg state0_1, state3_1 xchg state0_2, state3_2 xchg state0_3, state3_3 /* a[2] ^= a[0] ^ a[1] ^ a[3] */ op32 eor, state2, state0 op32 eor, state2, state1 op32 eor, state2, state3 rcall gamma_1 ldi r30, pm_lo8(bigendian_rotr32) ldi r31, pm_hi8(bigendian_rotr32) rcall pi ret gamma_1: /* a[1] ^= ~(a[3]|a[2])*/ mov r1, state3_0 or r1, state2_0 com r1 eor state1_0, r1 mov r1, state3_1 or r1, state2_1 com r1 eor state1_1, r1 mov r1, state3_2 or r1, state2_2 com r1 eor state1_2, r1 mov r1, state3_3 or r1, state2_3 com r1 eor state1_3, r1 /* a[0] ^= a[2]&a[1] */ mov r1, state2_0 and r1, state1_0 eor state0_0, r1 mov r1, state2_1 and r1, state1_1 eor state0_1, r1 mov r1, state2_2 and r1, state1_2 eor state0_2, r1 mov r1, state2_3 and r1, state1_3 eor state0_3, r1 ret pi: /* a[1] <<<= 1*/ mov r22, state1_0 mov r23, state1_1 mov r24, state1_2 mov r25, state1_3 ldi r20, 1 icall mov state1_0, r22 mov state1_1, r23 mov state1_2, r24 mov state1_3, r25 /* a[2] <<<= 5*/ mov r22, state2_0 mov r23, state2_1 mov r24, state2_2 mov r25, state2_3 ldi r20, 5 icall mov state2_0, r22 mov state2_1, r23 mov state2_2, r24 mov state2_3, r25 /* a[3] <<<= 2*/ mov r22, state3_0 mov r23, state3_1 mov r24, state3_2 mov r25, state3_3 ldi r20, 2 icall mov state3_0, r22 mov state3_1, r23 mov state3_2, r24 mov state3_3, r25 ret /******************************************************************************/ /* void noekeon_init(void *key, noekeon_ctx_t *ctx){ uint8_t nullv[16]; memset(nullv, 0, 16); memcpy(ctx, key, 16); noekeon_enc(ctx, nullv); } */ #ifndef NOEKEON_NO_INIT .global noekeon_init noekeon_init: ; === noekeon_init === ; ; param1: pointer to key (r24,r25) ; param2: pointer to context (r22,r23) ; in r30, _SFR_IO_ADDR(SPL) in r31, _SFR_IO_ADDR(SPH) sbiw r30, 16 out _SFR_IO_ADDR(SPH), r31 out _SFR_IO_ADDR(SPL), r30 movw r26, r22 adiw r30, 1 movw r22, r30 /* set nullv(stack) to zero */ ldi r20, 16 1: st Z+, r1 dec r20 brne 1b /* copy key data to ctx */ movw r30, r24 ldi r20, 16 1: ld r1, Z+ st X+, r1 dec r20 brne 1b clr r1 sbiw r26, 16 movw r24, r26 rcall noekeon_enc in r30, _SFR_IO_ADDR(SPL) in r31, _SFR_IO_ADDR(SPH) adiw r30, 16 out _SFR_IO_ADDR(SPH), r31 out _SFR_IO_ADDR(SPL), r30 ret #endif