+++ /dev/null
-/* noekeon_asm.S */
-/*
- This file is part of the AVR-Crypto-Lib.
- Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de)
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-/*
- * noekeon assembler implementation for avr
- * author: Daniel Otte
- * email: daniel.otte@rub.de
- * license: GPLv3
- */
-
-#include <avr/io.h>
-
-.macro push_all
- push r2
- push r3
- push r4
- push r5
- push r6
- push r7
- push r8
- push r9
- push r10
- push r11
- push r12
- push r13
- push r14
- push r15
- push r16
- push r17
- push r28
- push r29
-.endm
-
-.macro pop_all
- pop r29
- pop r28
- pop r17
- pop r16
- pop r15
- pop r14
- pop r13
- pop r12
- pop r11
- pop r10
- pop r9
- pop r8
- pop r7
- pop r6
- pop r5
- pop r4
- pop r3
- pop r2
- clr r1
-.endm
-
-push_all_func:
- pop r31
- pop r30
- push_all
- ijmp
-
-pop_all_func:
- pop r31
- pop r30
- pop_all
- ijmp
-
-.macro xchg a b
- eor \a, \b
- eor \b, \a
- eor \a, \b
-.endm
-
-.macro op32 op a b
- \op \a\()_0, \b\()_0
- \op \a\()_1, \b\()_1
- \op \a\()_2, \b\()_2
- \op \a\()_3, \b\()_3
-.endm
-
-
-.macro op32_4t op a b c d w x y z
- \op \a, \w
- \op \b, \x
- \op \c, \y
- \op \d, \z
-.endm
-
-
-.macro op32_prefix op p q a b c d w x y z
- \op \p\()\a, \q\()\w
- \op \p\()\b, \q\()\x
- \op \p\()\c, \q\()\y
- \op \p\()\d, \q\()\z
-.endm
-
-; === bigendian_rotl32 ===
-; this function rotates a 32bit bigendian word n bits to the left
-; param1: the 32-bit value
-; given in r25,r24,r23,r22 (r22 is most significant)
-; param2: the 8-bit parameter giving the number of bits to rotate
-; given in r20
-; return: the rotatet 32-bit word
-; given in r25,r24,r23,r22
-
-bigendian_rotl32:
- /* copy high bit of r22 to carry */
- mov r1, r22
-2:
- rol r1
-
- rol r25
- rol r24
- rol r23
- rol r22
-
- dec r20
- brne 2b
-bigendian_rotl32_exit:
- clr r1
- ret
-
-
-/******************************************************************************/
-
-; === bigendian_rotl32 ===
-; this function rotates a 32bit bigendian word n bits to the right
-; param1: the 32-bit value
-; given in r25,r24,r23,r22 (r22 is most significant)
-; param2: the 8-bit parameter giving the number of bits to rotate
-; given in r20
-; return: the rotatet 32-bit word
-; given in r25,r24,r23,r22
-
-bigendian_rotr32:
- /* copy high bit of r25 to carry */
-
- mov r1, r25
-2:
- ror r1
-
- ror r22
- ror r23
- ror r24
- ror r25
- dec r20
- brne 2b
-bigendian_rotr32_exit:
- clr r1
- ret
-
-/******************************************************************************/
-/*
-void theta(uint32_t* k, uint32_t* a){
- uint32_t temp;
- temp = a[0] ^ a[2]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8);
- a[1] ^= temp;
- a[3] ^= temp;
-
- a[0] ^= k[0];
- a[1] ^= k[1];
- a[2] ^= k[2];
- a[3] ^= k[3];
-
- temp = a[1] ^ a[3]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8);
- a[0] ^= temp;
- a[2] ^= temp;
-}
-*/
-
-round_const: .byte 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, \
- 0x2F, 0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, \
- 0xD4
-
-;-- a[0]
-state0_0 = 2
-state0_1 = 3
-state0_2 = 4
-state0_3 = 5
-;-- a[1]
-state1_0 = 6
-state1_1 = 7
-state1_2 = 8
-state1_3 = 9
-;-- a[2]
-state2_0 = 10
-state2_1 = 11
-state2_2 = 12
-state2_3 = 13
-;-- a[3]
-state3_0 = 14
-state3_1 = 15
-state3_2 = 16
-state3_3 = 17
-
-; === theta ===
-;
-; param1: the state in r2-r17
-; param2: pointer to k in X (r26,r27)
-;
-temp_a = 18
-temp_b = 19
-temp_c = 20
-temp_d = 21
-
-theta:
- /* temp = a[0] ^ a[2]; temp ^= temp>>>8 ^ temp<<<8 */
- op32_prefix mov, temp_, state0_, a,b,c,d, 0,1,2,3
- op32_prefix eor, temp_, state2_, a,b,c,d, 0,1,2,3
-
- mov r1, temp_a
- eor r1, temp_b
- eor r1, temp_c
- eor r1, temp_d
-
- op32_prefix eor, temp_, r, a,b,c,d, 1,1,1,1
-
- /* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */
- /* a[1] ^= temp */
- eor state1_0, temp_c
- eor state1_1, temp_d
- eor state1_2, temp_a
- eor state1_3, temp_b
- /* a[3] ^= temp */
- eor state3_0, temp_c
- eor state3_1, temp_d
- eor state3_2, temp_a
- eor state3_3, temp_b
-
- /* state ^ k (X points to K) */
- ldi r28, 2
- clr r29 /* Y points to r2 aka state0_0 */
- ldi temp_a, 16
-1:
- ld r1, X+
- ld r0, Y
- eor r1, r0
- st Y+, r1
- dec temp_a
- brne 1b
- sbiw r26, 16 /* set X back to key */
-
- mov temp_a, state1_0
- mov temp_b, state1_1
- mov temp_c, state1_2
- mov temp_d, state1_3
- eor temp_a, state3_0
- eor temp_b, state3_1
- eor temp_c, state3_2
- eor temp_d, state3_3
- mov r1, temp_a
- eor r1, temp_b
- eor r1, temp_c
- eor r1, temp_d
- eor temp_a, r1
- eor temp_b, r1
- eor temp_c, r1
- eor temp_d, r1
- /* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */
- /* a[0] ^= temp */
- eor state0_0, temp_c
- eor state0_1, temp_d
- eor state0_2, temp_a
- eor state0_3, temp_b
- /* a[2] ^= temp */
- eor state2_0, temp_c
- eor state2_1, temp_d
- eor state2_2, temp_a
- eor state2_3, temp_b
-
- clr r1
- ret
-
-/******************************************************************************/
-#ifndef NOEKEON_NO_ENC
-; === noekeon_enc ===
-;
-; param1: pointer to buffer (r24,r25)
-; param2: pointer to k (r22,r23)
-;
-.global noekeon_enc
-noekeon_enc:
- rcall push_all_func
- /* load state */
- movw r26, r22
- ldi r28, 2
- clr r29 /* Y points at r2 aka state0_0 */
- movw r30, r24 /* Z points at state */
- push r30
- push r31
- ldi r22, 16
- push r22 /* 16 is also the number of rounds and gets pushed here */
-1:
- ld r0, Z+
- st Y+, r0
- dec r22
- brne 1b
- /* state loaded */
- push r1 /* push round constan2 (0x00) */
- ldi r20, 0x80
- push r20 /* push round constan2 (0x00) */
- rjmp 3f
-2:
- ldi r30, lo8(round_const+15)
- ldi r31, hi8(round_const+15)
- sub r30, r22
- sbci r31, 0
- clr r1
- push r1
- lpm r0, Z
- push r0
-3:
- rcall round /* pops rc2 & rc1 */
- pop r22
- dec r22
- push r22
- brne 2b
-
- pop r22
-
- ldi r22, 0xD4
- eor state0_3, r22
- rcall theta
-
- pop r31
- pop r30
- clr r29
- ldi r28, 2
- ldi r22, 16
-1:
- ld r0, Y+
- st Z+, r0
- dec r22
- brne 1b
-
- rcall pop_all_func
- ret
-#endif
-/******************************************************************************/
-/******************************************************************************/
-#ifndef NOEKEON_NO_DEC
-
-; === noekeon_dec ===
-;
-; param1: pointer to buffer/state (r24,r25)
-; param2: pointer to k (r22,r23)
-;
-.global noekeon_dec
-noekeon_dec:
- rcall push_all_func
- /* allocate 16 bytes on the stack */
- in r30, _SFR_IO_ADDR(SPL)
- in r31, _SFR_IO_ADDR(SPH)
- sbiw r30, 16
- out _SFR_IO_ADDR(SPH), r31
- out _SFR_IO_ADDR(SPL), r30
-
- adiw r30, 1
- /* push state pointer */
- push r24
- push r25
- movw r26, r22 /* move key ptr to X */
-
- /* set stackkey to zero */
- ldi r22, 16
-1: st Z+, r1
- dec r22
- brne 1b
-
- /* copy key to state */
- clr r29
- ldi r28, 2
- ldi r22, 16
-1: ld r0, X+
- st Y+, r0
- dec r22
- brne 1b
-
- movw r26, r30
- sbiw r26, 16 /* set X back to begining of stack key */
- rcall theta
-
- /* mov state to stackkey */
- clr r29
- ldi r28, 2
- ldi r22, 16
-1: ld r0, Y+
- st X+, r0
- dec r22
- brne 1b
- sbiw r26, 16 /* set X back to begining of stack key */
-
- /* move data from stateptr to state */
- pop r31
- pop r30
- push r30
- push r31
- clr r29
- ldi r28, 2
- ldi r22, 16
- push r22
-1: ld r0, Z+
- st Y+, r0
- dec r22
- brne 1b
-
-;--- snip 8< ----
-
- ldi r20, 0xD4
- push r20 /* push round constant2 (0xD4) */
- push r22 /* push round constan1 (0x00) */
- rjmp 3f
-2:
- ldi r30, lo8(round_const-1)
- ldi r31, hi8(round_const-1)
- clr r1
- add r30, r22
- adc r31, r1
- lpm r0, Z
- push r0
- push r1
-3:
- rcall round /* pops rc2 & rc1 */
- pop r22
- dec r22
- push r22
- brne 2b
-;----
- pop r22
-
- rcall theta
- ldi r22, 0x80
- eor state0_3, r22
-
-write_state_back:
- /* write state back */
- pop r31 /* pop state pointer */
- pop r30
- clr r29
- ldi r28, 2
- ldi r22, 16
-1:
- ld r0, Y+
- st Z+, r0
- dec r22
- brne 1b
-
- /* remove key from stack */
- in r30, _SFR_IO_ADDR(SPL)
- in r31, _SFR_IO_ADDR(SPH)
- adiw r30, 16
- out _SFR_IO_ADDR(SPH), r31
- out _SFR_IO_ADDR(SPL), r30
- rcall pop_all_func
- ret
-#endif
-/******************************************************************************/
-
-
-round:
- pop r24
- pop r25
- pop r1
- eor state0_3, r1
- rcall theta
- pop r1
- eor state0_3, r1
- push r25
- push r24
-pi_gamma_pi:
- ldi r30, pm_lo8(bigendian_rotl32)
- ldi r31, pm_hi8(bigendian_rotl32)
- rcall pi
- /* pi1 done; now gamma */
- rcall gamma_1
- /* a[0] <-> a[3] */
- xchg state0_0, state3_0
- xchg state0_1, state3_1
- xchg state0_2, state3_2
- xchg state0_3, state3_3
- /* a[2] ^= a[0] ^ a[1] ^ a[3] */
- op32 eor, state2, state0
- op32 eor, state2, state1
- op32 eor, state2, state3
-
- rcall gamma_1
- ldi r30, pm_lo8(bigendian_rotr32)
- ldi r31, pm_hi8(bigendian_rotr32)
- rcall pi
- ret
-
-gamma_1:
- /* a[1] ^= ~(a[3]|a[2])*/
- mov r1, state3_0
- or r1, state2_0
- com r1
- eor state1_0, r1
-
- mov r1, state3_1
- or r1, state2_1
- com r1
- eor state1_1, r1
-
- mov r1, state3_2
- or r1, state2_2
- com r1
- eor state1_2, r1
-
- mov r1, state3_3
- or r1, state2_3
- com r1
- eor state1_3, r1
-
- /* a[0] ^= a[2]&a[1] */
- mov r1, state2_0
- and r1, state1_0
- eor state0_0, r1
-
- mov r1, state2_1
- and r1, state1_1
- eor state0_1, r1
-
- mov r1, state2_2
- and r1, state1_2
- eor state0_2, r1
-
- mov r1, state2_3
- and r1, state1_3
- eor state0_3, r1
- ret
-
-pi:
- /* a[1] <<<= 1*/
- mov r22, state1_0
- mov r23, state1_1
- mov r24, state1_2
- mov r25, state1_3
- ldi r20, 1
- icall
- mov state1_0, r22
- mov state1_1, r23
- mov state1_2, r24
- mov state1_3, r25
- /* a[2] <<<= 5*/
- mov r22, state2_0
- mov r23, state2_1
- mov r24, state2_2
- mov r25, state2_3
- ldi r20, 5
- icall
- mov state2_0, r22
- mov state2_1, r23
- mov state2_2, r24
- mov state2_3, r25
- /* a[3] <<<= 2*/
- mov r22, state3_0
- mov r23, state3_1
- mov r24, state3_2
- mov r25, state3_3
- ldi r20, 2
- icall
- mov state3_0, r22
- mov state3_1, r23
- mov state3_2, r24
- mov state3_3, r25
- ret
-
-/******************************************************************************/
-
-/*
-void noekeon_init(void* key, noekeon_ctx_t* ctx){
- uint8_t nullv[16];
-
- memset(nullv, 0, 16);
- memcpy(ctx, key, 16);
- noekeon_enc(ctx, nullv);
-}
-*/
-
-#ifndef NOEKEON_NO_INIT
-
-.global noekeon_init
-noekeon_init:
-; === noekeon_init ===
-;
-; param1: pointer to key (r24,r25)
-; param2: pointer to context (r22,r23)
-;
- in r30, _SFR_IO_ADDR(SPL)
- in r31, _SFR_IO_ADDR(SPH)
- sbiw r30, 16
- out _SFR_IO_ADDR(SPH), r31
- out _SFR_IO_ADDR(SPL), r30
-
- movw r26, r22
- adiw r30, 1
- movw r22, r30
- /* set nullv(stack) to zero */
- ldi r20, 16
-1: st Z+, r1
- dec r20
- brne 1b
-
- /* copy key data to ctx */
- movw r30, r24
- ldi r20, 16
-1: ld r1, Z+
- st X+, r1
- dec r20
- brne 1b
- clr r1
-
- sbiw r26, 16
- movw r24, r26
- rcall noekeon_enc
-
- in r30, _SFR_IO_ADDR(SPL)
- in r31, _SFR_IO_ADDR(SPH)
- adiw r30, 16
- out _SFR_IO_ADDR(SPH), r31
- out _SFR_IO_ADDR(SPL), r30
- ret
-
-#endif
-
-