--- /dev/null
+/*
+ * noekeon assembler implementation for avr
+ * author: Daniel Otte
+ * email: daniel.otte@rub.de
+ * license: GPLv3
+ */
+
+#include <avr/io.h>
+
+.macro push_all
+ push r2
+ push r3
+ push r4
+ push r5
+ push r6
+ push r7
+ push r8
+ push r9
+ push r10
+ push r11
+ push r12
+ push r13
+ push r14
+ push r15
+ push r16
+ push r17
+ push r28
+ push r29
+ in r28, _SFR_IO_ADDR(SREG)
+ push r28
+.endm
+
+.macro pop_all
+ pop r28
+ out _SFR_IO_ADDR(SREG), r28
+ pop r29
+ pop r28
+ pop r17
+ pop r16
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop r11
+ pop r10
+ pop r9
+ pop r8
+ pop r7
+ pop r6
+ pop r5
+ pop r4
+ pop r3
+ pop r2
+ clr r1
+.endm
+
+.macro xchg a b
+ eor \a, \b
+ eor \b, \a
+ eor \a, \b
+.endm
+
+.macro op32 op a b
+ \op \a\()_0, \b\()_0
+ \op \a\()_1, \b\()_1
+ \op \a\()_2, \b\()_2
+ \op \a\()_3, \b\()_3
+.endm
+
+
+.macro op32_4t op a b c d w x y z
+ \op \a, \w
+ \op \b, \x
+ \op \c, \y
+ \op \d, \z
+.endm
+
+
+.macro op32_prefix op p q a b c d w x y z
+ \op \p\()\a, \q\()\w
+ \op \p\()\b, \q\()\x
+ \op \p\()\c, \q\()\y
+ \op \p\()\d, \q\()\z
+.endm
+
+.global bigendian_rotl32
+; === bigendian_rotl32 ===
+; this function rotates a 32bit bigendian word n bits to the left
+; param1: the 32-bit value
+; given in r25,r24,r23,r22 (r22 is most significant)
+; param2: the 8-bit parameter giving the number of bits to rotate
+; given in r20
+; return: the rotatet 32-bit word
+; given in r25,r24,r23,r22
+
+bigendian_rotl32:
+ in r0, _SFR_IO_ADDR(SREG)
+ /* copy high bit of r22 to carry */
+ mov r1, r22
+2:
+ rol r1
+
+ rol r25
+ rol r24
+ rol r23
+ rol r22
+
+ dec r20
+ brne 2b
+bigendian_rotl32_exit:
+ clr r1
+ out _SFR_IO_ADDR(SREG), r0
+ ret
+
+
+/******************************************************************************/
+
+.global bigendian_rotr32
+; === bigendian_rotl32 ===
+; this function rotates a 32bit bigendian word n bits to the right
+; param1: the 32-bit value
+; given in r25,r24,r23,r22 (r22 is most significant)
+; param2: the 8-bit parameter giving the number of bits to rotate
+; given in r20
+; return: the rotatet 32-bit word
+; given in r25,r24,r23,r22
+
+bigendian_rotr32:
+ in r0, _SFR_IO_ADDR(SREG)
+ /* copy high bit of r25 to carry */
+
+ mov r1, r25
+2:
+ ror r1
+
+ ror r22
+ ror r23
+ ror r24
+ ror r25
+ dec r20
+ brne 2b
+bigendian_rotr32_exit:
+ clr r1
+ out _SFR_IO_ADDR(SREG), r0
+ ret
+
+/******************************************************************************/
+/*
+void theta(uint32_t* k, uint32_t* a){
+ uint32_t temp;
+ temp = a[0] ^ a[2]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8);
+ a[1] ^= temp;
+ a[3] ^= temp;
+
+ a[0] ^= k[0];
+ a[1] ^= k[1];
+ a[2] ^= k[2];
+ a[3] ^= k[3];
+
+ temp = a[1] ^ a[3]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8);
+ a[0] ^= temp;
+ a[2] ^= temp;
+}
+*/
+
+round_const: .byte 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, \
+ 0x2F, 0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, \
+ 0xD4
+
+;-- a[0]
+state0_0 = 2
+state0_1 = 3
+state0_2 = 4
+state0_3 = 5
+;-- a[1]
+state1_0 = 6
+state1_1 = 7
+state1_2 = 8
+state1_3 = 9
+;-- a[2]
+state2_0 = 10
+state2_1 = 11
+state2_2 = 12
+state2_3 = 13
+;-- a[3]
+state3_0 = 14
+state3_1 = 15
+state3_2 = 16
+state3_3 = 17
+
+; === theta ===
+;
+; param1: the state in r2-r17
+; param2: pointer to k in X (r26,r27)
+;
+temp_a = 18
+temp_b = 19
+temp_c = 20
+temp_d = 21
+
+theta:
+ /* temp = a[0] ^ a[2]; temp ^= temp>>>8 ^ temp<<<8 */
+ op32_prefix mov, temp_, state0_, a,b,c,d, 0,1,2,3
+ op32_prefix eor, temp_, state2_, a,b,c,d, 0,1,2,3
+
+ mov r1, temp_a
+ eor r1, temp_b
+ eor r1, temp_c
+ eor r1, temp_d
+
+ op32_prefix eor, temp_, r, a,b,c,d, 1,1,1,1
+
+ /* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */
+ /* a[1] ^= temp */
+ eor state1_0, temp_c
+ eor state1_1, temp_d
+ eor state1_2, temp_a
+ eor state1_3, temp_b
+ /* a[3] ^= temp */
+ eor state3_0, temp_c
+ eor state3_1, temp_d
+ eor state3_2, temp_a
+ eor state3_3, temp_b
+
+ /* state ^ k (X points to K) */
+ ldi r28, 2
+ clr r29 /* Y points to r2 aka state0_0 */
+ ldi temp_a, 16
+1:
+ ld r1, X+
+ ld r0, Y
+ eor r1, r0
+ st Y+, r1
+ dec temp_a
+ brne 1b
+ sbiw r26, 16 /* set X back to key */
+
+ mov temp_a, state1_0
+ mov temp_b, state1_1
+ mov temp_c, state1_2
+ mov temp_d, state1_3
+ eor temp_a, state3_0
+ eor temp_b, state3_1
+ eor temp_c, state3_2
+ eor temp_d, state3_3
+ mov r1, temp_a
+ eor r1, temp_b
+ eor r1, temp_c
+ eor r1, temp_d
+ eor temp_a, r1
+ eor temp_b, r1
+ eor temp_c, r1
+ eor temp_d, r1
+ /* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */
+ /* a[0] ^= temp */
+ eor state0_0, temp_c
+ eor state0_1, temp_d
+ eor state0_2, temp_a
+ eor state0_3, temp_b
+ /* a[2] ^= temp */
+ eor state2_0, temp_c
+ eor state2_1, temp_d
+ eor state2_2, temp_a
+ eor state2_3, temp_b
+
+ clr r1
+ ret
+
+/******************************************************************************/
+; === noekeon_enc ===
+;
+; param1: pointer to buffer/state (r24,r25)
+; param2: pointer to k (r22,r23)
+;
+.global noekeon_enc
+noekeon_enc:
+ push_all
+ /* load state */
+ movw r26, r22
+ ldi r28, 2
+ clr r29 /* Y points at r2 aka state0_0 */
+ movw r30, r24 /* Z points at state */
+ push r30
+ push r31
+ ldi r22, 16
+ push r22 /* 16 is also the number of rounds and gets pushed here */
+1:
+ ld r0, Z+
+ st Y+, r0
+ dec r22
+ brne 1b
+ /* state loaded */
+ push r1 /* push round constan2 (0x00) */
+ ldi r20, 0x80
+ push r20 /* push round constan2 (0x00) */
+ rjmp 3f
+2:
+ ldi r30, lo8(round_const+15)
+ ldi r31, hi8(round_const+15)
+ sub r30, r22
+ sbci r31, 0
+ clr r1
+ push r1
+ lpm r0, Z
+ push r0
+3:
+ call round /* pops rc2 & rc1 */
+ pop r22
+ dec r22
+ push r22
+ brne 2b
+
+ pop r22
+
+ ldi r22, 0xD4
+ eor state0_3, r22
+ call theta
+
+ pop r31
+ pop r30
+ clr r29
+ ldi r28, 2
+ ldi r22, 16
+1:
+ ld r0, Y+
+ st Z+, r0
+ dec r22
+ brne 1b
+
+ pop_all
+ ret
+/******************************************************************************/
+/******************************************************************************/
+; === noekeon_dec ===
+;
+; param1: pointer to buffer/state (r24,r25)
+; param2: pointer to k (r22,r23)
+;
+.global noekeon_dec
+noekeon_dec:
+ push_all
+ /* allocate 16 bytes on the stack */
+ in r30, _SFR_IO_ADDR(SPL)
+ in r31, _SFR_IO_ADDR(SPH)
+ sbiw r30, 16
+ out _SFR_IO_ADDR(SPH), r31
+ out _SFR_IO_ADDR(SPL), r30
+
+ adiw r30, 1
+ /* push state pointer */
+ push r24
+ push r25
+ movw r26, r22 /* move key ptr to X */
+
+ /* set stackkey to zero */
+ ldi r22, 16
+1: st Z+, r1
+ dec r22
+ brne 1b
+
+ /* copy key to state */
+ clr r29
+ ldi r28, 2
+ ldi r22, 16
+1: ld r0, X+
+ st Y+, r0
+ dec r22
+ brne 1b
+
+ movw r26, r30
+ sbiw r26, 16 /* set X back to begining of stack key */
+ call theta
+
+ /* mov state to stackkey */
+ clr r29
+ ldi r28, 2
+ ldi r22, 16
+1: ld r0, Y+
+ st X+, r0
+ dec r22
+ brne 1b
+ sbiw r26, 16 /* set X back to begining of stack key */
+
+ /* move data from stateptr to state */
+ pop r31
+ pop r30
+ push r30
+ push r31
+ clr r29
+ ldi r28, 2
+ ldi r22, 16
+ push r22
+1: ld r0, Z+
+ st Y+, r0
+ dec r22
+ brne 1b
+
+;--- snip 8< ----
+
+ ldi r20, 0xD4
+ push r20 /* push round constant2 (0xD4) */
+ push r22 /* push round constan1 (0x00) */
+ rjmp 3f
+2:
+ ldi r30, lo8(round_const-1)
+ ldi r31, hi8(round_const-1)
+ clr r1
+ add r30, r22
+ adc r31, r1
+ lpm r0, Z
+ push r0
+ push r1
+3:
+ call round /* pops rc2 & rc1 */
+ pop r22
+ dec r22
+ push r22
+ brne 2b
+;----
+ pop r22
+
+ call theta
+ ldi r22, 0x80
+ eor state0_3, r22
+
+write_state_back:
+ /* write state back */
+ pop r31 /* pop state pointer */
+ pop r30
+ clr r29
+ ldi r28, 2
+ ldi r22, 16
+1:
+ ld r0, Y+
+ st Z+, r0
+ dec r22
+ brne 1b
+
+ /* remove key from stack */
+ in r30, _SFR_IO_ADDR(SPL)
+ in r31, _SFR_IO_ADDR(SPH)
+ adiw r30, 16
+ out _SFR_IO_ADDR(SPH), r31
+ out _SFR_IO_ADDR(SPL), r30
+ pop_all
+ ret
+/******************************************************************************/
+
+round:
+ pop r24
+ pop r25
+ pop r1
+ eor state0_3, r1
+ call theta
+ pop r1
+ eor state0_3, r1
+ push r25
+ push r24
+pi_gamma_pi:
+ clc
+ call pi
+ /* pi1 done; now gamma */
+ call gamma_1
+ /* a[0] <-> a[3] */
+ xchg state0_0, state3_0
+ xchg state0_1, state3_1
+ xchg state0_2, state3_2
+ xchg state0_3, state3_3
+ /* a[2] ^= a[0] ^ a[1] ^ a[3] */
+ op32 eor, state2, state0
+ op32 eor, state2, state1
+ op32 eor, state2, state3
+/*
+ eor state2_0, state0_0
+ eor state2_1, state0_1
+ eor state2_2, state0_2
+ eor state2_3, state0_3
+ eor state2_0, state1_0
+ eor state2_1, state1_1
+ eor state2_2, state1_2
+ eor state2_3, state1_3
+ eor state2_0, state3_0
+ eor state2_1, state3_1
+ eor state2_2, state3_2
+ eor state2_3, state3_3
+*/
+ call gamma_1
+ sec
+ call pi
+ ret
+
+gamma_1:
+ /* a[1] ^= ~(a[3]|a[2])*/
+ mov r1, state3_0
+ or r1, state2_0
+ com r1
+ eor state1_0, r1
+
+ mov r1, state3_1
+ or r1, state2_1
+ com r1
+ eor state1_1, r1
+
+ mov r1, state3_2
+ or r1, state2_2
+ com r1
+ eor state1_2, r1
+
+ mov r1, state3_3
+ or r1, state2_3
+ com r1
+ eor state1_3, r1
+
+ /* a[0] ^= a[2]&a[1] */
+ mov r1, state2_0
+ and r1, state1_0
+ eor state0_0, r1
+
+ mov r1, state2_1
+ and r1, state1_1
+ eor state0_1, r1
+
+ mov r1, state2_2
+ and r1, state1_2
+ eor state0_2, r1
+
+ mov r1, state2_3
+ and r1, state1_3
+ eor state0_3, r1
+ ret
+
+pi:
+ brcs 1f
+ ldi r30, lo8(bigendian_rotl32)
+ ldi r31, hi8(bigendian_rotl32)
+ rjmp 2f
+1:
+ ldi r30, lo8(bigendian_rotr32)
+ ldi r31, hi8(bigendian_rotr32)
+2:
+ lsr r31
+ ror r30
+ /* a[1] <<<= 1*/
+ mov r22, state1_0
+ mov r23, state1_1
+ mov r24, state1_2
+ mov r25, state1_3
+ ldi r20, 1
+ icall
+ mov state1_0, r22
+ mov state1_1, r23
+ mov state1_2, r24
+ mov state1_3, r25
+ /* a[2] <<<= 5*/
+ mov r22, state2_0
+ mov r23, state2_1
+ mov r24, state2_2
+ mov r25, state2_3
+ ldi r20, 5
+ icall
+ mov state2_0, r22
+ mov state2_1, r23
+ mov state2_2, r24
+ mov state2_3, r25
+ /* a[3] <<<= 2*/
+ mov r22, state3_0
+ mov r23, state3_1
+ mov r24, state3_2
+ mov r25, state3_3
+ ldi r20, 2
+ icall
+ mov state3_0, r22
+ mov state3_1, r23
+ mov state3_2, r24
+ mov state3_3, r25
+ ret
+
+;------- trash follows --------
+
+
+
+ /* load state */
+ movw r26, r22
+ ldi r28, 2
+ clr r29 /* Y points at r2 aka state0_0 */
+ ldi r22, 16
+1: /* copy key to state */
+ ld r0, X+
+ st Y+, r0
+ dec r22
+ brne 1b
+
+ movw r26, r30
+
+ clr r1
+ ldi r22, 16
+1: /* set key to zero */
+ st Z+, r1
+ dec r22
+ brne 1b
+
+ call theta
+
+ ldi r22, 16
+1: /* write key back */
+ ld r0, -Y
+ st -Z, r0
+ dec r22
+ brne 1b
+
+; movw r26, r30 /* move keypointer to X */
+; adiw r26, 1
+ movw r30, r24 /* Z points at state */
+ push r30 /* push state pointer */
+ push r31
+
+ ;--
+ clr r29
+ ldi r28, 2
+ ;--
+ ldi r22, 16
+ push r22 /* 16 is also the number of rounds and gets pushed here */
+ ldi r22, 16
+1: /* load state */
+ ld r0, Z+
+ st Y+, r0
+ dec r22
+ brne 1b
+ /* state loaded */
+
+
+;------- ------------- --------
+
+