X-Git-Url: https://git.cryptolib.org/?p=avr-crypto-lib.git;a=blobdiff_plain;f=serpent%2Fserpent-asm.S;fp=serpent%2Fserpent-asm.S;h=f5f7cc5acf414c55b32980850e0a4d442efbfc78;hp=0000000000000000000000000000000000000000;hb=d32eba56ce10ea6b9eff123b50d9842673b38f2b;hpb=8f855d283a31a468ea014774c4723a8b77b81644 diff --git a/serpent/serpent-asm.S b/serpent/serpent-asm.S new file mode 100644 index 0000000..f5f7cc5 --- /dev/null +++ b/serpent/serpent-asm.S @@ -0,0 +1,754 @@ +/* serpent_asm.S */ +/* + This file is part of the AVR-Crypto-Lib. + Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +/* + * File: serpent_sboxes.S + * Author: Daniel Otte + * Date: 2008-08-07 + * License: GPLv3 or later + * Description: Implementation of the serpent sbox function. + * + */ + +#include +#include "avr-asm-macros.S" + +/* +static void serpent_lt(uint8_t *b){ + X0 = rotl32(X0, 13); + X2 = rotl32(X2, 3); + X1 ^= X0 ^ X2; + X3 ^= X2 ^ (X0 << 3); + X1 = rotl32(X1, 1); + X3 = rotl32(X3, 7); + X0 ^= X1 ^ X3; + X2 ^= X3 ^ (X1 << 7); + X0 = rotl32(X0, 5); + X2 = rotr32(X2, 10); +} +*/ + +#if 0 +A0 = 4 +A1 = 5 +A2 = 6 +A3 = 7 +B0 = 8 +B1 = 9 +B2 = 10 +B3 = 11 +C0 = 12 +C1 = 13 +C2 = 14 +C3 = 15 +D0 = 16 +D1 = 17 +D2 = 18 +D3 = 19 +T0 = 20 +T1 = 21 +T2 = 22 +T3 = 23 + +serpent_lt: + push_range 4, 17 + movw r26, r24 + ld A2, X+ + ld A3, X+ + ld A0, X+ + ld A1, X+ + ldi r20, 3 + mov r0, A0 +1: + lsr r0 + ror A3 + ror A2 + ror A1 + ror A0 + dec r20 + brne 1b + ld B0, X+ + ld B1, X+ + ld B2, X+ + ld B3, X+ + + ld C2, X+ + ld C3, X+ + ld C0, X+ + ld C1, X+ + ldi r20, 3 + mov r0, C0 +1: + lsr r0 + ror C3 + ror C2 + ror C1 + ror C0 + dec r20 + brne 1b + + ld D0, X+ + ld D1, X+ + ld D2, X+ + ld D3, X+ + /* X1 ^= X0 ^ X2; */ + eor B0, A0 + eor B0, C0 + eor B1, A1 + eor B1, C1 + eor B2, A2 + eor B2, C2 + eor B3, A3 + eor B3, C3 + /* X3 ^= X2 ^ (X0 << 3); */ + mov T0, A0 + mov T1, A1 + mov T2, A2 + mov T3, A3 + ldi r24, 3 +1: + lsl T0 + rol T1 + rol T2 + rol T3 + dec r24 + brne 1b + eor C0, B0 + eor C0, T0 + eor C1, B1 + eor C1, T1 + eor C2, B2 + eor C2, T2 + eor C3, B3 + eor C3, T3 + /* X1 = rotl32(X1, 1); */ + mov r0, B3 + lsl r0 + rol B0 + rol B1 + rol B2 + rol B3 + /* X3 = rotl32(X3, 7); */ + mov r0, D3 + mov D3, D2 + mov D2, D1 + mov D1, D0 + mov D0, r0 + lsr r0 + ror D3 + ror D2 + ror D1 + ror D0 + /* X0 ^= X1 ^ X3; */ + eor A0, B0 + eor A0, D0 + eor A1, B1 + eor A1, D1 + eor A2, B2 + eor A2, D2 + eor A3, B3 + eor A3, D3 + /* X2 ^= X3 ^ (X1 << 7); */ + mov T1, B0 + mov T2, B1 + mov T3, B2 + clr T0 + mov r0, B3 + lsr r0 + ror T2 + ror T1 + ror T0 + eor C0, D0 + eor C0, T0 + eor C1, D1 + eor C1, T1 + eor C2, D2 + eor C2, T2 + eor C3, D3 + eor C3, T3 + /* X0 = rotl32(X0, 5); */ + ldi r24, 5 + mov r0, A3 +1: + lsl r0 + rol A0 + rol A1 + rol A2 + rol A3 + dec r24 + brne 1b + /* X2 = rotr32(X2, 10); */ + mov r0, C0 + mov C0, C1 + mov C1, C2 + mov C2, C3 + mov C3, r0 + ldi r24, 2 +1: + lsr r0 + ror C2 + ror C1 + ror C0 + ror C3 + dec r24 + brne 1b + + clr r31 + ldi r30, D3+1 + ldi r24, 16 +1: + ld r0, -Z + st -X, r0 + dec r24 + brne 1b + + pop_range 4, 17 + ret +#endif + +T0 = 22 +T1 = 23 +T2 = 24 +T3 = 25 +TT = 21 +/* rotate the data word (4 byte) pointed to by X by r20 bits to the right */ +memrotr32: + ld T0, X+ + ld T1, X+ + ld T2, X+ + ld T3, X+ + mov TT, T0 +1: + lsr TT + ror T3 + ror T2 + ror T1 + ror T0 + dec r20 + brne 1b + st -X, T3 + st -X, T2 + st -X, T1 + st -X, T0 + ret + +/* rotate the data word (4 byte) pointed to by X by r20 bits to the left */ +memrotl32: + ld T0, X+ + ld T1, X+ + ld T2, X+ + ld T3, X+ + mov TT, T3 +1: + lsl TT + rol T0 + rol T1 + rol T2 + rol T3 + dec r20 + brne 1b + st -X, T3 + st -X, T2 + st -X, T1 + st -X, T0 + ret + +/* xor the dataword (4 byte) pointed by Z into X */ +memeor32: + ldi T2, 4 +1: + ld T0, X + ld T1, Z+ + eor T0, T1 + st X+, T0 + dec T2 + brne 1b + ret + +serpent_lt: + /* X0 := X0 <<< 13 */ + movw r26, r24 + ldi r20, 7 + rcall memrotl32 + ldi r20, 6 + rcall memrotl32 + /* X2 := X2 <<< 3 */ + adiw r26, 8 + ldi r20, 3 + rcall memrotl32 + /* X1 ^= X2 */ + movw r30, r26 + sbiw r26, 4 + rcall memeor32 + /* X1 ^= X0 */ + sbiw r26, 4 + sbiw r30, 12 + rcall memeor32 + /* X3 ^= X2 */ + movw r30, r26 + adiw r26, 4 + rcall memeor32 + /* T := X0 */ + sbiw r26, 16 + ld r18, X+ + ld r19, X+ + ld r20, X+ + ld r21, X+ + /* T := T<<3 */ + ldi r22, 3 +1: + lsl r18 + rol r19 + rol r20 + rol r21 + dec r22 + brne 1b + clr r31 + /* X3 ^= T */ + adiw r26, 8 + ldi r30, 18 + rcall memeor32 + /* X1 := X1<<<1 */ + sbiw r26, 12 + ldi r20, 1 + rcall memrotl32 + /* X3 := X3<<<7 */ + adiw r26, 8 + ldi r20, 7 + rcall memrotl32 + /* X0 ^= X3 */ + movw r30, r26 + sbiw r26, 12 + rcall memeor32 + /* X0 ^= X1 */ + movw r30, r26 + sbiw r26, 4 + rcall memeor32 + /* X2 ^= X3 */ + adiw r26, 4 + adiw r30, 4 + rcall memeor32 + /* T := X1<<<8 */ + sbiw r26, 8 + ld r19, X+ + ld r20, X+ + ld r21, X+ + ld r18, X+ + /* T := T>>>1; T&=0xfffffff8 */ + lsr r18 + ror r21 + ror r20 + ror r19 + clr r18 + ror r18 + clr r31 + ldi r30, 18 + /* X2 ^= T */ + rcall memeor32 + /* X0 := X0 <<< 5 */ + sbiw r26, 12 + ldi r20, 5 + rcall memrotl32 + /* X3 := X3 >>> 10 */ + adiw r26, 8 + ldi r20, 7 + rcall memrotr32 + ldi r20, 3 + rcall memrotr32 + ret + +serpent_inv_lt: + /* X0 := X0 >>> 5 */ + movw r26, r24 + ldi r20, 5 + rcall memrotr32 + /* X2 := X2 <<< 10 */ + adiw r26, 8 + ldi r20, 7 + rcall memrotl32 + ldi r20, 3 + rcall memrotl32 + /* X2 ^= X3 */ + movw r30, r26 + adiw r30, 4 + rcall memeor32 + sbiw r26, 4 + sbiw r30, 12 + /* T := X1<<7 */ + ld r19, Z+ + ld r20, Z+ + ld r21, Z+ + ld r18, Z+ + lsr r18 + ror r21 + ror r20 + ror r19 + clr r18 + ror r18 + clr r31 + /* X2 ^= T */ + ldi r30, 18 + rcall memeor32 + /* X0 ^= X1 */ + sbiw r26, 12 + movw r30, r26 + adiw r30, 4 + rcall memeor32 + /* X0 ^= X3 */ + sbiw r26, 4 + adiw r30, 4 + rcall memeor32 + /* X1 := X1>>>1 */ + ldi r20, 1 + rcall memrotr32 + /* X3 := X3>>>7 */ + adiw r26, 8 + ldi r20, 7 + rcall memrotr32 + /* X3 ^= X2 */ + sbiw r30, 8 + rcall memeor32 + sbiw r26, 4 + /* T:= X0<<3 */ + sbiw r30, 12 + ld r18, Z+ + ld r19, Z+ + ld r20, Z+ + ld r21, Z+ + ldi r24, 3 +1: + lsl r18 + rol r19 + rol r20 + rol r21 + dec r24 + brne 1b + /* X3 ^= T */ + clr r31 + ldi r30, 18 + rcall memeor32 + /* X1 ^= X0 */ + sbiw r26, 12 + movw r30, r26 + sbiw r30, 4 + rcall memeor32 + /* X1 ^= X2 */ + movw r26, r30 + adiw r30, 4 + rcall memeor32 + /* X2 := X2 >>> 3 */ + ldi r20, 3 + rcall memrotr32 + /* X0 := X0 >>> 13 */ + sbiw r26, 8 + ldi r20, 7 + rcall memrotr32 + ldi r20, 6 + rcall memrotr32 + ret + +/* +#define GOLDEN_RATIO 0x9e3779b9l + +static uint32_t serpent_gen_w(uint32_t * b, uint8_t i){ + uint32_t ret; + ret = b[0] ^ b[3] ^ b[5] ^ b[7] ^ GOLDEN_RATIO ^ (uint32_t)i; + ret = rotl32(ret, 11); + return ret; +} +*/ +/* + * param b is passed in r24:r25 + * param i is passed in r22 + * return value is returned in r22.r23.r24.r25 + */ + /* trashes: + * r20-r25, r30-r31 + */ +serpent_gen_w: + movw r30, r24 + /* ^i^b[0]*/ + ld r21, Z+ + eor r22, r21 + ld r23, Z+ + ld r24, Z+ + ld r25, Z+ + /* ^b[3]^b[5]^[b7] */ + adiw r30, 4 + ldi r20, 3 +1: + adiw r30, 4 + ld r21, Z+ + eor r22, r21 + ld r21, Z+ + eor r23, r21 + ld r21, Z+ + eor r24, r21 + ld r21, Z+ + eor r25, r21 + dec r20 + brne 1b + /* ^0x9e3779b9l */ + ldi r21, 0xb9 + eor r22, r21 + ldi r21, 0x79 + eor r23, r21 + ldi r21, 0x37 + eor r24, r21 + ldi r21, 0x9e + eor r25, r21 + /* <<<11 */ + mov r21, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r21 + mov r21, r25 + ldi r20, 3 +1: + lsl r21 + rol r22 + rol r23 + rol r24 + rol r25 + dec r20 + brne 1b + ret + +/* + * void serpent_init(const void* key, uint16_t keysize_b, serpent_ctx_t* ctx) + */ +/* + * param key is passed in r24:r25 + * param keysize is passed in r22:r23 + * param ctx is passed in r20:r21 + */ +.global serpent_init +serpent_init: + stack_alloc 32 + adiw r30, 1 + push_ r30, r31 + movw r26, r22 + adiw r26, 7 + tst r27 + breq 1f + ldi r26, 32 + rjmp 2f +1: + lsr r26 + lsr r26 + lsr r26 +2: + mov r22, r26 + bst r22, 5 /* store in T if we have to do the "append 1 thing"*/ + ldi r27, 32 +3: /* set buffer to zero */ + st Z+, r1 + dec r27 + brne 3b + + movw r26, r24 /* X points to the key */ + sbiw r30, 32 + tst r22 + breq 5f /* if keylength_b==0 */ +4: /* copy keybytes to buffer */ + ld r19, X+ + st Z+, r19 + dec r22 + brne 4b +5: + brts 7f /* if keylength_b == 256 */ + ldi r18, 0x01 + andi r22, 0x07 + brne 6f + st Z, r18 + rjmp 7f +6: /* shift the one to the right position */ + lsl r18 + dec r22 + brne 6b + or r18, r19 + st -Z, r18 +7: /* post "appending 1 thing" buffer is ready for subkey generation */ + movw r26, r20 /* X points to the context */ + + pop_ r19, r18 /* r18:r19 points to the buffer */ + push r16 + clr r16 +8: + movw r24, r18 + mov r22, r16 + rcall serpent_gen_w + movw r30, r18 + ldi r20, 7*4 +1: /* the memmove */ + ldd r0, Z+4 + st Z+, r0 + dec r20 + brne 1b + /* store new word in buffer and context */ + st Z+, r22 + st Z+, r23 + st Z+, r24 + st Z+, r25 + st X+, r22 + st X+, r23 + st X+, r24 + st X+, r25 + + inc r16 + cpi r16, 132 + brne 8b + + push_ r28, r29 + movw r28, r26 + subi r28, lo8(132*4) + sbci r29, hi8(132*4) + ldi r16, 33 +2: + movw r24, r28 + adiw r28, 16 + ldi r22, 2 + add r22, r16 + rcall sbox128 + dec r16 + brne 2b + pop_ r29, r28, r16 + stack_free 32 + ret + +/* + * void serpent_enc(void* buffer, const serpent_ctx_t* ctx){ + */ +/* + * param buffer is passed in r24:r25 + * param ctx is passed in r22:r23 + */ +.global serpent_enc +serpent_enc: + + push_ r12, r13, r14, r15, r16 + clr r16 + movw r14, r24 + movw r12, r22 +1: + movw r24, r14 + movw r22, r12 + ldi r20, 16 + add r12, r20 + adc r13, r1 + clr r21 + rcall memxor + movw r24, r14 + mov r22, r16 + rcall sbox128 + movw r24, r14 + rcall serpent_lt + + inc r16 + cpi r16, 31 + brne 1b + + movw r24, r14 + movw r22, r12 + ldi r20, 16 + add r12, r20 + adc r13, r1 + clr r21 + rcall memxor + movw r24, r14 + mov r22, r16 + rcall sbox128 + + inc r16 + movw r24, r14 + movw r22, r12 + ldi r20, 16 + clr r21 + pop_ r16, r15, r14, r13, r12 + rjmp memxor + +/* + * void serpent_dec(void* buffer, const serpent_ctx_t* ctx){ + */ +/* + * param buffer is passed in r24:r25 + * param ctx is passed in r22:r23 + */ +.global serpent_dec +serpent_dec: + push_ r12, r13, r14, r15, r16 + movw r14, r24 +// ldi r16, lo8(32*16) +// add r22, r16 + ldi r16, hi8(32*16) + add r23, r16 + movw r12, r22 + ldi r20, 16 + clr r21 + rcall memxor + + movw r24, r14 + ldi r22, 31 + call inv_sbox128 + + movw r24, r14 + ldi r20, 16 + sub r12, r20 + sbc r13, r1 + movw r22, r12 + clr r21 + rcall memxor + ldi r16, 31 +1: + dec r16 + movw r24, r14 + rcall serpent_inv_lt + movw r24, r14 + mov r22, r16 + rcall inv_sbox128 + movw r24, r14 + ldi r20, 16 + sub r12, r20 + sbc r13, r1 + movw r22, r12 + clr r21 + rcall memxor + + tst r16 + brne 1b + pop_ r16, r15, r14, r13, r12 + ret + + + + + + + + + + + + + + + + +