/* shabal-asm.S */ /* This file is part of the AVR-Crypto-Lib. Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* * \file shabal-asm.S * \author Daniel Otte * \email daniel.otte@rub.de * \date 2009-04-27 * \license GPLv3 or later */ #include "avr-asm-macros.S" /******************************************************************************/ /* void shabal_p(shabal_ctx_t *ctx, const void *m){ uint8_t i,j; for(i=0;i<16;++i){ ctx->b[i] = ROTL32(ctx->b[i],17); } for(j=0;ja[(i+16*j)%SHABAL_R] = shabal_u(ctx->a[(i+16*j)%SHABAL_R] ^ shabal_v(ROTL32(ctx->a[(i+16*j+SHABAL_R-1)%SHABAL_R],15)) ^ ctx->c[(8-i+16)%16]) ^ ctx->b[(i+SHABAL_O1)%16] ^ ((ctx->b[(i+SHABAL_O2)%16]) & ~(ctx->b[(i+SHABAL_O3)%16])) ^ ((uint32_t*)m)[i]; ctx->b[i] = ROTL32(ctx->b[i], 1) ^ ~(ctx->a[(i+16*j)%SHABAL_R]); } } for(j=0;j<36;++j){ ctx->a[j%SHABAL_R] += ctx->c[(j+3)%16]; } } */ MB0 = 2 MB1 = 3 AB0 = 4 AB1 = 5 BB0 = 6 BB1 = 7 CB0 = 8 CB1 = 9 AL0 = 10 AL1 = 11 AL2 = 12 AL3 = 13 A0 = 14 A1 = 15 A2 = 16 A3 = 17 B0 = 18 B1 = 19 B2 = 20 B3 = 21 I = 22 J = 23 T0 = 26 T1 = 27 T2 = 28 T3 = 29 /* * param ctx: r24:r25 * param m: r22:r23 */ ; .global shabal_p shabal_p: push_range 2, 17 push r28 push r29 movw MB0, r22 movw r30, r24 adiw r30, 8 ld BB0, Z+ ld BB1, Z+ ld CB0, Z+ ld CB1, Z+ movw AB0, r30 movw r30, BB0 adiw r30, 16*4-1 adiw r30, 1 ldi r24, 16 1: ld A3, -Z ld A2, -Z ld A1, -Z ld A0, -Z mov r0, A3 rol r0 rol A0 rol A1 rol A2 rol A3 std Z+0, A2 std Z+1, A3 std Z+2, A0 std Z+3, A1 dec r24 brne 1b movw B0, A2 movw B2, A0 /* load ctx->a[(i+16*j-1)%12]*/ movw r26, AB0 adiw r26, 4*11 ld AL0, X+ ld AL1, X+ ld AL2, X+ ld AL3, X+ clr I clr J 1: /* ROTL32(AL, 15)*/ movw T0, AL2 movw T2, AL0 mov r0, T0 ror r0 ror T3 ror T2 ror T1 ror T0 movw AL0, T0 movw AL2, T2 /* apply V to AL */ movw A0, AL0 movw A2, AL2 lsl A0 rol A1 rol A2 rol A3 lsl A0 rol A1 rol A2 rol A3 add A0, AL0 adc A1, AL1 adc A2, AL2 adc A3, AL3 /* xor in ctx->c[(8-i+16)%16] */ ldi T0, 24 sub T0, I andi T0, 0x0f lsl T0 lsl T0 movw r30, CB0 add r30, T0 adc r31, r1 ld r0, Z+ eor A0, r0 ld r0, Z+ eor A1, r0 ld r0, Z+ eor A2, r0 ld r0, Z+ eor A3, r0 /* xor in ctx->a[(i+16*j)%12] */ mov T0, J swap T0 /* *=16 */ add T0, I ldi r30, lo8(mod12table) ldi r31, hi8(mod12table) add r30, T0 adc r31, r1 lpm T0, Z movw r30, AB0 add r30, T0 adc r31, r1 movw T2, r30 ld r0, Z+ eor A0, r0 ld r0, Z+ eor A1, r0 ld r0, Z+ eor A2, r0 ld r0, Z+ eor A3, r0 /* AL = 3*A */ movw AL0, A0 movw AL2, A2 lsl AL0 rol AL1 rol AL2 rol AL3 add AL0, A0 adc AL1, A1 adc AL2, A2 adc AL3, A3 /* xor in ctx->b[(i+13)%16] */ ldi T0, 13 add T0, I andi T0, 0x0f lsl T0 lsl T0 movw r30, BB0 add r30, T0 adc r31, r1 ld r0, Z+ eor AL0, r0 ld r0, Z+ eor AL1, r0 ld r0, Z+ eor AL2, r0 ld r0, Z+ eor AL3, r0 /* load ctx->b[(i+9)%16] into A */ ldi T0, 9 add T0, I andi T0, 0x0f lsl T0 lsl T0 movw r30, BB0 add r30, T0 adc r31, r1 ld A0, Z+ ld A1, Z+ ld A2, Z+ ld A3, Z+ /* and in ~(ctx->b[(i+6)%16]) */ ldi T0, 6 add T0, I andi T0, 0x0f lsl T0 lsl T0 movw r30, BB0 add r30, T0 adc r31, r1 ld r0, Z+ com r0 and A0, r0 ld r0, Z+ com r0 and A1, r0 ld r0, Z+ com r0 and A2, r0 ld r0, Z+ com r0 and A3, r0 /* xor A into AL */ eor AL0, A0 eor AL1, A1 eor AL2, A2 eor AL3, A3 /* xor m[i] into AL */ mov T0, I lsl T0 lsl T0 movw r30, MB0 add r30, T0 adc r31, r1 ld r0, Z+ eor AL0, r0 ld r0, Z+ eor AL1, r0 ld r0, Z+ eor AL2, r0 ld r0, Z+ eor AL3, r0 /* A (AL) is done, now store it */ movw r30, T2 st Z+, AL0 st Z+, AL1 st Z+, AL2 st Z+, AL3 /* process ctx->b[i] */ /* ROTL32(b, 1)*/ mov r0, B3 rol r0 rol B0 rol B1 rol B2 rol B3 /* xor in ~(ctx->a[(i+16*j)%SHABAL_R]) */ movw A0, AL0 movw A2, AL2 com A0 com A1 com A2 com A3 eor B0, A0 eor B1, A1 eor B2, A2 eor B3, A3 /* store B */ movw r30, BB0 mov T0, I lsl T0 lsl T0 add r30, T0 adc r31, r1 st Z+, B0 st Z+, B1 st Z+, B2 st Z+, B3 inc I cpi I, 16 brne local_reload inc J cpi J, 3 brne global_reload rjmp addition global_reload: clr I local_reload: mov T0, I lsl T0 lsl T0 movw r30, BB0 add r30, T0 adc r31, r1 ld B0, Z+ ld B1, Z+ ld B2, Z+ ld B3, Z+ rjmp 1b addition: clr J movw r30, AB0 movw r26, CB0 adiw r26, 3*4 1: /* J = 0..11 */ ld AL0, X+ ld A0, Z add A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 inc J cpi J, 12 brne 1b /* J = 12 */ movw r30, AB0 ld AL0, X+ ld A0, Z add A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 inc J /* J= 13..23*/ movw r26, CB0 1: ld AL0, X+ ld A0, Z add A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 inc J cpi J, 24 brne 1b /* J= 24..28*/ movw r30, AB0 1: ld AL0, X+ ld A0, Z add A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 inc J cpi J, 29 brne 1b /* J= 29..35*/ movw r26, CB0 1: ld AL0, X+ ld A0, Z add A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 ld AL0, X+ ld A0, Z adc A0, AL0 st Z+, A0 inc J cpi J, 36 brne 1b exit: pop r29 pop r28 pop_range 2, 17 ret mod12table: .byte 0, 4, 8, 12, 16, 20, 24, 28 .byte 32, 36, 40, 44, 0, 4, 8, 12 .byte 16, 20, 24, 28, 32, 36, 40, 44 .byte 0, 4, 8, 12, 16, 20, 24, 28 .byte 32, 36, 40, 44, 0, 4, 8, 12 .byte 16, 20, 24, 28, 32, 36, 40, 44 /******************************************************************************/ /* void shabal_nextBlock(shabal_ctx_t *ctx, const void *block){ uint8_t i; uint32_t *t; for(i=0;i<16;++i){ ctx->b[i] += ((uint32_t*)block)[i]; } ctx->a[0] ^= ctx->w.w32[0]; ctx->a[1] ^= ctx->w.w32[1]; shabal_p(ctx, block); for(i=0;i<16;++i){ ctx->c[i] -= ((uint32_t*)block)[i]; } ctx->w.w64++; t = ctx->c; ctx->c = ctx->b; ctx->b = t; } */ /* * param ctx: r24:r25 * param block: r22:r23 */ MB0 = 14 MB1 = 15 CTX0 = 16 CTX1 = 17 .global shabal_nextBlock shabal_nextBlock: push_range 14, 17 movw CTX0, r24 movw MB0, r22 /* xor W into A and increment W */ movw r30, CTX0 ldi r19, 8 sec 1: ld r20, Z ldd r21, Z+(8+4) eor r21, r20 std Z+(8+4), r21 adc r20, r1 st Z+, r20 dec r19 brne 1b /* add block to ctx->b */ ld r26, Z+ ld r27, Z movw r30, MB0 ldi r19, 16 1: ld r0, X ld r18, Z+ add r0, r18 st X+, r0 ld r0, X ld r18, Z+ adc r0, r18 st X+, r0 ld r0, X ld r18, Z+ adc r0, r18 st X+, r0 ld r0, X ld r18, Z+ adc r0, r18 st X+, r0 dec r19 brne 1b /* call shbal_p */ rcall shabal_p /* sub block from ctx->c */ movw r30, CTX0 adiw r30, 8+2 ld r26, Z+ ld r27, Z movw r30, MB0 ldi r19, 16 1: ld r0, X ld r18, Z+ sub r0, r18 st X+, r0 ld r0, X ld r18, Z+ sbc r0, r18 st X+, r0 ld r0, X ld r18, Z+ sbc r0, r18 st X+, r0 ld r0, X ld r18, Z+ sbc r0, r18 st X+, r0 dec r19 brne 1b /* xchange ctx->b with ctx->c*/ movw r30, CTX0 ldd r22, Z+8 ldd r23, Z+9 ldd r24, Z+10 ldd r25, Z+11 std Z+10, r22 std Z+11, r23 std Z+8, r24 std Z+9, r25 pop_range 14, 17 ret /******************************************************************************/ /* void shabal_lastBlock(shabal_ctx_t *ctx, const void *block, uint16_t length_b){ uint8_t i,j; uint32_t *t; uint8_t buffer[64]; while(length_b>=SHABAL_BLOCKSIZE){ shabal_nextBlock(ctx, block); block = (uint8_t*)block + SHABAL_BLOCKSIZE_B; length_b -= SHABAL_BLOCKSIZE; } memset(buffer, 0, 64); memcpy(buffer, block, (length_b+7)/8); buffer[length_b/8] |= 0x80>>(length_b%8); for(i=0;i<16;++i){ ctx->b[i] += ((uint32_t*)buffer)[i]; } for(j=0; j<4;++j){ ctx->a[0] ^= ctx->w.w32[0]; ctx->a[1] ^= ctx->w.w32[1]; shabal_p(ctx, buffer); t = ctx->c; ctx->c = ctx->b; ctx->b = t; } } */ I = 16 LEN0 = 16 LEN1 = 17 CTX0 = 14 CTX1 = 15 MB0 = 12 MB1 = 13 /* * param ctx: r24:r25 * param block: r22:r23 * param length_b: r20:r21 */ .global shabal_lastBlock shabal_lastBlock: push_range 12, 17 movw CTX0, r24 movw MB0, r22 movw LEN0, r20 1: cpi LEN1, 0x02 brlo 2f movw r24, CTX0 movw r22, MB0 rcall shabal_nextBlock subi LEN1, 0x02 ldi r18, 64 add MB0, r18 adc MB1, r1 rjmp 1b 2: stack_alloc_large 64 adiw r30, 1 /* Z points at buffer */ movw r26, MB0 /* r24 = LEN/8*/ movw r24, LEN0 lsr r25 ror r24 lsr r24 lsr r24 ldi r25, 64-1 sub r25, r24 tst r24 breq 32f 31: ld r0, X+ st Z+, r0 dec r24 brne 31b 32: ldi r18, 0x80 andi LEN0, 0x07 breq append_0x80 ld r0, X+ 33: lsr r18 dec LEN0 brne 33b or r0, r18 st Z+, r0 rjmp append_zeros append_0x80: st Z+, r18 append_zeros: tst r25 breq 4f 34: st Z+, r1 dec r25 brne 34b 4: sbiw r30, 63 sbiw r30, 1 movw MB0, r30 movw r26, CTX0 adiw r26, 8 ld r24, X+ ld r25, X movw r26, r24 ldi r18, 16 41: ld r24, X ld r25, Z+ add r24, r25 st X+, r24 ld r24, X ld r25, Z+ adc r24, r25 st X+, r24 ld r24, X ld r25, Z+ adc r24, r25 st X+, r24 ld r24, X ld r25, Z+ adc r24, r25 st X+, r24 dec r18 brne 41b /* final loop */ ldi I, 4 5: /* xor W into A */ movw r30, CTX0 ldi r19, 8 51: ld r24, Z+ ldd r25, Z+(8+4-1) eor r24, r25 std Z+(8+4-1), r24 dec r19 brne 51b movw r24, CTX0 movw r22, MB0 rcall shabal_p movw r30, CTX0 ldd r22, Z+8 ldd r23, Z+9 ldd r24, Z+10 ldd r25, Z+11 std Z+10, r22 std Z+11, r23 std Z+8, r24 std Z+9, r25 dec I brne 5b stack_free_large 64 pop_range 12, 17 ret