X-Git-Url: https://git.cryptolib.org/?p=avr-crypto-lib.git;a=blobdiff_plain;f=xtea%2Fxtea-asm.S;h=f8aac8cd1380adbdd55c1ac3f77a32adb7e03c84;hp=826f12386a4aab0d68241f4beef6370bc10e99be;hb=b246a2a0589f234db6247255555df98f4c281c41;hpb=35dc9566e40c9f68fa216c70eaa6d5b0597448fe diff --git a/xtea/xtea-asm.S b/xtea/xtea-asm.S index 826f123..f8aac8c 100644 --- a/xtea/xtea-asm.S +++ b/xtea/xtea-asm.S @@ -1,7 +1,7 @@ -/* xtea-asm.S */ +/* xtea-enc.S */ /* - This file is part of the AVR-Crypto-Lib. - Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de) + This file is part of the ARM-Crypto-Lib. + Copyright (C) 2006-2011 Daniel Otte (daniel.otte@rub.de) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,570 +16,221 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ -/* xtea-asm.S - * Author: Daniel Otte - * Date: 2006-06-06 - * License: GPLv3 or later - * Implementation of XTEA for AVR - * include xtea.h in your C-Project to use this functions. -*/ -V01 = 2 -V02 = 3 -V03 = 4 -V04 = 5 -V11 = 6 -V12 = 7 -V13 = 8 -V14 = 9 -Accu1 = 14 -Accu2 = 15 -Accu3 = 16 -Accu4 = 17 -Sum1 = 18 -Sum2 = 19 -Sum3 = 20 -Sum4 = 21 -Func1 = 22 -Func2 = 23 -Func3 = 24 -Func4 = 25 -C = 28 /* der kleine Zaehler fuer zwischendurch */ +#include "avr-asm-macros.S" + +B0 = 4 +B1 = 5 +B2 = 6 +B3 = 7 + +A0 = 8 +A1 = 9 +A2 = 10 +A3 = 11 + +V10 = 12 +V11 = 13 +V12 = 14 +V13 = 15 + +V00 = 16 +V01 = 17 +V02 = 18 +V03 = 19 + +S0 = 20 +S1 = 21 +S2 = 22 +S3 = 23 + +xchg_V0V1: + movw r26, V10 + movw V10, V00 + movw V00, r26 + movw r26, V12 + movw V12, V02 + movw V02, r26 + ret + +eor_AB: + eor A0, B0 + eor A1, B1 + eor A2, B2 + eor A3, B3 + ret + +g_func: + movw A0, V10 + movw A2, V12 + movw B0, V10 + movw B2, V12 + + ldi r24, 4 +10: + lsl A0 + rol A1 + rol A2 + rol A3 + dec r24 + brne 10b + + ldi r24, 5 +10: + lsr B3 + ror B2 + ror B1 + ror B0 + dec r24 + brne 10b + + rcall eor_AB + + add A0, V10 + adc A1, V11 + adc A2, V12 + adc A3, V13 + + ret + +sum_plus_k: + andi r24, (3<<2) + movw r26, r30 + add r26, r24 + adc r27, r1 + ld B0, X+ + ld B1, X+ + ld B2, X+ + ld B3, X+ + add B0, S0 + adc B1, S1 + adc B2, S2 + adc B3, S3 + rcall eor_AB + brtc 20f + add V00, A0 + adc V01, A1 + adc V02, A2 + adc V03, A3 + ret +20: sub V00, A0 + sbc V01, A1 + sbc V02, A2 + sbc V03, A3 + ret + +main1: + rcall g_func + mov r24, S0 + lsl r24 + lsl r24 + rcall sum_plus_k + ret + +main2: + rcall xchg_V0V1 + rcall g_func + mov r24, S1 + lsr r24 + rcall sum_plus_k + rcall xchg_V0V1 + ret .global xtea_enc -; == xtea_enc == -; xtea encrytion function -; param1: 16-bit pointer to destination for encrypted block -; given in r25,r24 -; param2: 16-bit pointer to the block (64-bit) which is to encrypt -; given in r23,r22 -; param3: 16-bit pointer to the key (128-bit) -; given in r21,r20 -; xtea_enc: - /* prolog */ - push r2 - push r3 - push r4 - push r5 - push r6 - push r7 - push r8 - push r9 - push r14 - push r15 - push r16 - push r17 - push r28 - - /* load the block */ - movw r26, r22 /* X points to block */ - movw r30, r20 /* Z points to key */ - ld V01, X+ - ld V02, X+ - ld V03, X+ - ld V04, X+ - ld V11, X+ - ld V12, X+ - ld V13, X+ - ld V14, X+ -; push r25 -; push r24 - movw r26, r24 /* X points to destination */ - - ldi Func1, 32 - mov r0, Func1 /* r0 is cycle-counter */ - clr Sum1 - clr Sum2 - movw Sum3, Sum1 - clt - -1: - movw Accu1, V11 - movw Accu3, V13 - ldi C, 4 -2: lsl Accu1 - rol Accu2 - rol Accu3 - rol Accu4 - dec C - brne 2b /* Accu == V1 << 4 */ - - movw Func1, V11 - movw Func3, V13 - ldi C, 5 -3: lsr Func4 - ror Func3 - ror Func2 - ror Func1 - dec C - brne 3b /* Func == V1 >> 5 */ - - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 - add Accu1, V11 - adc Accu2, V12 - adc Accu3, V13 - adc Accu4, V14 /* Accu == ( (V1<<4)^(V1>>5) ) + V1 */ - - brtc 4f - mov C, Sum2 - lsr C - andi C,(0x03 <<2) - clt - rjmp 5f -4: - mov C, Sum1 /* calc key offset */ - andi C, 0x03 - lsl C - lsl C set - -5: - add r30, C - adc r31, r1 - ld Func1, Z - ldd Func2, Z+1 - ldd Func3, Z+2 - ldd Func4, Z+3 /* Func = key[sum & 3] */ - sub r30, C - sbci r31, 0 - add Func1, Sum1 - adc Func2, Sum2 - adc Func3, Sum3 - adc Func4, Sum4 - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 /* Accu = ((V1<<4 ^ V1>>5) + V1) ^ (sum + key[sum&3]) */ - add Accu1, V01 - adc Accu2, V02 - adc Accu3, V03 - adc Accu4, V04 - - movw V01, V11 - movw V03, V13 - movw V11, Accu1 - movw V13, Accu3 - - /* sum += delta */ /* delta == 0x9E3779B9 */ - brtc 6f - ldi C, 0xB9 - add Sum1, C - ldi C, 0x79 - adc Sum2, C - ldi C, 0x37 - adc Sum3, C - ldi C, 0x9E - adc Sum4, C - rjmp 1b - -6: +xtea_intro: + clr r27 + ldi r26, 4 + ldi r30, 14 +10: + ld r0, X+ + push r0 + dec r30 + brne 10b + + push r24 + push r25 + movw r30, r20 +/* load block */ + movw r26, r22 + ld V00, X+ + ld V01, X+ + ld V02, X+ + ld V03, X+ + ld V10, X+ + ld V11, X+ + ld V12, X+ + ld V13, X+ + ldi r24, 32 + mov r0, r24 + brtc xtea_dec_start + clr S0 + clr S1 + movw S2, S0 + +10: + rcall main1 + subi S0, 0x47 + sbci S1, 0x86 + sbci S2, 0xC8 + sbci S3, 0x61 + rcall main2 + dec r0 - breq 7f - rjmp 1b - - 7: - /* write block back */ - ; pop r26 - ; pop r27 - st X+, V01 - st X+, V02 - st X+, V03 - st X+, V04 - st X+, V11 - st X+, V12 - st X+, V13 - st X+, V14 - - /* epilog */ - pop r28 - pop r17 - pop r16 - pop r15 - pop r14 - pop r9 - pop r8 - pop r7 - pop r6 - pop r5 - pop r4 - pop r3 - pop r2 - ret - -;#################################################################### - - /* #endif TWO_IN_ONE */ - - /* #ifdef TWO_IN_ONE */ - /* now we use the same base-structure for enc- and decryption - to indicate operation mode we use the highest bit of param3 (16 bit pointer to key), - this is ok, since even the larges atmel today has "only" 8k of ram, - but you shouldn't use this feature while using external ram. - */ -.global xtea_enc - ori r21, 0x80 - -.global xtea_dec -; == xtea_dec == -; xtea decrytion function -; param1: 16-bit pointer to destination for decrypted block -; given in r25,r24 -; param2: 16-bit pointer to the block (64-bit) which is to derypt -; given in r23,r22 -; param3: 16-bit pointer to the key (128-bit) -; given in r21,r20 -; -/* -void xtea_dec(uint32_t* dest, uint32_t* v, uint32_t* k) { - uint32_t v0=v[0], v1=v[1], i; - uint32_t sum=0xC6EF3720, delta=0x9E3779B9; - for(i=0; i<32; i++) { - v1 -= ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); - sum -= delta; - v0 -= ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); - } - dest[0]=v0; dest[1]=v1; -} -*/ + brne 10b + +/* store back */ +xtea_enc_exit: + pop r27 + pop r26 + st X+, V00 + st X+, V01 + st X+, V02 + st X+, V03 + st X+, V10 + st X+, V11 + st X+, V12 + st X+, V13 + + clr r27 + ldi r26, 18 + ldi r24, 14 +10: + pop r0 + st -X, r0 + dec r24 + brne 10b + ret + + +/******************************************************************************/ +/******************************************************************************/ +/******************************************************************************/ +/******************************************************************************/ +.global xtea_dec xtea_dec: - /* prolog */ - push r2 - push r3 - push r4 - push r5 - push r6 - push r7 - push r8 - push r9 - push r14 - push r15 - push r16 - push r17 - push r28 - /* load the block */ - movw r26, r22 /* Z points to block */ - movw r30, r20 /* X points to key */ - ld V01, X+ - ld V02, X+ - ld V03, X+ - ld V04, X+ - ld V11, X+ - ld V12, X+ - ld V13, X+ - ld V14, X+ - movw r26, r24 /* Z points to destination */ - - ldi Sum1, 32 - mov r0, Sum1 /* r1 is cycle-counter */ - ldi Sum1, 0x20 /* sum = 0xC6EF3720 */ - ldi Sum2, 0x37 - ldi Sum3, 0xEF - ldi Sum4, 0xC6 clt + rjmp xtea_intro +xtea_dec_start: + ldi S0, 0x20 /* sum = 0xC6EF3720 */ + ldi S1, 0x37 + ldi S2, 0xEF + ldi S3, 0xC6 + +10: + rcall main2 + subi S0, 0xB9 + sbci S1, 0x79 + sbci S2, 0x37 + sbci S3, 0x9E + rcall main1 -1: - movw Accu1, V01 - movw Accu3, V03 - ldi C, 4 -2: lsl Accu1 - rol Accu2 - rol Accu3 - rol Accu4 - dec C - brne 2b /* Accu == V0 << 4 */ - - movw Func1, V01 - movw Func3, V03 - ldi C, 5 -3: lsr Func4 - ror Func3 - ror Func2 - ror Func1 - dec C - brne 3b /* Func == V0 >> 5 */ - - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 - add Accu1, V01 - adc Accu2, V02 - adc Accu3, V03 - adc Accu4, V04 /* Accu == ( (V0<<4)^(V0>>5) ) + V0 */ - - brts 4f - mov C, Sum2 - lsr C - andi C,(0x03 <<2) - set - rjmp 5f -4: - mov C, Sum1 /* calc key offset */ - andi C, 0x03 - lsl C - lsl C - clt - -5: - add r30, C - adc r31, r1 - ld Func1, Z - ldd Func2, Z+1 - ldd Func3, Z+2 - ldd Func4, Z+3 /* Func = key[sum & 3] */ - sub r30, C - sbci r31, 0 - add Func1, Sum1 - adc Func2, Sum2 - adc Func3, Sum3 - adc Func4, Sum4 - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 /* Accu = ((V0<<4 ^ V0>>5) + V0) ^ (sum + key[sum&3]) */ - sub V11, Accu1 - sbc V12, Accu2 - sbc V13, Accu3 - sbc V14, Accu4 - - movw Accu1, V01 - movw Accu3, V03 - movw V01, V11 - movw V03, V13 - movw V11, Accu1 - movw V13, Accu3 - - /* sum += delta */ /* delta == 0x9E3779B9 */ - brtc 6f - subi Sum1, 0xB9 - sbci Sum2, 0x79 - sbci Sum3, 0x37 - sbci Sum4, 0x9E - rjmp 1b - -6: dec r0 - breq 7f - rjmp 1b - -7: - /* write block back */ - st X+, V01 - st X+, V02 - st X+, V03 - st X+, V04 - st X+, V11 - st X+, V12 - st X+, V13 - st X+, V14 - - /* epilog */ - pop r28 - pop r17 - pop r16 - pop r15 - pop r14 - pop r9 - pop r8 - pop r7 - pop r6 - pop r5 - pop r4 - pop r3 - pop r2 - ret - - /* #endif */ - -;#################################################################### - - #ifdef TWO_IN_ONE - /* now we use the same base-structure for enc- and decryption - to indicate operation mode we use the highest bit of param3 (16 bit pointer to key), - this is ok, since even the larges atmel today has "only" 8k of ram, - but you shouldn't use this feature while using external ram. - */ -.global xtea_enc - ori r21, 0x80 - -.global xtea_dec -; == xtea_dec == -; xtea decrytion function -; param1: 16-bit pointer to destination for decrypted block -; given in r25,r24 -; param2: 16-bit pointer to the block (64-bit) which is to derypt -; given in r23,r22 -; param3: 16-bit pointer to the key (128-bit) -; given in r21,r20 -; -/* -void xtea_dec(uint32_t* dest, uint32_t* v, uint32_t* k) { - uint32_t v0=v[0], v1=v[1], i; - uint32_t sum=0xC6EF3720, delta=0x9E3779B9; - for(i=0; i<32; i++) { - v1 -= ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); - sum -= delta; - v0 -= ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); - } - dest[0]=v0; dest[1]=v1; -} -*/ + brne 10b +/* store back */ + rjmp xtea_enc_exit -xtea_dec: - /* prolog */ - push r2 - push r3 - push r4 - push r5 - push r6 - push r7 - push r8 - push r9 - push r14 - push r15 - push r16 - push r17 - push r28 - /* set T-bit if we are going to encrypt, clear otherwise */ - bst r21, 7 - andi r21, 0x7f /* fix r21:r22 to a real addr */ - /* load the block */ - movw r26, r22 /* Z points to block */ - movw r30, r20 /* X points to key */ - ld V01, X+ - ld V02, X+ - ld V03, X+ - ld V04, X+ - ld V11, X+ - ld V12, X+ - ld V13, X+ - ld V14, X+ - movw r26, r24 /* Z points to destination */ - - ldi Sum1, 32 - mov r0, Sum1 /* r1 is cycle-counter */ - ldi Sum1, 0x20 /* sum = 0xC6EF3720 */ - ldi Sum2, 0x37 - ldi Sum3, 0xEF - ldi Sum4, 0xC6 - clt -1: - movw Accu1, V01 - movw Accu3, V03 - ldi C, 4 -2: lsl Accu1 - rol Accu2 - rol Accu3 - rol Accu4 - dec C - brne 2b /* Accu == V0 << 4 */ - - movw Func1, V01 - movw Func3, V03 - ldi C, 5 -3: lsr Func4 - ror Func3 - ror Func2 - ror Func1 - dec C - brne 3b /* Func == V0 >> 5 */ - - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 - add Accu1, V01 - adc Accu2, V02 - adc Accu3, V03 - adc Accu4, V04 /* Accu == ( (V0<<4)^(V0>>5) ) + V0 */ - - brts 4f - mov C, Sum2 - lsr C - andi C,(0x03 <<2) - set - rjmp 5f -4: - mov C, Sum1 /* calc key offset */ - andi C, 0x03 - lsl C - lsl C - clt - -5: - add r30, C - adc r31, r1 - ld Func1, Z - ldd Func2, Z+1 - ldd Func3, Z+2 - ldd Func4, Z+3 /* Func = key[sum & 3] */ - sub r30, C - sbci r31, 0 - add Func1, Sum1 - adc Func2, Sum2 - adc Func3, Sum3 - adc Func4, Sum4 - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 /* Accu = ((V0<<4 ^ V0>>5) + V0) ^ (sum + key[sum&3]) */ - sub V11, Accu1 - sbc V12, Accu2 - sbc V13, Accu3 - sbc V14, Accu4 - - movw Accu1, V01 - movw Accu3, V03 - movw V01, V11 - movw V03, V13 - movw V11, Accu1 - movw V13, Accu3 - - /* sum += delta */ /* delta == 0x9E3779B9 */ - brtc 6f - subi Sum1, 0xB9 - sbci Sum2, 0x79 - sbci Sum3, 0x37 - sbci Sum4, 0x9E - rjmp 1b - -6: - dec r0 - breq 7f - rjmp 1b - -7: - /* write block back */ - st X+, V01 - st X+, V02 - st X+, V03 - st X+, V04 - st X+, V11 - st X+, V12 - st X+, V13 - st X+, V14 - - /* epilog */ - pop r28 - pop r17 - pop r16 - pop r15 - pop r14 - pop r9 - pop r8 - pop r7 - pop r6 - pop r5 - pop r4 - pop r3 - pop r2 - ret - - #endif