From: bg Date: Mon, 30 Mar 2009 23:42:48 +0000 (+0000) Subject: threefish decryption (256, 512 and 1024 bit) in assembler X-Git-Url: https://git.cryptolib.org/?p=avr-crypto-lib.git;a=commitdiff_plain;h=877bd61c0f8df66c000d0f9f35d87cddc5dd0957 threefish decryption (256, 512 and 1024 bit) in assembler --- diff --git a/host/optimize_shift.rb b/host/optimize_shift.rb index e667697..0cc277e 100644 --- a/host/optimize_shift.rb +++ b/host/optimize_shift.rb @@ -74,9 +74,27 @@ for i in -3..4 printf("%+d: %4d\n", i, ss_hist[i]) end +puts "\ntransformed:" (0..shift_values.length-1).each{|i| + puts " for 256 bit:" if i==0 + puts " for 512 bit:" if i==16 + puts " for 1024 bit:" if i==16+32 + a = transform_shift(shift_values[i]) a[0] = transform_singleshift(a[0]) printf("0x%01x%01x, ", a[1], a[0]) puts("") if (i%8==7) } + + +puts "\ntransformed (decryption):" +(0..shift_values.length-1).each{|i| + puts " for 256 bit:" if i==0 + puts " for 512 bit:" if i==16 + puts " for 1024 bit:" if i==16+32 + + a = transform_shift(shift_values[(i/8)*8+7-(i%8)]) + a[0] = transform_singleshift(a[0]) + printf("0x%01x%01x, ", a[1], a[0]) + puts("") if (i%8==7) +} diff --git a/mkfiles/threefish.mk b/mkfiles/threefish.mk index d46769e..ef68ed4 100644 --- a/mkfiles/threefish.mk +++ b/mkfiles/threefish.mk @@ -6,8 +6,8 @@ BLOCK_CIPHERS += $(ALGO_NAME) $(ALGO_NAME)_OBJ := threefish256_enc_asm.o threefish512_enc_asm.o threefish1024_enc_asm.o\ - threefish_mix.o threefish_mix_4c.o threefish_invmix_c.o\ - threefish256_dec.o threefish512_dec.o threefish1024_dec.o + threefish_mix.o threefish_invmix.o \ + threefish256_dec_asm.o threefish512_dec_asm.o threefish1024_dec_asm.o $(ALGO_NAME)_TEST_BIN := main-threefish-test.o debug.o uart.o hexdigit_tab.o \ nessie_bc_test.o dbz_strings.o nessie_common.o cli.o string-extras.o performance_test.o $(ALGO_NAME)_NESSIE_TEST := test nessie diff --git a/test_src/main-threefish-test.c b/test_src/main-threefish-test.c index e40e59a..668eca0 100644 --- a/test_src/main-threefish-test.c +++ b/test_src/main-threefish-test.c @@ -125,6 +125,10 @@ void testrun_stdtest_threefish256(void){ threefish256_enc(data, &ctx); cli_putstr_P(PSTR("\r\ncipher: ")); cli_hexdump_block(data, 32, 4, 16); + cli_putstr_P(PSTR("\r\ndecipher: ")); + threefish256_dec(data, &ctx); + cli_hexdump_block(data, 32, 4, 16); + /* second test */ for(i=0; i<32; ++i){ key[i] = 0x10+i; @@ -143,6 +147,9 @@ void testrun_stdtest_threefish256(void){ threefish256_enc(data, &ctx); cli_putstr_P(PSTR("\r\ncipher: ")); cli_hexdump_block(data, 32, 4, 16); + cli_putstr_P(PSTR("\r\ndecipher: ")); + threefish256_dec(data, &ctx); + cli_hexdump_block(data, 32, 4, 16); } void testrun_stdtest_threefish512(void){ @@ -168,6 +175,10 @@ void testrun_stdtest_threefish512(void){ threefish512_enc(data, &ctx); cli_putstr_P(PSTR("\r\ncipher: ")); cli_hexdump_block(data, 64, 4, 16); + threefish512_dec(data, &ctx); + cli_putstr_P(PSTR("\r\ndecipher: ")); + cli_hexdump_block(data, 64, 4, 16); + for(i=0; i<64; ++i){ key[i] = 0x10+i; @@ -188,6 +199,10 @@ void testrun_stdtest_threefish512(void){ threefish512_enc(data, &ctx); cli_putstr_P(PSTR("\r\ncipher: ")); cli_hexdump_block(data, 64, 4, 16); + threefish512_dec(data, &ctx); + cli_putstr_P(PSTR("\r\ndecipher: ")); + cli_hexdump_block(data, 64, 4, 16); + } void testrun_stdtest_threefish1024(void){ @@ -211,6 +226,9 @@ void testrun_stdtest_threefish1024(void){ threefish1024_enc(data, &ctx); cli_putstr_P(PSTR("\r\ncipher: ")); cli_hexdump_block(data, 128, 4, 16); + threefish1024_dec(data, &ctx); + cli_putstr_P(PSTR("\r\ndecipher: ")); + cli_hexdump_block(data, 128, 4, 16); for(i=0; i<128; ++i){ key[i] = 0x10+i; @@ -229,6 +247,9 @@ void testrun_stdtest_threefish1024(void){ threefish1024_enc(data, &ctx); cli_putstr_P(PSTR("\r\ncipher: ")); cli_hexdump_block(data, 128, 4, 16); + threefish1024_dec(data, &ctx); + cli_putstr_P(PSTR("\r\ndecipher: ")); + cli_hexdump_block(data, 128, 4, 16); } @@ -268,6 +289,12 @@ void testrun_performance_threefish256(void){ ultoa((unsigned long)t, str, 10); cli_putstr(str); + startTimer(1); + threefish256_dec(data, &ctx); + t = stopTimer(); + cli_putstr_P(PSTR("\r\n\tdecrypt time: ")); + ultoa((unsigned long)t, str, 10); + cli_putstr(str); cli_putstr_P(PSTR("\r\n")); } @@ -301,6 +328,13 @@ void testrun_performance_threefish512(void){ ultoa((unsigned long)t, str, 10); cli_putstr(str); + startTimer(1); + threefish512_dec(data, &ctx); + t = stopTimer(); + cli_putstr_P(PSTR("\r\n\tdecrypt time: ")); + ultoa((unsigned long)t, str, 10); + cli_putstr(str); + cli_putstr_P(PSTR("\r\n")); } @@ -334,6 +368,13 @@ void testrun_performance_threefish1024(void){ ultoa((unsigned long)t, str, 10); cli_putstr(str); + startTimer(1); + threefish1024_dec(data, &ctx); + t = stopTimer(); + cli_putstr_P(PSTR("\r\n\tdecrypt time: ")); + ultoa((unsigned long)t, str, 10); + cli_putstr(str); + cli_putstr_P(PSTR("\r\n")); } diff --git a/threefish1024_dec_asm.S b/threefish1024_dec_asm.S new file mode 100644 index 0000000..5743829 --- /dev/null +++ b/threefish1024_dec_asm.S @@ -0,0 +1,474 @@ +/* threefish1024_enc_asm.S */ +/* + This file is part of the AVR-Crypto-Lib. + Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +/* + * \author Daniel Otte + * \email daniel.otte@rub.de + * \date 2009-03-24 + * \license GPLv3 or later + */ + +#include "avr-asm-macros.S" + +/******************************************************************************/ +/* +void permute_inv16(void* data){ + uint64_t t; + t = X(15); + X(15) = X(7); + X(7) = X(9); + X(9) = X(1); + X(1) = t; + t = X(11); + X(11) = X(5); + X(5) = X(13); + X(13) = X(3); + X(3) = t; + t = X(4); + X(4) = X(6); + X(6) = t; + t = X(14); + X(14) = X(12); + X(12) = X(10); + X(10) = X(8); + X(8) = t; +} +void add_key_16(void* data, const threefish1024_ctx_t* ctx, uint8_t s){ + uint8_t i; + for(i=0; i<13; ++i){ + X(i) -= ctx->k[(s+i)%17]; + } + X(13) -= ctx->k[(s+13)%17] + ctx->t[s%3]; + X(14) -= ctx->k[(s+14)%17] + ctx->t[(s+1)%3]; + X(15) -= ctx->k[(s+15)%17] + s; +} +void threefish1024_dec(void* data, const threefish1024_ctx_t* ctx){ + uint8_t i=0,s=20; + uint8_t r0[8] = {0x69, 0x72, 0x21, 0x34, 0x42, 0x41, 0x31, 0x79}; + uint8_t r1[8] = {0x61, 0x19, 0x1a, 0x19, 0x53, 0x10, 0x31, 0x53}; + uint8_t r2[8] = {0x33, 0x40, 0x22, 0x69, 0x31, 0x22, 0x6a, 0x5b}; + uint8_t r3[8] = {0x72, 0x6b, 0x31, 0x60, 0x74, 0x71, 0x2b, 0x50}; + uint8_t r4[8] = {0x5b, 0x23, 0x53, 0x63, 0x54, 0x3b, 0x2a, 0x20}; + uint8_t r5[8] = {0x60, 0x22, 0x52, 0x11, 0x11, 0x14, 0x2b, 0x3a}; + uint8_t r6[8] = {0x7b, 0x02, 0x50, 0x43, 0x73, 0x40, 0x64, 0x5a}; + uint8_t r7[8] = {0x70, 0x70, 0x29, 0x51, 0x42, 0x7a, 0x71, 0x14}; + + do{ + if(i%4==0){ + add_key_16(data, ctx, s); + --s; + } + permute_inv16(data); + threefish_invmix((uint8_t*)data + 0, r0[i%8]); + threefish_invmix((uint8_t*)data + 16, r1[i%8]); + threefish_invmix((uint8_t*)data + 32, r2[i%8]); + threefish_invmix((uint8_t*)data + 48, r3[i%8]); + threefish_invmix((uint8_t*)data + 64, r4[i%8]); + threefish_invmix((uint8_t*)data + 80, r5[i%8]); + threefish_invmix((uint8_t*)data + 96, r6[i%8]); + threefish_invmix((uint8_t*)data +112, r7[i%8]); + ++i; + }while(i!=80); + add_key_16(data, ctx, s); +} +*/ +I = 2 +S = 3 +DATA0 = 4 +DATA1 = 5 +CTX0 = 6 +CTX1 = 7 +IDX0 = 8 +IDX1 = 9 +IDX2 = 10 +IDX3 = 11 +IDX4 = 12 +IDX5 = 13 +IDX6 = 14 +IDX7 = 15 + +/* + * param data: r24:r25 + * param ctx: r22:r23 + */ +.global threefish1024_dec +threefish1024_dec: + push r28 + push r29 + push_range 2, 17 + movw DATA0, r24 + movw CTX0, r22 + clr I + ldi r26, 20 + mov S, r26 +1: + mov r30, I + andi r30, 0x03 + breq 2f + rjmp 4f +2: + ldi r30, lo8(threefish1024_slut17) + ldi r31, hi8(threefish1024_slut17) + add r30, S + adc r31, r1 + lpm IDX0, Z+ + lpm IDX1, Z+ + lpm IDX2, Z+ + lpm IDX3, Z+ + lpm IDX4, Z+ + lpm IDX5, Z+ + lpm IDX6, Z+ + lpm IDX7, Z + movw r30, CTX0 + movw r26, DATA0 + add r30, IDX0 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX1 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX2 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX3 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX4 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX5 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX6 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX7 + adc r31, r1 + rcall sub_z_from_x8 + /* second half */ + ldi r30, lo8(threefish1024_slut17) + ldi r31, hi8(threefish1024_slut17) + add r30, S + adc r31, r1 + adiw r30, 8 + lpm IDX0, Z+ + lpm IDX1, Z+ + lpm IDX2, Z+ + lpm IDX3, Z+ + lpm IDX4, Z+ + lpm IDX5, Z+ + lpm IDX6, Z+ + lpm IDX7, Z + movw r30, CTX0 + add r30, IDX0 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX1 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX2 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX3 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX4 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX5 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX6 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX7 + adc r31, r1 + rcall sub_z_from_x8 + /* now the remaining key */ + sbiw r26, 3*8 + ldi r30, lo8(threefish1024_slut3) + ldi r31, hi8(threefish1024_slut3) + add r30, S + adc r31, r1 + lpm IDX0, Z+ + lpm IDX1, Z + movw r30, CTX0 + adiw r30, 7*8 /* make Z pointing to (extended tweak) */ + adiw r30, 7*8 + adiw r30, 3*8 + movw IDX2, r30 + add r30, IDX0 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, IDX2 + add r30, IDX1 + adc r31, r1 + rcall sub_z_from_x8 + ld r0, X + sub r0, S + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + tst S + brne 3f +exit: + pop_range 2, 17 + pop r29 + pop r28 + ret +3: + dec S +4: + /* now the permutation */ + movw r26, DATA0 /* X1 <-> X15 */ + adiw r26, 1*8 + movw r30, DATA0 + adiw r30, 7*8+4 + adiw r30, 7*8+4 + rcall xchg_zx8 + movw r26, DATA0 /* X15 <-> X7 */ + adiw r26, 7*8+4 + adiw r26, 7*8+4 + movw r30, DATA0 + adiw r30, 7*8 + rcall xchg_zx8 + movw r26, DATA0 /* X9 <-> X7 */ + adiw r26, 7*8 + adiw r26, 2*8 + movw r30, DATA0 + adiw r30, 7*8 + rcall xchg_zx8 + /* --- */ + movw r26, DATA0 /* X3 <-> X11 */ + adiw r26, 3*8 + movw r30, DATA0 + adiw r30, 7*8 + adiw r30, 4*8 + rcall xchg_zx8 + movw r26, DATA0 /* X11 <-> X5 */ + adiw r26, 7*8 + adiw r26, 4*8 + movw r30, DATA0 + adiw r30, 5*8 + rcall xchg_zx8 + movw r26, DATA0 /* X13 <-> X5 */ + adiw r26, 7*8 + adiw r26, 6*8 + movw r30, DATA0 + adiw r30, 5*8 + rcall xchg_zx8 + /* --- */ + movw r26, DATA0 /* X8 <-> X14 */ + adiw r26, 7*8 + adiw r26, 1*8 + movw r30, DATA0 + adiw r30, 7*8 + adiw r30, 7*8 + rcall xchg_zx8 + movw r26, DATA0 /* X14 <-> X12 */ + adiw r26, 7*8 + adiw r26, 7*8 + movw r30, DATA0 + adiw r30, 7*8 + adiw r30, 5*8 + rcall xchg_zx8 + movw r26, DATA0 /* X10 <-> X12 */ + adiw r26, 7*8 + adiw r26, 3*8 + movw r30, DATA0 + adiw r30, 7*8 + adiw r30, 5*8 + rcall xchg_zx8 + /* --- */ + movw r26, DATA0 /* X4 <-> X6 */ + adiw r26, 4*8 + movw r30, DATA0 + adiw r30, 6*8 + rcall xchg_zx8 + + /* call mix */ + ldi r30, lo8(threefish1024_rc0) + ldi r31, hi8(threefish1024_rc0) + mov r26, I + andi r26, 0x07 + add r30, r26 + adc r31, r1 + lpm r22, Z + adiw r30, 8 + lpm IDX0, Z + adiw r30, 8 + lpm IDX1, Z + adiw r30, 8 + lpm IDX2, Z + adiw r30, 8 + lpm IDX3, Z + adiw r30, 8 + lpm IDX4, Z + adiw r30, 8 + lpm IDX5, Z + adiw r30, 8 + lpm IDX6, Z + push IDX6 + push IDX5 + push IDX4 + push IDX3 + push IDX2 + + movw r24, DATA0 + call threefish_invmix_asm /* no rcall? */ + movw r24, DATA0 + adiw r24, 16 + mov r22, IDX0 + call threefish_invmix_asm /* no rcall? */ + movw r24, DATA0 + adiw r24, 32 + mov r22, IDX1 + call threefish_invmix_asm /* no rcall? */ + movw r24, DATA0 + adiw r24, 48 + pop r22 + call threefish_invmix_asm /* no rcall? */ + movw r24, DATA0 + adiw r24, 63 + adiw r24, 1 + pop r22 + call threefish_invmix_asm /* no rcall? */ + movw r24, DATA0 + adiw r24, 63 + adiw r24, 17 + pop r22 + call threefish_invmix_asm /* no rcall? */ + movw r24, DATA0 + adiw r24, 63 + adiw r24, 33 + pop r22 + call threefish_invmix_asm /* no rcall? */ + movw r24, DATA0 + adiw r24, 63 + adiw r24, 49 + pop r22 + call threefish_invmix_asm /* no rcall? */ + inc I +9: + rjmp 1b + +threefish1024_slut17: + .byte 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38 + .byte 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 + .byte 0x80, 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30 + .byte 0x38, 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70 + .byte 0x78, 0x80, 0x00, 0x08, 0x10 +threefish1024_slut3: + .byte 0x00, 0x08, 0x10, 0x00, 0x08, 0x10, 0x00, 0x08 + .byte 0x10, 0x00, 0x08, 0x10, 0x00, 0x08, 0x10, 0x00 + .byte 0x08, 0x10, 0x00, 0x08, 0x10, 0x00, 0x08, 0x10 + .byte 0x00 + +threefish1024_rc0: .byte 0x69, 0x72, 0x21, 0x34, 0x42, 0x41, 0x31, 0x79 +threefish1024_rc1: .byte 0x61, 0x19, 0x1a, 0x19, 0x53, 0x10, 0x31, 0x53 +threefish1024_rc2: .byte 0x33, 0x40, 0x22, 0x69, 0x31, 0x22, 0x6a, 0x5b +threefish1024_rc3: .byte 0x72, 0x6b, 0x31, 0x60, 0x74, 0x71, 0x2b, 0x50 +threefish1024_rc4: .byte 0x5b, 0x23, 0x53, 0x63, 0x54, 0x3b, 0x2a, 0x20 +threefish1024_rc5: .byte 0x60, 0x22, 0x52, 0x11, 0x11, 0x14, 0x2b, 0x3a +threefish1024_rc6: .byte 0x7b, 0x02, 0x50, 0x43, 0x73, 0x40, 0x64, 0x5a +threefish1024_rc7: .byte 0x70, 0x70, 0x29, 0x51, 0x42, 0x7a, 0x71, 0x14 + +sub_z_from_x8: + ld r0, Z+ + ld r1, X + sub r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + clr r1 + ret + +T0 = IDX0 +T1 = 0 +CNT = 24 +xchg_zx8: + ldi CNT, 8 +1: ld T0, X + ld T1, Z + st X+, T1 + st Z+, T0 + dec CNT + brne 1b + ret + + + diff --git a/threefish256_dec_asm.S b/threefish256_dec_asm.S new file mode 100644 index 0000000..e55ca68 --- /dev/null +++ b/threefish256_dec_asm.S @@ -0,0 +1,279 @@ +/* threefish256_enc_asm.S */ +/* + This file is part of the AVR-Crypto-Lib. + Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +/* + * \author Daniel Otte + * \email daniel.otte@rub.de + * \date 2009-03-16 + * \license GPLv3 or later + */ + +#include "avr-asm-macros.S" + +/******************************************************************************/ +/* +void permute_4(void* data){ + uint64_t t; + t = X(1); + X(1) = X(3); + X(3) = t; +} +void add_key_4(void* data, const threefish256_ctx_t* ctx, uint8_t s){ + X(0) -= ctx->k[(s+0)%5]; + X(1) -= ctx->k[(s+1)%5] + ctx->t[s%3]; + X(2) -= ctx->k[(s+2)%5] + ctx->t[(s+1)%3]; + X(3) -= ctx->k[(s+3)%5] + s; +} +void threefish256_dec(void* data, const threefish256_ctx_t* ctx){ + uint8_t i=0,s=18; + uint8_t r0[8] = {0x73, 0x13, 0x7b, 0x32, 0x72, 0x2b, 0x44, 0x1b}; + uint8_t r1[8] = {0x62, 0x52, 0x43, 0x24, 0x54, 0x6a, 0x34, 0x70}; + do{ + if(i%4==0){ + add_key_4(data, ctx, s); + --s; + } + permute_4(data); + threefish_invmix(data, r0[i%8]); + threefish_invmix((uint8_t*)data + 16, r1[i%8]); + ++i; + }while(i!=72); + add_key_4(data, ctx, s); +} +*/ +I = 2 +S = 3 +DATA0 = 4 +DATA1 = 5 +CTX0 = 6 +CTX1 = 7 +IDX0 = 8 +IDX1 = 9 +IDX2 = 10 +IDX3 = 11 +/* + * param data: r24:r25 + * param ctx: r22:r23 + */ +.global threefish256_dec +threefish256_dec: + push r28 + push r29 + push_range 2, 17 + movw DATA0, r24 + movw CTX0, r22 + clr I + ldi r26, 18 + mov S, r26 +1: + mov r30, I + andi r30, 0x03 + breq 2f + rjmp 4f +2: + ldi r30, lo8(threefish256_slut5) + ldi r31, hi8(threefish256_slut5) + add r30, S + adc r31, r1 + lpm IDX0, Z+ + lpm IDX1, Z+ + lpm IDX2, Z+ + lpm IDX3, Z + movw r30, CTX0 + movw r26, DATA0 + add r30, IDX0 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX1 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX2 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX3 + adc r31, r1 + rcall sub_z_from_x8 + + /* now the remaining key */ + sbiw r26, 3*8 + ldi r30, lo8(threefish256_slut3) + ldi r31, hi8(threefish256_slut3) + add r30, S + adc r31, r1 + lpm IDX0, Z+ + lpm IDX1, Z + movw r30, CTX0 + adiw r30, 5*8 + movw IDX2, r30 + add r30, IDX0 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, IDX2 + add r30, IDX1 + adc r31, r1 + rcall sub_z_from_x8 + ld r0, X + sub r0, S + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + adc r0, r1 + st X+, r0 + tst S + brne 3f +exit: + pop_range 2, 17 + pop r29 + pop r28 + ret +3: + dec S +4: + /* now the permutation */ + movw r26, DATA0 + adiw r26, 8 + movw r30, r26 + adiw r30, 16 + ld IDX0, X + ld IDX1, Z + st X+, IDX1 + st Z+, IDX0 + ld IDX0, X + ld IDX1, Z + st X+, IDX1 + st Z+, IDX0 + ld IDX0, X + ld IDX1, Z + st X+, IDX1 + st Z+, IDX0 + ld IDX0, X + ld IDX1, Z + st X+, IDX1 + st Z+, IDX0 + ld IDX0, X + ld IDX1, Z + st X+, IDX1 + st Z+, IDX0 + ld IDX0, X + ld IDX1, Z + st X+, IDX1 + st Z+, IDX0 + ld IDX0, X + ld IDX1, Z + st X+, IDX1 + st Z+, IDX0 + ld IDX0, X + ld IDX1, Z + st X+, IDX1 + st Z+, IDX0 + /* call mix */ + ldi r30, lo8(threefish256_rc0) + ldi r31, hi8(threefish256_rc0) + mov r26, I + andi r26, 0x07 + add r30, r26 + adc r31, r1 + lpm r22, Z + adiw r30, 8 + lpm IDX0, Z + movw r24, DATA0 + call threefish_invmix_asm /* no rcall? */ + movw r24, DATA0 + adiw r24, 16 + mov r22, IDX0 + call threefish_invmix_asm /* no rcall? */ + inc I + rjmp 1b + +threefish256_slut5: + .byte 0x00, 0x08, 0x10, 0x18, 0x20, 0x00, 0x08, 0x10 + .byte 0x18, 0x20, 0x00, 0x08, 0x10, 0x18, 0x20, 0x00 + .byte 0x08, 0x10, 0x18, 0x20, 0x00, 0x08, 0x10 +threefish256_slut3: + .byte 0x00, 0x08, 0x10, 0x00, 0x08, 0x10, 0x00, 0x08 + .byte 0x10, 0x00, 0x08, 0x10, 0x00, 0x08, 0x10, 0x00 + .byte 0x08, 0x10, 0x00, 0x08, 0x10, 0x00, 0x08 + +threefish256_rc0: .byte 0x73, 0x13, 0x7b, 0x32, 0x72, 0x2b, 0x44, 0x1b +threefish256_rc1: .byte 0x62, 0x52, 0x43, 0x24, 0x54, 0x6a, 0x34, 0x70 + +sub_z_from_x8: + ld r0, Z+ + ld r1, X + sub r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + clr r1 + ret + + + + + + + + + + diff --git a/threefish512_dec_asm.S b/threefish512_dec_asm.S new file mode 100644 index 0000000..5ac9c0d --- /dev/null +++ b/threefish512_dec_asm.S @@ -0,0 +1,330 @@ +/* threefish512_enc_asm.S */ +/* + This file is part of the AVR-Crypto-Lib. + Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +/* + * \author Daniel Otte + * \email daniel.otte@rub.de + * \date 2009-03-24 + * \license GPLv3 or later + */ + +#include "avr-asm-macros.S" + +/******************************************************************************/ +/* +#define X(a) (((uint64_t*)data)[(a)]) + + +static +void permute_inv8(void* data){ + uint64_t t; + t = X(6); + X(6) = X(4); + X(4) = X(2); + X(2) = X(0); + X(0) = t; + t = X(7); + X(7) = X(3); + X(3) = t; +} + +static +void add_key_8(void* data, const threefish512_ctx_t* ctx, uint8_t s){ + uint8_t i; + for(i=0; i<5; ++i){ + X(i) -= ctx->k[(s+i)%9]; + } + X(5) -= ctx->k[(s+5)%9] + ctx->t[s%3]; + X(6) -= ctx->k[(s+6)%9] + ctx->t[(s+1)%3]; + X(7) -= ctx->k[(s+7)%9] + s; +} + +void threefish512_dec(void* data, const threefish512_ctx_t* ctx){ + uint8_t i=0,s=18; + uint8_t r0[8] = {0x41, 0x4b, 0x59, 0x41, 0x32, 0x42, 0x60, 0x5a}; + uint8_t r1[8] = {0x63, 0x32, 0x33, 0x61, 0x14, 0x2a, 0x24, 0x4a}; + uint8_t r2[8] = {0x59, 0x13, 0x51, 0x10, 0x72, 0x29, 0x53, 0x62}; + uint8_t r3[8] = {0x43, 0x11, 0x2a, 0x52, 0x19, 0x33, 0x49, 0x7b}; + do{ + if(i%4==0){ + add_key_8(data, ctx, s); + --s; + } + permute_inv8(data); + threefish_invmix((uint8_t*)data + 0, r0[i%8]); + threefish_invmix((uint8_t*)data + 16, r1[i%8]); + threefish_invmix((uint8_t*)data + 32, r2[i%8]); + threefish_invmix((uint8_t*)data + 48, r3[i%8]); + ++i; + }while(i!=72); + add_key_8(data, ctx, s); +} +*/ +I = 2 +S = 3 +DATA0 = 4 +DATA1 = 5 +CTX0 = 6 +CTX1 = 7 +IDX0 = 8 +IDX1 = 9 +IDX2 = 10 +IDX3 = 11 +IDX4 = 12 +IDX5 = 13 +IDX6 = 14 +IDX7 = 15 +/* + * param data: r24:r25 + * param ctx: r22:r23 + */ +.global threefish512_dec +threefish512_dec: + push r28 + push r29 + push_range 2, 17 + movw DATA0, r24 + movw CTX0, r22 + clr I + ldi r26, 18 + mov S, r26 +1: + mov r30, I + andi r30, 0x03 + breq 2f + rjmp 4f +2: + ldi r30, lo8(threefish512_slut9) + ldi r31, hi8(threefish512_slut9) + add r30, S + adc r31, r1 + lpm IDX0, Z+ + lpm IDX1, Z+ + lpm IDX2, Z+ + lpm IDX3, Z+ + lpm IDX4, Z+ + lpm IDX5, Z+ + lpm IDX6, Z+ + lpm IDX7, Z + movw r30, CTX0 + movw r26, DATA0 + add r30, IDX0 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX1 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX2 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX3 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX4 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX5 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX6 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, CTX0 + add r30, IDX7 + adc r31, r1 + rcall sub_z_from_x8 + + /* now the remaining key */ + sbiw r26, 3*8 + ldi r30, lo8(threefish512_slut3) + ldi r31, hi8(threefish512_slut3) + add r30, S + adc r31, r1 + lpm IDX0, Z+ + lpm IDX1, Z + movw r30, CTX0 + adiw r30, 7*8 /* make Z pointing to (extended tweak) */ + adiw r30, 2*8 + movw IDX2, r30 + add r30, IDX0 + adc r31, r1 + rcall sub_z_from_x8 + movw r30, IDX2 + add r30, IDX1 + adc r31, r1 + rcall sub_z_from_x8 + ld r0, X + sub r0, S + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + ld r0, X + sbc r0, r1 + st X+, r0 + tst S + brne 3f +exit: + pop_range 2, 17 + pop r29 + pop r28 + ret +3: + dec S +4: + /* now the permutation */ + movw r26, DATA0 + movw r30, DATA0 + adiw r30, 6*8 + rcall xchg_zx8 + movw r26, DATA0 + adiw r26, 6*8 + movw r30, DATA0 + adiw r30, 4*8 + rcall xchg_zx8 + movw r26, DATA0 + adiw r26, 2*8 + movw r30, DATA0 + adiw r30, 4*8 + rcall xchg_zx8 + movw r26, DATA0 + adiw r26, 3*8 + movw r30, DATA0 + adiw r30, 7*8 + rcall xchg_zx8 + /* call mix */ + ldi r30, lo8(threefish512_rc0) + ldi r31, hi8(threefish512_rc0) + mov r26, I + andi r26, 0x07 + add r30, r26 + adc r31, r1 + lpm r22, Z + adiw r30, 8 + lpm IDX0, Z + adiw r30, 8 + lpm IDX1, Z + push IDX1 + adiw r30, 8 + lpm IDX1, Z + + movw r24, DATA0 + call threefish_invmix_asm /* no rcall? */ + movw r24, DATA0 + adiw r24, 16 + mov r22, IDX0 + call threefish_invmix_asm /* no rcall? */ + movw r24, DATA0 + adiw r24, 32 + pop r22 + ;mov r22, IDX0 + call threefish_invmix_asm /* no rcall? */ + movw r24, DATA0 + adiw r24, 48 + mov r22, IDX1 + call threefish_invmix_asm /* no rcall? */ + inc I + rjmp 1b + +threefish512_slut9: + .byte 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38 + .byte 0x40, 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30 + .byte 0x38, 0x40, 0x00, 0x08, 0x10, 0x18, 0x20, 0x28 + .byte 0x30, 0x38, 0x40 +threefish512_slut3: + .byte 0x00, 0x08, 0x10, 0x00, 0x08, 0x10, 0x00, 0x08 + .byte 0x10, 0x00, 0x08, 0x10, 0x00, 0x08, 0x10, 0x00 + .byte 0x08, 0x10, 0x00, 0x08, 0x10, 0x00, 0x08 + +threefish512_rc0: .byte 0x41, 0x4b, 0x59, 0x41, 0x32, 0x42, 0x60, 0x5a +threefish512_rc1: .byte 0x63, 0x32, 0x33, 0x61, 0x14, 0x2a, 0x24, 0x4a +threefish512_rc2: .byte 0x59, 0x13, 0x51, 0x10, 0x72, 0x29, 0x53, 0x62 +threefish512_rc3: .byte 0x43, 0x11, 0x2a, 0x52, 0x19, 0x33, 0x49, 0x7b + +sub_z_from_x8: + ld r0, Z+ + ld r1, X + sub r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + ld r0, Z+ + ld r1, X + sbc r1, r0 + st X+, r1 + clr r1 + ret + +T0 = IDX0 +T1 = 0 +CNT = 24 +xchg_zx8: + ldi CNT, 8 +1: ld T0, X + ld T1, Z + st X+, T1 + st Z+, T0 + dec CNT + brne 1b + ret + + + diff --git a/threefish_invmix.S b/threefish_invmix.S new file mode 100644 index 0000000..1e55cf2 --- /dev/null +++ b/threefish_invmix.S @@ -0,0 +1,299 @@ +/* threefish_invmix.S */ +/* + This file is part of the AVR-Crypto-Lib. + Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +/* + * \author Daniel Otte + * \email daniel.otte@rub.de + * \date 2009-03-21 + * \license GPLv3 or later + */ + +#include "avr-asm-macros.S" + +/* +#define X0 (((uint64_t*)data)[0]) +#define X1 (((uint64_t*)data)[1]) +void threefish_invmix(void* data, uint8_t rot){ + uint64_t x; + x = X1; + x ^= X0; + X1 = ((x>>rot)|(x<<(64-rot))); + X0 -= X1; +} +*/ +A0 = 10 +A1 = 11 +A2 = 12 +A3 = 13 +A4 = 14 +A5 = 15 +A6 = 16 +A7 = 17 + +B0 = 18 +B1 = 19 +B2 = 20 +B3 = 21 +B4 = 22 +B5 = 23 +B6 = 24 +B7 = 25 +vROT = 27 +/* + * param data: r24:r25 + * param rot: r22 + */ + +.global threefish_invmix_asm +threefish_invmix_asm: + movw r28, r24 + mov vROT,r22 + ldd A0, Y+ 0 + ldd A1, Y+ 1 + ldd A2, Y+ 2 + ldd A3, Y+ 3 + ldd A4, Y+ 4 + ldd A5, Y+ 5 + ldd A6, Y+ 6 + ldd A7, Y+ 7 + ldd B0, Y+ 8 + ldd B1, Y+ 9 + ldd B2, Y+10 + ldd B3, Y+11 + ldd B4, Y+12 + ldd B5, Y+13 + ldd B6, Y+14 + ldd B7, Y+15 + eor B0, A0 + eor B1, A1 + eor B2, A2 + eor B3, A3 + eor B4, A4 + eor B5, A5 + eor B6, A6 + eor B7, A7 + + mov r26, vROT + swap r26 + andi r26, 0x07 + ldi r30, pm_lo8(byte_rot_jmptable) + ldi r31, pm_hi8(byte_rot_jmptable) + add r30, r26 + adc r31, r1 + ijmp +post_byterot: + bst vROT, 3 + andi vROT, 0x07 + brts 1f + rjmp bit_rotr +1: rjmp bit_rotl +post_bitrot: + sub A0, B0 + sbc A1, B1 + sbc A2, B2 + sbc A3, B3 + sbc A4, B4 + sbc A5, B5 + sbc A6, B6 + sbc A7, B7 + + std Y+ 0, A0 + std Y+ 1, A1 + std Y+ 2, A2 + std Y+ 3, A3 + std Y+ 4, A4 + std Y+ 5, A5 + std Y+ 6, A6 + std Y+ 7, A7 + std Y+ 8, B0 + std Y+ 9, B1 + std Y+10, B2 + std Y+11, B3 + std Y+12, B4 + std Y+13, B5 + std Y+14, B6 + std Y+15, B7 +exit: + ret + +byte_rot_jmptable: + rjmp post_byterot;ret; rjmp byte_rotr_0 + rjmp byte_rotr_1 + rjmp byte_rotr_2 + rjmp byte_rotr_3 + rjmp byte_rotr_4 + rjmp byte_rotr_5 + rjmp byte_rotr_6 + rjmp byte_rotr_7 + rjmp post_byterot;ret; rjmp byte_rotr_0 + + + +; 0 1 2 3 4 5 6 7 +; 1 2 3 4 5 6 7 0 +;.global byte_rotr_1 +;.global byte_rotr_0 +byte_rotr_1: /* 10 words */ + mov r0, B0 + mov B0, B1 + mov B1, B2 + mov B2, B3 + mov B3, B4 + mov B4, B5 + mov B5, B6 + mov B6, B7 + mov B7, r0 +byte_rotr_0: + rjmp post_byterot + +; 0 1 2 3 4 5 6 7 +; 2 3 4 5 6 7 0 1 +;.global byte_rotr_2 +byte_rotr_2: /* 11 words */ + mov r0, B0 + mov B0, B2 + mov B2, B4 + mov B4, B6 + mov B6, r0 + mov r0, B1 + mov B1, B3 + mov B3, B5 + mov B5, B7 + mov B7, r0 + rjmp post_byterot + +; 0 1 2 3 4 5 6 7 +; 3 4 5 6 7 0 1 2 +;.global byte_rotr_3 +byte_rotr_3: /* 10 words */ + mov r0, B0 + mov B0, B3 + mov B3, B6 + mov B6, B1 + mov B1, B4 + mov B4, B7 + mov B7, B2 + mov B2, B5 + mov B5, r0 + rjmp post_byterot + +; 0 1 2 3 4 5 6 7 +; 4 5 6 7 0 1 2 3 +;.global byte_rotr_4 +byte_rotr_4: /* 13 words */ + mov r0, B0 + mov B0, B4 + mov B4, r0 + + mov r0, B1 + mov B1, B5 + mov B5, r0 + + mov r0, B2 + mov B2, B6 + mov B6, r0 + + mov r0, B3 + mov B3, B7 + mov B7, r0 + rjmp post_byterot + +; 0 1 2 3 4 5 6 7 +; 5 6 7 0 1 2 3 4 +;.global byte_rotr_5 +byte_rotr_5: /* 10 words */ + mov r0, B0 + mov B0, B5 + mov B5, B2 + mov B2, B7 + mov B7, B4 + mov B4, B1 + mov B1, B6 + mov B6, B3 + mov B3, r0 + rjmp post_byterot + +; 0 1 2 3 4 5 6 7 +; 6 7 0 1 2 3 4 5 +;.global byte_rotr_6 +byte_rotr_6: /* 11 words */ + mov r0, B0 + mov B0, B6 + mov B6, B4 + mov B4, B2 + mov B2, r0 + + mov r0, B1 + mov B1, B7 + mov B7, B5 + mov B5, B3 + mov B3, r0 + rjmp post_byterot + +; 0 1 2 3 4 5 6 7 +; 7 0 1 2 3 4 5 6 +;.global byte_rotr_7 +byte_rotr_7: /* 10 words */ + mov r0, B7 + mov B7, B6 + mov B6, B5 + mov B5, B4 + mov B4, B3 + mov B3, B2 + mov B2, B1 + mov B1, B0 + mov B0, r0 + rjmp post_byterot + +;.global bit_rotl +bit_rotl: + tst vROT + brne 1f + rjmp post_bitrot +1: mov r0, B7 + rol r0 + rol B0 + rol B1 + rol B2 + rol B3 + rol B4 + rol B5 + rol B6 + rol B7 + dec vROT + rjmp bit_rotl + +;.global bit_rotr +bit_rotr: + tst vROT + brne 1f + rjmp post_bitrot +1: mov r0, B0 + ror r0 + ror B7 + ror B6 + ror B5 + ror B4 + ror B3 + ror B2 + ror B1 + ror B0 + dec vROT + rjmp bit_rotr + + diff --git a/threefish_mix.S b/threefish_mix.S index 00952a3..07d076f 100644 --- a/threefish_mix.S +++ b/threefish_mix.S @@ -145,8 +145,8 @@ byte_rot_jmptable: ; 0 1 2 3 4 5 6 7 ; 1 2 3 4 5 6 7 0 -.global byte_rotr_1 -.global byte_rotr_0 +;.global byte_rotr_1 +;.global byte_rotr_0 byte_rotr_1: /* 10 words */ mov r0, B0 mov B0, B1 @@ -162,7 +162,7 @@ byte_rotr_0: ; 0 1 2 3 4 5 6 7 ; 2 3 4 5 6 7 0 1 -.global byte_rotr_2 +;.global byte_rotr_2 byte_rotr_2: /* 11 words */ mov r0, B0 mov B0, B2 @@ -178,7 +178,7 @@ byte_rotr_2: /* 11 words */ ; 0 1 2 3 4 5 6 7 ; 3 4 5 6 7 0 1 2 -.global byte_rotr_3 +;.global byte_rotr_3 byte_rotr_3: /* 10 words */ mov r0, B0 mov B0, B3 @@ -193,7 +193,7 @@ byte_rotr_3: /* 10 words */ ; 0 1 2 3 4 5 6 7 ; 4 5 6 7 0 1 2 3 -.global byte_rotr_4 +;.global byte_rotr_4 byte_rotr_4: /* 13 words */ mov r0, B0 mov B0, B4 @@ -214,7 +214,7 @@ byte_rotr_4: /* 13 words */ ; 0 1 2 3 4 5 6 7 ; 5 6 7 0 1 2 3 4 -.global byte_rotr_5 +;.global byte_rotr_5 byte_rotr_5: /* 10 words */ mov r0, B0 mov B0, B5 @@ -229,7 +229,7 @@ byte_rotr_5: /* 10 words */ ; 0 1 2 3 4 5 6 7 ; 6 7 0 1 2 3 4 5 -.global byte_rotr_6 +;.global byte_rotr_6 byte_rotr_6: /* 11 words */ mov r0, B0 mov B0, B6 @@ -246,7 +246,7 @@ byte_rotr_6: /* 11 words */ ; 0 1 2 3 4 5 6 7 ; 7 0 1 2 3 4 5 6 -.global byte_rotr_7 +;.global byte_rotr_7 byte_rotr_7: /* 10 words */ mov r0, B7 mov B7, B6 @@ -259,7 +259,7 @@ byte_rotr_7: /* 10 words */ mov B0, r0 rjmp post_byterot -.global bit_rotl +;.global bit_rotl bit_rotl: tst vROT brne 1f @@ -277,7 +277,7 @@ bit_rotl: dec vROT rjmp bit_rotl -.global bit_rotr +;.global bit_rotr bit_rotr: tst vROT brne 1f