From 6bca96e560e6097aa5b225fad67f2e2a27f4182f Mon Sep 17 00:00:00 2001 From: bg Date: Thu, 3 Jul 2008 04:11:34 +0000 Subject: [PATCH] modyfied copyright --- A5_1.c | 9 +- Makefile | 9 +- arcfour-asm.S | 8 +- arcfour.c | 9 +- camellia-asm.S | 8 +- cast5.c | 5 +- cli.c | 2 +- des.c | 11 +- entropium.c | 9 +- grain.c | 2 +- grain_h_lutgen.c | 60 -- grain_nfsr_lutgen.c | 91 -- hmac-sha256.c | 6 +- main-seed-test.c | 5 +- main-shabea-test.c | 2 +- md5.c | 4 +- noekeon.c | 2 +- noekeon_genrc.c | 35 - seed-asm.S | 2 +- sha1-asm.S | 1954 ++++++++++++++++++++--------------------- sha256-asm.S | 2050 +++++++++++++++++++++---------------------- xtea-asm.S | 6 +- 22 files changed, 2054 insertions(+), 2235 deletions(-) delete mode 100644 grain_h_lutgen.c delete mode 100644 grain_nfsr_lutgen.c delete mode 100644 noekeon_genrc.c diff --git a/A5_1.c b/A5_1.c index 526e6cf..a22d185 100644 --- a/A5_1.c +++ b/A5_1.c @@ -17,10 +17,11 @@ along with this program. If not, see . */ /* - * File: A5_1.c - * Author: Daniel Otte - * Date: 24.06.2006 - * License: GPL + * File: A5_1.c + * Author: Daniel Otte + * email: daniel.otte@rub.de + * Date: 2006-06-24 + * License: GPLv3 or later * Description: Implementation of the A5/1 stream cipher algorithm, as used in GSM. * ! Warning, this is weak crypto ! * diff --git a/Makefile b/Makefile index 09457be..c8a52b7 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,10 @@ PRG = remove_me #------------------------------------------------------------------------------- +all: $(foreach algo, $(ALGORITHMS), $(algo)_OBJ) + +#------------------------------------------------------------------------------- + define BLA_TEMPLATE2 $(2): $(3) @echo "[gcc]: $$@" @@ -151,11 +155,6 @@ $(foreach algo, $(ALGORITHMS),$(eval $(call FLASH_TEMPLATE, $(algo), \ $(patsubst %.o,%.hex,$(firstword $($(algo)_TEST_BIN)))) )) #------------------------------------------------------------------------------- - -.PHONY: all -all: $(foreach algo, $(ALGORITHMS), $(algo)_OBJ) -#all: $(PRG).elf lst text eeprom - .PHONY: clean clean: diff --git a/arcfour-asm.S b/arcfour-asm.S index eafd771..ec0eeeb 100644 --- a/arcfour-asm.S +++ b/arcfour-asm.S @@ -17,10 +17,10 @@ along with this program. If not, see . */ /* - * File: arcfour-asm.S - * Author: Daniel Otte - * Date: 07.06.2006 - * License: GPL + * File: arcfour-asm.S + * Author: Daniel Otte + * Date: 2006-07-06 + * License: GPLv3 or later * Description: Implementation of the ARCFOUR (RC4 compatible) stream cipher algorithm. * */ diff --git a/arcfour.c b/arcfour.c index abed9dd..f8d01a6 100644 --- a/arcfour.c +++ b/arcfour.c @@ -17,10 +17,11 @@ along with this program. If not, see . */ /* - * File: arcfour.c - * Author: Daniel Otte - * Date: 07.06.2006 - * License: GPL + * File: arcfour.c + * Author: Daniel Otte + * email: daniel.otte@rub.de + * Date: 2006-06-07 + * License: GPLv3 or later * Description: Implementation of the ARCFOUR (RC4 compatible) stream cipher algorithm. * */ diff --git a/camellia-asm.S b/camellia-asm.S index bcc190e..6e58ca5 100644 --- a/camellia-asm.S +++ b/camellia-asm.S @@ -17,10 +17,10 @@ along with this program. If not, see . */ /* - * File: camellis-asm.S - * Author: Daniel Otte - * Date: 10.11.2006 - * License: GPL + * File: camellis-asm.S + * Author: Daniel Otte + * Date: 2006-11-10 + * License: GPLv3 or later * Description: Implementation of the camellia block cipher algorithm. * */ diff --git a/cast5.c b/cast5.c index a938bfb..51e9e93 100644 --- a/cast5.c +++ b/cast5.c @@ -19,9 +19,10 @@ /* * \file cast5.c * \author Daniel Otte - * \date 26.07.2006 + * \email daniel.otte@rub.de + * \date 2006-07-26 * \par License: - * GPL + * GPLv3 or later * \brief Implementation of the CAST5 (aka CAST-128) cipher algorithm as described in RFC 2144 * */ diff --git a/cli.c b/cli.c index 1b5467c..8c15f5f 100644 --- a/cli.c +++ b/cli.c @@ -20,7 +20,7 @@ * * author: Daniel Otte * email: daniel.otte@rub.de - * license: GPLv3 + * license: GPLv3 or later * * components to help implementing simple command based interaction * diff --git a/des.c b/des.c index dc16750..d4b8ce1 100644 --- a/des.c +++ b/des.c @@ -17,12 +17,13 @@ along with this program. If not, see . */ /** - * \file des.c - * \author Daniel Otte - * \date 2007-06-16 - * \brief DES and EDE-DES implementation + * \file des.c + * \author Daniel Otte + * \email daniel.otte@rub.de + * \date 2007-06-16 + * \brief DES and EDE-DES implementation * \par License - * GPL + * GPLv3 or later * */ #include "config.h" diff --git a/entropium.c b/entropium.c index 1bbf583..fdbf13c 100644 --- a/entropium.c +++ b/entropium.c @@ -17,11 +17,12 @@ along with this program. If not, see . */ /** - * \file entropium.c - * \author Daniel Otte - * \date 17.05.2006 + * \file entropium.c + * \author Daniel Otte + * \email daniel.otte@rub.de + * \date 2006-05-17 * \par License: - * GPL + * GPLv3 or later * \brief This file contains an implementaition of a pseudo-random-number generator. * * Extension 1: diff --git a/grain.c b/grain.c index 612d18b..05571f4 100644 --- a/grain.c +++ b/grain.c @@ -20,7 +20,7 @@ * * author: Daniel Otte * email: daniel.otte@rub.de - * license: GPLv3 + * license: GPLv3 or later * */ diff --git a/grain_h_lutgen.c b/grain_h_lutgen.c deleted file mode 100644 index 4b5ede0..0000000 --- a/grain_h_lutgen.c +++ /dev/null @@ -1,60 +0,0 @@ -/** - * - * author: Daniel Otte - * email: daniel.otte@rub.de - * license: GPLv3 - * - * this program generate a lookuptable for the h-function in grain - */ - -#include -#include - -#define X(i) ((x)>>((i))) -uint8_t h(uint8_t x){ - uint8_t h; - - h = (X(1)) ^ (X(4)) ^ - (X(0)&X(3)) ^ (X(2)&X(3)) ^ (X(3)&X(4)) ^ - (X(0)&X(1)&X(2)) ^ (X(0)&X(2)&X(3)) ^ (X(0)&X(2)&X(4)) ^ - (X(1)&X(2)&X(4)) ^ (X(2)&X(3)&X(4)) ; - - return h&1; -} - -int main(void){ - uint8_t i; - uint32_t lut; - puts( - "/* \n" - " * author: Daniel Otte \n" - " * email: daniel.otte@rub.de \n" - " * license: GPLv3 \n" - " * \n" - " * this program generate a lookuptable for the h-function in grain \n" - " * \n" - " */ \n"); - puts("/* \n" - " * x0 x1 x2 x3 x4 - h"); - - for(i=0; i<0x20; ++i){ - printf(" * %c %c %c %c %c - %c\n", - (i&0x01)?'1':'0', - (i&0x02)?'1':'0', - (i&0x04)?'1':'0', - (i&0x08)?'1':'0', - (i&0x10)?'1':'0', - (h(i))?'1':'0' ); - lut >>=1; - lut |= h(i)?0x80000000:0x00000000; - if(i%4==3){ - puts(" * --"); - } - } - puts(" */\n"); - printf(" uint8_t lut[4]= {0x%2.2X, 0x%2.2X, 0x%2.2X, 0x%2.2X} \n", - lut&0xFF, (lut>>8)&0xFF, (lut>>16)&0xFF, (lut>>24)&0xFF); - - return 0; -} - diff --git a/grain_nfsr_lutgen.c b/grain_nfsr_lutgen.c deleted file mode 100644 index 9b9277d..0000000 --- a/grain_nfsr_lutgen.c +++ /dev/null @@ -1,91 +0,0 @@ -/** - * - * author: Daniel Otte - * email: daniel.otte@rub.de - * license: GPLv3 - * - * this program generate a lookuptable for the nfsr-feedback-function in grain - */ - -#include -#include - -#define X(i) ((x)>>((i))) -#define B63 X(0) -#define B60 X(3) -#define B52 X(5) -#define B45 X(6) -#define B37 X(4) -#define B33 X(8) -#define B28 X(2) -#define B21 X(9) -#define B15 X(1) -#define B09 X(7) - -uint8_t g(uint16_t x){ - uint8_t a,b,d,e; - uint8_t ret; - - ret = B60 ^ B52 ^ B45 ^ B37 ^ B33 ^ B28 ^ B21 ^ B09; - ret ^= (a = B63 & B60); - ret ^= (b = B37 & B33); - ret ^= B15 & B09; - ret ^= (d = B60 & B52 & B45); - ret ^= (e = B33 & B28 & B21); - ret ^= B63 & B45 & B28 & B09; - ret ^= b & B60 & B52; - ret ^= a & B21 & B15; - ret ^= d & B63 & B37; - ret ^= e & B15 & B09; - ret ^= e & B52 & B45 & B37; - - return ret&1; -} - -int main(void){ - uint16_t i; - uint8_t t, lut[128]={0}; /* 2**10 / 8 == 2**(10-3) == 2**7 == 128 */ - puts( - "/* \n" - " * author: Daniel Otte \n" - " * email: daniel.otte@rub.de \n" - " * license: GPLv3 \n" - " * \n" - " * this program generate a lookuptable for the h-function in grain \n" - " * \n" - " */ \n"); - puts("/* \n" - " * b63 b15 b28 b60 b37 b52 b45 b09 b33 b21 - g"); - - for(i=0; i<0x0400; ++i){ - t = g(i); - printf(" * %c %c %c %c %c %c %c %c %c %c - %c\n", - (i&0x01)?'1':'0', - (i&0x02)?'1':'0', - (i&0x04)?'1':'0', - (i&0x08)?'1':'0', - (i&0x10)?'1':'0', - (i&0x20)?'1':'0', - (i&0x40)?'1':'0', - (i&0x80)?'1':'0', - (i&0x0100)?'1':'0', - (i&0x0200)?'1':'0', - t?'1':'0' ); - lut[i/8] |= t<<(i%8); -// if(i%4==3){ -// puts(" * --"); -// } - } - puts(" */\n"); - - printf(" uint8_t g_lut[128]= {"); - for(i=0; i<128; ++i){ - if(i%16==0){ - printf("\n\t"); - } - printf("0x%2.2X%c ", lut[i], (i!=127)?',':' '); - } - printf("};\n\n"); - return 0; -} - diff --git a/hmac-sha256.c b/hmac-sha256.c index c57ba95..a0ad1dc 100644 --- a/hmac-sha256.c +++ b/hmac-sha256.c @@ -19,9 +19,9 @@ /** * * implementation of HMAC as described in RFC2104 - * Author: Daniel Otte - * - * License: GPL + * Author: Daniel Otte + * email: daniel.otte@rub.de + * License: GPLv3 or later **/ /* diff --git a/main-seed-test.c b/main-seed-test.c index 6bff1d5..813cb5c 100644 --- a/main-seed-test.c +++ b/main-seed-test.c @@ -18,11 +18,12 @@ */ /** * \file main-seed-test.c - * \author Daniel Otte + * \author Daniel Otte + * \email daniel.otte@rub.de * \date 2007-06-01 * \brief test suit for SEED * \par License - * GPL + * GPLv3 or later * */ #include "config.h" diff --git a/main-shabea-test.c b/main-shabea-test.c index aac85c1..a83d0c7 100644 --- a/main-shabea-test.c +++ b/main-shabea-test.c @@ -22,7 +22,7 @@ * \date 2007-06-07 * \brief test suit for SHABEA * \par License - * GPL + * GPLv3 or later * */ #include "config.h" diff --git a/md5.c b/md5.c index bd43a38..5edb36b 100644 --- a/md5.c +++ b/md5.c @@ -19,9 +19,9 @@ /* * \file md5.c * \author Daniel Otte - * \date 31.07.2006 + * \date 2006-07-31 * \par License: - * GPL + * GPLv3 or later * \brief Implementation of the MD5 hash algorithm as described in RFC 1321 * */ diff --git a/noekeon.c b/noekeon.c index 5ae3ec2..7627cc1 100644 --- a/noekeon.c +++ b/noekeon.c @@ -19,7 +19,7 @@ /* * author: Daniel Otte * email: daniel.otte@rub.de - * license: GPLv3 + * license: GPLv3 or later * * * diff --git a/noekeon_genrc.c b/noekeon_genrc.c deleted file mode 100644 index cb8fac2..0000000 --- a/noekeon_genrc.c +++ /dev/null @@ -1,35 +0,0 @@ -/** - * - * author: Daniel Otte - * email: daniel.otte@rub.de - * license: GPLv3 - * - */ - -#include -#include - -uint8_t getnextrc(uint8_t a){ - if((a&0x80) != 0){ - return (a<<1) ^ 0x1B; - } else { - return (a<<1); - } -} - -#define N 32 - -int main(void){ - uint8_t c=0x80; - uint32_t i; - puts("\nNoekeon Round Constants:"); - for(i=0; i. */ -/* - * Author: Daniel Otte - * - * License: GPL -*/ -; SHA1 implementation in assembler for AVR -SHA1_BLOCK_BITS = 512 -SHA1_HASH_BITS = 160 - -.macro precall - /* push r18 - r27, r30 - r31*/ - push r0 - push r1 - push r18 - push r19 - push r20 - push r21 - push r22 - push r23 - push r24 - push r25 - push r26 - push r27 - push r30 - push r31 - clr r1 -.endm - -.macro postcall - pop r31 - pop r30 - pop r27 - pop r26 - pop r25 - pop r24 - pop r23 - pop r22 - pop r21 - pop r20 - pop r19 - pop r18 - pop r1 - pop r0 -.endm - - -.macro hexdump length - push r27 - push r26 - ldi r25, '\r' - mov r24, r25 - call uart_putc - ldi r25, '\n' - mov r24, r25 - call uart_putc - pop r26 - pop r27 - movw r24, r26 -.if \length > 16 - ldi r22, lo8(16) - ldi r23, hi8(16) - push r27 - push r26 - call uart_hexdump - pop r26 - pop r27 - adiw r26, 16 - hexdump \length-16 -.else - ldi r22, lo8(\length) - ldi r23, hi8(\length) - call uart_hexdump -.endif -.endm - -.macro delay -/* - push r0 - push r1 - clr r0 -1: clr r1 -2: dec r1 - brne 2b - dec r0 - brne 1b - pop r1 - pop r0 // */ -.endm - -/* X points to Block */ -.macro dbg_hexdump length -/* - precall - hexdump \length - postcall - // */ -.endm - - - -.section .text - -SPL = 0x3D -SPH = 0x3E -SREG = 0x3F - - -; -;sha1_ctx_t is: -; -; [h0][h1][h2][h3][h4][length] -; hn is 32 bit large, length is 64 bit large - -;########################################################### - -.global sha1_ctx2hash -; === sha1_ctx2hash === -; this function converts a state into a normal hash (bytestring) -; param1: the 16-bit destination pointer -; given in r25,r24 (r25 is most significant) -; param2: the 16-bit pointer to sha1_ctx structure -; given in r23,r22 -sha1_ctx2hash: - movw r26, r22 - movw r30, r24 - ldi r21, 5 - sbiw r26, 4 -1: - ldi r20, 4 - adiw r26, 8 -2: - ld r0, -X - st Z+, r0 - dec r20 - brne 2b - - dec r21 - brne 1b - - ret - -;########################################################### - -.global sha1 -; === sha1 === -; this function calculates SHA-1 hashes from messages in RAM -; param1: the 16-bit hash destination pointer -; given in r25,r24 (r25 is most significant) -; param2: the 16-bit pointer to message -; given in r23,r22 -; param3: 32-bit length value (length of message in bits) -; given in r21,r20,r19,r18 -sha1: -sha1_prolog: - push r8 - push r9 - push r10 - push r11 - push r12 - push r13 - push r16 - push r17 - in r16, SPL - in r17, SPH - subi r16, 5*4+8 - sbci r17, 0 - in r0, SREG - cli - out SPL, r16 - out SPH, r17 - out SREG, r0 - - push r25 - push r24 - inc r16 - adc r17, r1 - - movw r8, r18 /* backup of length*/ - movw r10, r20 - - movw r12, r22 /* backup pf msg-ptr */ - - movw r24, r16 - rcall sha1_init - /* if length >= 512 */ -1: - tst r11 - brne 4f - tst r10 - brne 4f - mov r19, r9 - cpi r19, 0x02 - brlo 4f - - movw r24, r16 - movw r22, r12 - rcall sha1_nextBlock - ldi r19, 0x64 - add r22, r19 - adc r23, r1 - /* length -= 512 */ - ldi r19, 0x02 - sub r9, r19 - sbc r10, r1 - sbc r11, r1 - rjmp 1b - -4: - movw r24, r16 - movw r22, r12 - movw r20, r8 - rcall sha1_lastBlock - - pop r24 - pop r25 - movw r22, r16 - rcall sha1_ctx2hash - -sha1_epilog: - in r30, SPL - in r31, SPH - adiw r30, 5*4+8 - in r0, SREG - cli - out SPL, r30 - out SPH, r31 - out SREG, r0 - pop r17 - pop r16 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop r8 - ret - -;########################################################### - - -; block MUST NOT be larger than 64 bytes - -.global sha1_lastBlock -; === sha1_lastBlock === -; this function does padding & Co. for calculating SHA-1 hashes -; param1: the 16-bit pointer to sha1_ctx structure -; given in r25,r24 (r25 is most significant) -; param2: an 16-bit pointer to 64 byte block to hash -; given in r23,r22 -; param3: an 16-bit integer specifing length of block in bits -; given in r21,r20 -sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1) - - -sha1_lastBlock: - tst r20 - brne sha1_lastBlock_prolog - cpi r21, 0x02 - brne sha1_lastBlock_prolog - push r25 - push r24 - push r23 - push r22 - rcall sha1_nextBlock - pop r22 - pop r23 - pop r24 - pop r25 - clr r21 - clr r22 -sha1_lastBlock_prolog: - /* allocate space on stack */ - in r30, SPL - in r31, SPH - in r1, SREG - subi r30, lo8(64) - sbci r31, hi8(64) /* ??? */ - cli - out SPL, r30 - out SPH, r31 - out SREG,r1 - - adiw r30, 1 /* SP points to next free byte on stack */ - mov r18, r20 /* r20 = LSB(length) */ - lsr r18 - lsr r18 - lsr r18 - bst r21, 0 /* may be we should explain this ... */ - bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */ - - - movw r26, r22 /* X points to begin of msg */ - tst r18 - breq sha1_lastBlock_post_copy - mov r1, r18 -sha1_lastBlock_copy_loop: - ld r0, X+ - st Z+, r0 - dec r1 - brne sha1_lastBlock_copy_loop -sha1_lastBlock_post_copy: -sha1_lastBlock_insert_stuffing_bit: - ldi r19, 0x80 - mov r0,r19 - ldi r19, 0x07 - and r19, r20 /* if we are in bitmode */ - breq 2f /* no bitmode */ -1: - lsr r0 - dec r19 - brne 1b - ld r19, X -/* maybe we should do some ANDing here, just for safety */ - or r0, r19 -2: - st Z+, r0 - inc r18 - -/* checking stuff here */ - cpi r18, 64-8+1 - brsh 0f - rjmp sha1_lastBlock_insert_zeros -0: - /* oh shit, we landed here */ - /* first we have to fill it up with zeros */ - ldi r19, 64 - sub r19, r18 - breq 2f -1: - st Z+, r1 - dec r19 - brne 1b -2: - sbiw r30, 63 - sbiw r30, 1 - movw r22, r30 - - push r31 - push r30 - push r25 - push r24 - push r21 - push r20 - rcall sha1_nextBlock - pop r20 - pop r21 - pop r24 - pop r25 - pop r30 - pop r31 - - /* now we should subtract 512 from length */ - movw r26, r24 - adiw r26, 4*5+1 /* we can skip the lowest byte */ - ld r19, X - subi r19, hi8(512) - st X+, r19 - ldi r18, 6 -1: - ld r19, X - sbci r19, 0 - st X+, r19 - dec r18 - brne 1b - -; clr r18 /* not neccessary ;-) */ - /* reset Z pointer to begin of block */ - -sha1_lastBlock_insert_zeros: - ldi r19, 64-8 - sub r19, r18 - breq sha1_lastBlock_insert_length - clr r1 -1: - st Z+, r1 /* r1 is still zero */ - dec r19 - brne 1b - -; rjmp sha1_lastBlock_epilog -sha1_lastBlock_insert_length: - movw r26, r24 /* X points to state */ - adiw r26, 5*4 /* X points to (state.length) */ - adiw r30, 8 /* Z points one after the last byte of block */ - ld r0, X+ - add r0, r20 - st -Z, r0 - ld r0, X+ - adc r0, r21 - st -Z, r0 - ldi r19, 6 -1: - ld r0, X+ - adc r0, r1 - st -Z, r0 - dec r19 - brne 1b - - sbiw r30, 64-8 - movw r22, r30 - rcall sha1_nextBlock - -sha1_lastBlock_epilog: - in r30, SPL - in r31, SPH - in r1, SREG - adiw r30, 63 ; lo8(64) - adiw r30, 1 ; hi8(64) - cli - out SPL, r30 - out SPH, r31 - out SREG,r1 - clr r1 - clr r0 - ret - -/**/ -;########################################################### - -.global sha1_nextBlock -; === sha1_nextBlock === -; this is the core function for calculating SHA-1 hashes -; param1: the 16-bit pointer to sha1_ctx structure -; given in r25,r24 (r25 is most significant) -; param2: an 16-bit pointer to 64 byte block to hash -; given in r23,r22 -sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte) - -xtmp = 0 -xNULL = 1 -W1 = 10 -W2 = 11 -T1 = 12 -T2 = 13 -T3 = 14 -T4 = 15 -LoopC = 16 -S = 17 -tmp1 = 18 -tmp2 = 19 -tmp3 = 20 -tmp4 = 21 -F1 = 22 -F2 = 23 -F3 = 24 -F4 = 25 - -/* byteorder: high number <--> high significance */ -sha1_nextBlock: - ; initial, let's make some space ready for local vars - /* replace push & pop by mem ops? */ - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - push r16 - push r17 - push r28 - push r29 - in r20, SPL - in r21, SPH - movw r18, r20 ;backup SP -; movw r26, r20 ; X points to free space on stack /* maybe removeable? */ - movw r30, r22 ; Z points to message - subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63 - sbci r21, hi8(sha1_nextBlock_localSpace) - movw r26, r20 ; X points to free space on stack - in r0, SREG - cli ; we want to be uninterrupted while updating SP - out SPL, r20 - out SPH, r21 - out SREG, r0 - - push r18 - push r19 /* push old SP on new stack */ - push r24 - push r25 /* param1 will be needed later */ - - /* load a[] with state */ - movw 28, r24 /* load pointer to state in Y */ - adiw r26, 1 ; X++ - - ldi LoopC, 5*4 -1: ld tmp1, Y+ - st X+, tmp1 - dec LoopC - brne 1b - - movw W1, r26 /* save pointer to w[0] */ - /* load w[] with endian fixed message */ - /* we might also use the changeendian32() function at bottom */ - movw r30, r22 /* mv param2 (ponter to msg) to Z */ - ldi LoopC, 16 -1: - ldd tmp1, Z+3 - st X+, tmp1 - ldd tmp1, Z+2 - st X+, tmp1 - ldd tmp1, Z+1 - st X+, tmp1 - ld tmp1, Z - st X+, tmp1 - adiw r30, 4 - dec LoopC - brne 1b - - ;clr LoopC /* LoopC is named t in FIPS 180-2 */ - clr xtmp -sha1_nextBlock_mainloop: - mov S, LoopC - lsl S - lsl S - andi S, 0x3C /* S is a bytepointer so *4 */ - /* load w[s] */ - movw r26, W1 - add r26, S /* X points at w[s] */ - adc r27, xNULL - ld T1, X+ - ld T2, X+ - ld T3, X+ - ld T4, X+ - - /**/ - push r26 - push r27 - push T4 - push T3 - push T2 - push T1 - in r26, SPL - in r27, SPH - adiw r26, 1 - dbg_hexdump 4 - pop T1 - pop T2 - pop T3 - pop T4 - pop r27 - pop r26 - /**/ - - cpi LoopC, 16 - brlt sha1_nextBlock_mainloop_core - /* update w[s] */ - ldi tmp1, 2*4 - rcall 1f - ldi tmp1, 8*4 - rcall 1f - ldi tmp1, 13*4 - rcall 1f - rjmp 2f -1: /* this might be "outsourced" to save the jump above */ - add tmp1, S - andi tmp1, 0x3f - movw r26, W1 - add r26, tmp1 - adc r27, xNULL - ld tmp2, X+ - eor T1, tmp2 - ld tmp2, X+ - eor T2, tmp2 - ld tmp2, X+ - eor T3, tmp2 - ld tmp2, X+ - eor T4, tmp2 - ret -2: /* now we just hav to do a ROTL(T) and save T back */ - mov tmp2, T4 - rol tmp2 - rol T1 - rol T2 - rol T3 - rol T4 - movw r26, W1 - add r26, S - adc r27, xNULL - st X+, T1 - st X+, T2 - st X+, T3 - st X+, T4 - -sha1_nextBlock_mainloop_core: /* ther core function; T=ROTL5(a) ....*/ - /* T already contains w[s] */ - movw r26, W1 - sbiw r26, 4*1 /* X points at a[4] aka e */ - ld tmp1, X+ - add T1, tmp1 - ld tmp1, X+ - adc T2, tmp1 - ld tmp1, X+ - adc T3, tmp1 - ld tmp1, X+ - adc T4, tmp1 /* T = w[s]+e */ - sbiw r26, 4*5 /* X points at a[0] aka a */ - ld F1, X+ - ld F2, X+ - ld F3, X+ - ld F4, X+ - mov tmp1, F4 /* X points at a[1] aka b */ - ldi tmp2, 5 -1: - rol tmp1 - rol F1 - rol F2 - rol F3 - rol F4 - dec tmp2 - brne 1b - - add T1, F1 - adc T2, F2 - adc T3, F3 - adc T4, F4 /* T = ROTL(a,5) + e + w[s] */ - - /* now we have to do this fucking conditional stuff */ - ldi r30, lo8(sha1_nextBlock_xTable) - ldi r31, hi8(sha1_nextBlock_xTable) - add r30, xtmp - adc r31, xNULL - lpm tmp1, Z - cp tmp1, LoopC - brne 1f - inc xtmp -1: ldi r30, lo8(sha1_nextBlock_KTable) - ldi r31, hi8(sha1_nextBlock_KTable) - lsl xtmp - lsl xtmp - add r30, xtmp - adc r31, xNULL - lsr xtmp - lsr xtmp - - lpm tmp1, Z+ - add T1, tmp1 - lpm tmp1, Z+ - adc T2, tmp1 - lpm tmp1, Z+ - adc T3, tmp1 - lpm tmp1, Z+ - adc T4, tmp1 - /* T = ROTL(a,5) + e + kt + w[s] */ - - /* wo Z-4 gerade auf kt zeigt ... */ - movw r28, r26 /* copy X in Y */ - adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */ - lsr r31 - ror r30 - - icall - mov F1, tmp1 - icall - mov F2, tmp1 - icall - mov F3, tmp1 - icall - - add T1, F1 - adc T2, F2 - adc T3, F3 - adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */ - /* X points still at a[1] aka b, Y points at a[2] aka c */ - /* update a[] */ -sha1_nextBlock_update_a: - /*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/ - //adiw r28, 3*4 /* Y should point at a[4] aka e */ - movw r28, W1 - sbiw r28, 4 - - ldi tmp2, 4*4 -1: - ld tmp1, -Y - std Y+4, tmp1 - dec tmp2 - brne 1b - /* Y points at a[0] aka a*/ - - movw r28, W1 - sbiw r28, 5*4 - /* store T in a[0] aka a */ - st Y+, T1 - st Y+, T2 - st Y+, T3 - st Y+, T4 - /* Y points at a[1] aka b*/ - - /* rotate c */ - ldd T1, Y+1*4 - ldd T2, Y+1*4+1 - ldd T3, Y+1*4+2 - ldd T4, Y+1*4+3 - mov tmp1, T1 - ldi tmp2, 2 -1: ror tmp1 - ror T4 - ror T3 - ror T2 - ror T1 - dec tmp2 - brne 1b - std Y+1*4+0, T1 - std Y+1*4+1, T2 - std Y+1*4+2, T3 - std Y+1*4+3, T4 - - push r27 - push r26 - movw r26, W1 - sbiw r26, 4*5 - dbg_hexdump 4*5 - pop r26 - pop r27 - - inc LoopC - cpi LoopC, 80 - brge 1f - jmp sha1_nextBlock_mainloop -/**************************************/ -1: - /* littel patch */ - sbiw r28, 4 - -/* add a[] to state and inc length */ - pop r27 - pop r26 /* now X points to state (and Y still at a[0]) */ - ldi tmp4, 5 -1: clc - ldi tmp3, 4 -2: ld tmp1, X - ld tmp2, Y+ - adc tmp1, tmp2 - st X+, tmp1 - dec tmp3 - brne 2b - dec tmp4 - brne 1b - - /* now length += 512 */ - adiw r26, 1 /* we skip the least significant byte */ - ld tmp1, X - ldi tmp2, hi8(512) /* 2 */ - add tmp1, tmp2 - st X+, tmp1 - ldi tmp2, 6 -1: - ld tmp1, X - adc tmp1, xNULL - st X+, tmp1 - dec tmp2 - brne 1b - -; EPILOG -sha1_nextBlock_epilog: -/* now we should clean up the stack */ - pop r21 - pop r20 - in r0, SREG - cli ; we want to be uninterrupted while updating SP - out SPL, r20 - out SPH, r21 - out SREG, r0 - - clr r1 - pop r29 - pop r28 - pop r17 - pop r16 - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - ret - -sha1_nextBlock_xTable: -.byte 20,40,60,0 -sha1_nextBlock_KTable: -.int 0x5a827999 -.int 0x6ed9eba1 -.int 0x8f1bbcdc -.int 0xca62c1d6 -sha1_nextBlock_JumpTable: -jmp sha1_nextBlock_Ch -jmp sha1_nextBlock_Parity -jmp sha1_nextBlock_Maj -jmp sha1_nextBlock_Parity - - /* X and Y still point at a[1] aka b ; return value in tmp1 */ -sha1_nextBlock_Ch: - ld tmp1, Y+ - mov tmp2, tmp1 - com tmp2 - ldd tmp3, Y+3 /* load from c */ - and tmp1, tmp3 - ldd tmp3, Y+7 /* load from d */ - and tmp2, tmp3 - eor tmp1, tmp2 - /** - precall - ldi r24, lo8(ch_str) - ldi r25, hi8(ch_str) - call uart_putstr_P - postcall - /**/ - ret - -sha1_nextBlock_Maj: - ld tmp1, Y+ - mov tmp2, tmp1 - ldd tmp3, Y+3 /* load from c */ - and tmp1, tmp3 - ldd tmp4, Y+7 /* load from d */ - and tmp2, tmp4 - eor tmp1, tmp2 - and tmp3, tmp4 - eor tmp1, tmp3 - /** - precall - ldi r24, lo8(maj_str) - ldi r25, hi8(maj_str) - call uart_putstr_P - postcall - /**/ - ret - -sha1_nextBlock_Parity: - ld tmp1, Y+ - ldd tmp2, Y+3 /* load from c */ - eor tmp1, tmp2 - ldd tmp2, Y+7 /* load from d */ - eor tmp1, tmp2 - - /** - precall - ldi r24, lo8(parity_str) - ldi r25, hi8(parity_str) - call uart_putstr_P - postcall - /**/ - ret -/* -ch_str: .asciz "\r\nCh" -maj_str: .asciz "\r\nMaj" -parity_str: .asciz "\r\nParity" -*/ -;########################################################### - -.global sha1_init -;void sha1_init(sha1_ctx_t *state){ -; DEBUG_S("\r\nSHA1_INIT"); -; state->h[0] = 0x67452301; -; state->h[1] = 0xefcdab89; -; state->h[2] = 0x98badcfe; -; state->h[3] = 0x10325476; -; state->h[4] = 0xc3d2e1f0; -; state->length = 0; -;} -; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram -; modifys: Z(r30,r31), Func1, r22 -sha1_init: - movw r26, r24 ; (24,25) --> (26,27) load X with param1 - ldi r30, lo8((sha1_init_vector)) - ldi r31, hi8((sha1_init_vector)) - ldi r22, 5*4 /* bytes to copy */ -sha1_init_vloop: - lpm r23, Z+ - st X+, r23 - dec r22 - brne sha1_init_vloop - ldi r22, 8 - clr r1 /* this should not be needed */ -sha1_init_lloop: - st X+, r1 - dec r22 - brne sha1_init_lloop - ret - -sha1_init_vector: -.int 0x67452301; -.int 0xefcdab89; -.int 0x98badcfe; -.int 0x10325476; -.int 0xc3d2e1f0; -/* -;########################################################### - -.global rotl32 -; === ROTL32 === -; function that rotates a 32 bit word to the left -; param1: the 32-bit word to rotate -; given in r25,r24,r23,r22 (r25 is most significant) -; param2: an 8-bit value telling how often to rotate -; given in r20 -; modifys: r21, r22 -rotl32: - cpi r20, 8 - brlo bitrotl - mov r21, r25 - mov r25, r24 - mov r24, r23 - mov r23, r22 - mov r22, r21 - subi r20, 8 - rjmp rotr32 -bitrotl: - clr r21 - clc -bitrotl_loop: - tst r20 - breq fixrotl - rol r22 - rol r23 - rol r24 - rol r25 - rol r21 - dec r20 - rjmp bitrotl_loop -fixrotl: - or r22, r21 - ret - - -;########################################################### - -.global rotr32 -; === ROTR32 === -; function that rotates a 32 bit word to the right -; param1: the 32-bit word to rotate -; given in r25,r24,r23,22 (r25 is most significant) -; param2: an 8-bit value telling how often to rotate -; given in r20 -; modifys: r21, r22 -rotr32: - cpi r20, 8 - brlo bitrotr - mov r21, r22 - mov r22, r23 - mov r23, r24 - mov r24, r25 - mov r25, r21 - subi r20, 8 - rjmp rotr32 -bitrotr: - clr r21 - clc -bitrotr_loop: - tst r20 - breq fixrotr - ror r25 - ror r24 - ror r23 - ror r22 - ror r21 - dec r20 - rjmp bitrotr_loop -fixrotr: - or r25, r21 - ret - - -;########################################################### - -.global change_endian32 -; === change_endian32 === -; function that changes the endianess of a 32-bit word -; param1: the 32-bit word -; given in r25,r24,r23,22 (r25 is most significant) -; modifys: r21, r22 -change_endian32: - movw r20, r22 ; (r22,r23) --> (r20,r21) - mov r22, r25 - mov r23, r24 - mov r24, r21 - mov r25, r20 - ret -*/ +/* + * Author: Daniel Otte + * + * License: GPLv3 or later +*/ +; SHA1 implementation in assembler for AVR +SHA1_BLOCK_BITS = 512 +SHA1_HASH_BITS = 160 + +.macro precall + /* push r18 - r27, r30 - r31*/ + push r0 + push r1 + push r18 + push r19 + push r20 + push r21 + push r22 + push r23 + push r24 + push r25 + push r26 + push r27 + push r30 + push r31 + clr r1 +.endm + +.macro postcall + pop r31 + pop r30 + pop r27 + pop r26 + pop r25 + pop r24 + pop r23 + pop r22 + pop r21 + pop r20 + pop r19 + pop r18 + pop r1 + pop r0 +.endm + + +.macro hexdump length + push r27 + push r26 + ldi r25, '\r' + mov r24, r25 + call uart_putc + ldi r25, '\n' + mov r24, r25 + call uart_putc + pop r26 + pop r27 + movw r24, r26 +.if \length > 16 + ldi r22, lo8(16) + ldi r23, hi8(16) + push r27 + push r26 + call uart_hexdump + pop r26 + pop r27 + adiw r26, 16 + hexdump \length-16 +.else + ldi r22, lo8(\length) + ldi r23, hi8(\length) + call uart_hexdump +.endif +.endm + +.macro delay +/* + push r0 + push r1 + clr r0 +1: clr r1 +2: dec r1 + brne 2b + dec r0 + brne 1b + pop r1 + pop r0 // */ +.endm + +/* X points to Block */ +.macro dbg_hexdump length +/* + precall + hexdump \length + postcall + // */ +.endm + + + +.section .text + +SPL = 0x3D +SPH = 0x3E +SREG = 0x3F + + +; +;sha1_ctx_t is: +; +; [h0][h1][h2][h3][h4][length] +; hn is 32 bit large, length is 64 bit large + +;########################################################### + +.global sha1_ctx2hash +; === sha1_ctx2hash === +; this function converts a state into a normal hash (bytestring) +; param1: the 16-bit destination pointer +; given in r25,r24 (r25 is most significant) +; param2: the 16-bit pointer to sha1_ctx structure +; given in r23,r22 +sha1_ctx2hash: + movw r26, r22 + movw r30, r24 + ldi r21, 5 + sbiw r26, 4 +1: + ldi r20, 4 + adiw r26, 8 +2: + ld r0, -X + st Z+, r0 + dec r20 + brne 2b + + dec r21 + brne 1b + + ret + +;########################################################### + +.global sha1 +; === sha1 === +; this function calculates SHA-1 hashes from messages in RAM +; param1: the 16-bit hash destination pointer +; given in r25,r24 (r25 is most significant) +; param2: the 16-bit pointer to message +; given in r23,r22 +; param3: 32-bit length value (length of message in bits) +; given in r21,r20,r19,r18 +sha1: +sha1_prolog: + push r8 + push r9 + push r10 + push r11 + push r12 + push r13 + push r16 + push r17 + in r16, SPL + in r17, SPH + subi r16, 5*4+8 + sbci r17, 0 + in r0, SREG + cli + out SPL, r16 + out SPH, r17 + out SREG, r0 + + push r25 + push r24 + inc r16 + adc r17, r1 + + movw r8, r18 /* backup of length*/ + movw r10, r20 + + movw r12, r22 /* backup pf msg-ptr */ + + movw r24, r16 + rcall sha1_init + /* if length >= 512 */ +1: + tst r11 + brne 4f + tst r10 + brne 4f + mov r19, r9 + cpi r19, 0x02 + brlo 4f + + movw r24, r16 + movw r22, r12 + rcall sha1_nextBlock + ldi r19, 0x64 + add r22, r19 + adc r23, r1 + /* length -= 512 */ + ldi r19, 0x02 + sub r9, r19 + sbc r10, r1 + sbc r11, r1 + rjmp 1b + +4: + movw r24, r16 + movw r22, r12 + movw r20, r8 + rcall sha1_lastBlock + + pop r24 + pop r25 + movw r22, r16 + rcall sha1_ctx2hash + +sha1_epilog: + in r30, SPL + in r31, SPH + adiw r30, 5*4+8 + in r0, SREG + cli + out SPL, r30 + out SPH, r31 + out SREG, r0 + pop r17 + pop r16 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop r8 + ret + +;########################################################### + + +; block MUST NOT be larger than 64 bytes + +.global sha1_lastBlock +; === sha1_lastBlock === +; this function does padding & Co. for calculating SHA-1 hashes +; param1: the 16-bit pointer to sha1_ctx structure +; given in r25,r24 (r25 is most significant) +; param2: an 16-bit pointer to 64 byte block to hash +; given in r23,r22 +; param3: an 16-bit integer specifing length of block in bits +; given in r21,r20 +sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1) + + +sha1_lastBlock: + tst r20 + brne sha1_lastBlock_prolog + cpi r21, 0x02 + brne sha1_lastBlock_prolog + push r25 + push r24 + push r23 + push r22 + rcall sha1_nextBlock + pop r22 + pop r23 + pop r24 + pop r25 + clr r21 + clr r22 +sha1_lastBlock_prolog: + /* allocate space on stack */ + in r30, SPL + in r31, SPH + in r1, SREG + subi r30, lo8(64) + sbci r31, hi8(64) /* ??? */ + cli + out SPL, r30 + out SPH, r31 + out SREG,r1 + + adiw r30, 1 /* SP points to next free byte on stack */ + mov r18, r20 /* r20 = LSB(length) */ + lsr r18 + lsr r18 + lsr r18 + bst r21, 0 /* may be we should explain this ... */ + bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */ + + + movw r26, r22 /* X points to begin of msg */ + tst r18 + breq sha1_lastBlock_post_copy + mov r1, r18 +sha1_lastBlock_copy_loop: + ld r0, X+ + st Z+, r0 + dec r1 + brne sha1_lastBlock_copy_loop +sha1_lastBlock_post_copy: +sha1_lastBlock_insert_stuffing_bit: + ldi r19, 0x80 + mov r0,r19 + ldi r19, 0x07 + and r19, r20 /* if we are in bitmode */ + breq 2f /* no bitmode */ +1: + lsr r0 + dec r19 + brne 1b + ld r19, X +/* maybe we should do some ANDing here, just for safety */ + or r0, r19 +2: + st Z+, r0 + inc r18 + +/* checking stuff here */ + cpi r18, 64-8+1 + brsh 0f + rjmp sha1_lastBlock_insert_zeros +0: + /* oh shit, we landed here */ + /* first we have to fill it up with zeros */ + ldi r19, 64 + sub r19, r18 + breq 2f +1: + st Z+, r1 + dec r19 + brne 1b +2: + sbiw r30, 63 + sbiw r30, 1 + movw r22, r30 + + push r31 + push r30 + push r25 + push r24 + push r21 + push r20 + rcall sha1_nextBlock + pop r20 + pop r21 + pop r24 + pop r25 + pop r30 + pop r31 + + /* now we should subtract 512 from length */ + movw r26, r24 + adiw r26, 4*5+1 /* we can skip the lowest byte */ + ld r19, X + subi r19, hi8(512) + st X+, r19 + ldi r18, 6 +1: + ld r19, X + sbci r19, 0 + st X+, r19 + dec r18 + brne 1b + +; clr r18 /* not neccessary ;-) */ + /* reset Z pointer to begin of block */ + +sha1_lastBlock_insert_zeros: + ldi r19, 64-8 + sub r19, r18 + breq sha1_lastBlock_insert_length + clr r1 +1: + st Z+, r1 /* r1 is still zero */ + dec r19 + brne 1b + +; rjmp sha1_lastBlock_epilog +sha1_lastBlock_insert_length: + movw r26, r24 /* X points to state */ + adiw r26, 5*4 /* X points to (state.length) */ + adiw r30, 8 /* Z points one after the last byte of block */ + ld r0, X+ + add r0, r20 + st -Z, r0 + ld r0, X+ + adc r0, r21 + st -Z, r0 + ldi r19, 6 +1: + ld r0, X+ + adc r0, r1 + st -Z, r0 + dec r19 + brne 1b + + sbiw r30, 64-8 + movw r22, r30 + rcall sha1_nextBlock + +sha1_lastBlock_epilog: + in r30, SPL + in r31, SPH + in r1, SREG + adiw r30, 63 ; lo8(64) + adiw r30, 1 ; hi8(64) + cli + out SPL, r30 + out SPH, r31 + out SREG,r1 + clr r1 + clr r0 + ret + +/**/ +;########################################################### + +.global sha1_nextBlock +; === sha1_nextBlock === +; this is the core function for calculating SHA-1 hashes +; param1: the 16-bit pointer to sha1_ctx structure +; given in r25,r24 (r25 is most significant) +; param2: an 16-bit pointer to 64 byte block to hash +; given in r23,r22 +sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte) + +xtmp = 0 +xNULL = 1 +W1 = 10 +W2 = 11 +T1 = 12 +T2 = 13 +T3 = 14 +T4 = 15 +LoopC = 16 +S = 17 +tmp1 = 18 +tmp2 = 19 +tmp3 = 20 +tmp4 = 21 +F1 = 22 +F2 = 23 +F3 = 24 +F4 = 25 + +/* byteorder: high number <--> high significance */ +sha1_nextBlock: + ; initial, let's make some space ready for local vars + /* replace push & pop by mem ops? */ + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push r16 + push r17 + push r28 + push r29 + in r20, SPL + in r21, SPH + movw r18, r20 ;backup SP +; movw r26, r20 ; X points to free space on stack /* maybe removeable? */ + movw r30, r22 ; Z points to message + subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63 + sbci r21, hi8(sha1_nextBlock_localSpace) + movw r26, r20 ; X points to free space on stack + in r0, SREG + cli ; we want to be uninterrupted while updating SP + out SPL, r20 + out SPH, r21 + out SREG, r0 + + push r18 + push r19 /* push old SP on new stack */ + push r24 + push r25 /* param1 will be needed later */ + + /* load a[] with state */ + movw 28, r24 /* load pointer to state in Y */ + adiw r26, 1 ; X++ + + ldi LoopC, 5*4 +1: ld tmp1, Y+ + st X+, tmp1 + dec LoopC + brne 1b + + movw W1, r26 /* save pointer to w[0] */ + /* load w[] with endian fixed message */ + /* we might also use the changeendian32() function at bottom */ + movw r30, r22 /* mv param2 (ponter to msg) to Z */ + ldi LoopC, 16 +1: + ldd tmp1, Z+3 + st X+, tmp1 + ldd tmp1, Z+2 + st X+, tmp1 + ldd tmp1, Z+1 + st X+, tmp1 + ld tmp1, Z + st X+, tmp1 + adiw r30, 4 + dec LoopC + brne 1b + + ;clr LoopC /* LoopC is named t in FIPS 180-2 */ + clr xtmp +sha1_nextBlock_mainloop: + mov S, LoopC + lsl S + lsl S + andi S, 0x3C /* S is a bytepointer so *4 */ + /* load w[s] */ + movw r26, W1 + add r26, S /* X points at w[s] */ + adc r27, xNULL + ld T1, X+ + ld T2, X+ + ld T3, X+ + ld T4, X+ + + /**/ + push r26 + push r27 + push T4 + push T3 + push T2 + push T1 + in r26, SPL + in r27, SPH + adiw r26, 1 + dbg_hexdump 4 + pop T1 + pop T2 + pop T3 + pop T4 + pop r27 + pop r26 + /**/ + + cpi LoopC, 16 + brlt sha1_nextBlock_mainloop_core + /* update w[s] */ + ldi tmp1, 2*4 + rcall 1f + ldi tmp1, 8*4 + rcall 1f + ldi tmp1, 13*4 + rcall 1f + rjmp 2f +1: /* this might be "outsourced" to save the jump above */ + add tmp1, S + andi tmp1, 0x3f + movw r26, W1 + add r26, tmp1 + adc r27, xNULL + ld tmp2, X+ + eor T1, tmp2 + ld tmp2, X+ + eor T2, tmp2 + ld tmp2, X+ + eor T3, tmp2 + ld tmp2, X+ + eor T4, tmp2 + ret +2: /* now we just hav to do a ROTL(T) and save T back */ + mov tmp2, T4 + rol tmp2 + rol T1 + rol T2 + rol T3 + rol T4 + movw r26, W1 + add r26, S + adc r27, xNULL + st X+, T1 + st X+, T2 + st X+, T3 + st X+, T4 + +sha1_nextBlock_mainloop_core: /* ther core function; T=ROTL5(a) ....*/ + /* T already contains w[s] */ + movw r26, W1 + sbiw r26, 4*1 /* X points at a[4] aka e */ + ld tmp1, X+ + add T1, tmp1 + ld tmp1, X+ + adc T2, tmp1 + ld tmp1, X+ + adc T3, tmp1 + ld tmp1, X+ + adc T4, tmp1 /* T = w[s]+e */ + sbiw r26, 4*5 /* X points at a[0] aka a */ + ld F1, X+ + ld F2, X+ + ld F3, X+ + ld F4, X+ + mov tmp1, F4 /* X points at a[1] aka b */ + ldi tmp2, 5 +1: + rol tmp1 + rol F1 + rol F2 + rol F3 + rol F4 + dec tmp2 + brne 1b + + add T1, F1 + adc T2, F2 + adc T3, F3 + adc T4, F4 /* T = ROTL(a,5) + e + w[s] */ + + /* now we have to do this fucking conditional stuff */ + ldi r30, lo8(sha1_nextBlock_xTable) + ldi r31, hi8(sha1_nextBlock_xTable) + add r30, xtmp + adc r31, xNULL + lpm tmp1, Z + cp tmp1, LoopC + brne 1f + inc xtmp +1: ldi r30, lo8(sha1_nextBlock_KTable) + ldi r31, hi8(sha1_nextBlock_KTable) + lsl xtmp + lsl xtmp + add r30, xtmp + adc r31, xNULL + lsr xtmp + lsr xtmp + + lpm tmp1, Z+ + add T1, tmp1 + lpm tmp1, Z+ + adc T2, tmp1 + lpm tmp1, Z+ + adc T3, tmp1 + lpm tmp1, Z+ + adc T4, tmp1 + /* T = ROTL(a,5) + e + kt + w[s] */ + + /* wo Z-4 gerade auf kt zeigt ... */ + movw r28, r26 /* copy X in Y */ + adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */ + lsr r31 + ror r30 + + icall + mov F1, tmp1 + icall + mov F2, tmp1 + icall + mov F3, tmp1 + icall + + add T1, F1 + adc T2, F2 + adc T3, F3 + adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */ + /* X points still at a[1] aka b, Y points at a[2] aka c */ + /* update a[] */ +sha1_nextBlock_update_a: + /*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/ + //adiw r28, 3*4 /* Y should point at a[4] aka e */ + movw r28, W1 + sbiw r28, 4 + + ldi tmp2, 4*4 +1: + ld tmp1, -Y + std Y+4, tmp1 + dec tmp2 + brne 1b + /* Y points at a[0] aka a*/ + + movw r28, W1 + sbiw r28, 5*4 + /* store T in a[0] aka a */ + st Y+, T1 + st Y+, T2 + st Y+, T3 + st Y+, T4 + /* Y points at a[1] aka b*/ + + /* rotate c */ + ldd T1, Y+1*4 + ldd T2, Y+1*4+1 + ldd T3, Y+1*4+2 + ldd T4, Y+1*4+3 + mov tmp1, T1 + ldi tmp2, 2 +1: ror tmp1 + ror T4 + ror T3 + ror T2 + ror T1 + dec tmp2 + brne 1b + std Y+1*4+0, T1 + std Y+1*4+1, T2 + std Y+1*4+2, T3 + std Y+1*4+3, T4 + + push r27 + push r26 + movw r26, W1 + sbiw r26, 4*5 + dbg_hexdump 4*5 + pop r26 + pop r27 + + inc LoopC + cpi LoopC, 80 + brge 1f + jmp sha1_nextBlock_mainloop +/**************************************/ +1: + /* littel patch */ + sbiw r28, 4 + +/* add a[] to state and inc length */ + pop r27 + pop r26 /* now X points to state (and Y still at a[0]) */ + ldi tmp4, 5 +1: clc + ldi tmp3, 4 +2: ld tmp1, X + ld tmp2, Y+ + adc tmp1, tmp2 + st X+, tmp1 + dec tmp3 + brne 2b + dec tmp4 + brne 1b + + /* now length += 512 */ + adiw r26, 1 /* we skip the least significant byte */ + ld tmp1, X + ldi tmp2, hi8(512) /* 2 */ + add tmp1, tmp2 + st X+, tmp1 + ldi tmp2, 6 +1: + ld tmp1, X + adc tmp1, xNULL + st X+, tmp1 + dec tmp2 + brne 1b + +; EPILOG +sha1_nextBlock_epilog: +/* now we should clean up the stack */ + pop r21 + pop r20 + in r0, SREG + cli ; we want to be uninterrupted while updating SP + out SPL, r20 + out SPH, r21 + out SREG, r0 + + clr r1 + pop r29 + pop r28 + pop r17 + pop r16 + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + ret + +sha1_nextBlock_xTable: +.byte 20,40,60,0 +sha1_nextBlock_KTable: +.int 0x5a827999 +.int 0x6ed9eba1 +.int 0x8f1bbcdc +.int 0xca62c1d6 +sha1_nextBlock_JumpTable: +jmp sha1_nextBlock_Ch +jmp sha1_nextBlock_Parity +jmp sha1_nextBlock_Maj +jmp sha1_nextBlock_Parity + + /* X and Y still point at a[1] aka b ; return value in tmp1 */ +sha1_nextBlock_Ch: + ld tmp1, Y+ + mov tmp2, tmp1 + com tmp2 + ldd tmp3, Y+3 /* load from c */ + and tmp1, tmp3 + ldd tmp3, Y+7 /* load from d */ + and tmp2, tmp3 + eor tmp1, tmp2 + /** + precall + ldi r24, lo8(ch_str) + ldi r25, hi8(ch_str) + call uart_putstr_P + postcall + /**/ + ret + +sha1_nextBlock_Maj: + ld tmp1, Y+ + mov tmp2, tmp1 + ldd tmp3, Y+3 /* load from c */ + and tmp1, tmp3 + ldd tmp4, Y+7 /* load from d */ + and tmp2, tmp4 + eor tmp1, tmp2 + and tmp3, tmp4 + eor tmp1, tmp3 + /** + precall + ldi r24, lo8(maj_str) + ldi r25, hi8(maj_str) + call uart_putstr_P + postcall + /**/ + ret + +sha1_nextBlock_Parity: + ld tmp1, Y+ + ldd tmp2, Y+3 /* load from c */ + eor tmp1, tmp2 + ldd tmp2, Y+7 /* load from d */ + eor tmp1, tmp2 + + /** + precall + ldi r24, lo8(parity_str) + ldi r25, hi8(parity_str) + call uart_putstr_P + postcall + /**/ + ret +/* +ch_str: .asciz "\r\nCh" +maj_str: .asciz "\r\nMaj" +parity_str: .asciz "\r\nParity" +*/ +;########################################################### + +.global sha1_init +;void sha1_init(sha1_ctx_t *state){ +; DEBUG_S("\r\nSHA1_INIT"); +; state->h[0] = 0x67452301; +; state->h[1] = 0xefcdab89; +; state->h[2] = 0x98badcfe; +; state->h[3] = 0x10325476; +; state->h[4] = 0xc3d2e1f0; +; state->length = 0; +;} +; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram +; modifys: Z(r30,r31), Func1, r22 +sha1_init: + movw r26, r24 ; (24,25) --> (26,27) load X with param1 + ldi r30, lo8((sha1_init_vector)) + ldi r31, hi8((sha1_init_vector)) + ldi r22, 5*4 /* bytes to copy */ +sha1_init_vloop: + lpm r23, Z+ + st X+, r23 + dec r22 + brne sha1_init_vloop + ldi r22, 8 + clr r1 /* this should not be needed */ +sha1_init_lloop: + st X+, r1 + dec r22 + brne sha1_init_lloop + ret + +sha1_init_vector: +.int 0x67452301; +.int 0xefcdab89; +.int 0x98badcfe; +.int 0x10325476; +.int 0xc3d2e1f0; +/* +;########################################################### + +.global rotl32 +; === ROTL32 === +; function that rotates a 32 bit word to the left +; param1: the 32-bit word to rotate +; given in r25,r24,r23,r22 (r25 is most significant) +; param2: an 8-bit value telling how often to rotate +; given in r20 +; modifys: r21, r22 +rotl32: + cpi r20, 8 + brlo bitrotl + mov r21, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r21 + subi r20, 8 + rjmp rotr32 +bitrotl: + clr r21 + clc +bitrotl_loop: + tst r20 + breq fixrotl + rol r22 + rol r23 + rol r24 + rol r25 + rol r21 + dec r20 + rjmp bitrotl_loop +fixrotl: + or r22, r21 + ret + + +;########################################################### + +.global rotr32 +; === ROTR32 === +; function that rotates a 32 bit word to the right +; param1: the 32-bit word to rotate +; given in r25,r24,r23,22 (r25 is most significant) +; param2: an 8-bit value telling how often to rotate +; given in r20 +; modifys: r21, r22 +rotr32: + cpi r20, 8 + brlo bitrotr + mov r21, r22 + mov r22, r23 + mov r23, r24 + mov r24, r25 + mov r25, r21 + subi r20, 8 + rjmp rotr32 +bitrotr: + clr r21 + clc +bitrotr_loop: + tst r20 + breq fixrotr + ror r25 + ror r24 + ror r23 + ror r22 + ror r21 + dec r20 + rjmp bitrotr_loop +fixrotr: + or r25, r21 + ret + + +;########################################################### + +.global change_endian32 +; === change_endian32 === +; function that changes the endianess of a 32-bit word +; param1: the 32-bit word +; given in r25,r24,r23,22 (r25 is most significant) +; modifys: r21, r22 +change_endian32: + movw r20, r22 ; (r22,r23) --> (r20,r21) + mov r22, r25 + mov r23, r24 + mov r24, r21 + mov r25, r20 + ret +*/ diff --git a/sha256-asm.S b/sha256-asm.S index 392bf42..403506e 100644 --- a/sha256-asm.S +++ b/sha256-asm.S @@ -16,1028 +16,1028 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ -/* - * Author: Daniel Otte - * - * License: GPL -*/ -; sha-256 implementation in assembler -SHA256_BLOCK_BITS = 512 -SHA256_HASH_BITS = 256 - -.macro precall - /* push r18 - r27, r30 - r31*/ - push r0 - push r1 - push r18 - push r19 - push r20 - push r21 - push r22 - push r23 - push r24 - push r25 - push r26 - push r27 - push r30 - push r31 - clr r1 -.endm - -.macro postcall - pop r31 - pop r30 - pop r27 - pop r26 - pop r25 - pop r24 - pop r23 - pop r22 - pop r21 - pop r20 - pop r19 - pop r18 - pop r1 - pop r0 -.endm - - -.macro hexdump length - push r27 - push r26 - ldi r25, '\r' - mov r24, r25 - call uart_putc - ldi r25, '\n' - mov r24, r25 - call uart_putc - pop r26 - pop r27 - movw r24, r26 -.if \length > 16 - ldi r22, lo8(16) - ldi r23, hi8(16) - push r27 - push r26 - call uart_hexdump - pop r26 - pop r27 - adiw r26, 16 - hexdump \length-16 -.else - ldi r22, lo8(\length) - ldi r23, hi8(\length) - call uart_hexdump -.endif -.endm - -/* X points to Block */ -.macro dbg_hexdump length - precall - hexdump \length - postcall -.endm - -.section .text - -SPL = 0x3D -SPH = 0x3E -SREG = 0x3F - - -; -;sha256_ctx_t is: -; -; [h0][h1][h2][h3][h4][h5][h6][h7][length] -; hn is 32 bit large, length is 64 bit large - -;########################################################### - -.global sha256_ctx2hash -; === sha256_ctx2hash === -; this function converts a state into a normal hash (bytestring) -; param1: the 16-bit destination pointer -; given in r25,r24 (r25 is most significant) -; param2: the 16-bit pointer to sha256_ctx structure -; given in r23,r22 -sha256_ctx2hash: - movw r26, r22 - movw r30, r24 - ldi r21, 8 - sbiw r26, 4 -1: - ldi r20, 4 - adiw r26, 8 -2: - ld r0, -X - st Z+, r0 - dec r20 - brne 2b - - dec r21 - brne 1b - - ret - -;########################################################### - -.global sha256 -; === sha256 === -; this function calculates SHA-256 hashes from messages in RAM -; param1: the 16-bit hash destination pointer -; given in r25,r24 (r25 is most significant) -; param2: the 16-bit pointer to message -; given in r23,r22 -; param3: 32-bit length value (length of message in bits) -; given in r21,r20,r19,r18 -sha256: -sha256_prolog: - push r8 - push r9 - push r10 - push r11 - push r12 - push r13 - push r16 - push r17 - in r16, SPL - in r17, SPH - subi r16, 8*4+8 - sbci r17, 0 - in r0, SREG - cli - out SPL, r16 - out SPH, r17 - out SREG, r0 - - push r25 - push r24 - inc r16 - adc r17, r1 - - movw r8, r18 /* backup of length*/ - movw r10, r20 - - movw r12, r22 /* backup pf msg-ptr */ - - movw r24, r16 - rcall sha256_init - /* if length >= 512 */ -1: - tst r11 - brne 4f - tst r10 - brne 4f - mov r19, r9 - cpi r19, 0x02 - brlo 4f - - movw r24, r16 - movw r22, r12 - rcall sha256_nextBlock - ldi r19, 0x64 - add r22, r19 - adc r23, r1 - /* length -= 512 */ - ldi r19, 0x02 - sub r9, r19 - sbc r10, r1 - sbc r11, r1 - rjmp 1b - -4: - movw r24, r16 - movw r22, r12 - movw r20, r8 - rcall sha256_lastBlock - - pop r24 - pop r25 - movw r22, r16 - rcall sha256_ctx2hash - -sha256_epilog: - in r30, SPL - in r31, SPH - adiw r30, 8*4+8 - in r0, SREG - cli - out SPL, r30 - out SPH, r31 - out SREG, r0 - pop r17 - pop r16 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop r8 - ret - -;########################################################### - - -; block MUST NOT be larger than 64 bytes - -.global sha256_lastBlock -; === sha256_lastBlock === -; this function does padding & Co. for calculating SHA-256 hashes -; param1: the 16-bit pointer to sha256_ctx structure -; given in r25,r24 (r25 is most significant) -; param2: an 16-bit pointer to 64 byte block to hash -; given in r23,r22 -; param3: an 16-bit integer specifing length of block in bits -; given in r21,r20 -sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1) - - -sha256_lastBlock: - tst r20 - brne sha256_lastBlock_prolog - cpi r21, 0x02 - brne sha256_lastBlock_prolog - push r25 - push r24 - push r23 - push r22 - rcall sha256_nextBlock - pop r22 - pop r23 - pop r24 - pop r25 - clr r21 - clr r22 -sha256_lastBlock_prolog: - /* allocate space on stack */ - in r30, SPL - in r31, SPH - in r1, SREG - subi r30, lo8(64) - sbci r31, hi8(64) - cli - out SPL, r30 - out SPH, r31 - out SREG,r1 - - adiw r30, 1 /* SP points to next free byte on stack */ - mov r18, r20 /* r20 = LSB(length) */ - lsr r18 - lsr r18 - lsr r18 - bst r21, 0 /* may be we should explain this ... */ - bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */ - - - movw r26, r22 /* X points to begin of msg */ - tst r18 - breq sha256_lastBlock_post_copy - mov r1, r18 -sha256_lastBlock_copy_loop: - ld r0, X+ - st Z+, r0 - dec r1 - brne sha256_lastBlock_copy_loop -sha256_lastBlock_post_copy: -sha256_lastBlock_insert_stuffing_bit: - ldi r19, 0x80 - mov r0,r19 - ldi r19, 0x07 - and r19, r20 /* if we are in bitmode */ - breq 2f /* no bitmode */ -1: - lsr r0 - dec r19 - brne 1b - ld r19, X -/* maybe we should do some ANDing here, just for safety */ - or r0, r19 -2: - st Z+, r0 - inc r18 - -/* checking stuff here */ - cpi r18, 64-8+1 - brsh 0f - rjmp sha256_lastBlock_insert_zeros -0: - /* oh shit, we landed here */ - /* first we have to fill it up with zeros */ - ldi r19, 64 - sub r19, r18 - breq 2f -1: - st Z+, r1 - dec r19 - brne 1b -2: - sbiw r30, 63 - sbiw r30, 1 - movw r22, r30 - - push r31 - push r30 - push r25 - push r24 - push r21 - push r20 - rcall sha256_nextBlock - pop r20 - pop r21 - pop r24 - pop r25 - pop r30 - pop r31 - - /* now we should subtract 512 from length */ - movw r26, r24 - adiw r26, 4*8+1 /* we can skip the lowest byte */ - ld r19, X - subi r19, hi8(512) - st X+, r19 - ldi r18, 6 -1: - ld r19, X - sbci r19, 0 - st X+, r19 - dec r18 - brne 1b - -; clr r18 /* not neccessary ;-) */ - /* reset Z pointer to begin of block */ - -sha256_lastBlock_insert_zeros: - ldi r19, 64-8 - sub r19, r18 - breq sha256_lastBlock_insert_length - clr r1 -1: - st Z+, r1 /* r1 is still zero */ - dec r19 - brne 1b - -; rjmp sha256_lastBlock_epilog -sha256_lastBlock_insert_length: - movw r26, r24 /* X points to state */ - adiw r26, 8*4 /* X points to (state.length) */ - adiw r30, 8 /* Z points one after the last byte of block */ - ld r0, X+ - add r0, r20 - st -Z, r0 - ld r0, X+ - adc r0, r21 - st -Z, r0 - ldi r19, 6 -1: - ld r0, X+ - adc r0, r1 - st -Z, r0 - dec r19 - brne 1b - - sbiw r30, 64-8 - movw r22, r30 - rcall sha256_nextBlock - -sha256_lastBlock_epilog: - in r30, SPL - in r31, SPH - in r1, SREG - adiw r30, 63 ; lo8(64) - adiw r30, 1 ; hi8(64) - cli - out SPL, r30 - out SPH, r31 - out SREG,r1 - clr r1 - clr r0 - ret - -/**/ -;########################################################### - -.global sha256_nextBlock -; === sha256_nextBlock === -; this is the core function for calculating SHA-256 hashes -; param1: the 16-bit pointer to sha256_ctx structure -; given in r25,r24 (r25 is most significant) -; param2: an 16-bit pointer to 64 byte block to hash -; given in r23,r22 -sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte) - -Bck1 = 12 -Bck2 = 13 -Bck3 = 14 -Bck4 = 15 -Func1 = 22 -Func2 = 23 -Func3 = 24 -Func4 = 25 -Accu1 = 16 -Accu2 = 17 -Accu3 = 18 -Accu4 = 19 -XAccu1 = 8 -XAccu2 = 9 -XAccu3 = 10 -XAccu4 = 11 -T1 = 4 -T2 = 5 -T3 = 6 -T4 = 7 -LoopC = 1 -/* byteorder: high number <--> high significance */ -sha256_nextBlock: - ; initial, let's make some space ready for local vars - push r4 /* replace push & pop by mem ops? */ - push r5 - push r6 - push r7 - push r8 - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - push r16 - push r17 - push r28 - push r29 - in r20, SPL - in r21, SPH - movw r18, r20 ;backup SP -; movw r26, r20 ; X points to free space on stack - movw r30, r22 ; Z points to message - subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63 - sbci r21, hi8(sha256_nextBlock_localSpace) - movw r26, r20 ; X points to free space on stack - in r0, SREG - cli ; we want to be uninterrupted while updating SP - out SPL, r20 - out SPH, r21 - out SREG, r0 - push r18 - push r19 - push r24 - push r25 /* param1 will be needed later */ - ; now we fill the w array with message (think about endianess) - adiw r26, 1 ; X++ - ldi r20, 16 -sha256_nextBlock_wcpyloop: - ld r23, Z+ - ld r22, Z+ - ld r19, Z+ - ld r18, Z+ - st X+, r18 - st X+, r19 - st X+, r22 - st X+, r23 - dec r20 - brne sha256_nextBlock_wcpyloop -/* for (i=16; i<64; ++i){ - w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16]; - } */ - /* r25,r24,r23,r24 (r21,r20) are function values - r19,r18,r17,r16 are the accumulator - r15,r14,r13,rBck1 are backup1 - r11,r10,r9 ,r8 are xor accu - r1 is round counter */ - - ldi r20, 64-16 - mov LoopC, r20 -sha256_nextBlock_wcalcloop: - movw r30, r26 ; cp X to Z - sbiw r30, 63 - sbiw r30, 1 ; substract 64 = 16*4 - ld Accu1, Z+ - ld Accu2, Z+ - ld Accu3, Z+ - ld Accu4, Z+ /* w[i] = w[i-16] */ - ld Bck1, Z+ - ld Bck2, Z+ - ld Bck3, Z+ - ld Bck4, Z+ /* backup = w[i-15] */ - /* now sigma 0 */ - mov Func1, Bck2 - mov Func2, Bck3 - mov Func3, Bck4 - mov Func4, Bck1 /* prerotated by 8 */ - ldi r20, 1 - rcall bitrotl - movw XAccu1, Func1 - movw XAccu3, Func3 /* store ROTR(w[i-15],7) in xor accu */ - movw Func1, Bck3 - movw Func3, Bck1 /* prerotated by 16 */ - ldi r20, 2 - rcall bitrotr - eor XAccu1, Func1 /* xor ROTR(w[i-15], 18)*/ - eor XAccu2, Func2 - eor XAccu3, Func3 - eor XAccu4, Func4 - ldi Func2, 3 /* now shr3 */ /*we can destroy backup now*/ -sigma0_shr: - lsr Bck4 - ror Bck3 - ror Bck2 - ror Bck1 - dec Func2 - brne sigma0_shr - eor XAccu1, Bck1 - eor XAccu2, Bck2 - eor XAccu3, Bck3 - eor XAccu4, Bck4 /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */ - add Accu1, XAccu1 - adc Accu2, XAccu2 - adc Accu3, XAccu3 - adc Accu4, XAccu4 /* finished with sigma0 */ - ldd Func1, Z+7*4 /* now accu += w[i-7] */ - ldd Func2, Z+7*4+1 - ldd Func3, Z+7*4+2 - ldd Func4, Z+7*4+3 - add Accu1, Func1 - adc Accu2, Func2 - adc Accu3, Func3 - adc Accu4, Func4 - ldd Bck1, Z+12*4 /* now backup = w[i-2]*/ - ldd Bck2, Z+12*4+1 - ldd Bck3, Z+12*4+2 - ldd Bck4, Z+12*4+3 - /* now sigma 1 */ - movw Func1, Bck3 - movw Func3, Bck1 /* prerotated by 16 */ - ldi r20, 1 - rcall bitrotr - movw XAccu3, Func3 - movw XAccu1, Func1 /* store in ROTR(w[i-2], 17) xor accu */ -; movw Func1, Bck3 -; movw Func3, Bck1 /* prerotated by 16 */ - ldi r20, 2 - rcall bitrotr - eor XAccu1, Func1 /* xor ROTR(w[i-2], 19)*/ - eor XAccu2, Func2 - eor XAccu3, Func3 - eor XAccu4, Func4 - ldi Func2, 2 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/ -sigma1_shr: - lsr Bck4 - ror Bck3 - ror Bck2 - dec Func2 - brne sigma1_shr - eor XAccu1, Bck2 - eor XAccu2, Bck3 - eor XAccu3, Bck4 /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */ - add Accu1, XAccu1 - adc Accu2, XAccu2 - adc Accu3, XAccu3 - adc Accu4, XAccu4 /* finished with sigma0 */ - /* now let's store the shit */ - st X+, Accu1 - st X+, Accu2 - st X+, Accu3 - st X+, Accu4 - dec LoopC - breq 3f ; skip if zero - rjmp sha256_nextBlock_wcalcloop -3: - /* we are finished with w array X points one byte post w */ -/* init a array */ - pop r31 - pop r30 - push r30 - push r31 - ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */ -init_a_array: - ld r1, Z+ - st X+, r1 - dec r25 - brne init_a_array - -/* now the real fun begins */ -/* for (i=0; i<64; ++i){ - t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i]; - t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]); - memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; - a[4] += t1; - a[0] = t1 + t2; - } */ - /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */ - sbiw r26, 8*4 /* X still points at a[7]+1*/ - movw r28, r26 - ldi r30, lo8(sha256_kv) - ldi r31, hi8(sha256_kv) - dec r27 /* X - (64*4 == 256) */ - ldi r25, 64 - mov LoopC, r25 -sha256_main_loop: - /* now calculate t1 */ - /*CH(x,y,z) = (x&y)^((~x)&z)*/ - ldd T1, Y+5*4 - ldd T2, Y+5*4+1 - ldd T3, Y+5*4+2 - ldd T4, Y+5*4+3 /* y in T */ - ldd Func1, Y+4*4 - ldd Func2, Y+4*4+1 - ldd Func3, Y+4*4+2 - ldd Func4, Y+4*4+3 /* x in Func */ - ldd Bck1, Y+6*4 - ldd Bck2, Y+6*4+1 - ldd Bck3, Y+6*4+2 - ldd Bck4, Y+6*4+3 /* z in Bck */ - and T1, Func1 - and T2, Func2 - and T3, Func3 - and T4, Func4 - com Func1 - com Func2 - com Func3 - com Func4 - and Bck1, Func1 - and Bck2, Func2 - and Bck3, Func3 - and Bck4, Func4 - eor T1, Bck1 - eor T2, Bck2 - eor T3, Bck3 - eor T4, Bck4 /* done, CH(x,y,z) is in T */ - /* now SIGMA1(a[4]) */ - ldd Bck4, Y+4*4 /* think about using it from Func reg above*/ - ldd Bck1, Y+4*4+1 - ldd Bck2, Y+4*4+2 - ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */ - movw Func1, Bck1 - movw Func3, Bck3 - ldi r20, 2 - rcall bitrotl /* rotr(x,6) */ - movw XAccu1, Func1 - movw XAccu3, Func3 - movw Func1, Bck1 - movw Func3, Bck3 - ldi r20, 3 - rcall bitrotr /* rotr(x,11) */ - eor XAccu1, Func1 - eor XAccu2, Func2 - eor XAccu3, Func3 - eor XAccu4, Func4 - movw Func1, Bck3 /* this prerotates furteh 16 bits*/ - movw Func3, Bck1 /* so we have now prerotated by 24 bits*/ - ldi r20, 1 - rcall bitrotr /* rotr(x,11) */ - eor XAccu1, Func1 - eor XAccu2, Func2 - eor XAccu3, Func3 - eor XAccu4, Func4 /* finished with SIGMA1, add it to T */ - add T1, XAccu1 - adc T2, XAccu2 - adc T3, XAccu3 - adc T4, XAccu4 - /* now we've to add a[7], w[i] and k[i] */ - ldd XAccu1, Y+4*7 - ldd XAccu2, Y+4*7+1 - ldd XAccu3, Y+4*7+2 - ldd XAccu4, Y+4*7+3 - add T1, XAccu1 - adc T2, XAccu2 - adc T3, XAccu3 - adc T4, XAccu4 /* add a[7] */ - ld XAccu1, X+ - ld XAccu2, X+ - ld XAccu3, X+ - ld XAccu4, X+ - add T1, XAccu1 - adc T2, XAccu2 - adc T3, XAccu3 - adc T4, XAccu4 /* add w[i] */ - lpm XAccu1, Z+ - lpm XAccu2, Z+ - lpm XAccu3, Z+ - lpm XAccu4, Z+ - add T1, XAccu1 - adc T2, XAccu2 - adc T3, XAccu3 - adc T4, XAccu4 /* add k[i] */ /* finished with t1 */ - /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/ - /* starting with MAJ(x,y,z) */ - ldd Func1, Y+4*0+0 - ldd Func2, Y+4*0+1 - ldd Func3, Y+4*0+2 - ldd Func4, Y+4*0+3 /* load x=a[0] */ - ldd XAccu1, Y+4*1+0 - ldd XAccu2, Y+4*1+1 - ldd XAccu3, Y+4*1+2 - ldd XAccu4, Y+4*1+3 /* load y=a[1] */ - and XAccu1, Func1 - and XAccu2, Func2 - and XAccu3, Func3 - and XAccu4, Func4 /* XAccu == (x & y) */ - ldd Bck1, Y+4*2+0 - ldd Bck2, Y+4*2+1 - ldd Bck3, Y+4*2+2 - ldd Bck4, Y+4*2+3 /* load z=a[2] */ - and Func1, Bck1 - and Func2, Bck2 - and Func3, Bck3 - and Func4, Bck4 - eor XAccu1, Func1 - eor XAccu2, Func2 - eor XAccu3, Func3 - eor XAccu4, Func4 /* XAccu == (x & y) ^ (x & z) */ - ldd Func1, Y+4*1+0 - ldd Func2, Y+4*1+1 - ldd Func3, Y+4*1+2 - ldd Func4, Y+4*1+3 /* load y=a[1] */ - and Func1, Bck1 - and Func2, Bck2 - and Func3, Bck3 - and Func4, Bck4 - eor XAccu1, Func1 - eor XAccu2, Func2 - eor XAccu3, Func3 - eor XAccu4, Func4 /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */ - /* SIGMA0(a[0]) */ - ldd Bck1, Y+4*0+0 /* we should combine this with above */ - ldd Bck2, Y+4*0+1 - ldd Bck3, Y+4*0+2 - ldd Bck4, Y+4*0+3 - movw Func1, Bck1 - movw Func3, Bck3 - ldi r20, 2 - rcall bitrotr - movw Accu1, Func1 - movw Accu3, Func3 /* Accu = shr(a[0], 2) */ - movw Func1, Bck3 - movw Func3, Bck1 /* prerotate by 16 bits */ - ldi r20, 3 - rcall bitrotl - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */ - mov Func1, Bck4 - mov Func2, Bck1 - mov Func3, Bck2 - mov Func4, Bck3 /* prerotate by 24 bits */ - ldi r20, 2 - rcall bitrotl - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */ - add Accu1, XAccu1 /* add previous result (MAJ)*/ - adc Accu2, XAccu2 - adc Accu3, XAccu3 - adc Accu4, XAccu4 - /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/ - /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */ - - ldi r21, 7*4 - adiw r28, 7*4 -a_shift_loop: - ld r25, -Y /* warning: this is PREdecrement */ - std Y+4, r25 - dec r21 - brne a_shift_loop - - ldd Bck1, Y+4*4+0 - ldd Bck2, Y+4*4+1 - ldd Bck3, Y+4*4+2 - ldd Bck4, Y+4*4+3 - add Bck1, T1 - adc Bck2, T2 - adc Bck3, T3 - adc Bck4, T4 - std Y+4*4+0, Bck1 - std Y+4*4+1, Bck2 - std Y+4*4+2, Bck3 - std Y+4*4+3, Bck4 - add Accu1, T1 - adc Accu2, T2 - adc Accu3, T3 - adc Accu4, T4 - std Y+4*0+0, Accu1 - std Y+4*0+1, Accu2 - std Y+4*0+2, Accu3 - std Y+4*0+3, Accu4 /* a array updated */ - - - dec LoopC - breq update_state - rjmp sha256_main_loop ;brne sha256_main_loop -update_state: - /* update state */ - /* pointers to state should still exist on the stack ;-) */ - pop r31 - pop r30 - ldi r21, 8 -update_state_loop: - ldd Accu1, Z+0 - ldd Accu2, Z+1 - ldd Accu3, Z+2 - ldd Accu4, Z+3 - ld Func1, Y+ - ld Func2, Y+ - ld Func3, Y+ - ld Func4, Y+ - add Accu1, Func1 - adc Accu2, Func2 - adc Accu3, Func3 - adc Accu4, Func4 - st Z+, Accu1 - st Z+, Accu2 - st Z+, Accu3 - st Z+, Accu4 - dec r21 - brne update_state_loop - /* now we just have to update the length */ - adiw r30, 1 /* since we add 512, we can simply skip the LSB */ - ldi r21, 2 - ldi r22, 6 - ld r20, Z - add r20, r21 - st Z+, r20 - clr r21 -sha256_nextBlock_fix_length: - brcc sha256_nextBlock_epilog - ld r20, Z - adc r20, r21 - st Z+, r20 - dec r22 - brne sha256_nextBlock_fix_length - -; EPILOG -sha256_nextBlock_epilog: -/* now we should clean up the stack */ - - pop r21 - pop r20 - in r0, SREG - cli ; we want to be uninterrupted while updating SP - out SPL, r20 - out SPH, r21 - out SREG, r0 - - clr r1 - pop r29 - pop r28 - pop r17 - pop r16 - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop r8 - pop r7 - pop r6 - pop r5 - pop r4 - ret - -sha256_kv: ; round-key-vector stored in ProgMem -.word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c -.word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b -.word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9 -.word 0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429 -.word 0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272 -.word 0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a -.word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e -.word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671 - - -;########################################################### - -.global sha256_init -;uint32_t sha256_init_vector[]={ -; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, -; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; -; -;void sha256_init(sha256_ctx_t *state){ -; state->length=0; -; memcpy(state->h, sha256_init_vector, 8*4); -;} -; param1: (Func3,r24) 16-bit pointer to sha256_ctx_t struct in ram -; modifys: Z(r30,r31), Func1, r22 -sha256_init: - movw r26, r24 ; (24,25) --> (26,27) load X with param1 - ldi r30, lo8((sha256_init_vector)) - ldi r31, hi8((sha256_init_vector)) - ldi r22, 32 -sha256_init_vloop: - lpm r23, Z+ - st X+, r23 - dec r22 - brne sha256_init_vloop - ldi r22, 8 - clr r1 ;this should not be needed -sha256_init_lloop: - st X+, r1 - dec r22 - brne sha256_init_lloop - ret - -sha256_init_vector: -.word 0xE667, 0x6A09 -.word 0xAE85, 0xBB67 -.word 0xF372, 0x3C6E -.word 0xF53A, 0xA54F -.word 0x527F, 0x510E -.word 0x688C, 0x9B05 -.word 0xD9AB, 0x1F83 -.word 0xCD19, 0x5BE0 - -;########################################################### - -.global rotl32 -; === ROTL32 === -; function that rotates a 32 bit word to the left -; param1: the 32-bit word to rotate -; given in r25,r24,r23,r22 (r25 is most significant) -; param2: an 8-bit value telling how often to rotate -; given in r20 -; modifys: r21, r22 -rotl32: - cpi r20, 8 - brlo bitrotl - mov r21, r25 - mov r25, r24 - mov r24, r23 - mov r23, r22 - mov r22, r21 - subi r20, 8 - rjmp rotr32 -bitrotl: - clr r21 - clc -bitrotl_loop: - tst r20 - breq fixrotl - rol r22 - rol r23 - rol r24 - rol r25 - rol r21 - dec r20 - rjmp bitrotl_loop -fixrotl: - or r22, r21 - ret - - -;########################################################### - -.global rotr32 -; === ROTR32 === -; function that rotates a 32 bit word to the right -; param1: the 32-bit word to rotate -; given in r25,r24,r23,22 (r25 is most significant) -; param2: an 8-bit value telling how often to rotate -; given in r20 -; modifys: r21, r22 -rotr32: - cpi r20, 8 - brlo bitrotr - mov r21, r22 - mov r22, r23 - mov r23, r24 - mov r24, r25 - mov r25, r21 - subi r20, 8 - rjmp rotr32 -bitrotr: - clr r21 - clc -bitrotr_loop: - tst r20 - breq fixrotr - ror r25 - ror r24 - ror r23 - ror r22 - ror r21 - dec r20 - rjmp bitrotr_loop -fixrotr: - or r25, r21 - ret - - -;########################################################### - -.global change_endian32 -; === change_endian32 === -; function that changes the endianess of a 32-bit word -; param1: the 32-bit word -; given in r25,r24,r23,22 (r25 is most significant) -; modifys: r21, r22 -change_endian32: - movw r20, r22 ; (r22,r23) --> (r20,r21) - mov r22, r25 - mov r23, r24 - mov r24, r21 - mov r25, r20 - ret - +/* + * Author: Daniel Otte + * + * License: GPLv3 or later +*/ +; sha-256 implementation in assembler +SHA256_BLOCK_BITS = 512 +SHA256_HASH_BITS = 256 + +.macro precall + /* push r18 - r27, r30 - r31*/ + push r0 + push r1 + push r18 + push r19 + push r20 + push r21 + push r22 + push r23 + push r24 + push r25 + push r26 + push r27 + push r30 + push r31 + clr r1 +.endm + +.macro postcall + pop r31 + pop r30 + pop r27 + pop r26 + pop r25 + pop r24 + pop r23 + pop r22 + pop r21 + pop r20 + pop r19 + pop r18 + pop r1 + pop r0 +.endm + + +.macro hexdump length + push r27 + push r26 + ldi r25, '\r' + mov r24, r25 + call uart_putc + ldi r25, '\n' + mov r24, r25 + call uart_putc + pop r26 + pop r27 + movw r24, r26 +.if \length > 16 + ldi r22, lo8(16) + ldi r23, hi8(16) + push r27 + push r26 + call uart_hexdump + pop r26 + pop r27 + adiw r26, 16 + hexdump \length-16 +.else + ldi r22, lo8(\length) + ldi r23, hi8(\length) + call uart_hexdump +.endif +.endm + +/* X points to Block */ +.macro dbg_hexdump length + precall + hexdump \length + postcall +.endm + +.section .text + +SPL = 0x3D +SPH = 0x3E +SREG = 0x3F + + +; +;sha256_ctx_t is: +; +; [h0][h1][h2][h3][h4][h5][h6][h7][length] +; hn is 32 bit large, length is 64 bit large + +;########################################################### + +.global sha256_ctx2hash +; === sha256_ctx2hash === +; this function converts a state into a normal hash (bytestring) +; param1: the 16-bit destination pointer +; given in r25,r24 (r25 is most significant) +; param2: the 16-bit pointer to sha256_ctx structure +; given in r23,r22 +sha256_ctx2hash: + movw r26, r22 + movw r30, r24 + ldi r21, 8 + sbiw r26, 4 +1: + ldi r20, 4 + adiw r26, 8 +2: + ld r0, -X + st Z+, r0 + dec r20 + brne 2b + + dec r21 + brne 1b + + ret + +;########################################################### + +.global sha256 +; === sha256 === +; this function calculates SHA-256 hashes from messages in RAM +; param1: the 16-bit hash destination pointer +; given in r25,r24 (r25 is most significant) +; param2: the 16-bit pointer to message +; given in r23,r22 +; param3: 32-bit length value (length of message in bits) +; given in r21,r20,r19,r18 +sha256: +sha256_prolog: + push r8 + push r9 + push r10 + push r11 + push r12 + push r13 + push r16 + push r17 + in r16, SPL + in r17, SPH + subi r16, 8*4+8 + sbci r17, 0 + in r0, SREG + cli + out SPL, r16 + out SPH, r17 + out SREG, r0 + + push r25 + push r24 + inc r16 + adc r17, r1 + + movw r8, r18 /* backup of length*/ + movw r10, r20 + + movw r12, r22 /* backup pf msg-ptr */ + + movw r24, r16 + rcall sha256_init + /* if length >= 512 */ +1: + tst r11 + brne 4f + tst r10 + brne 4f + mov r19, r9 + cpi r19, 0x02 + brlo 4f + + movw r24, r16 + movw r22, r12 + rcall sha256_nextBlock + ldi r19, 0x64 + add r22, r19 + adc r23, r1 + /* length -= 512 */ + ldi r19, 0x02 + sub r9, r19 + sbc r10, r1 + sbc r11, r1 + rjmp 1b + +4: + movw r24, r16 + movw r22, r12 + movw r20, r8 + rcall sha256_lastBlock + + pop r24 + pop r25 + movw r22, r16 + rcall sha256_ctx2hash + +sha256_epilog: + in r30, SPL + in r31, SPH + adiw r30, 8*4+8 + in r0, SREG + cli + out SPL, r30 + out SPH, r31 + out SREG, r0 + pop r17 + pop r16 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop r8 + ret + +;########################################################### + + +; block MUST NOT be larger than 64 bytes + +.global sha256_lastBlock +; === sha256_lastBlock === +; this function does padding & Co. for calculating SHA-256 hashes +; param1: the 16-bit pointer to sha256_ctx structure +; given in r25,r24 (r25 is most significant) +; param2: an 16-bit pointer to 64 byte block to hash +; given in r23,r22 +; param3: an 16-bit integer specifing length of block in bits +; given in r21,r20 +sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1) + + +sha256_lastBlock: + tst r20 + brne sha256_lastBlock_prolog + cpi r21, 0x02 + brne sha256_lastBlock_prolog + push r25 + push r24 + push r23 + push r22 + rcall sha256_nextBlock + pop r22 + pop r23 + pop r24 + pop r25 + clr r21 + clr r22 +sha256_lastBlock_prolog: + /* allocate space on stack */ + in r30, SPL + in r31, SPH + in r1, SREG + subi r30, lo8(64) + sbci r31, hi8(64) + cli + out SPL, r30 + out SPH, r31 + out SREG,r1 + + adiw r30, 1 /* SP points to next free byte on stack */ + mov r18, r20 /* r20 = LSB(length) */ + lsr r18 + lsr r18 + lsr r18 + bst r21, 0 /* may be we should explain this ... */ + bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */ + + + movw r26, r22 /* X points to begin of msg */ + tst r18 + breq sha256_lastBlock_post_copy + mov r1, r18 +sha256_lastBlock_copy_loop: + ld r0, X+ + st Z+, r0 + dec r1 + brne sha256_lastBlock_copy_loop +sha256_lastBlock_post_copy: +sha256_lastBlock_insert_stuffing_bit: + ldi r19, 0x80 + mov r0,r19 + ldi r19, 0x07 + and r19, r20 /* if we are in bitmode */ + breq 2f /* no bitmode */ +1: + lsr r0 + dec r19 + brne 1b + ld r19, X +/* maybe we should do some ANDing here, just for safety */ + or r0, r19 +2: + st Z+, r0 + inc r18 + +/* checking stuff here */ + cpi r18, 64-8+1 + brsh 0f + rjmp sha256_lastBlock_insert_zeros +0: + /* oh shit, we landed here */ + /* first we have to fill it up with zeros */ + ldi r19, 64 + sub r19, r18 + breq 2f +1: + st Z+, r1 + dec r19 + brne 1b +2: + sbiw r30, 63 + sbiw r30, 1 + movw r22, r30 + + push r31 + push r30 + push r25 + push r24 + push r21 + push r20 + rcall sha256_nextBlock + pop r20 + pop r21 + pop r24 + pop r25 + pop r30 + pop r31 + + /* now we should subtract 512 from length */ + movw r26, r24 + adiw r26, 4*8+1 /* we can skip the lowest byte */ + ld r19, X + subi r19, hi8(512) + st X+, r19 + ldi r18, 6 +1: + ld r19, X + sbci r19, 0 + st X+, r19 + dec r18 + brne 1b + +; clr r18 /* not neccessary ;-) */ + /* reset Z pointer to begin of block */ + +sha256_lastBlock_insert_zeros: + ldi r19, 64-8 + sub r19, r18 + breq sha256_lastBlock_insert_length + clr r1 +1: + st Z+, r1 /* r1 is still zero */ + dec r19 + brne 1b + +; rjmp sha256_lastBlock_epilog +sha256_lastBlock_insert_length: + movw r26, r24 /* X points to state */ + adiw r26, 8*4 /* X points to (state.length) */ + adiw r30, 8 /* Z points one after the last byte of block */ + ld r0, X+ + add r0, r20 + st -Z, r0 + ld r0, X+ + adc r0, r21 + st -Z, r0 + ldi r19, 6 +1: + ld r0, X+ + adc r0, r1 + st -Z, r0 + dec r19 + brne 1b + + sbiw r30, 64-8 + movw r22, r30 + rcall sha256_nextBlock + +sha256_lastBlock_epilog: + in r30, SPL + in r31, SPH + in r1, SREG + adiw r30, 63 ; lo8(64) + adiw r30, 1 ; hi8(64) + cli + out SPL, r30 + out SPH, r31 + out SREG,r1 + clr r1 + clr r0 + ret + +/**/ +;########################################################### + +.global sha256_nextBlock +; === sha256_nextBlock === +; this is the core function for calculating SHA-256 hashes +; param1: the 16-bit pointer to sha256_ctx structure +; given in r25,r24 (r25 is most significant) +; param2: an 16-bit pointer to 64 byte block to hash +; given in r23,r22 +sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte) + +Bck1 = 12 +Bck2 = 13 +Bck3 = 14 +Bck4 = 15 +Func1 = 22 +Func2 = 23 +Func3 = 24 +Func4 = 25 +Accu1 = 16 +Accu2 = 17 +Accu3 = 18 +Accu4 = 19 +XAccu1 = 8 +XAccu2 = 9 +XAccu3 = 10 +XAccu4 = 11 +T1 = 4 +T2 = 5 +T3 = 6 +T4 = 7 +LoopC = 1 +/* byteorder: high number <--> high significance */ +sha256_nextBlock: + ; initial, let's make some space ready for local vars + push r4 /* replace push & pop by mem ops? */ + push r5 + push r6 + push r7 + push r8 + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push r16 + push r17 + push r28 + push r29 + in r20, SPL + in r21, SPH + movw r18, r20 ;backup SP +; movw r26, r20 ; X points to free space on stack + movw r30, r22 ; Z points to message + subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63 + sbci r21, hi8(sha256_nextBlock_localSpace) + movw r26, r20 ; X points to free space on stack + in r0, SREG + cli ; we want to be uninterrupted while updating SP + out SPL, r20 + out SPH, r21 + out SREG, r0 + push r18 + push r19 + push r24 + push r25 /* param1 will be needed later */ + ; now we fill the w array with message (think about endianess) + adiw r26, 1 ; X++ + ldi r20, 16 +sha256_nextBlock_wcpyloop: + ld r23, Z+ + ld r22, Z+ + ld r19, Z+ + ld r18, Z+ + st X+, r18 + st X+, r19 + st X+, r22 + st X+, r23 + dec r20 + brne sha256_nextBlock_wcpyloop +/* for (i=16; i<64; ++i){ + w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16]; + } */ + /* r25,r24,r23,r24 (r21,r20) are function values + r19,r18,r17,r16 are the accumulator + r15,r14,r13,rBck1 are backup1 + r11,r10,r9 ,r8 are xor accu + r1 is round counter */ + + ldi r20, 64-16 + mov LoopC, r20 +sha256_nextBlock_wcalcloop: + movw r30, r26 ; cp X to Z + sbiw r30, 63 + sbiw r30, 1 ; substract 64 = 16*4 + ld Accu1, Z+ + ld Accu2, Z+ + ld Accu3, Z+ + ld Accu4, Z+ /* w[i] = w[i-16] */ + ld Bck1, Z+ + ld Bck2, Z+ + ld Bck3, Z+ + ld Bck4, Z+ /* backup = w[i-15] */ + /* now sigma 0 */ + mov Func1, Bck2 + mov Func2, Bck3 + mov Func3, Bck4 + mov Func4, Bck1 /* prerotated by 8 */ + ldi r20, 1 + rcall bitrotl + movw XAccu1, Func1 + movw XAccu3, Func3 /* store ROTR(w[i-15],7) in xor accu */ + movw Func1, Bck3 + movw Func3, Bck1 /* prerotated by 16 */ + ldi r20, 2 + rcall bitrotr + eor XAccu1, Func1 /* xor ROTR(w[i-15], 18)*/ + eor XAccu2, Func2 + eor XAccu3, Func3 + eor XAccu4, Func4 + ldi Func2, 3 /* now shr3 */ /*we can destroy backup now*/ +sigma0_shr: + lsr Bck4 + ror Bck3 + ror Bck2 + ror Bck1 + dec Func2 + brne sigma0_shr + eor XAccu1, Bck1 + eor XAccu2, Bck2 + eor XAccu3, Bck3 + eor XAccu4, Bck4 /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */ + add Accu1, XAccu1 + adc Accu2, XAccu2 + adc Accu3, XAccu3 + adc Accu4, XAccu4 /* finished with sigma0 */ + ldd Func1, Z+7*4 /* now accu += w[i-7] */ + ldd Func2, Z+7*4+1 + ldd Func3, Z+7*4+2 + ldd Func4, Z+7*4+3 + add Accu1, Func1 + adc Accu2, Func2 + adc Accu3, Func3 + adc Accu4, Func4 + ldd Bck1, Z+12*4 /* now backup = w[i-2]*/ + ldd Bck2, Z+12*4+1 + ldd Bck3, Z+12*4+2 + ldd Bck4, Z+12*4+3 + /* now sigma 1 */ + movw Func1, Bck3 + movw Func3, Bck1 /* prerotated by 16 */ + ldi r20, 1 + rcall bitrotr + movw XAccu3, Func3 + movw XAccu1, Func1 /* store in ROTR(w[i-2], 17) xor accu */ +; movw Func1, Bck3 +; movw Func3, Bck1 /* prerotated by 16 */ + ldi r20, 2 + rcall bitrotr + eor XAccu1, Func1 /* xor ROTR(w[i-2], 19)*/ + eor XAccu2, Func2 + eor XAccu3, Func3 + eor XAccu4, Func4 + ldi Func2, 2 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/ +sigma1_shr: + lsr Bck4 + ror Bck3 + ror Bck2 + dec Func2 + brne sigma1_shr + eor XAccu1, Bck2 + eor XAccu2, Bck3 + eor XAccu3, Bck4 /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */ + add Accu1, XAccu1 + adc Accu2, XAccu2 + adc Accu3, XAccu3 + adc Accu4, XAccu4 /* finished with sigma0 */ + /* now let's store the shit */ + st X+, Accu1 + st X+, Accu2 + st X+, Accu3 + st X+, Accu4 + dec LoopC + breq 3f ; skip if zero + rjmp sha256_nextBlock_wcalcloop +3: + /* we are finished with w array X points one byte post w */ +/* init a array */ + pop r31 + pop r30 + push r30 + push r31 + ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */ +init_a_array: + ld r1, Z+ + st X+, r1 + dec r25 + brne init_a_array + +/* now the real fun begins */ +/* for (i=0; i<64; ++i){ + t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i]; + t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]); + memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; + a[4] += t1; + a[0] = t1 + t2; + } */ + /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */ + sbiw r26, 8*4 /* X still points at a[7]+1*/ + movw r28, r26 + ldi r30, lo8(sha256_kv) + ldi r31, hi8(sha256_kv) + dec r27 /* X - (64*4 == 256) */ + ldi r25, 64 + mov LoopC, r25 +sha256_main_loop: + /* now calculate t1 */ + /*CH(x,y,z) = (x&y)^((~x)&z)*/ + ldd T1, Y+5*4 + ldd T2, Y+5*4+1 + ldd T3, Y+5*4+2 + ldd T4, Y+5*4+3 /* y in T */ + ldd Func1, Y+4*4 + ldd Func2, Y+4*4+1 + ldd Func3, Y+4*4+2 + ldd Func4, Y+4*4+3 /* x in Func */ + ldd Bck1, Y+6*4 + ldd Bck2, Y+6*4+1 + ldd Bck3, Y+6*4+2 + ldd Bck4, Y+6*4+3 /* z in Bck */ + and T1, Func1 + and T2, Func2 + and T3, Func3 + and T4, Func4 + com Func1 + com Func2 + com Func3 + com Func4 + and Bck1, Func1 + and Bck2, Func2 + and Bck3, Func3 + and Bck4, Func4 + eor T1, Bck1 + eor T2, Bck2 + eor T3, Bck3 + eor T4, Bck4 /* done, CH(x,y,z) is in T */ + /* now SIGMA1(a[4]) */ + ldd Bck4, Y+4*4 /* think about using it from Func reg above*/ + ldd Bck1, Y+4*4+1 + ldd Bck2, Y+4*4+2 + ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */ + movw Func1, Bck1 + movw Func3, Bck3 + ldi r20, 2 + rcall bitrotl /* rotr(x,6) */ + movw XAccu1, Func1 + movw XAccu3, Func3 + movw Func1, Bck1 + movw Func3, Bck3 + ldi r20, 3 + rcall bitrotr /* rotr(x,11) */ + eor XAccu1, Func1 + eor XAccu2, Func2 + eor XAccu3, Func3 + eor XAccu4, Func4 + movw Func1, Bck3 /* this prerotates furteh 16 bits*/ + movw Func3, Bck1 /* so we have now prerotated by 24 bits*/ + ldi r20, 1 + rcall bitrotr /* rotr(x,11) */ + eor XAccu1, Func1 + eor XAccu2, Func2 + eor XAccu3, Func3 + eor XAccu4, Func4 /* finished with SIGMA1, add it to T */ + add T1, XAccu1 + adc T2, XAccu2 + adc T3, XAccu3 + adc T4, XAccu4 + /* now we've to add a[7], w[i] and k[i] */ + ldd XAccu1, Y+4*7 + ldd XAccu2, Y+4*7+1 + ldd XAccu3, Y+4*7+2 + ldd XAccu4, Y+4*7+3 + add T1, XAccu1 + adc T2, XAccu2 + adc T3, XAccu3 + adc T4, XAccu4 /* add a[7] */ + ld XAccu1, X+ + ld XAccu2, X+ + ld XAccu3, X+ + ld XAccu4, X+ + add T1, XAccu1 + adc T2, XAccu2 + adc T3, XAccu3 + adc T4, XAccu4 /* add w[i] */ + lpm XAccu1, Z+ + lpm XAccu2, Z+ + lpm XAccu3, Z+ + lpm XAccu4, Z+ + add T1, XAccu1 + adc T2, XAccu2 + adc T3, XAccu3 + adc T4, XAccu4 /* add k[i] */ /* finished with t1 */ + /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/ + /* starting with MAJ(x,y,z) */ + ldd Func1, Y+4*0+0 + ldd Func2, Y+4*0+1 + ldd Func3, Y+4*0+2 + ldd Func4, Y+4*0+3 /* load x=a[0] */ + ldd XAccu1, Y+4*1+0 + ldd XAccu2, Y+4*1+1 + ldd XAccu3, Y+4*1+2 + ldd XAccu4, Y+4*1+3 /* load y=a[1] */ + and XAccu1, Func1 + and XAccu2, Func2 + and XAccu3, Func3 + and XAccu4, Func4 /* XAccu == (x & y) */ + ldd Bck1, Y+4*2+0 + ldd Bck2, Y+4*2+1 + ldd Bck3, Y+4*2+2 + ldd Bck4, Y+4*2+3 /* load z=a[2] */ + and Func1, Bck1 + and Func2, Bck2 + and Func3, Bck3 + and Func4, Bck4 + eor XAccu1, Func1 + eor XAccu2, Func2 + eor XAccu3, Func3 + eor XAccu4, Func4 /* XAccu == (x & y) ^ (x & z) */ + ldd Func1, Y+4*1+0 + ldd Func2, Y+4*1+1 + ldd Func3, Y+4*1+2 + ldd Func4, Y+4*1+3 /* load y=a[1] */ + and Func1, Bck1 + and Func2, Bck2 + and Func3, Bck3 + and Func4, Bck4 + eor XAccu1, Func1 + eor XAccu2, Func2 + eor XAccu3, Func3 + eor XAccu4, Func4 /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */ + /* SIGMA0(a[0]) */ + ldd Bck1, Y+4*0+0 /* we should combine this with above */ + ldd Bck2, Y+4*0+1 + ldd Bck3, Y+4*0+2 + ldd Bck4, Y+4*0+3 + movw Func1, Bck1 + movw Func3, Bck3 + ldi r20, 2 + rcall bitrotr + movw Accu1, Func1 + movw Accu3, Func3 /* Accu = shr(a[0], 2) */ + movw Func1, Bck3 + movw Func3, Bck1 /* prerotate by 16 bits */ + ldi r20, 3 + rcall bitrotl + eor Accu1, Func1 + eor Accu2, Func2 + eor Accu3, Func3 + eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */ + mov Func1, Bck4 + mov Func2, Bck1 + mov Func3, Bck2 + mov Func4, Bck3 /* prerotate by 24 bits */ + ldi r20, 2 + rcall bitrotl + eor Accu1, Func1 + eor Accu2, Func2 + eor Accu3, Func3 + eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */ + add Accu1, XAccu1 /* add previous result (MAJ)*/ + adc Accu2, XAccu2 + adc Accu3, XAccu3 + adc Accu4, XAccu4 + /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/ + /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */ + + ldi r21, 7*4 + adiw r28, 7*4 +a_shift_loop: + ld r25, -Y /* warning: this is PREdecrement */ + std Y+4, r25 + dec r21 + brne a_shift_loop + + ldd Bck1, Y+4*4+0 + ldd Bck2, Y+4*4+1 + ldd Bck3, Y+4*4+2 + ldd Bck4, Y+4*4+3 + add Bck1, T1 + adc Bck2, T2 + adc Bck3, T3 + adc Bck4, T4 + std Y+4*4+0, Bck1 + std Y+4*4+1, Bck2 + std Y+4*4+2, Bck3 + std Y+4*4+3, Bck4 + add Accu1, T1 + adc Accu2, T2 + adc Accu3, T3 + adc Accu4, T4 + std Y+4*0+0, Accu1 + std Y+4*0+1, Accu2 + std Y+4*0+2, Accu3 + std Y+4*0+3, Accu4 /* a array updated */ + + + dec LoopC + breq update_state + rjmp sha256_main_loop ;brne sha256_main_loop +update_state: + /* update state */ + /* pointers to state should still exist on the stack ;-) */ + pop r31 + pop r30 + ldi r21, 8 +update_state_loop: + ldd Accu1, Z+0 + ldd Accu2, Z+1 + ldd Accu3, Z+2 + ldd Accu4, Z+3 + ld Func1, Y+ + ld Func2, Y+ + ld Func3, Y+ + ld Func4, Y+ + add Accu1, Func1 + adc Accu2, Func2 + adc Accu3, Func3 + adc Accu4, Func4 + st Z+, Accu1 + st Z+, Accu2 + st Z+, Accu3 + st Z+, Accu4 + dec r21 + brne update_state_loop + /* now we just have to update the length */ + adiw r30, 1 /* since we add 512, we can simply skip the LSB */ + ldi r21, 2 + ldi r22, 6 + ld r20, Z + add r20, r21 + st Z+, r20 + clr r21 +sha256_nextBlock_fix_length: + brcc sha256_nextBlock_epilog + ld r20, Z + adc r20, r21 + st Z+, r20 + dec r22 + brne sha256_nextBlock_fix_length + +; EPILOG +sha256_nextBlock_epilog: +/* now we should clean up the stack */ + + pop r21 + pop r20 + in r0, SREG + cli ; we want to be uninterrupted while updating SP + out SPL, r20 + out SPH, r21 + out SREG, r0 + + clr r1 + pop r29 + pop r28 + pop r17 + pop r16 + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop r8 + pop r7 + pop r6 + pop r5 + pop r4 + ret + +sha256_kv: ; round-key-vector stored in ProgMem +.word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c +.word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b +.word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9 +.word 0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429 +.word 0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272 +.word 0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a +.word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e +.word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671 + + +;########################################################### + +.global sha256_init +;uint32_t sha256_init_vector[]={ +; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, +; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; +; +;void sha256_init(sha256_ctx_t *state){ +; state->length=0; +; memcpy(state->h, sha256_init_vector, 8*4); +;} +; param1: (Func3,r24) 16-bit pointer to sha256_ctx_t struct in ram +; modifys: Z(r30,r31), Func1, r22 +sha256_init: + movw r26, r24 ; (24,25) --> (26,27) load X with param1 + ldi r30, lo8((sha256_init_vector)) + ldi r31, hi8((sha256_init_vector)) + ldi r22, 32 +sha256_init_vloop: + lpm r23, Z+ + st X+, r23 + dec r22 + brne sha256_init_vloop + ldi r22, 8 + clr r1 ;this should not be needed +sha256_init_lloop: + st X+, r1 + dec r22 + brne sha256_init_lloop + ret + +sha256_init_vector: +.word 0xE667, 0x6A09 +.word 0xAE85, 0xBB67 +.word 0xF372, 0x3C6E +.word 0xF53A, 0xA54F +.word 0x527F, 0x510E +.word 0x688C, 0x9B05 +.word 0xD9AB, 0x1F83 +.word 0xCD19, 0x5BE0 + +;########################################################### + +.global rotl32 +; === ROTL32 === +; function that rotates a 32 bit word to the left +; param1: the 32-bit word to rotate +; given in r25,r24,r23,r22 (r25 is most significant) +; param2: an 8-bit value telling how often to rotate +; given in r20 +; modifys: r21, r22 +rotl32: + cpi r20, 8 + brlo bitrotl + mov r21, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r21 + subi r20, 8 + rjmp rotr32 +bitrotl: + clr r21 + clc +bitrotl_loop: + tst r20 + breq fixrotl + rol r22 + rol r23 + rol r24 + rol r25 + rol r21 + dec r20 + rjmp bitrotl_loop +fixrotl: + or r22, r21 + ret + + +;########################################################### + +.global rotr32 +; === ROTR32 === +; function that rotates a 32 bit word to the right +; param1: the 32-bit word to rotate +; given in r25,r24,r23,22 (r25 is most significant) +; param2: an 8-bit value telling how often to rotate +; given in r20 +; modifys: r21, r22 +rotr32: + cpi r20, 8 + brlo bitrotr + mov r21, r22 + mov r22, r23 + mov r23, r24 + mov r24, r25 + mov r25, r21 + subi r20, 8 + rjmp rotr32 +bitrotr: + clr r21 + clc +bitrotr_loop: + tst r20 + breq fixrotr + ror r25 + ror r24 + ror r23 + ror r22 + ror r21 + dec r20 + rjmp bitrotr_loop +fixrotr: + or r25, r21 + ret + + +;########################################################### + +.global change_endian32 +; === change_endian32 === +; function that changes the endianess of a 32-bit word +; param1: the 32-bit word +; given in r25,r24,r23,22 (r25 is most significant) +; modifys: r21, r22 +change_endian32: + movw r20, r22 ; (r22,r23) --> (r20,r21) + mov r22, r25 + mov r23, r24 + mov r24, r21 + mov r25, r20 + ret + diff --git a/xtea-asm.S b/xtea-asm.S index f3c5b12..20f1d63 100644 --- a/xtea-asm.S +++ b/xtea-asm.S @@ -17,9 +17,9 @@ along with this program. If not, see . */ /* xtea-asm.S - * Author: Daniel Otte - * Date: 06.06.2006 - * License: GPL + * Author: Daniel Otte + * Date: 2006-06-06 + * License: GPLv3 or later * Implementation of XTEA for AVR * include xtea.h in your C-Project to use this functions. */ -- 2.39.5