X-Git-Url: https://git.cryptolib.org/?p=avr-crypto-lib.git;a=blobdiff_plain;f=shabea%2Fsha256-asm.S;fp=shabea%2Fsha256-asm.S;h=d9eb6b65a789a49a7dc730775685114f146ed53b;hp=0000000000000000000000000000000000000000;hb=d32eba56ce10ea6b9eff123b50d9842673b38f2b;hpb=8f855d283a31a468ea014774c4723a8b77b81644

diff --git a/shabea/sha256-asm.S b/shabea/sha256-asm.S
new file mode 100644
index 0000000..d9eb6b6
--- /dev/null
+++ b/shabea/sha256-asm.S
@@ -0,0 +1,1042 @@
+/* sha256-asm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Author:	Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; sha-256 implementation in assembler	
+SHA256_BLOCK_BITS = 512
+SHA256_HASH_BITS = 256
+
+.macro precall
+	/* push r18 - r27, r30 - r31*/
+	push r0
+	push r1
+	push r18
+	push r19
+	push r20
+	push r21
+	push r22
+	push r23
+	push r24
+	push r25
+	push r26
+	push r27
+	push r30
+	push r31
+	clr r1
+.endm
+
+.macro postcall
+	pop r31
+	pop r30
+	pop r27
+	pop r26
+	pop r25
+	pop r24
+	pop r23
+	pop r22
+	pop r21
+	pop r20
+	pop r19
+	pop r18
+	pop r1
+	pop r0
+.endm
+
+
+.macro hexdump length
+	push r27
+	push r26
+	ldi r25, '\r'
+	mov r24, r25
+	call uart_putc
+	ldi r25, '\n'
+	mov r24, r25
+	call uart_putc
+	pop r26
+	pop r27
+	movw r24, r26
+.if \length > 16
+	ldi r22, lo8(16)
+	ldi r23, hi8(16)
+	push r27
+	push r26
+	call uart_hexdump
+	pop r26
+	pop r27
+	adiw r26, 16
+	hexdump \length-16
+.else
+	ldi r22, lo8(\length)
+	ldi r23, hi8(\length)
+	call uart_hexdump
+.endif
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+	precall
+	hexdump \length
+	postcall
+.endm
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha256_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][h5][h6][h7][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################	
+
+.global sha256_ctx2hash
+; === sha256_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+;  param1: the 16-bit destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to sha256_ctx structure
+;	given in r23,r22
+sha256_ctx2hash:
+	movw r26, r22
+	movw r30, r24
+	ldi r21, 8
+	sbiw r26, 4
+1:	
+	ldi r20, 4
+	adiw r26, 8
+2:	
+		ld r0, -X
+		st Z+, r0	
+	dec r20
+	brne 2b
+	
+	dec r21
+	brne 1b
+	
+	ret
+
+;###########################################################	
+
+.global sha256
+; === sha256 ===
+; this function calculates SHA-256 hashes from messages in RAM
+;  param1: the 16-bit hash destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to message
+;	given in r23,r22
+;  param3: 32-bit length value (length of message in bits)
+;   given in r21,r20,r19,r18
+sha256:
+sha256_prolog:
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r16
+	push r17
+	in r16, SPL
+	in r17, SPH
+	subi r16, 8*4+8 
+	sbci r17, 0	
+	in r0, SREG
+	cli
+	out SPL, r16
+	out SPH, r17
+	out SREG, r0
+	
+	push r25
+	push r24
+	inc r16
+	adc r17, r1
+	
+	movw r8, r18		/* backup of length*/
+	movw r10, r20
+	
+	movw r12, r22	/* backup pf msg-ptr */
+	
+	movw r24, r16
+	rcall sha256_init
+	/* if length >= 512 */
+1:
+	tst r11
+	brne 4f
+	tst r10
+	brne 4f
+	mov r19, r9
+	cpi r19, 0x02
+	brlo 4f
+	
+	movw r24, r16
+	movw r22, r12
+	rcall sha256_nextBlock
+	ldi r19, 0x64
+	add r22, r19
+	adc r23, r1
+	/* length -= 512 */
+	ldi r19, 0x02
+	sub r9, r19
+	sbc r10, r1
+	sbc r11, r1
+	rjmp 1b
+	
+4:
+	movw r24, r16
+	movw r22, r12
+	movw r20, r8
+	rcall sha256_lastBlock
+	
+	pop r24
+	pop r25
+	movw r22, r16
+	rcall sha256_ctx2hash	
+	
+sha256_epilog:
+	in r30, SPL
+	in r31, SPH
+	adiw r30, 8*4+8 	
+	in r0, SREG
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG, r0
+	pop r17
+	pop r16
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	ret
+
+;###########################################################	
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha256_lastBlock
+; === sha256_lastBlock ===
+; this function does padding & Co. for calculating SHA-256 hashes
+;  param1: the 16-bit pointer to sha256_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+;  param3: an 16-bit integer specifing length of block in bits
+;	given in r21,r20
+sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
+
+
+sha256_lastBlock:
+	cpi r21, 0x02
+	brlo sha256_lastBlock_prolog
+	push r25
+	push r24
+	push r23
+	push r22
+	push r21
+	push r20
+	rcall sha256_nextBlock
+	pop r20
+	pop r21
+	pop r22
+	pop r23
+	pop r24
+	pop r25
+	subi r21, 0x02
+	subi r23, -2
+	rjmp sha256_lastBlock	
+sha256_lastBlock_prolog:
+	/* allocate space on stack */
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	subi r30, lo8(64)
+	sbci r31, hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+
+	adiw r30, 1 /* SP points to next free byte on stack */
+	mov r18, r20 /* r20 = LSB(length) */
+	lsr r18
+	lsr r18
+	lsr r18
+	bst r21, 0	/* may be we should explain this ... */
+	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
+	
+	
+	movw r26, r22 /* X points to begin of msg */
+	tst r18
+	breq sha256_lastBlock_post_copy
+	mov r1, r18
+sha256_lastBlock_copy_loop:
+	ld r0, X+
+	st Z+, r0
+	dec r1
+	brne sha256_lastBlock_copy_loop
+sha256_lastBlock_post_copy:	
+sha256_lastBlock_insert_stuffing_bit:	
+	ldi r19, 0x80
+	mov r0,r19 	
+	ldi r19, 0x07
+	and r19, r20 /* if we are in bitmode */
+	breq 2f	/* no bitmode */
+1:	
+	lsr r0
+	dec r19
+	brne 1b
+	ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+	or r0, r19
+2:	
+	st Z+, r0
+	inc r18
+
+/* checking stuff here */
+	cpi r18, 64-8+1
+	brsh 0f 
+	rjmp sha256_lastBlock_insert_zeros
+0:
+	/* oh shit, we landed here */
+	/* first we have to fill it up with zeros */
+	ldi r19, 64
+	sub r19, r18
+	breq 2f
+1:	
+	st Z+, r1
+	dec r19
+	brne 1b	
+2:	
+	sbiw r30, 63
+	sbiw r30,  1
+	movw r22, r30
+	
+	push r31
+	push r30
+	push r25
+	push r24
+	push r21
+	push r20
+	rcall sha256_nextBlock
+	pop r20
+	pop r21
+	pop r24
+	pop r25
+	pop r30
+	pop r31
+	
+	/* now we should subtract 512 from length */
+	movw r26, r24
+	adiw r26, 4*8+1 /* we can skip the lowest byte */
+	ld r19, X
+	subi r19, hi8(512)
+	st X+, r19
+	ldi r18, 6
+1:
+	ld r19, X
+	sbci r19, 0
+	st X+, r19
+	dec r18
+	brne 1b
+	
+;	clr r18 /* not neccessary ;-) */
+	/* reset Z pointer to begin of block */
+
+sha256_lastBlock_insert_zeros:	
+	ldi r19, 64-8
+	sub r19, r18
+	breq sha256_lastBlock_insert_length
+	clr r1
+1:
+	st Z+, r1	/* r1 is still zero */
+	dec r19
+	brne 1b
+
+;	rjmp sha256_lastBlock_epilog
+sha256_lastBlock_insert_length:
+	movw r26, r24	/* X points to state */
+	adiw r26, 8*4	/* X points to (state.length) */
+	adiw r30, 8		/* Z points one after the last byte of block */
+	ld r0, X+
+	add r0, r20
+	st -Z, r0
+	ld r0, X+
+	adc r0, r21
+	st -Z, r0
+	ldi r19, 6
+1:
+	ld r0, X+
+	adc r0, r1
+	st -Z, r0
+	dec r19
+	brne 1b
+
+	sbiw r30, 64-8
+	movw r22, r30
+	rcall sha256_nextBlock
+
+sha256_lastBlock_epilog:
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	adiw r30, 63 ; lo8(64)
+	adiw r30,  1  ; hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+	clr r1
+	clr r0
+	ret
+
+/**/
+;###########################################################	
+
+.global sha256_nextBlock
+; === sha256_nextBlock ===
+; this is the core function for calculating SHA-256 hashes
+;  param1: the 16-bit pointer to sha256_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
+
+Bck1 = 12
+Bck2 = 13
+Bck3 = 14
+Bck4 = 15
+Func1 = 22
+Func2 = 23
+Func3 = 24
+Func4 = 25
+Accu1 = 16
+Accu2 = 17
+Accu3 = 18
+Accu4 = 19
+XAccu1 = 8
+XAccu2 = 9
+XAccu3 = 10
+XAccu4 = 11
+T1	= 4
+T2	= 5
+T3	= 6
+T4	= 7
+LoopC = 1
+/* byteorder: high number <--> high significance */
+sha256_nextBlock:
+ ; initial, let's make some space ready for local vars
+	push r4 /* replace push & pop by mem ops? */
+	push r5
+	push r6
+	push r7
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	push r16
+	push r17
+	push r28
+	push r29
+	in r20, SPL
+	in r21, SPH
+	movw r18, r20			;backup SP
+;	movw r26, r20			; X points to free space on stack 
+	movw r30, r22			; Z points to message
+	subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
+	sbci r21, hi8(sha256_nextBlock_localSpace)
+	movw r26, r20			; X points to free space on stack 
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	push r18
+	push r19
+	push r24
+	push r25 /* param1 will be needed later */
+ ; now we fill the w array with message (think about endianess)
+ 	adiw r26, 1 ; X++
+ 	ldi r20, 16
+sha256_nextBlock_wcpyloop: 	
+ 	ld r23, Z+
+ 	ld r22, Z+
+ 	ld r19, Z+
+ 	ld r18, Z+
+ 	st X+, r18
+ 	st X+, r19
+ 	st X+, r22	
+	st X+, r23
+	dec r20
+	brne sha256_nextBlock_wcpyloop
+/*	for (i=16; i<64; ++i){
+		w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];	
+	} */
+	/* r25,r24,r23,r24 (r21,r20) are function values
+	   r19,r18,r17,r16 are the accumulator
+	   r15,r14,r13,rBck1 are backup1
+	   r11,r10,r9 ,r8  are xor accu   
+	   r1 is round counter 								*/
+
+	ldi r20, 64-16
+	mov LoopC, r20
+sha256_nextBlock_wcalcloop:		 
+	movw r30, r26 ; cp X to Z
+	sbiw r30, 63
+	sbiw r30, 1 		; substract 64 = 16*4
+	ld Accu1, Z+
+	ld Accu2, Z+
+	ld Accu3, Z+
+	ld Accu4, Z+ /* w[i] = w[i-16] */
+	ld Bck1, Z+
+	ld Bck2, Z+
+	ld Bck3, Z+
+	ld Bck4, Z+ /* backup = w[i-15] */
+	/* now sigma 0 */
+	mov Func1, Bck2
+	mov Func2, Bck3
+	mov Func3, Bck4
+	mov Func4, Bck1  /* prerotated by 8 */
+	ldi r20, 1
+	rcall bitrotl
+	movw XAccu1, Func1
+	movw XAccu3, Func3	 /* store ROTR(w[i-15],7) in xor accu */
+	movw Func1, Bck3
+	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 2
+	rcall bitrotr
+	eor XAccu1, Func1  /* xor ROTR(w[i-15], 18)*/
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	ldi Func2, 3		 /* now shr3 */ /*we can destroy backup now*/
+sigma0_shr:
+	lsr Bck4
+	ror Bck3
+	ror Bck2
+	ror Bck1	
+	dec Func2
+	brne sigma0_shr
+	eor XAccu1, Bck1
+	eor XAccu2, Bck2
+	eor XAccu3, Bck3
+	eor XAccu4, Bck4	/* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
+	add Accu1, XAccu1
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4 /* finished with sigma0 */
+	ldd Func1, Z+7*4  /* now accu += w[i-7] */
+	ldd Func2, Z+7*4+1
+	ldd Func3, Z+7*4+2
+	ldd Func4, Z+7*4+3
+	add Accu1, Func1
+	adc Accu2, Func2
+	adc Accu3, Func3
+	adc Accu4, Func4
+	ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
+	ldd Bck2, Z+12*4+1
+	ldd Bck3, Z+12*4+2
+	ldd Bck4, Z+12*4+3
+	/* now sigma 1 */
+	movw Func1, Bck3
+	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 1
+	rcall bitrotr
+	movw XAccu3, Func3
+	movw XAccu1, Func1	 /* store in ROTR(w[i-2], 17) xor accu */
+;	movw Func1, Bck3
+;	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 2
+	rcall bitrotr
+	eor XAccu1, Func1  /* xor ROTR(w[i-2], 19)*/
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	ldi Func2, 2	 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
+sigma1_shr:
+	lsr Bck4
+	ror Bck3
+	ror Bck2	
+	dec Func2
+	brne sigma1_shr
+	eor XAccu1, Bck2
+	eor XAccu2, Bck3
+	eor XAccu3, Bck4  /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
+	add Accu1, XAccu1
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4 /* finished with sigma0 */
+	/* now let's store the shit */
+	st X+, Accu1
+	st X+, Accu2
+	st X+, Accu3
+	st X+, Accu4
+	dec LoopC
+	breq 3f  ; skip if zero
+	rjmp sha256_nextBlock_wcalcloop
+3:
+	/* we are finished with w array X points one byte post w */
+/* init a array */
+	pop r31
+	pop r30
+	push r30
+	push r31
+	ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
+init_a_array:	
+	ld r1, Z+
+	st X+, r1
+	dec r25
+	brne init_a_array
+	
+/* now the real fun begins */
+/* for (i=0; i<64; ++i){
+			t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
+			t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
+			memmove(&(a[1]), &(a[0]), 7*4); 	// a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; 
+			a[4] += t1;
+			a[0] = t1 + t2;
+		} */
+	/* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
+	sbiw r26, 8*4  /* X still points at a[7]+1*/
+	movw r28, r26
+	ldi r30, lo8(sha256_kv)
+	ldi r31, hi8(sha256_kv)		
+	dec r27  /* X - (64*4 == 256) */
+	ldi r25, 64
+	mov LoopC, r25
+sha256_main_loop:
+	/* now calculate t1 */
+	 /*CH(x,y,z) = (x&y)^((~x)&z)*/
+	ldd T1, Y+5*4
+	ldd T2, Y+5*4+1
+	ldd T3, Y+5*4+2
+	ldd T4, Y+5*4+3 /* y in T */
+	ldd Func1, Y+4*4
+	ldd Func2, Y+4*4+1
+	ldd Func3, Y+4*4+2
+	ldd Func4, Y+4*4+3  /* x in Func */
+	ldd Bck1, Y+6*4
+	ldd Bck2, Y+6*4+1
+	ldd Bck3, Y+6*4+2
+	ldd Bck4, Y+6*4+3 /* z in Bck */
+	and T1, Func1
+	and T2, Func2
+	and T3, Func3
+	and T4, Func4
+	com Func1
+	com Func2
+	com Func3
+	com Func4
+	and Bck1, Func1
+	and Bck2, Func2
+	and Bck3, Func3
+	and Bck4, Func4
+	eor T1, Bck1
+	eor T2, Bck2
+	eor T3, Bck3
+	eor T4, Bck4 /* done, CH(x,y,z) is in T */
+	/* now SIGMA1(a[4]) */
+	ldd Bck4, Y+4*4		/* think about using it from Func reg above*/
+	ldd Bck1, Y+4*4+1	
+	ldd Bck2, Y+4*4+2
+	ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */	
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 2 
+	rcall bitrotl		/* rotr(x,6) */ 
+	movw XAccu1, Func1
+	movw XAccu3, Func3
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 3 
+	rcall bitrotr 	/* rotr(x,11) */
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	movw Func1, Bck3 /* this prerotates furteh 16 bits*/
+	movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
+	ldi r20, 1 
+	rcall bitrotr 	/* rotr(x,11) */
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4
+	/* now we've to add a[7], w[i] and k[i] */
+	ldd XAccu1, Y+4*7
+	ldd XAccu2, Y+4*7+1
+	ldd XAccu3, Y+4*7+2
+	ldd XAccu4, Y+4*7+3
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add a[7] */
+	ld XAccu1, X+
+	ld XAccu2, X+
+	ld XAccu3, X+
+	ld XAccu4, X+
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add w[i] */
+	lpm XAccu1, Z+
+	lpm XAccu2, Z+
+	lpm XAccu3, Z+
+	lpm XAccu4, Z+
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
+	/*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
+		/* starting with MAJ(x,y,z) */
+	ldd Func1, Y+4*0+0
+	ldd Func2, Y+4*0+1
+	ldd Func3, Y+4*0+2
+	ldd Func4, Y+4*0+3 /* load x=a[0] */
+	ldd XAccu1, Y+4*1+0
+	ldd XAccu2, Y+4*1+1
+	ldd XAccu3, Y+4*1+2
+	ldd XAccu4, Y+4*1+3 /* load y=a[1] */
+	and XAccu1, Func1
+	and XAccu2, Func2
+	and XAccu3, Func3
+	and XAccu4, Func4	/* XAccu == (x & y) */
+	ldd Bck1, Y+4*2+0
+	ldd Bck2, Y+4*2+1
+	ldd Bck3, Y+4*2+2
+	ldd Bck4, Y+4*2+3 /* load z=a[2] */
+	and Func1, Bck1
+	and Func2, Bck2
+	and Func3, Bck3
+	and Func4, Bck4
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4	/* XAccu == (x & y) ^ (x & z) */
+	ldd Func1, Y+4*1+0
+	ldd Func2, Y+4*1+1
+	ldd Func3, Y+4*1+2
+	ldd Func4, Y+4*1+3 /* load y=a[1] */
+	and Func1, Bck1
+	and Func2, Bck2
+	and Func3, Bck3
+	and Func4, Bck4
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4	/* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
+   	/* SIGMA0(a[0]) */
+	ldd Bck1, Y+4*0+0 /* we should combine this with above */
+	ldd Bck2, Y+4*0+1
+	ldd Bck3, Y+4*0+2
+	ldd Bck4, Y+4*0+3
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 2
+	rcall bitrotr
+	movw Accu1, Func1
+	movw Accu3, Func3 /* Accu = shr(a[0], 2) */
+	movw Func1, Bck3 
+	movw Func3, Bck1 /* prerotate by 16 bits */
+	ldi r20, 3
+	rcall bitrotl
+	eor Accu1, Func1
+	eor Accu2, Func2
+	eor Accu3, Func3
+	eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
+	mov Func1, Bck4
+	mov Func2, Bck1
+	mov Func3, Bck2
+	mov Func4, Bck3  /* prerotate by 24 bits */
+	ldi r20, 2
+	rcall bitrotl
+	eor Accu1, Func1
+	eor Accu2, Func2
+	eor Accu3, Func3
+	eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
+	add Accu1, XAccu1 /* add previous result (MAJ)*/
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4
+	/* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
+	/* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
+
+	ldi r21, 7*4
+	adiw r28, 7*4
+a_shift_loop:
+	ld  r25, -Y /* warning: this is PREdecrement */
+	std Y+4, r25
+	dec r21
+	brne a_shift_loop
+
+	ldd Bck1, Y+4*4+0
+	ldd Bck2, Y+4*4+1
+	ldd Bck3, Y+4*4+2
+	ldd Bck4, Y+4*4+3
+	add Bck1, T1
+	adc Bck2, T2
+	adc Bck3, T3
+	adc Bck4, T4
+	std Y+4*4+0, Bck1
+	std Y+4*4+1, Bck2
+	std Y+4*4+2, Bck3
+	std Y+4*4+3, Bck4
+	add Accu1, T1
+	adc Accu2, T2
+	adc Accu3, T3
+	adc Accu4, T4
+	std Y+4*0+0, Accu1
+	std Y+4*0+1, Accu2
+	std Y+4*0+2, Accu3
+	std Y+4*0+3, Accu4 /* a array updated */
+	
+	
+	dec LoopC
+	breq update_state
+	rjmp sha256_main_loop ;brne sha256_main_loop
+update_state:	
+	/* update state */
+	/* pointers to state should still exist on the stack ;-) */
+	pop r31
+	pop r30
+	ldi r21, 8
+update_state_loop:
+	ldd Accu1, Z+0
+	ldd Accu2, Z+1
+	ldd Accu3, Z+2
+	ldd Accu4, Z+3 
+	ld Func1, Y+
+	ld Func2, Y+
+	ld Func3, Y+
+	ld Func4, Y+
+	add Accu1, Func1
+	adc Accu2, Func2
+	adc Accu3, Func3
+	adc Accu4, Func4
+	st Z+, Accu1
+	st Z+, Accu2
+	st Z+, Accu3
+	st Z+, Accu4
+	dec r21
+	brne update_state_loop
+	/* now we just have to update the length */
+	adiw r30, 1 /* since we add 512, we can simply skip the LSB */ 
+	ldi r21, 2
+	ldi r22, 6
+	ld r20, Z
+	add r20, r21
+	st Z+, r20	
+	clr r21
+sha256_nextBlock_fix_length:	
+	brcc sha256_nextBlock_epilog
+	ld r20, Z
+	adc r20, r21
+	st Z+, r20
+	dec r22
+	brne sha256_nextBlock_fix_length
+	
+; EPILOG
+sha256_nextBlock_epilog:
+/* now we should clean up the stack */
+	
+	pop r21
+	pop r20
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	clr r1
+	pop r29
+	pop r28
+	pop r17
+	pop r16
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	pop r7
+	pop r6
+	pop r5
+	pop r4 
+	ret
+
+sha256_kv: ; round-key-vector stored in ProgMem 
+.word	0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
+.word	0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
+.word	0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
+.word	0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
+.word	0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
+.word	0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
+.word	0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
+.word	0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
+
+	
+;###########################################################	
+
+.global sha256_init 
+;uint32_t sha256_init_vector[]={
+;  	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+;	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
+;
+;void sha256_init(sha256_ctx_t *state){
+;	state->length=0;
+;	memcpy(state->h, sha256_init_vector, 8*4);
+;}
+; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha256_init:
+	movw r26, r24 ; (24,25) --> (26,27) load X with param1
+	ldi r30, lo8((sha256_init_vector))
+	ldi r31, hi8((sha256_init_vector))
+	ldi r22, 32+8
+sha256_init_vloop:	
+	lpm r23, Z+ 
+	st X+, r23
+	dec r22
+	brne sha256_init_vloop
+	ret
+	
+sha256_init_vector:
+.word 0xE667, 0x6A09
+.word 0xAE85, 0xBB67 
+.word 0xF372, 0x3C6E 
+.word 0xF53A, 0xA54F 
+.word 0x527F, 0x510E 
+.word 0x688C, 0x9B05 
+.word 0xD9AB, 0x1F83 
+.word 0xCD19, 0x5BE0
+.word 0x0000, 0x0000
+.word 0x0000, 0x0000
+
+;###########################################################	
+
+.global rotl32
+; === ROTL32 ===
+; function that rotates a 32 bit word to the left
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,r22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotl32:
+	cpi r20, 8
+	brlo bitrotl
+	mov r21, r25
+	mov r25, r24
+	mov r24, r23
+	mov r23, r22
+	mov r22, r21
+	subi r20, 8
+	rjmp rotl32
+bitrotl:
+	clr r21
+	clc
+bitrotl_loop:	
+	tst r20
+	breq fixrotl
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	rol r21
+	dec r20
+	rjmp bitrotl_loop
+fixrotl:
+	or r22, r21
+	ret
+	
+
+;###########################################################	
+
+.global rotr32
+; === ROTR32 ===
+; function that rotates a 32 bit word to the right
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotr32:
+	cpi r20, 8
+	brlo bitrotr
+	mov r21, r22
+	mov r22, r23
+	mov r23, r24
+	mov r24, r25
+	mov r25, r21
+	subi r20, 8
+	rjmp rotr32
+bitrotr:
+	clr r21
+	clc
+bitrotr_loop:	
+	tst r20
+	breq fixrotr
+	ror r25
+	ror r24
+	ror r23
+	ror r22
+	ror r21
+	dec r20
+	rjmp bitrotr_loop
+fixrotr:
+	or r25, r21
+	ret
+	
+	
+;###########################################################	
+	
+.global change_endian32
+; === change_endian32 ===
+; function that changes the endianess of a 32-bit word
+;  param1: the 32-bit word
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  modifys: r21, r22
+change_endian32:
+	movw r20,  r22 ; (r22,r23) --> (r20,r21)
+	mov r22, r25
+	mov r23, r24
+	mov r24, r21
+	mov r25, r20 
+	ret
+