X-Git-Url: https://git.cryptolib.org/?a=blobdiff_plain;f=sha1-asm.S;h=f571685984c5046fd0f4280adbc9a5fe513720a8;hb=53f8a8d7ca5c03120224128e1f8552c99f6af11e;hp=278d52a6a2bd35c715d1170fbd5542a96d7aeb67;hpb=6e51024d966bd015cb8f9c8460c5c21da6a08f9e;p=avr-crypto-lib.git

diff --git a/sha1-asm.S b/sha1-asm.S
index 278d52a..f571685 100644
--- a/sha1-asm.S
+++ b/sha1-asm.S
@@ -1,978 +1,886 @@
-/*
- * Author:	Daniel Otte
- *
- * License: GPL
-*/
-; SHA1 implementation in assembler for AVR
-SHA1_BLOCK_BITS = 512
-SHA1_HASH_BITS = 160
-
-.macro precall
-	/* push r18 - r27, r30 - r31*/
-	push r0
-	push r1
-	push r18
-	push r19
-	push r20
-	push r21
-	push r22
-	push r23
-	push r24
-	push r25
-	push r26
-	push r27
-	push r30
-	push r31
-	clr r1
-.endm
-
-.macro postcall
-	pop r31
-	pop r30
-	pop r27
-	pop r26
-	pop r25
-	pop r24
-	pop r23
-	pop r22
-	pop r21
-	pop r20
-	pop r19
-	pop r18
-	pop r1
-	pop r0
-.endm
-
-
-.macro hexdump length
-	push r27
-	push r26
-	ldi r25, '\r'
-	mov r24, r25
-	call uart_putc
-	ldi r25, '\n'
-	mov r24, r25
-	call uart_putc
-	pop r26
-	pop r27
-	movw r24, r26
-.if \length > 16
-	ldi r22, lo8(16)
-	ldi r23, hi8(16)
-	push r27
-	push r26
-	call uart_hexdump
-	pop r26
-	pop r27
-	adiw r26, 16
-	hexdump \length-16
-.else
-	ldi r22, lo8(\length)
-	ldi r23, hi8(\length)
-	call uart_hexdump
-.endif
-.endm
-
-.macro delay
-/*	
-	push r0
-	push r1
-	clr r0
-1:	clr r1
-2:	dec r1
-	brne 2b
-	dec r0
-	brne 1b
-	pop r1
-	pop r0  // */
-.endm
-
-/* X points to Block */
-.macro dbg_hexdump length
-/*	
-	precall
-	hexdump \length
-	postcall
-	// */
-.endm
-
-
-
-.section .text
-
-SPL = 0x3D
-SPH = 0x3E
-SREG = 0x3F
-
-
-;
-;sha1_ctx_t is:
-;
-; [h0][h1][h2][h3][h4][length]
-; hn is 32 bit large, length is 64 bit large
-
-;###########################################################	
-
-.global sha1_ctx2hash
-; === sha1_ctx2hash ===
-; this function converts a state into a normal hash (bytestring)
-;  param1: the 16-bit destination pointer
-;	given in r25,r24 (r25 is most significant)
-;  param2: the 16-bit pointer to sha1_ctx structure
-;	given in r23,r22
-sha1_ctx2hash:
-	movw r26, r22
-	movw r30, r24
-	ldi r21, 5
-	sbiw r26, 4
-1:	
-	ldi r20, 4
-	adiw r26, 8
-2:	
-		ld r0, -X
-		st Z+, r0	
-	dec r20
-	brne 2b
-	
-	dec r21
-	brne 1b
-	
-	ret
-
-;###########################################################	
-
-.global sha1
-; === sha1 ===
-; this function calculates SHA-1 hashes from messages in RAM
-;  param1: the 16-bit hash destination pointer
-;	given in r25,r24 (r25 is most significant)
-;  param2: the 16-bit pointer to message
-;	given in r23,r22
-;  param3: 32-bit length value (length of message in bits)
-;   given in r21,r20,r19,r18
-sha1:
-sha1_prolog:
-	push r8
-	push r9
-	push r10
-	push r11
-	push r12
-	push r13
-	push r16
-	push r17
-	in r16, SPL
-	in r17, SPH
-	subi r16, 5*4+8 
-	sbci r17, 0	
-	in r0, SREG
-	cli
-	out SPL, r16
-	out SPH, r17
-	out SREG, r0
-	
-	push r25
-	push r24
-	inc r16
-	adc r17, r1
-	
-	movw r8, r18		/* backup of length*/
-	movw r10, r20
-	
-	movw r12, r22	/* backup pf msg-ptr */
-	
-	movw r24, r16
-	rcall sha1_init
-	/* if length >= 512 */
-1:
-	tst r11
-	brne 4f
-	tst r10
-	brne 4f
-	mov r19, r9
-	cpi r19, 0x02
-	brlo 4f
-	
-	movw r24, r16
-	movw r22, r12
-	rcall sha1_nextBlock
-	ldi r19, 0x64
-	add r22, r19
-	adc r23, r1
-	/* length -= 512 */
-	ldi r19, 0x02
-	sub r9, r19
-	sbc r10, r1
-	sbc r11, r1
-	rjmp 1b
-	
-4:
-	movw r24, r16
-	movw r22, r12
-	movw r20, r8
-	rcall sha1_lastBlock
-	
-	pop r24
-	pop r25
-	movw r22, r16
-	rcall sha1_ctx2hash	
-	
-sha1_epilog:
-	in r30, SPL
-	in r31, SPH
-	adiw r30, 5*4+8 	
-	in r0, SREG
-	cli
-	out SPL, r30
-	out SPH, r31
-	out SREG, r0
-	pop r17
-	pop r16
-	pop r13
-	pop r12
-	pop r11
-	pop r10
-	pop r9
-	pop r8
-	ret
-
-;###########################################################	
-
-
-; block MUST NOT be larger than 64 bytes
-
-.global sha1_lastBlock
-; === sha1_lastBlock ===
-; this function does padding & Co. for calculating SHA-1 hashes
-;  param1: the 16-bit pointer to sha1_ctx structure
-;	given in r25,r24 (r25 is most significant)
-;  param2: an 16-bit pointer to 64 byte block to hash
-;	given in r23,r22
-;  param3: an 16-bit integer specifing length of block in bits
-;	given in r21,r20
-sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1)
-
-
-sha1_lastBlock:
-	tst r20
-	brne sha1_lastBlock_prolog
-	cpi r21, 0x02
-	brne sha1_lastBlock_prolog
-	push r25
-	push r24
-	push r23
-	push r22
-	rcall sha1_nextBlock
-	pop r22
-	pop r23
-	pop r24
-	pop r25
-	clr r21
-	clr r22
-sha1_lastBlock_prolog:
-	/* allocate space on stack */
-	in r30, SPL
-	in r31, SPH
-	in r1, SREG
-	subi r30, lo8(64)
-	sbci r31, hi8(64) /* ??? */
-	cli
-	out SPL, r30
-	out SPH, r31
-	out SREG,r1
-
-	adiw r30, 1 /* SP points to next free byte on stack */
-	mov r18, r20 /* r20 = LSB(length) */
-	lsr r18
-	lsr r18
-	lsr r18
-	bst r21, 0	/* may be we should explain this ... */
-	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
-	
-	
-	movw r26, r22 /* X points to begin of msg */
-	tst r18
-	breq sha1_lastBlock_post_copy
-	mov r1, r18
-sha1_lastBlock_copy_loop:
-	ld r0, X+
-	st Z+, r0
-	dec r1
-	brne sha1_lastBlock_copy_loop
-sha1_lastBlock_post_copy:	
-sha1_lastBlock_insert_stuffing_bit:	
-	ldi r19, 0x80
-	mov r0,r19 	
-	ldi r19, 0x07
-	and r19, r20 /* if we are in bitmode */
-	breq 2f	/* no bitmode */
-1:	
-	lsr r0
-	dec r19
-	brne 1b
-	ld r19, X
-/* maybe we should do some ANDing here, just for safety */
-	or r0, r19
-2:	
-	st Z+, r0
-	inc r18
-
-/* checking stuff here */
-	cpi r18, 64-8+1
-	brsh 0f 
-	rjmp sha1_lastBlock_insert_zeros
-0:
-	/* oh shit, we landed here */
-	/* first we have to fill it up with zeros */
-	ldi r19, 64
-	sub r19, r18
-	breq 2f
-1:	
-	st Z+, r1
-	dec r19
-	brne 1b	
-2:	
-	sbiw r30, 63
-	sbiw r30,  1
-	movw r22, r30
-	
-	push r31
-	push r30
-	push r25
-	push r24
-	push r21
-	push r20
-	rcall sha1_nextBlock
-	pop r20
-	pop r21
-	pop r24
-	pop r25
-	pop r30
-	pop r31
-	
-	/* now we should subtract 512 from length */
-	movw r26, r24
-	adiw r26, 4*5+1 /* we can skip the lowest byte */
-	ld r19, X
-	subi r19, hi8(512)
-	st X+, r19
-	ldi r18, 6
-1:
-	ld r19, X
-	sbci r19, 0
-	st X+, r19
-	dec r18
-	brne 1b
-	
-;	clr r18 /* not neccessary ;-) */
-	/* reset Z pointer to begin of block */
-
-sha1_lastBlock_insert_zeros:	
-	ldi r19, 64-8
-	sub r19, r18
-	breq sha1_lastBlock_insert_length
-	clr r1
-1:
-	st Z+, r1	/* r1 is still zero */
-	dec r19
-	brne 1b
-
-;	rjmp sha1_lastBlock_epilog
-sha1_lastBlock_insert_length:
-	movw r26, r24	/* X points to state */
-	adiw r26, 5*4	/* X points to (state.length) */
-	adiw r30, 8		/* Z points one after the last byte of block */
-	ld r0, X+
-	add r0, r20
-	st -Z, r0
-	ld r0, X+
-	adc r0, r21
-	st -Z, r0
-	ldi r19, 6
-1:
-	ld r0, X+
-	adc r0, r1
-	st -Z, r0
-	dec r19
-	brne 1b
-
-	sbiw r30, 64-8
-	movw r22, r30
-	rcall sha1_nextBlock
-
-sha1_lastBlock_epilog:
-	in r30, SPL
-	in r31, SPH
-	in r1, SREG
-	adiw r30, 63 ; lo8(64)
-	adiw r30,  1  ; hi8(64)
-	cli
-	out SPL, r30
-	out SPH, r31
-	out SREG,r1
-	clr r1
-	clr r0
-	ret
-
-/**/
-;###########################################################	
-
-.global sha1_nextBlock
-; === sha1_nextBlock ===
-; this is the core function for calculating SHA-1 hashes
-;  param1: the 16-bit pointer to sha1_ctx structure
-;	given in r25,r24 (r25 is most significant)
-;  param2: an 16-bit pointer to 64 byte block to hash
-;	given in r23,r22
-sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte)
-
-xtmp = 0
-xNULL = 1
-W1 = 10
-W2 = 11
-T1	= 12
-T2	= 13
-T3	= 14
-T4	= 15
-LoopC = 16
-S	  = 17
-tmp1 = 18
-tmp2 = 19
-tmp3 = 20
-tmp4 = 21
-F1 = 22
-F2 = 23
-F3 = 24
-F4 = 25
-
-/* byteorder: high number <--> high significance */
-sha1_nextBlock:
- ; initial, let's make some space ready for local vars
- 			 /* replace push & pop by mem ops? */
-	push r10
-	push r11
-	push r12
-	push r13
-	push r14
-	push r15
-	push r16
-	push r17
-	push r28
-	push r29
-	in r20, SPL
-	in r21, SPH
-	movw r18, r20			;backup SP
-;	movw r26, r20			; X points to free space on stack /* maybe removeable? */ 
-	movw r30, r22			; Z points to message
-	subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63
-	sbci r21, hi8(sha1_nextBlock_localSpace)
-	movw r26, r20			; X points to free space on stack 
-	in r0, SREG
-	cli ; we want to be uninterrupted while updating SP
-	out SPL, r20
-	out SPH, r21
-	out SREG, r0
-	
-	push r18
-	push r19 /* push old SP on new stack */
-	push r24
-	push r25 /* param1 will be needed later */
-	
-	/* load a[] with state */
-	movw 28, r24 /* load pointer to state in Y */
-	adiw r26, 1 ; X++
-
-	ldi LoopC, 5*4	
-1:	ld tmp1, Y+
-	st X+, tmp1
-	dec LoopC
-	brne 1b
-
-	movw W1, r26 /* save pointer to w[0] */
-	/* load w[] with endian fixed message */
-		/* we might also use the changeendian32() function at bottom */
-	movw r30, r22 /* mv param2 (ponter to msg) to Z */	
-	ldi LoopC, 16
-1:
-	ldd tmp1, Z+3
-	st X+, tmp1
-	ldd tmp1, Z+2
-	st X+, tmp1
-	ldd tmp1, Z+1
-	st X+, tmp1
-	ld tmp1, Z
-	st X+, tmp1
-	adiw r30, 4
-	dec LoopC
-	brne 1b
-	
-	;clr LoopC /* LoopC is named t in FIPS 180-2 */	
-	clr xtmp
-sha1_nextBlock_mainloop:
-	mov S, LoopC
-	lsl S
-	lsl S
-	andi S, 0x3C /* S is a bytepointer so *4 */
-	/* load w[s] */
-	movw r26, W1
-	add r26, S /* X points at w[s] */
-	adc r27, xNULL
-	ld T1, X+
-	ld T2, X+
-	ld T3, X+
-	ld T4, X+
-
-	/**/
-	push r26
-	push r27
-	push T4
-	push T3
-	push T2
-	push T1
-	in r26, SPL
-	in r27, SPH
-	adiw r26, 1
-	dbg_hexdump 4
-	pop T1
-	pop T2
-	pop T3
-	pop T4
-	pop r27
-	pop r26
-	/**/
-
-	cpi LoopC, 16
-	brlt sha1_nextBlock_mainloop_core
-	/* update w[s] */
-	ldi tmp1, 2*4
-	rcall 1f
-	ldi tmp1, 8*4
-	rcall 1f
-	ldi tmp1, 13*4
-	rcall 1f
-	rjmp 2f
-1:		/* this might be "outsourced" to save the jump above */
-	add tmp1, S
-	andi tmp1, 0x3f
-	movw r26, W1
-	add r26, tmp1
-	adc r27, xNULL
-	ld tmp2, X+
-	eor T1, tmp2
-	ld tmp2, X+
-	eor T2, tmp2
-	ld tmp2, X+
-	eor T3, tmp2
-	ld tmp2, X+
-	eor T4, tmp2
-	ret
-2:	/* now we just hav to do a ROTL(T) and save T back */
-	mov tmp2, T4
-	rol tmp2
-	rol T1
-	rol T2
-	rol T3
-	rol T4
-	movw r26, W1
-	add r26, S
-	adc r27, xNULL
-	st X+, T1
-	st X+, T2
-	st X+, T3
-	st X+, T4
-	
-sha1_nextBlock_mainloop_core:	/* ther core function; T=ROTL5(a) ....*/	
-								/* T already contains w[s] */
-	movw r26, W1
-	sbiw r26, 4*1		/* X points at a[4] aka e */
-	ld tmp1, X+ 
-	add T1, tmp1
-	ld tmp1, X+ 
-	adc T2, tmp1
-	ld tmp1, X+ 
-	adc T3, tmp1
-	ld tmp1, X+ 
-	adc T4, tmp1		/* T = w[s]+e */
-	sbiw r26, 4*5		/* X points at a[0] aka a */
-	ld F1, X+ 
-	ld F2, X+ 
-	ld F3, X+ 
-	ld F4, X+ 
-	mov tmp1, F4		/* X points at a[1] aka b */
-	ldi tmp2, 5
-1:
-	rol tmp1
-	rol F1
-	rol F2
-	rol F3
-	rol F4
-	dec tmp2
-	brne 1b
-	
-	add T1, F1
-	adc T2, F2
-	adc T3, F3
-	adc T4, F4 /* T = ROTL(a,5) + e + w[s] */
-	
-	/* now we have to do this fucking conditional stuff */
-	ldi r30, lo8(sha1_nextBlock_xTable)
-	ldi r31, hi8(sha1_nextBlock_xTable)
-	add r30, xtmp
-	adc r31, xNULL
-	lpm tmp1, Z
-	cp tmp1, LoopC
-	brne 1f
-	inc xtmp
-1:	ldi r30, lo8(sha1_nextBlock_KTable)
-	ldi r31, hi8(sha1_nextBlock_KTable)
-	lsl xtmp
-	lsl xtmp
-	add r30, xtmp
-	adc r31, xNULL
-	lsr xtmp
-	lsr xtmp
-	 
-	lpm tmp1, Z+
-	add T1, tmp1
-	lpm tmp1, Z+
-	adc T2, tmp1
-	lpm tmp1, Z+
-	adc T3, tmp1
-	lpm tmp1, Z+
-	adc T4, tmp1
-			/* T = ROTL(a,5) + e + kt + w[s] */
-	
-	/* wo Z-4 gerade auf kt zeigt ... */
-	movw r28, r26 /* copy X in Y */
-	adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */
-	clc
-	ror r31
-	ror r30
-		
-	icall
-	mov F1, tmp1
-	icall
-	mov F2, tmp1
-	icall
-	mov F3, tmp1
-	icall
-	
-	add T1, F1
-	adc T2, F2
-	adc T3, F3
-	adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */
-				 /* X points still at a[1] aka b, Y points at a[2] aka c */	
-	/* update a[] */
-sha1_nextBlock_update_a:
-	/*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/
-	//adiw r28, 3*4  /* Y should point at a[4] aka e */
-	movw r28, W1
-	sbiw r28, 4
-	
-	ldi tmp2, 4*4 
-1:	
-	ld tmp1, -Y
-	std Y+4, tmp1
-	dec tmp2
-	brne 1b
-	/* Y points at a[0] aka a*/
-	
-	movw r28, W1
-	sbiw r28, 5*4
-	/* store T in a[0] aka a */
-	st Y+, T1
-	st Y+, T2
-	st Y+, T3
-	st Y+, T4
-	/* Y points at a[1] aka b*/
-	
-	/* rotate c */
-	ldd T1, Y+1*4
-	ldd T2, Y+1*4+1
-	ldd T3, Y+1*4+2
-	ldd T4, Y+1*4+3
-	mov tmp1, T1
-	ldi tmp2, 2
-1:	ror tmp1
-	ror T4
-	ror T3
-	ror T2
-	ror T1
-	dec tmp2
-	brne 1b
-	std Y+1*4+0, T1
-	std Y+1*4+1, T2
-	std Y+1*4+2, T3
-	std Y+1*4+3, T4
-	
-	push r27
-	push r26
-	movw r26, W1
-	sbiw r26, 4*5
-	dbg_hexdump 4*5
-	pop r26
-	pop r27
-	
-	inc LoopC
-	cpi LoopC, 80
-	brge 1f
-	jmp sha1_nextBlock_mainloop
-/**************************************/
-1:	
-   /* littel patch */
-	sbiw r28, 4
-
-/* add a[] to state and inc length */	
-	pop r27
-	pop r26		/* now X points to state (and Y still at a[0]) */
-	ldi tmp4, 5
-1:	clc
-	ldi tmp3, 4
-2:	ld tmp1, X
-	ld tmp2, Y+
-	adc tmp1, tmp2
-	st X+, tmp1
-	dec tmp3
-	brne 2b
-	dec tmp4
-	brne 1b
-	
-	/* now length += 512 */
-	adiw r26, 1 /* we skip the least significant byte */
-	ld tmp1, X
-	ldi tmp2, hi8(512) /* 2 */
-	add tmp1, tmp2
-	st X+, tmp1
-	ldi tmp2, 6
-1:
-	ld tmp1, X
-	adc tmp1, xNULL
-	st X+, tmp1
-	dec tmp2
-	brne 1b
-	
-; EPILOG
-sha1_nextBlock_epilog:
-/* now we should clean up the stack */
-	pop r21
-	pop r20
-	in r0, SREG
-	cli ; we want to be uninterrupted while updating SP
-	out SPL, r20
-	out SPH, r21
-	out SREG, r0
-	
-	clr r1
-	pop r29
-	pop r28
-	pop r17
-	pop r16
-	pop r15
-	pop r14
-	pop r13
-	pop r12
-	pop r11
-	pop r10
-	ret
-
-sha1_nextBlock_xTable:
-.byte 20,40,60,0
-sha1_nextBlock_KTable:
-.int	0x5a827999 
-.int	0x6ed9eba1 
-.int	0x8f1bbcdc 
-.int	0xca62c1d6
-sha1_nextBlock_JumpTable:
-jmp sha1_nextBlock_Ch	
-jmp sha1_nextBlock_Parity
-jmp sha1_nextBlock_Maj
-jmp sha1_nextBlock_Parity
-
-	 /* X and Y still point at a[1] aka b ; return value in tmp1 */
-sha1_nextBlock_Ch:
-	ld tmp1, Y+
-	mov tmp2, tmp1
-	com tmp2
-	ldd tmp3, Y+3	/* load from c */
-	and tmp1, tmp3
-	ldd tmp3, Y+7	/* load from d */
-	and tmp2, tmp3
-	eor tmp1, tmp2
-	/**
-	precall
-	ldi r24, lo8(ch_str)
-	ldi r25, hi8(ch_str)
-	call uart_putstr_P
-	postcall
-	/**/
-	ret
-	
-sha1_nextBlock_Maj:
-	ld tmp1, Y+
-	mov tmp2, tmp1
-	ldd tmp3, Y+3	/* load from c */
-	and tmp1, tmp3
-	ldd tmp4, Y+7	/* load from d */
-	and tmp2, tmp4
-	eor tmp1, tmp2
-	and tmp3, tmp4
-	eor tmp1, tmp3
-	/**
-	precall
-	ldi r24, lo8(maj_str)
-	ldi r25, hi8(maj_str)
-	call uart_putstr_P
-	postcall
-	/**/
-	ret
-
-sha1_nextBlock_Parity:
-	ld tmp1, Y+
-	ldd tmp2, Y+3	/* load from c */
-	eor tmp1, tmp2
-	ldd tmp2, Y+7	/* load from d */
-	eor tmp1, tmp2
-	
-	/**
-	precall
-	ldi r24, lo8(parity_str)
-	ldi r25, hi8(parity_str)
-	call uart_putstr_P
-	postcall
-	/**/
-	ret
-/*	
-ch_str:			.asciz "\r\nCh"
-maj_str:		.asciz "\r\nMaj"
-parity_str:	.asciz "\r\nParity"
-*/
-;###########################################################	
-
-.global sha1_init 
-;void sha1_init(sha1_ctx_t *state){
-;	DEBUG_S("\r\nSHA1_INIT");
-;	state->h[0] = 0x67452301;
-;	state->h[1] = 0xefcdab89;
-;	state->h[2] = 0x98badcfe;
-;	state->h[3] = 0x10325476;
-;	state->h[4] = 0xc3d2e1f0;
-;	state->length = 0;
-;}
-; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram
-; modifys: Z(r30,r31), Func1, r22
-sha1_init:
-	movw r26, r24 ; (24,25) --> (26,27) load X with param1
-	ldi r30, lo8((sha1_init_vector))
-	ldi r31, hi8((sha1_init_vector))
-	ldi r22, 5*4 /* bytes to copy */
-sha1_init_vloop:	
-	lpm r23, Z+ 
-	st X+, r23
-	dec r22
-	brne sha1_init_vloop
-	ldi r22, 8
-	clr r1 /* this should not be needed */
-sha1_init_lloop:
-	st X+, r1
-	dec r22
-	brne sha1_init_lloop
-	ret
-	
-sha1_init_vector:
-.int 0x67452301;
-.int 0xefcdab89;
-.int 0x98badcfe;
-.int 0x10325476;
-.int 0xc3d2e1f0;
-/*
-;###########################################################	
-
-.global rotl32
-; === ROTL32 ===
-; function that rotates a 32 bit word to the left
-;  param1: the 32-bit word to rotate
-;	given in r25,r24,r23,r22 (r25 is most significant)
-;  param2: an 8-bit value telling how often to rotate
-;	given in r20
-; modifys: r21, r22
-rotl32:
-	cpi r20, 8
-	brlo bitrotl
-	mov r21, r25
-	mov r25, r24
-	mov r24, r23
-	mov r23, r22
-	mov r22, r21
-	subi r20, 8
-	rjmp rotr32
-bitrotl:
-	clr r21
-	clc
-bitrotl_loop:	
-	tst r20
-	breq fixrotl
-	rol r22
-	rol r23
-	rol r24
-	rol r25
-	rol r21
-	dec r20
-	rjmp bitrotl_loop
-fixrotl:
-	or r22, r21
-	ret
-	
-
-;###########################################################	
-
-.global rotr32
-; === ROTR32 ===
-; function that rotates a 32 bit word to the right
-;  param1: the 32-bit word to rotate
-;	given in r25,r24,r23,22 (r25 is most significant)
-;  param2: an 8-bit value telling how often to rotate
-;	given in r20
-; modifys: r21, r22
-rotr32:
-	cpi r20, 8
-	brlo bitrotr
-	mov r21, r22
-	mov r22, r23
-	mov r23, r24
-	mov r24, r25
-	mov r25, r21
-	subi r20, 8
-	rjmp rotr32
-bitrotr:
-	clr r21
-	clc
-bitrotr_loop:	
-	tst r20
-	breq fixrotr
-	ror r25
-	ror r24
-	ror r23
-	ror r22
-	ror r21
-	dec r20
-	rjmp bitrotr_loop
-fixrotr:
-	or r25, r21
-	ret
-	
-	
-;###########################################################	
-	
-.global change_endian32
-; === change_endian32 ===
-; function that changes the endianess of a 32-bit word
-;  param1: the 32-bit word
-;	given in r25,r24,r23,22 (r25 is most significant)
-;  modifys: r21, r22
-change_endian32:
-	movw r20,  r22 ; (r22,r23) --> (r20,r21)
-	mov r22, r25
-	mov r23, r24
-	mov r24, r21
-	mov r25, r20 
-	ret
-*/
+/* sha1-asm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Author:	Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; SHA1 implementation in assembler for AVR
+SHA1_BLOCK_BITS = 512
+SHA1_HASH_BITS = 160
+
+.macro precall
+	/* push r18 - r27, r30 - r31*/
+	push r0
+	push r1
+	push r18
+	push r19
+	push r20
+	push r21
+	push r22
+	push r23
+	push r24
+	push r25
+	push r26
+	push r27
+	push r30
+	push r31
+	clr r1
+.endm
+
+.macro postcall
+	pop r31
+	pop r30
+	pop r27
+	pop r26
+	pop r25
+	pop r24
+	pop r23
+	pop r22
+	pop r21
+	pop r20
+	pop r19
+	pop r18
+	pop r1
+	pop r0
+.endm
+
+
+.macro hexdump length
+	push r27
+	push r26
+	ldi r25, '\r'
+	mov r24, r25
+	call uart_putc
+	ldi r25, '\n'
+	mov r24, r25
+	call uart_putc
+	pop r26
+	pop r27
+	movw r24, r26
+.if \length > 16
+	ldi r22, lo8(16)
+	ldi r23, hi8(16)
+	push r27
+	push r26
+	call uart_hexdump
+	pop r26
+	pop r27
+	adiw r26, 16
+	hexdump \length-16
+.else
+	ldi r22, lo8(\length)
+	ldi r23, hi8(\length)
+	call uart_hexdump
+.endif
+.endm
+
+.macro delay
+/*	
+	push r0
+	push r1
+	clr r0
+1:	clr r1
+2:	dec r1
+	brne 2b
+	dec r0
+	brne 1b
+	pop r1
+	pop r0  // */
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+/*	
+	precall
+	hexdump \length
+	postcall
+	// */
+.endm
+
+
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha1_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################	
+
+.global sha1_ctx2hash
+; === sha1_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+;  param1: the 16-bit destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to sha1_ctx structure
+;	given in r23,r22
+sha1_ctx2hash:
+	movw r26, r22
+	movw r30, r24
+	ldi r21, 5
+	sbiw r26, 4
+1:	
+	ldi r20, 4
+	adiw r26, 8
+2:	
+		ld r0, -X
+		st Z+, r0	
+	dec r20
+	brne 2b
+	
+	dec r21
+	brne 1b
+	
+	ret
+
+;###########################################################	
+
+.global sha1
+; === sha1 ===
+; this function calculates SHA-1 hashes from messages in RAM
+;  param1: the 16-bit hash destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to message
+;	given in r23,r22
+;  param3: 32-bit length value (length of message in bits)
+;   given in r21,r20,r19,r18
+sha1:
+sha1_prolog:
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r16
+	push r17
+	in r16, SPL
+	in r17, SPH
+	subi r16, 5*4+8 
+	sbci r17, 0	
+	in r0, SREG
+	cli
+	out SPL, r16
+	out SPH, r17
+	out SREG, r0
+	
+	push r25
+	push r24
+	inc r16
+	adc r17, r1
+	
+	movw r8, r18		/* backup of length*/
+	movw r10, r20
+	
+	movw r12, r22	/* backup pf msg-ptr */
+	
+	movw r24, r16
+	rcall sha1_init
+	/* if length >= 512 */
+1:
+	tst r11
+	brne 4f
+	tst r10
+	brne 4f
+	mov r19, r9
+	cpi r19, 0x02
+	brlo 4f
+	
+	movw r24, r16
+	movw r22, r12
+	rcall sha1_nextBlock
+	ldi r19, 0x64
+	add r22, r19
+	adc r23, r1
+	/* length -= 512 */
+	ldi r19, 0x02
+	sub r9, r19
+	sbc r10, r1
+	sbc r11, r1
+	rjmp 1b
+	
+4:
+	movw r24, r16
+	movw r22, r12
+	movw r20, r8
+	rcall sha1_lastBlock
+	
+	pop r24
+	pop r25
+	movw r22, r16
+	rcall sha1_ctx2hash	
+	
+sha1_epilog:
+	in r30, SPL
+	in r31, SPH
+	adiw r30, 5*4+8 	
+	in r0, SREG
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG, r0
+	pop r17
+	pop r16
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	ret
+
+;###########################################################	
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha1_lastBlock
+; === sha1_lastBlock ===
+; this function does padding & Co. for calculating SHA-1 hashes
+;  param1: the 16-bit pointer to sha1_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+;  param3: an 16-bit integer specifing length of block in bits
+;	given in r21,r20
+sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1)
+
+
+sha1_lastBlock:
+	cpi r21, 0x02
+	brlo sha1_lastBlock_prolog
+	push r25
+	push r24
+	push r23
+	push r22
+	push r21
+	push r20
+	rcall sha1_nextBlock
+	pop r20
+	pop r21
+	pop r22
+	pop r23
+	pop r24
+	pop r25
+	subi r21, 2
+	subi r23, -2
+	rjmp sha1_lastBlock
+sha1_lastBlock_prolog:
+	/* allocate space on stack */
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	subi r30, lo8(64)
+	sbci r31, hi8(64) /* ??? */
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+
+	adiw r30, 1 /* SP points to next free byte on stack */
+	mov r18, r20 /* r20 = LSB(length) */
+	lsr r18
+	lsr r18
+	lsr r18
+	bst r21, 0	/* may be we should explain this ... */
+	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
+	
+	
+	movw r26, r22 /* X points to begin of msg */
+	tst r18
+	breq sha1_lastBlock_post_copy
+	mov r1, r18
+sha1_lastBlock_copy_loop:
+	ld r0, X+
+	st Z+, r0
+	dec r1
+	brne sha1_lastBlock_copy_loop
+sha1_lastBlock_post_copy:	
+sha1_lastBlock_insert_stuffing_bit:	
+	ldi r19, 0x80
+	mov r0,r19 	
+	ldi r19, 0x07
+	and r19, r20 /* if we are in bitmode */
+	breq 2f	/* no bitmode */
+1:	
+	lsr r0
+	dec r19
+	brne 1b
+	ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+	or r0, r19
+2:	
+	st Z+, r0
+	inc r18
+
+/* checking stuff here */
+	cpi r18, 64-8+1
+	brsh 0f 
+	rjmp sha1_lastBlock_insert_zeros
+0:
+	/* oh shit, we landed here */
+	/* first we have to fill it up with zeros */
+	ldi r19, 64
+	sub r19, r18
+	breq 2f
+1:	
+	st Z+, r1
+	dec r19
+	brne 1b	
+2:	
+	sbiw r30, 63
+	sbiw r30,  1
+	movw r22, r30
+	
+	push r31
+	push r30
+	push r25
+	push r24
+	push r21
+	push r20
+	rcall sha1_nextBlock
+	pop r20
+	pop r21
+	pop r24
+	pop r25
+	pop r30
+	pop r31
+	
+	/* now we should subtract 512 from length */
+	movw r26, r24
+	adiw r26, 4*5+1 /* we can skip the lowest byte */
+	ld r19, X
+	subi r19, hi8(512)
+	st X+, r19
+	ldi r18, 6
+1:
+	ld r19, X
+	sbci r19, 0
+	st X+, r19
+	dec r18
+	brne 1b
+	
+;	clr r18 /* not neccessary ;-) */
+	/* reset Z pointer to begin of block */
+
+sha1_lastBlock_insert_zeros:	
+	ldi r19, 64-8
+	sub r19, r18
+	breq sha1_lastBlock_insert_length
+	clr r1
+1:
+	st Z+, r1	/* r1 is still zero */
+	dec r19
+	brne 1b
+
+;	rjmp sha1_lastBlock_epilog
+sha1_lastBlock_insert_length:
+	movw r26, r24	/* X points to state */
+	adiw r26, 5*4	/* X points to (state.length) */
+	adiw r30, 8		/* Z points one after the last byte of block */
+	ld r0, X+
+	add r0, r20
+	st -Z, r0
+	ld r0, X+
+	adc r0, r21
+	st -Z, r0
+	ldi r19, 6
+1:
+	ld r0, X+
+	adc r0, r1
+	st -Z, r0
+	dec r19
+	brne 1b
+
+	sbiw r30, 64-8
+	movw r22, r30
+	rcall sha1_nextBlock
+
+sha1_lastBlock_epilog:
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	adiw r30, 63 ; lo8(64)
+	adiw r30,  1  ; hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+	clr r1
+	clr r0
+	ret
+
+/**/
+;###########################################################	
+
+.global sha1_nextBlock
+; === sha1_nextBlock ===
+; this is the core function for calculating SHA-1 hashes
+;  param1: the 16-bit pointer to sha1_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte)
+
+xtmp = 0
+xNULL = 1
+W1 = 10
+W2 = 11
+T1	= 12
+T2	= 13
+T3	= 14
+T4	= 15
+LoopC = 16
+S	  = 17
+tmp1 = 18
+tmp2 = 19
+tmp3 = 20
+tmp4 = 21
+F1 = 22
+F2 = 23
+F3 = 24
+F4 = 25
+
+/* byteorder: high number <--> high significance */
+sha1_nextBlock:
+ ; initial, let's make some space ready for local vars
+ 			 /* replace push & pop by mem ops? */
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	push r16
+	push r17
+	push r28
+	push r29
+	in r20, SPL
+	in r21, SPH
+	movw r18, r20			;backup SP
+;	movw r26, r20			; X points to free space on stack /* maybe removeable? */ 
+	movw r30, r22			; Z points to message
+	subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63
+	sbci r21, hi8(sha1_nextBlock_localSpace)
+	movw r26, r20			; X points to free space on stack 
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	push r18
+	push r19 /* push old SP on new stack */
+	push r24
+	push r25 /* param1 will be needed later */
+	
+	/* load a[] with state */
+	movw 28, r24 /* load pointer to state in Y */
+	adiw r26, 1 ; X++
+
+	ldi LoopC, 5*4	
+1:	ld tmp1, Y+
+	st X+, tmp1
+	dec LoopC
+	brne 1b
+
+	movw W1, r26 /* save pointer to w[0] */
+	/* load w[] with endian fixed message */
+		/* we might also use the changeendian32() function at bottom */
+	movw r30, r22 /* mv param2 (ponter to msg) to Z */	
+	ldi LoopC, 16
+1:
+	ldd tmp1, Z+3
+	st X+, tmp1
+	ldd tmp1, Z+2
+	st X+, tmp1
+	ldd tmp1, Z+1
+	st X+, tmp1
+	ld tmp1, Z
+	st X+, tmp1
+	adiw r30, 4
+	dec LoopC
+	brne 1b
+	
+	;clr LoopC /* LoopC is named t in FIPS 180-2 */	
+	clr xtmp
+sha1_nextBlock_mainloop:
+	mov S, LoopC
+	lsl S
+	lsl S
+	andi S, 0x3C /* S is a bytepointer so *4 */
+	/* load w[s] */
+	movw r26, W1
+	add r26, S /* X points at w[s] */
+	adc r27, xNULL
+	ld T1, X+
+	ld T2, X+
+	ld T3, X+
+	ld T4, X+
+
+	/**/
+	push r26
+	push r27
+	push T4
+	push T3
+	push T2
+	push T1
+	in r26, SPL
+	in r27, SPH
+	adiw r26, 1
+	dbg_hexdump 4
+	pop T1
+	pop T2
+	pop T3
+	pop T4
+	pop r27
+	pop r26
+	/**/
+
+	cpi LoopC, 16
+	brlt sha1_nextBlock_mainloop_core
+	/* update w[s] */
+	ldi tmp1, 2*4
+	rcall 1f
+	ldi tmp1, 8*4
+	rcall 1f
+	ldi tmp1, 13*4
+	rcall 1f
+	rjmp 2f
+1:		/* this might be "outsourced" to save the jump above */
+	add tmp1, S
+	andi tmp1, 0x3f
+	movw r26, W1
+	add r26, tmp1
+	adc r27, xNULL
+	ld tmp2, X+
+	eor T1, tmp2
+	ld tmp2, X+
+	eor T2, tmp2
+	ld tmp2, X+
+	eor T3, tmp2
+	ld tmp2, X+
+	eor T4, tmp2
+	ret
+2:	/* now we just hav to do a ROTL(T) and save T back */
+	mov tmp2, T4
+	rol tmp2
+	rol T1
+	rol T2
+	rol T3
+	rol T4
+	movw r26, W1
+	add r26, S
+	adc r27, xNULL
+	st X+, T1
+	st X+, T2
+	st X+, T3
+	st X+, T4
+	
+sha1_nextBlock_mainloop_core:	/* ther core function; T=ROTL5(a) ....*/	
+								/* T already contains w[s] */
+	movw r26, W1
+	sbiw r26, 4*1		/* X points at a[4] aka e */
+	ld tmp1, X+ 
+	add T1, tmp1
+	ld tmp1, X+ 
+	adc T2, tmp1
+	ld tmp1, X+ 
+	adc T3, tmp1
+	ld tmp1, X+ 
+	adc T4, tmp1		/* T = w[s]+e */
+	sbiw r26, 4*5		/* X points at a[0] aka a */
+	ld F1, X+ 
+	ld F2, X+ 
+	ld F3, X+ 
+	ld F4, X+ 
+	mov tmp1, F4		/* X points at a[1] aka b */
+	ldi tmp2, 5
+1:
+	rol tmp1
+	rol F1
+	rol F2
+	rol F3
+	rol F4
+	dec tmp2
+	brne 1b
+	
+	add T1, F1
+	adc T2, F2
+	adc T3, F3
+	adc T4, F4 /* T = ROTL(a,5) + e + w[s] */
+	
+	/* now we have to do this fucking conditional stuff */
+	ldi r30, lo8(sha1_nextBlock_xTable)
+	ldi r31, hi8(sha1_nextBlock_xTable)
+	add r30, xtmp
+	adc r31, xNULL
+	lpm tmp1, Z
+	cp tmp1, LoopC
+	brne 1f
+	inc xtmp
+1:	ldi r30, lo8(sha1_nextBlock_KTable)
+	ldi r31, hi8(sha1_nextBlock_KTable)
+	lsl xtmp
+	lsl xtmp
+	add r30, xtmp
+	adc r31, xNULL
+	lsr xtmp
+	lsr xtmp
+	 
+	lpm tmp1, Z+
+	add T1, tmp1
+	lpm tmp1, Z+
+	adc T2, tmp1
+	lpm tmp1, Z+
+	adc T3, tmp1
+	lpm tmp1, Z+
+	adc T4, tmp1
+			/* T = ROTL(a,5) + e + kt + w[s] */
+	
+	/* Z-4 is just pointing to kt ... */
+	movw r28, r26 /* copy X in Y */
+	adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */
+	lsr r31
+	ror r30
+		
+	icall
+	mov F1, tmp1
+	icall
+	mov F2, tmp1
+	icall
+	mov F3, tmp1
+	icall
+	
+	add T1, F1
+	adc T2, F2
+	adc T3, F3
+	adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */
+				 /* X points still at a[1] aka b, Y points at a[2] aka c */	
+	/* update a[] */
+sha1_nextBlock_update_a:
+	/*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/
+	//adiw r28, 3*4  /* Y should point at a[4] aka e */
+	movw r28, W1
+	sbiw r28, 4
+	
+	ldi tmp2, 4*4 
+1:	
+	ld tmp1, -Y
+	std Y+4, tmp1
+	dec tmp2
+	brne 1b
+	/* Y points at a[0] aka a*/
+	
+	movw r28, W1
+	sbiw r28, 5*4
+	/* store T in a[0] aka a */
+	st Y+, T1
+	st Y+, T2
+	st Y+, T3
+	st Y+, T4
+	/* Y points at a[1] aka b*/
+	
+	/* rotate c */
+	ldd T1, Y+1*4
+	ldd T2, Y+1*4+1
+	ldd T3, Y+1*4+2
+	ldd T4, Y+1*4+3
+	mov tmp1, T1
+	ldi tmp2, 2
+1:	ror tmp1
+	ror T4
+	ror T3
+	ror T2
+	ror T1
+	dec tmp2
+	brne 1b
+	std Y+1*4+0, T1
+	std Y+1*4+1, T2
+	std Y+1*4+2, T3
+	std Y+1*4+3, T4
+	
+	push r27
+	push r26
+	movw r26, W1
+	sbiw r26, 4*5
+	dbg_hexdump 4*5
+	pop r26
+	pop r27
+	
+	inc LoopC
+	cpi LoopC, 80
+	brge 1f
+	rjmp sha1_nextBlock_mainloop
+/**************************************/
+1:	
+   /* littel patch */
+	sbiw r28, 4
+
+/* add a[] to state and inc length */	
+	pop r27
+	pop r26		/* now X points to state (and Y still at a[0]) */
+	ldi tmp4, 5
+1:	clc
+	ldi tmp3, 4
+2:	ld tmp1, X
+	ld tmp2, Y+
+	adc tmp1, tmp2
+	st X+, tmp1
+	dec tmp3
+	brne 2b
+	dec tmp4
+	brne 1b
+	
+	/* now length += 512 */
+	adiw r26, 1 /* we skip the least significant byte */
+	ld tmp1, X
+	ldi tmp2, hi8(512) /* 2 */
+	add tmp1, tmp2
+	st X+, tmp1
+	ldi tmp2, 6
+1:
+	ld tmp1, X
+	adc tmp1, xNULL
+	st X+, tmp1
+	dec tmp2
+	brne 1b
+	
+; EPILOG
+sha1_nextBlock_epilog:
+/* now we should clean up the stack */
+	pop r21
+	pop r20
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	clr r1
+	pop r29
+	pop r28
+	pop r17
+	pop r16
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	ret
+
+sha1_nextBlock_xTable:
+.byte 20,40,60,0
+sha1_nextBlock_KTable:
+.int	0x5a827999 
+.int	0x6ed9eba1 
+.int	0x8f1bbcdc 
+.int	0xca62c1d6
+sha1_nextBlock_JumpTable:
+rjmp sha1_nextBlock_Ch
+	nop	
+rjmp sha1_nextBlock_Parity
+	nop
+rjmp sha1_nextBlock_Maj
+	nop
+rjmp sha1_nextBlock_Parity
+
+	 /* X and Y still point at a[1] aka b ; return value in tmp1 */
+sha1_nextBlock_Ch:
+	ld tmp1, Y+
+	mov tmp2, tmp1
+	com tmp2
+	ldd tmp3, Y+3	/* load from c */
+	and tmp1, tmp3
+	ldd tmp3, Y+7	/* load from d */
+	and tmp2, tmp3
+	eor tmp1, tmp2
+	ret
+	
+sha1_nextBlock_Maj:
+	ld tmp1, Y+
+	mov tmp2, tmp1
+	ldd tmp3, Y+3	/* load from c */
+	and tmp1, tmp3
+	ldd tmp4, Y+7	/* load from d */
+	and tmp2, tmp4
+	eor tmp1, tmp2
+	and tmp3, tmp4
+	eor tmp1, tmp3
+	ret
+
+sha1_nextBlock_Parity:
+	ld tmp1, Y+
+	ldd tmp2, Y+3	/* load from c */
+	eor tmp1, tmp2
+	ldd tmp2, Y+7	/* load from d */
+	eor tmp1, tmp2
+	ret
+/*	
+ch_str:			.asciz "\r\nCh"
+maj_str:		.asciz "\r\nMaj"
+parity_str:	.asciz "\r\nParity"
+*/
+;###########################################################	
+
+.global sha1_init 
+;void sha1_init(sha1_ctx_t *state){
+;	DEBUG_S("\r\nSHA1_INIT");
+;	state->h[0] = 0x67452301;
+;	state->h[1] = 0xefcdab89;
+;	state->h[2] = 0x98badcfe;
+;	state->h[3] = 0x10325476;
+;	state->h[4] = 0xc3d2e1f0;
+;	state->length = 0;
+;}
+; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha1_init:
+	movw r26, r24 ; (24,25) --> (26,27) load X with param1
+	ldi r30, lo8((sha1_init_vector))
+	ldi r31, hi8((sha1_init_vector))
+	ldi r22, 5*4 /* bytes to copy */
+sha1_init_vloop:	
+	lpm r23, Z+ 
+	st X+, r23
+	dec r22
+	brne sha1_init_vloop
+	ldi r22, 8
+sha1_init_lloop:
+	st X+, r1
+	dec r22
+	brne sha1_init_lloop
+	ret
+	
+sha1_init_vector:
+.int 0x67452301;
+.int 0xefcdab89;
+.int 0x98badcfe;
+.int 0x10325476;
+.int 0xc3d2e1f0;
+