clean up

[avr-crypto-lib.git] / shabea / sha256-asm.S
diff --git a/shabea/sha256-asm.S b/shabea/sha256-asm.S

deleted file mode 100644 (file)

index d9eb6b6..0000000
--- a/shabea/sha256-asm.S
+++ /dev/null
@@ -1,1042 +0,0 @@
-/* sha256-asm.S */
-/*
-    This file is part of the AVR-Crypto-Lib.
-    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-/*
- * Author:     Daniel Otte
- *
- * License: GPLv3 or later
-*/
-; sha-256 implementation in assembler  
-SHA256_BLOCK_BITS = 512
-SHA256_HASH_BITS = 256
-
-.macro precall
-       /* push r18 - r27, r30 - r31*/
-       push r0
-       push r1
-       push r18
-       push r19
-       push r20
-       push r21
-       push r22
-       push r23
-       push r24
-       push r25
-       push r26
-       push r27
-       push r30
-       push r31
-       clr r1
-.endm
-
-.macro postcall
-       pop r31
-       pop r30
-       pop r27
-       pop r26
-       pop r25
-       pop r24
-       pop r23
-       pop r22
-       pop r21
-       pop r20
-       pop r19
-       pop r18
-       pop r1
-       pop r0
-.endm
-
-
-.macro hexdump length
-       push r27
-       push r26
-       ldi r25, '\r'
-       mov r24, r25
-       call uart_putc
-       ldi r25, '\n'
-       mov r24, r25
-       call uart_putc
-       pop r26
-       pop r27
-       movw r24, r26
-.if \length > 16
-       ldi r22, lo8(16)
-       ldi r23, hi8(16)
-       push r27
-       push r26
-       call uart_hexdump
-       pop r26
-       pop r27
-       adiw r26, 16
-       hexdump \length-16
-.else
-       ldi r22, lo8(\length)
-       ldi r23, hi8(\length)
-       call uart_hexdump
-.endif
-.endm
-
-/* X points to Block */
-.macro dbg_hexdump length
-       precall
-       hexdump \length
-       postcall
-.endm
-
-.section .text
-
-SPL = 0x3D
-SPH = 0x3E
-SREG = 0x3F
-
-
-;
-;sha256_ctx_t is:
-;
-; [h0][h1][h2][h3][h4][h5][h6][h7][length]
-; hn is 32 bit large, length is 64 bit large
-
-;###########################################################   
-
-.global sha256_ctx2hash
-; === sha256_ctx2hash ===
-; this function converts a state into a normal hash (bytestring)
-;  param1: the 16-bit destination pointer
-;      given in r25,r24 (r25 is most significant)
-;  param2: the 16-bit pointer to sha256_ctx structure
-;      given in r23,r22
-sha256_ctx2hash:
-       movw r26, r22
-       movw r30, r24
-       ldi r21, 8
-       sbiw r26, 4
-1:     
-       ldi r20, 4
-       adiw r26, 8
-2:     
-               ld r0, -X
-               st Z+, r0       
-       dec r20
-       brne 2b
-       
-       dec r21
-       brne 1b
-       
-       ret
-
-;###########################################################   
-
-.global sha256
-; === sha256 ===
-; this function calculates SHA-256 hashes from messages in RAM
-;  param1: the 16-bit hash destination pointer
-;      given in r25,r24 (r25 is most significant)
-;  param2: the 16-bit pointer to message
-;      given in r23,r22
-;  param3: 32-bit length value (length of message in bits)
-;   given in r21,r20,r19,r18
-sha256:
-sha256_prolog:
-       push r8
-       push r9
-       push r10
-       push r11
-       push r12
-       push r13
-       push r16
-       push r17
-       in r16, SPL
-       in r17, SPH
-       subi r16, 8*4+8 
-       sbci r17, 0     
-       in r0, SREG
-       cli
-       out SPL, r16
-       out SPH, r17
-       out SREG, r0
-       
-       push r25
-       push r24
-       inc r16
-       adc r17, r1
-       
-       movw r8, r18            /* backup of length*/
-       movw r10, r20
-       
-       movw r12, r22   /* backup pf msg-ptr */
-       
-       movw r24, r16
-       rcall sha256_init
-       /* if length >= 512 */
-1:
-       tst r11
-       brne 4f
-       tst r10
-       brne 4f
-       mov r19, r9
-       cpi r19, 0x02
-       brlo 4f
-       
-       movw r24, r16
-       movw r22, r12
-       rcall sha256_nextBlock
-       ldi r19, 0x64
-       add r22, r19
-       adc r23, r1
-       /* length -= 512 */
-       ldi r19, 0x02
-       sub r9, r19
-       sbc r10, r1
-       sbc r11, r1
-       rjmp 1b
-       
-4:
-       movw r24, r16
-       movw r22, r12
-       movw r20, r8
-       rcall sha256_lastBlock
-       
-       pop r24
-       pop r25
-       movw r22, r16
-       rcall sha256_ctx2hash   
-       
-sha256_epilog:
-       in r30, SPL
-       in r31, SPH
-       adiw r30, 8*4+8         
-       in r0, SREG
-       cli
-       out SPL, r30
-       out SPH, r31
-       out SREG, r0
-       pop r17
-       pop r16
-       pop r13
-       pop r12
-       pop r11
-       pop r10
-       pop r9
-       pop r8
-       ret
-
-;###########################################################   
-
-
-; block MUST NOT be larger than 64 bytes
-
-.global sha256_lastBlock
-; === sha256_lastBlock ===
-; this function does padding & Co. for calculating SHA-256 hashes
-;  param1: the 16-bit pointer to sha256_ctx structure
-;      given in r25,r24 (r25 is most significant)
-;  param2: an 16-bit pointer to 64 byte block to hash
-;      given in r23,r22
-;  param3: an 16-bit integer specifing length of block in bits
-;      given in r21,r20
-sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
-
-
-sha256_lastBlock:
-       cpi r21, 0x02
-       brlo sha256_lastBlock_prolog
-       push r25
-       push r24
-       push r23
-       push r22
-       push r21
-       push r20
-       rcall sha256_nextBlock
-       pop r20
-       pop r21
-       pop r22
-       pop r23
-       pop r24
-       pop r25
-       subi r21, 0x02
-       subi r23, -2
-       rjmp sha256_lastBlock   
-sha256_lastBlock_prolog:
-       /* allocate space on stack */
-       in r30, SPL
-       in r31, SPH
-       in r1, SREG
-       subi r30, lo8(64)
-       sbci r31, hi8(64)
-       cli
-       out SPL, r30
-       out SPH, r31
-       out SREG,r1
-
-       adiw r30, 1 /* SP points to next free byte on stack */
-       mov r18, r20 /* r20 = LSB(length) */
-       lsr r18
-       lsr r18
-       lsr r18
-       bst r21, 0      /* may be we should explain this ... */
-       bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
-       
-       
-       movw r26, r22 /* X points to begin of msg */
-       tst r18
-       breq sha256_lastBlock_post_copy
-       mov r1, r18
-sha256_lastBlock_copy_loop:
-       ld r0, X+
-       st Z+, r0
-       dec r1
-       brne sha256_lastBlock_copy_loop
-sha256_lastBlock_post_copy:    
-sha256_lastBlock_insert_stuffing_bit:  
-       ldi r19, 0x80
-       mov r0,r19      
-       ldi r19, 0x07
-       and r19, r20 /* if we are in bitmode */
-       breq 2f /* no bitmode */
-1:     
-       lsr r0
-       dec r19
-       brne 1b
-       ld r19, X
-/* maybe we should do some ANDing here, just for safety */
-       or r0, r19
-2:     
-       st Z+, r0
-       inc r18
-
-/* checking stuff here */
-       cpi r18, 64-8+1
-       brsh 0f 
-       rjmp sha256_lastBlock_insert_zeros
-0:
-       /* oh shit, we landed here */
-       /* first we have to fill it up with zeros */
-       ldi r19, 64
-       sub r19, r18
-       breq 2f
-1:     
-       st Z+, r1
-       dec r19
-       brne 1b 
-2:     
-       sbiw r30, 63
-       sbiw r30,  1
-       movw r22, r30
-       
-       push r31
-       push r30
-       push r25
-       push r24
-       push r21
-       push r20
-       rcall sha256_nextBlock
-       pop r20
-       pop r21
-       pop r24
-       pop r25
-       pop r30
-       pop r31
-       
-       /* now we should subtract 512 from length */
-       movw r26, r24
-       adiw r26, 4*8+1 /* we can skip the lowest byte */
-       ld r19, X
-       subi r19, hi8(512)
-       st X+, r19
-       ldi r18, 6
-1:
-       ld r19, X
-       sbci r19, 0
-       st X+, r19
-       dec r18
-       brne 1b
-       
-;      clr r18 /* not neccessary ;-) */
-       /* reset Z pointer to begin of block */
-
-sha256_lastBlock_insert_zeros: 
-       ldi r19, 64-8
-       sub r19, r18
-       breq sha256_lastBlock_insert_length
-       clr r1
-1:
-       st Z+, r1       /* r1 is still zero */
-       dec r19
-       brne 1b
-
-;      rjmp sha256_lastBlock_epilog
-sha256_lastBlock_insert_length:
-       movw r26, r24   /* X points to state */
-       adiw r26, 8*4   /* X points to (state.length) */
-       adiw r30, 8             /* Z points one after the last byte of block */
-       ld r0, X+
-       add r0, r20
-       st -Z, r0
-       ld r0, X+
-       adc r0, r21
-       st -Z, r0
-       ldi r19, 6
-1:
-       ld r0, X+
-       adc r0, r1
-       st -Z, r0
-       dec r19
-       brne 1b
-
-       sbiw r30, 64-8
-       movw r22, r30
-       rcall sha256_nextBlock
-
-sha256_lastBlock_epilog:
-       in r30, SPL
-       in r31, SPH
-       in r1, SREG
-       adiw r30, 63 ; lo8(64)
-       adiw r30,  1  ; hi8(64)
-       cli
-       out SPL, r30
-       out SPH, r31
-       out SREG,r1
-       clr r1
-       clr r0
-       ret
-
-/**/
-;###########################################################   
-
-.global sha256_nextBlock
-; === sha256_nextBlock ===
-; this is the core function for calculating SHA-256 hashes
-;  param1: the 16-bit pointer to sha256_ctx structure
-;      given in r25,r24 (r25 is most significant)
-;  param2: an 16-bit pointer to 64 byte block to hash
-;      given in r23,r22
-sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
-
-Bck1 = 12
-Bck2 = 13
-Bck3 = 14
-Bck4 = 15
-Func1 = 22
-Func2 = 23
-Func3 = 24
-Func4 = 25
-Accu1 = 16
-Accu2 = 17
-Accu3 = 18
-Accu4 = 19
-XAccu1 = 8
-XAccu2 = 9
-XAccu3 = 10
-XAccu4 = 11
-T1     = 4
-T2     = 5
-T3     = 6
-T4     = 7
-LoopC = 1
-/* byteorder: high number <--> high significance */
-sha256_nextBlock:
- ; initial, let's make some space ready for local vars
-       push r4 /* replace push & pop by mem ops? */
-       push r5
-       push r6
-       push r7
-       push r8
-       push r9
-       push r10
-       push r11
-       push r12
-       push r13
-       push r14
-       push r15
-       push r16
-       push r17
-       push r28
-       push r29
-       in r20, SPL
-       in r21, SPH
-       movw r18, r20                   ;backup SP
-;      movw r26, r20                   ; X points to free space on stack 
-       movw r30, r22                   ; Z points to message
-       subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
-       sbci r21, hi8(sha256_nextBlock_localSpace)
-       movw r26, r20                   ; X points to free space on stack 
-       in r0, SREG
-       cli ; we want to be uninterrupted while updating SP
-       out SPL, r20
-       out SPH, r21
-       out SREG, r0
-       push r18
-       push r19
-       push r24
-       push r25 /* param1 will be needed later */
- ; now we fill the w array with message (think about endianess)
-       adiw r26, 1 ; X++
-       ldi r20, 16
-sha256_nextBlock_wcpyloop:     
-       ld r23, Z+
-       ld r22, Z+
-       ld r19, Z+
-       ld r18, Z+
-       st X+, r18
-       st X+, r19
-       st X+, r22      
-       st X+, r23
-       dec r20
-       brne sha256_nextBlock_wcpyloop
-/*     for (i=16; i<64; ++i){
-               w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];   
-       } */
-       /* r25,r24,r23,r24 (r21,r20) are function values
-          r19,r18,r17,r16 are the accumulator
-          r15,r14,r13,rBck1 are backup1
-          r11,r10,r9 ,r8  are xor accu   
-          r1 is round counter                                                          */
-
-       ldi r20, 64-16
-       mov LoopC, r20
-sha256_nextBlock_wcalcloop:             
-       movw r30, r26 ; cp X to Z
-       sbiw r30, 63
-       sbiw r30, 1             ; substract 64 = 16*4
-       ld Accu1, Z+
-       ld Accu2, Z+
-       ld Accu3, Z+
-       ld Accu4, Z+ /* w[i] = w[i-16] */
-       ld Bck1, Z+
-       ld Bck2, Z+
-       ld Bck3, Z+
-       ld Bck4, Z+ /* backup = w[i-15] */
-       /* now sigma 0 */
-       mov Func1, Bck2
-       mov Func2, Bck3
-       mov Func3, Bck4
-       mov Func4, Bck1  /* prerotated by 8 */
-       ldi r20, 1
-       rcall bitrotl
-       movw XAccu1, Func1
-       movw XAccu3, Func3       /* store ROTR(w[i-15],7) in xor accu */
-       movw Func1, Bck3
-       movw Func3, Bck1 /* prerotated by 16 */
-       ldi r20, 2
-       rcall bitrotr
-       eor XAccu1, Func1  /* xor ROTR(w[i-15], 18)*/
-       eor XAccu2, Func2
-       eor XAccu3, Func3
-       eor XAccu4, Func4
-       ldi Func2, 3             /* now shr3 */ /*we can destroy backup now*/
-sigma0_shr:
-       lsr Bck4
-       ror Bck3
-       ror Bck2
-       ror Bck1        
-       dec Func2
-       brne sigma0_shr
-       eor XAccu1, Bck1
-       eor XAccu2, Bck2
-       eor XAccu3, Bck3
-       eor XAccu4, Bck4        /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
-       add Accu1, XAccu1
-       adc Accu2, XAccu2
-       adc Accu3, XAccu3
-       adc Accu4, XAccu4 /* finished with sigma0 */
-       ldd Func1, Z+7*4  /* now accu += w[i-7] */
-       ldd Func2, Z+7*4+1
-       ldd Func3, Z+7*4+2
-       ldd Func4, Z+7*4+3
-       add Accu1, Func1
-       adc Accu2, Func2
-       adc Accu3, Func3
-       adc Accu4, Func4
-       ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
-       ldd Bck2, Z+12*4+1
-       ldd Bck3, Z+12*4+2
-       ldd Bck4, Z+12*4+3
-       /* now sigma 1 */
-       movw Func1, Bck3
-       movw Func3, Bck1 /* prerotated by 16 */
-       ldi r20, 1
-       rcall bitrotr
-       movw XAccu3, Func3
-       movw XAccu1, Func1       /* store in ROTR(w[i-2], 17) xor accu */
-;      movw Func1, Bck3
-;      movw Func3, Bck1 /* prerotated by 16 */
-       ldi r20, 2
-       rcall bitrotr
-       eor XAccu1, Func1  /* xor ROTR(w[i-2], 19)*/
-       eor XAccu2, Func2
-       eor XAccu3, Func3
-       eor XAccu4, Func4
-       ldi Func2, 2     /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
-sigma1_shr:
-       lsr Bck4
-       ror Bck3
-       ror Bck2        
-       dec Func2
-       brne sigma1_shr
-       eor XAccu1, Bck2
-       eor XAccu2, Bck3
-       eor XAccu3, Bck4  /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
-       add Accu1, XAccu1
-       adc Accu2, XAccu2
-       adc Accu3, XAccu3
-       adc Accu4, XAccu4 /* finished with sigma0 */
-       /* now let's store the shit */
-       st X+, Accu1
-       st X+, Accu2
-       st X+, Accu3
-       st X+, Accu4
-       dec LoopC
-       breq 3f  ; skip if zero
-       rjmp sha256_nextBlock_wcalcloop
-3:
-       /* we are finished with w array X points one byte post w */
-/* init a array */
-       pop r31
-       pop r30
-       push r30
-       push r31
-       ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
-init_a_array:  
-       ld r1, Z+
-       st X+, r1
-       dec r25
-       brne init_a_array
-       
-/* now the real fun begins */
-/* for (i=0; i<64; ++i){
-                       t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
-                       t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
-                       memmove(&(a[1]), &(a[0]), 7*4);         // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; 
-                       a[4] += t1;
-                       a[0] = t1 + t2;
-               } */
-       /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
-       sbiw r26, 8*4  /* X still points at a[7]+1*/
-       movw r28, r26
-       ldi r30, lo8(sha256_kv)
-       ldi r31, hi8(sha256_kv)         
-       dec r27  /* X - (64*4 == 256) */
-       ldi r25, 64
-       mov LoopC, r25
-sha256_main_loop:
-       /* now calculate t1 */
-        /*CH(x,y,z) = (x&y)^((~x)&z)*/
-       ldd T1, Y+5*4
-       ldd T2, Y+5*4+1
-       ldd T3, Y+5*4+2
-       ldd T4, Y+5*4+3 /* y in T */
-       ldd Func1, Y+4*4
-       ldd Func2, Y+4*4+1
-       ldd Func3, Y+4*4+2
-       ldd Func4, Y+4*4+3  /* x in Func */
-       ldd Bck1, Y+6*4
-       ldd Bck2, Y+6*4+1
-       ldd Bck3, Y+6*4+2
-       ldd Bck4, Y+6*4+3 /* z in Bck */
-       and T1, Func1
-       and T2, Func2
-       and T3, Func3
-       and T4, Func4
-       com Func1
-       com Func2
-       com Func3
-       com Func4
-       and Bck1, Func1
-       and Bck2, Func2
-       and Bck3, Func3
-       and Bck4, Func4
-       eor T1, Bck1
-       eor T2, Bck2
-       eor T3, Bck3
-       eor T4, Bck4 /* done, CH(x,y,z) is in T */
-       /* now SIGMA1(a[4]) */
-       ldd Bck4, Y+4*4         /* think about using it from Func reg above*/
-       ldd Bck1, Y+4*4+1       
-       ldd Bck2, Y+4*4+2
-       ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */ 
-       movw Func1, Bck1
-       movw Func3, Bck3
-       ldi r20, 2 
-       rcall bitrotl           /* rotr(x,6) */ 
-       movw XAccu1, Func1
-       movw XAccu3, Func3
-       movw Func1, Bck1
-       movw Func3, Bck3
-       ldi r20, 3 
-       rcall bitrotr   /* rotr(x,11) */
-       eor XAccu1, Func1
-       eor XAccu2, Func2
-       eor XAccu3, Func3
-       eor XAccu4, Func4
-       movw Func1, Bck3 /* this prerotates furteh 16 bits*/
-       movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
-       ldi r20, 1 
-       rcall bitrotr   /* rotr(x,11) */
-       eor XAccu1, Func1
-       eor XAccu2, Func2
-       eor XAccu3, Func3
-       eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
-       add T1, XAccu1
-       adc T2, XAccu2
-       adc T3, XAccu3
-       adc T4, XAccu4
-       /* now we've to add a[7], w[i] and k[i] */
-       ldd XAccu1, Y+4*7
-       ldd XAccu2, Y+4*7+1
-       ldd XAccu3, Y+4*7+2
-       ldd XAccu4, Y+4*7+3
-       add T1, XAccu1
-       adc T2, XAccu2
-       adc T3, XAccu3
-       adc T4, XAccu4 /* add a[7] */
-       ld XAccu1, X+
-       ld XAccu2, X+
-       ld XAccu3, X+
-       ld XAccu4, X+
-       add T1, XAccu1
-       adc T2, XAccu2
-       adc T3, XAccu3
-       adc T4, XAccu4 /* add w[i] */
-       lpm XAccu1, Z+
-       lpm XAccu2, Z+
-       lpm XAccu3, Z+
-       lpm XAccu4, Z+
-       add T1, XAccu1
-       adc T2, XAccu2
-       adc T3, XAccu3
-       adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
-       /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
-               /* starting with MAJ(x,y,z) */
-       ldd Func1, Y+4*0+0
-       ldd Func2, Y+4*0+1
-       ldd Func3, Y+4*0+2
-       ldd Func4, Y+4*0+3 /* load x=a[0] */
-       ldd XAccu1, Y+4*1+0
-       ldd XAccu2, Y+4*1+1
-       ldd XAccu3, Y+4*1+2
-       ldd XAccu4, Y+4*1+3 /* load y=a[1] */
-       and XAccu1, Func1
-       and XAccu2, Func2
-       and XAccu3, Func3
-       and XAccu4, Func4       /* XAccu == (x & y) */
-       ldd Bck1, Y+4*2+0
-       ldd Bck2, Y+4*2+1
-       ldd Bck3, Y+4*2+2
-       ldd Bck4, Y+4*2+3 /* load z=a[2] */
-       and Func1, Bck1
-       and Func2, Bck2
-       and Func3, Bck3
-       and Func4, Bck4
-       eor XAccu1, Func1
-       eor XAccu2, Func2
-       eor XAccu3, Func3
-       eor XAccu4, Func4       /* XAccu == (x & y) ^ (x & z) */
-       ldd Func1, Y+4*1+0
-       ldd Func2, Y+4*1+1
-       ldd Func3, Y+4*1+2
-       ldd Func4, Y+4*1+3 /* load y=a[1] */
-       and Func1, Bck1
-       and Func2, Bck2
-       and Func3, Bck3
-       and Func4, Bck4
-       eor XAccu1, Func1
-       eor XAccu2, Func2
-       eor XAccu3, Func3
-       eor XAccu4, Func4       /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
-       /* SIGMA0(a[0]) */
-       ldd Bck1, Y+4*0+0 /* we should combine this with above */
-       ldd Bck2, Y+4*0+1
-       ldd Bck3, Y+4*0+2
-       ldd Bck4, Y+4*0+3
-       movw Func1, Bck1
-       movw Func3, Bck3
-       ldi r20, 2
-       rcall bitrotr
-       movw Accu1, Func1
-       movw Accu3, Func3 /* Accu = shr(a[0], 2) */
-       movw Func1, Bck3 
-       movw Func3, Bck1 /* prerotate by 16 bits */
-       ldi r20, 3
-       rcall bitrotl
-       eor Accu1, Func1
-       eor Accu2, Func2
-       eor Accu3, Func3
-       eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
-       mov Func1, Bck4
-       mov Func2, Bck1
-       mov Func3, Bck2
-       mov Func4, Bck3  /* prerotate by 24 bits */
-       ldi r20, 2
-       rcall bitrotl
-       eor Accu1, Func1
-       eor Accu2, Func2
-       eor Accu3, Func3
-       eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
-       add Accu1, XAccu1 /* add previous result (MAJ)*/
-       adc Accu2, XAccu2
-       adc Accu3, XAccu3
-       adc Accu4, XAccu4
-       /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
-       /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
-
-       ldi r21, 7*4
-       adiw r28, 7*4
-a_shift_loop:
-       ld  r25, -Y /* warning: this is PREdecrement */
-       std Y+4, r25
-       dec r21
-       brne a_shift_loop
-
-       ldd Bck1, Y+4*4+0
-       ldd Bck2, Y+4*4+1
-       ldd Bck3, Y+4*4+2
-       ldd Bck4, Y+4*4+3
-       add Bck1, T1
-       adc Bck2, T2
-       adc Bck3, T3
-       adc Bck4, T4
-       std Y+4*4+0, Bck1
-       std Y+4*4+1, Bck2
-       std Y+4*4+2, Bck3
-       std Y+4*4+3, Bck4
-       add Accu1, T1
-       adc Accu2, T2
-       adc Accu3, T3
-       adc Accu4, T4
-       std Y+4*0+0, Accu1
-       std Y+4*0+1, Accu2
-       std Y+4*0+2, Accu3
-       std Y+4*0+3, Accu4 /* a array updated */
-       
-       
-       dec LoopC
-       breq update_state
-       rjmp sha256_main_loop ;brne sha256_main_loop
-update_state:  
-       /* update state */
-       /* pointers to state should still exist on the stack ;-) */
-       pop r31
-       pop r30
-       ldi r21, 8
-update_state_loop:
-       ldd Accu1, Z+0
-       ldd Accu2, Z+1
-       ldd Accu3, Z+2
-       ldd Accu4, Z+3 
-       ld Func1, Y+
-       ld Func2, Y+
-       ld Func3, Y+
-       ld Func4, Y+
-       add Accu1, Func1
-       adc Accu2, Func2
-       adc Accu3, Func3
-       adc Accu4, Func4
-       st Z+, Accu1
-       st Z+, Accu2
-       st Z+, Accu3
-       st Z+, Accu4
-       dec r21
-       brne update_state_loop
-       /* now we just have to update the length */
-       adiw r30, 1 /* since we add 512, we can simply skip the LSB */ 
-       ldi r21, 2
-       ldi r22, 6
-       ld r20, Z
-       add r20, r21
-       st Z+, r20      
-       clr r21
-sha256_nextBlock_fix_length:   
-       brcc sha256_nextBlock_epilog
-       ld r20, Z
-       adc r20, r21
-       st Z+, r20
-       dec r22
-       brne sha256_nextBlock_fix_length
-       
-; EPILOG
-sha256_nextBlock_epilog:
-/* now we should clean up the stack */
-       
-       pop r21
-       pop r20
-       in r0, SREG
-       cli ; we want to be uninterrupted while updating SP
-       out SPL, r20
-       out SPH, r21
-       out SREG, r0
-       
-       clr r1
-       pop r29
-       pop r28
-       pop r17
-       pop r16
-       pop r15
-       pop r14
-       pop r13
-       pop r12
-       pop r11
-       pop r10
-       pop r9
-       pop r8
-       pop r7
-       pop r6
-       pop r5
-       pop r4 
-       ret
-
-sha256_kv: ; round-key-vector stored in ProgMem 
-.word  0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
-.word  0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
-.word  0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
-.word  0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
-.word  0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
-.word  0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
-.word  0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
-.word  0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
-
-       
-;###########################################################   
-
-.global sha256_init 
-;uint32_t sha256_init_vector[]={
-;      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-;      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
-;
-;void sha256_init(sha256_ctx_t *state){
-;      state->length=0;
-;      memcpy(state->h, sha256_init_vector, 8*4);
-;}
-; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
-; modifys: Z(r30,r31), Func1, r22
-sha256_init:
-       movw r26, r24 ; (24,25) --> (26,27) load X with param1
-       ldi r30, lo8((sha256_init_vector))
-       ldi r31, hi8((sha256_init_vector))
-       ldi r22, 32+8
-sha256_init_vloop:     
-       lpm r23, Z+ 
-       st X+, r23
-       dec r22
-       brne sha256_init_vloop
-       ret
-       
-sha256_init_vector:
-.word 0xE667, 0x6A09
-.word 0xAE85, 0xBB67 
-.word 0xF372, 0x3C6E 
-.word 0xF53A, 0xA54F 
-.word 0x527F, 0x510E 
-.word 0x688C, 0x9B05 
-.word 0xD9AB, 0x1F83 
-.word 0xCD19, 0x5BE0
-.word 0x0000, 0x0000
-.word 0x0000, 0x0000
-
-;###########################################################   
-
-.global rotl32
-; === ROTL32 ===
-; function that rotates a 32 bit word to the left
-;  param1: the 32-bit word to rotate
-;      given in r25,r24,r23,r22 (r25 is most significant)
-;  param2: an 8-bit value telling how often to rotate
-;      given in r20
-; modifys: r21, r22
-rotl32:
-       cpi r20, 8
-       brlo bitrotl
-       mov r21, r25
-       mov r25, r24
-       mov r24, r23
-       mov r23, r22
-       mov r22, r21
-       subi r20, 8
-       rjmp rotl32
-bitrotl:
-       clr r21
-       clc
-bitrotl_loop:  
-       tst r20
-       breq fixrotl
-       rol r22
-       rol r23
-       rol r24
-       rol r25
-       rol r21
-       dec r20
-       rjmp bitrotl_loop
-fixrotl:
-       or r22, r21
-       ret
-       
-
-;###########################################################   
-
-.global rotr32
-; === ROTR32 ===
-; function that rotates a 32 bit word to the right
-;  param1: the 32-bit word to rotate
-;      given in r25,r24,r23,22 (r25 is most significant)
-;  param2: an 8-bit value telling how often to rotate
-;      given in r20
-; modifys: r21, r22
-rotr32:
-       cpi r20, 8
-       brlo bitrotr
-       mov r21, r22
-       mov r22, r23
-       mov r23, r24
-       mov r24, r25
-       mov r25, r21
-       subi r20, 8
-       rjmp rotr32
-bitrotr:
-       clr r21
-       clc
-bitrotr_loop:  
-       tst r20
-       breq fixrotr
-       ror r25
-       ror r24
-       ror r23
-       ror r22
-       ror r21
-       dec r20
-       rjmp bitrotr_loop
-fixrotr:
-       or r25, r21
-       ret
-       
-       
-;###########################################################   
-       
-.global change_endian32
-; === change_endian32 ===
-; function that changes the endianess of a 32-bit word
-;  param1: the 32-bit word
-;      given in r25,r24,r23,22 (r25 is most significant)
-;  modifys: r21, r22
-change_endian32:
-       movw r20,  r22 ; (r22,r23) --> (r20,r21)
-       mov r22, r25
-       mov r23, r24
-       mov r24, r21
-       mov r25, r20 
-       ret
-