X-Git-Url: https://git.cryptolib.org/?p=avr-crypto-lib.git;a=blobdiff_plain;f=shabea%2Fsha256-asm.S;fp=shabea%2Fsha256-asm.S;h=d9eb6b65a789a49a7dc730775685114f146ed53b;hp=0000000000000000000000000000000000000000;hb=d32eba56ce10ea6b9eff123b50d9842673b38f2b;hpb=8f855d283a31a468ea014774c4723a8b77b81644 diff --git a/shabea/sha256-asm.S b/shabea/sha256-asm.S new file mode 100644 index 0000000..d9eb6b6 --- /dev/null +++ b/shabea/sha256-asm.S @@ -0,0 +1,1042 @@ +/* sha256-asm.S */ +/* + This file is part of the AVR-Crypto-Lib. + Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +/* + * Author: Daniel Otte + * + * License: GPLv3 or later +*/ +; sha-256 implementation in assembler +SHA256_BLOCK_BITS = 512 +SHA256_HASH_BITS = 256 + +.macro precall + /* push r18 - r27, r30 - r31*/ + push r0 + push r1 + push r18 + push r19 + push r20 + push r21 + push r22 + push r23 + push r24 + push r25 + push r26 + push r27 + push r30 + push r31 + clr r1 +.endm + +.macro postcall + pop r31 + pop r30 + pop r27 + pop r26 + pop r25 + pop r24 + pop r23 + pop r22 + pop r21 + pop r20 + pop r19 + pop r18 + pop r1 + pop r0 +.endm + + +.macro hexdump length + push r27 + push r26 + ldi r25, '\r' + mov r24, r25 + call uart_putc + ldi r25, '\n' + mov r24, r25 + call uart_putc + pop r26 + pop r27 + movw r24, r26 +.if \length > 16 + ldi r22, lo8(16) + ldi r23, hi8(16) + push r27 + push r26 + call uart_hexdump + pop r26 + pop r27 + adiw r26, 16 + hexdump \length-16 +.else + ldi r22, lo8(\length) + ldi r23, hi8(\length) + call uart_hexdump +.endif +.endm + +/* X points to Block */ +.macro dbg_hexdump length + precall + hexdump \length + postcall +.endm + +.section .text + +SPL = 0x3D +SPH = 0x3E +SREG = 0x3F + + +; +;sha256_ctx_t is: +; +; [h0][h1][h2][h3][h4][h5][h6][h7][length] +; hn is 32 bit large, length is 64 bit large + +;########################################################### + +.global sha256_ctx2hash +; === sha256_ctx2hash === +; this function converts a state into a normal hash (bytestring) +; param1: the 16-bit destination pointer +; given in r25,r24 (r25 is most significant) +; param2: the 16-bit pointer to sha256_ctx structure +; given in r23,r22 +sha256_ctx2hash: + movw r26, r22 + movw r30, r24 + ldi r21, 8 + sbiw r26, 4 +1: + ldi r20, 4 + adiw r26, 8 +2: + ld r0, -X + st Z+, r0 + dec r20 + brne 2b + + dec r21 + brne 1b + + ret + +;########################################################### + +.global sha256 +; === sha256 === +; this function calculates SHA-256 hashes from messages in RAM +; param1: the 16-bit hash destination pointer +; given in r25,r24 (r25 is most significant) +; param2: the 16-bit pointer to message +; given in r23,r22 +; param3: 32-bit length value (length of message in bits) +; given in r21,r20,r19,r18 +sha256: +sha256_prolog: + push r8 + push r9 + push r10 + push r11 + push r12 + push r13 + push r16 + push r17 + in r16, SPL + in r17, SPH + subi r16, 8*4+8 + sbci r17, 0 + in r0, SREG + cli + out SPL, r16 + out SPH, r17 + out SREG, r0 + + push r25 + push r24 + inc r16 + adc r17, r1 + + movw r8, r18 /* backup of length*/ + movw r10, r20 + + movw r12, r22 /* backup pf msg-ptr */ + + movw r24, r16 + rcall sha256_init + /* if length >= 512 */ +1: + tst r11 + brne 4f + tst r10 + brne 4f + mov r19, r9 + cpi r19, 0x02 + brlo 4f + + movw r24, r16 + movw r22, r12 + rcall sha256_nextBlock + ldi r19, 0x64 + add r22, r19 + adc r23, r1 + /* length -= 512 */ + ldi r19, 0x02 + sub r9, r19 + sbc r10, r1 + sbc r11, r1 + rjmp 1b + +4: + movw r24, r16 + movw r22, r12 + movw r20, r8 + rcall sha256_lastBlock + + pop r24 + pop r25 + movw r22, r16 + rcall sha256_ctx2hash + +sha256_epilog: + in r30, SPL + in r31, SPH + adiw r30, 8*4+8 + in r0, SREG + cli + out SPL, r30 + out SPH, r31 + out SREG, r0 + pop r17 + pop r16 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop r8 + ret + +;########################################################### + + +; block MUST NOT be larger than 64 bytes + +.global sha256_lastBlock +; === sha256_lastBlock === +; this function does padding & Co. for calculating SHA-256 hashes +; param1: the 16-bit pointer to sha256_ctx structure +; given in r25,r24 (r25 is most significant) +; param2: an 16-bit pointer to 64 byte block to hash +; given in r23,r22 +; param3: an 16-bit integer specifing length of block in bits +; given in r21,r20 +sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1) + + +sha256_lastBlock: + cpi r21, 0x02 + brlo sha256_lastBlock_prolog + push r25 + push r24 + push r23 + push r22 + push r21 + push r20 + rcall sha256_nextBlock + pop r20 + pop r21 + pop r22 + pop r23 + pop r24 + pop r25 + subi r21, 0x02 + subi r23, -2 + rjmp sha256_lastBlock +sha256_lastBlock_prolog: + /* allocate space on stack */ + in r30, SPL + in r31, SPH + in r1, SREG + subi r30, lo8(64) + sbci r31, hi8(64) + cli + out SPL, r30 + out SPH, r31 + out SREG,r1 + + adiw r30, 1 /* SP points to next free byte on stack */ + mov r18, r20 /* r20 = LSB(length) */ + lsr r18 + lsr r18 + lsr r18 + bst r21, 0 /* may be we should explain this ... */ + bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */ + + + movw r26, r22 /* X points to begin of msg */ + tst r18 + breq sha256_lastBlock_post_copy + mov r1, r18 +sha256_lastBlock_copy_loop: + ld r0, X+ + st Z+, r0 + dec r1 + brne sha256_lastBlock_copy_loop +sha256_lastBlock_post_copy: +sha256_lastBlock_insert_stuffing_bit: + ldi r19, 0x80 + mov r0,r19 + ldi r19, 0x07 + and r19, r20 /* if we are in bitmode */ + breq 2f /* no bitmode */ +1: + lsr r0 + dec r19 + brne 1b + ld r19, X +/* maybe we should do some ANDing here, just for safety */ + or r0, r19 +2: + st Z+, r0 + inc r18 + +/* checking stuff here */ + cpi r18, 64-8+1 + brsh 0f + rjmp sha256_lastBlock_insert_zeros +0: + /* oh shit, we landed here */ + /* first we have to fill it up with zeros */ + ldi r19, 64 + sub r19, r18 + breq 2f +1: + st Z+, r1 + dec r19 + brne 1b +2: + sbiw r30, 63 + sbiw r30, 1 + movw r22, r30 + + push r31 + push r30 + push r25 + push r24 + push r21 + push r20 + rcall sha256_nextBlock + pop r20 + pop r21 + pop r24 + pop r25 + pop r30 + pop r31 + + /* now we should subtract 512 from length */ + movw r26, r24 + adiw r26, 4*8+1 /* we can skip the lowest byte */ + ld r19, X + subi r19, hi8(512) + st X+, r19 + ldi r18, 6 +1: + ld r19, X + sbci r19, 0 + st X+, r19 + dec r18 + brne 1b + +; clr r18 /* not neccessary ;-) */ + /* reset Z pointer to begin of block */ + +sha256_lastBlock_insert_zeros: + ldi r19, 64-8 + sub r19, r18 + breq sha256_lastBlock_insert_length + clr r1 +1: + st Z+, r1 /* r1 is still zero */ + dec r19 + brne 1b + +; rjmp sha256_lastBlock_epilog +sha256_lastBlock_insert_length: + movw r26, r24 /* X points to state */ + adiw r26, 8*4 /* X points to (state.length) */ + adiw r30, 8 /* Z points one after the last byte of block */ + ld r0, X+ + add r0, r20 + st -Z, r0 + ld r0, X+ + adc r0, r21 + st -Z, r0 + ldi r19, 6 +1: + ld r0, X+ + adc r0, r1 + st -Z, r0 + dec r19 + brne 1b + + sbiw r30, 64-8 + movw r22, r30 + rcall sha256_nextBlock + +sha256_lastBlock_epilog: + in r30, SPL + in r31, SPH + in r1, SREG + adiw r30, 63 ; lo8(64) + adiw r30, 1 ; hi8(64) + cli + out SPL, r30 + out SPH, r31 + out SREG,r1 + clr r1 + clr r0 + ret + +/**/ +;########################################################### + +.global sha256_nextBlock +; === sha256_nextBlock === +; this is the core function for calculating SHA-256 hashes +; param1: the 16-bit pointer to sha256_ctx structure +; given in r25,r24 (r25 is most significant) +; param2: an 16-bit pointer to 64 byte block to hash +; given in r23,r22 +sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte) + +Bck1 = 12 +Bck2 = 13 +Bck3 = 14 +Bck4 = 15 +Func1 = 22 +Func2 = 23 +Func3 = 24 +Func4 = 25 +Accu1 = 16 +Accu2 = 17 +Accu3 = 18 +Accu4 = 19 +XAccu1 = 8 +XAccu2 = 9 +XAccu3 = 10 +XAccu4 = 11 +T1 = 4 +T2 = 5 +T3 = 6 +T4 = 7 +LoopC = 1 +/* byteorder: high number <--> high significance */ +sha256_nextBlock: + ; initial, let's make some space ready for local vars + push r4 /* replace push & pop by mem ops? */ + push r5 + push r6 + push r7 + push r8 + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push r16 + push r17 + push r28 + push r29 + in r20, SPL + in r21, SPH + movw r18, r20 ;backup SP +; movw r26, r20 ; X points to free space on stack + movw r30, r22 ; Z points to message + subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63 + sbci r21, hi8(sha256_nextBlock_localSpace) + movw r26, r20 ; X points to free space on stack + in r0, SREG + cli ; we want to be uninterrupted while updating SP + out SPL, r20 + out SPH, r21 + out SREG, r0 + push r18 + push r19 + push r24 + push r25 /* param1 will be needed later */ + ; now we fill the w array with message (think about endianess) + adiw r26, 1 ; X++ + ldi r20, 16 +sha256_nextBlock_wcpyloop: + ld r23, Z+ + ld r22, Z+ + ld r19, Z+ + ld r18, Z+ + st X+, r18 + st X+, r19 + st X+, r22 + st X+, r23 + dec r20 + brne sha256_nextBlock_wcpyloop +/* for (i=16; i<64; ++i){ + w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16]; + } */ + /* r25,r24,r23,r24 (r21,r20) are function values + r19,r18,r17,r16 are the accumulator + r15,r14,r13,rBck1 are backup1 + r11,r10,r9 ,r8 are xor accu + r1 is round counter */ + + ldi r20, 64-16 + mov LoopC, r20 +sha256_nextBlock_wcalcloop: + movw r30, r26 ; cp X to Z + sbiw r30, 63 + sbiw r30, 1 ; substract 64 = 16*4 + ld Accu1, Z+ + ld Accu2, Z+ + ld Accu3, Z+ + ld Accu4, Z+ /* w[i] = w[i-16] */ + ld Bck1, Z+ + ld Bck2, Z+ + ld Bck3, Z+ + ld Bck4, Z+ /* backup = w[i-15] */ + /* now sigma 0 */ + mov Func1, Bck2 + mov Func2, Bck3 + mov Func3, Bck4 + mov Func4, Bck1 /* prerotated by 8 */ + ldi r20, 1 + rcall bitrotl + movw XAccu1, Func1 + movw XAccu3, Func3 /* store ROTR(w[i-15],7) in xor accu */ + movw Func1, Bck3 + movw Func3, Bck1 /* prerotated by 16 */ + ldi r20, 2 + rcall bitrotr + eor XAccu1, Func1 /* xor ROTR(w[i-15], 18)*/ + eor XAccu2, Func2 + eor XAccu3, Func3 + eor XAccu4, Func4 + ldi Func2, 3 /* now shr3 */ /*we can destroy backup now*/ +sigma0_shr: + lsr Bck4 + ror Bck3 + ror Bck2 + ror Bck1 + dec Func2 + brne sigma0_shr + eor XAccu1, Bck1 + eor XAccu2, Bck2 + eor XAccu3, Bck3 + eor XAccu4, Bck4 /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */ + add Accu1, XAccu1 + adc Accu2, XAccu2 + adc Accu3, XAccu3 + adc Accu4, XAccu4 /* finished with sigma0 */ + ldd Func1, Z+7*4 /* now accu += w[i-7] */ + ldd Func2, Z+7*4+1 + ldd Func3, Z+7*4+2 + ldd Func4, Z+7*4+3 + add Accu1, Func1 + adc Accu2, Func2 + adc Accu3, Func3 + adc Accu4, Func4 + ldd Bck1, Z+12*4 /* now backup = w[i-2]*/ + ldd Bck2, Z+12*4+1 + ldd Bck3, Z+12*4+2 + ldd Bck4, Z+12*4+3 + /* now sigma 1 */ + movw Func1, Bck3 + movw Func3, Bck1 /* prerotated by 16 */ + ldi r20, 1 + rcall bitrotr + movw XAccu3, Func3 + movw XAccu1, Func1 /* store in ROTR(w[i-2], 17) xor accu */ +; movw Func1, Bck3 +; movw Func3, Bck1 /* prerotated by 16 */ + ldi r20, 2 + rcall bitrotr + eor XAccu1, Func1 /* xor ROTR(w[i-2], 19)*/ + eor XAccu2, Func2 + eor XAccu3, Func3 + eor XAccu4, Func4 + ldi Func2, 2 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/ +sigma1_shr: + lsr Bck4 + ror Bck3 + ror Bck2 + dec Func2 + brne sigma1_shr + eor XAccu1, Bck2 + eor XAccu2, Bck3 + eor XAccu3, Bck4 /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */ + add Accu1, XAccu1 + adc Accu2, XAccu2 + adc Accu3, XAccu3 + adc Accu4, XAccu4 /* finished with sigma0 */ + /* now let's store the shit */ + st X+, Accu1 + st X+, Accu2 + st X+, Accu3 + st X+, Accu4 + dec LoopC + breq 3f ; skip if zero + rjmp sha256_nextBlock_wcalcloop +3: + /* we are finished with w array X points one byte post w */ +/* init a array */ + pop r31 + pop r30 + push r30 + push r31 + ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */ +init_a_array: + ld r1, Z+ + st X+, r1 + dec r25 + brne init_a_array + +/* now the real fun begins */ +/* for (i=0; i<64; ++i){ + t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i]; + t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]); + memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; + a[4] += t1; + a[0] = t1 + t2; + } */ + /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */ + sbiw r26, 8*4 /* X still points at a[7]+1*/ + movw r28, r26 + ldi r30, lo8(sha256_kv) + ldi r31, hi8(sha256_kv) + dec r27 /* X - (64*4 == 256) */ + ldi r25, 64 + mov LoopC, r25 +sha256_main_loop: + /* now calculate t1 */ + /*CH(x,y,z) = (x&y)^((~x)&z)*/ + ldd T1, Y+5*4 + ldd T2, Y+5*4+1 + ldd T3, Y+5*4+2 + ldd T4, Y+5*4+3 /* y in T */ + ldd Func1, Y+4*4 + ldd Func2, Y+4*4+1 + ldd Func3, Y+4*4+2 + ldd Func4, Y+4*4+3 /* x in Func */ + ldd Bck1, Y+6*4 + ldd Bck2, Y+6*4+1 + ldd Bck3, Y+6*4+2 + ldd Bck4, Y+6*4+3 /* z in Bck */ + and T1, Func1 + and T2, Func2 + and T3, Func3 + and T4, Func4 + com Func1 + com Func2 + com Func3 + com Func4 + and Bck1, Func1 + and Bck2, Func2 + and Bck3, Func3 + and Bck4, Func4 + eor T1, Bck1 + eor T2, Bck2 + eor T3, Bck3 + eor T4, Bck4 /* done, CH(x,y,z) is in T */ + /* now SIGMA1(a[4]) */ + ldd Bck4, Y+4*4 /* think about using it from Func reg above*/ + ldd Bck1, Y+4*4+1 + ldd Bck2, Y+4*4+2 + ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */ + movw Func1, Bck1 + movw Func3, Bck3 + ldi r20, 2 + rcall bitrotl /* rotr(x,6) */ + movw XAccu1, Func1 + movw XAccu3, Func3 + movw Func1, Bck1 + movw Func3, Bck3 + ldi r20, 3 + rcall bitrotr /* rotr(x,11) */ + eor XAccu1, Func1 + eor XAccu2, Func2 + eor XAccu3, Func3 + eor XAccu4, Func4 + movw Func1, Bck3 /* this prerotates furteh 16 bits*/ + movw Func3, Bck1 /* so we have now prerotated by 24 bits*/ + ldi r20, 1 + rcall bitrotr /* rotr(x,11) */ + eor XAccu1, Func1 + eor XAccu2, Func2 + eor XAccu3, Func3 + eor XAccu4, Func4 /* finished with SIGMA1, add it to T */ + add T1, XAccu1 + adc T2, XAccu2 + adc T3, XAccu3 + adc T4, XAccu4 + /* now we've to add a[7], w[i] and k[i] */ + ldd XAccu1, Y+4*7 + ldd XAccu2, Y+4*7+1 + ldd XAccu3, Y+4*7+2 + ldd XAccu4, Y+4*7+3 + add T1, XAccu1 + adc T2, XAccu2 + adc T3, XAccu3 + adc T4, XAccu4 /* add a[7] */ + ld XAccu1, X+ + ld XAccu2, X+ + ld XAccu3, X+ + ld XAccu4, X+ + add T1, XAccu1 + adc T2, XAccu2 + adc T3, XAccu3 + adc T4, XAccu4 /* add w[i] */ + lpm XAccu1, Z+ + lpm XAccu2, Z+ + lpm XAccu3, Z+ + lpm XAccu4, Z+ + add T1, XAccu1 + adc T2, XAccu2 + adc T3, XAccu3 + adc T4, XAccu4 /* add k[i] */ /* finished with t1 */ + /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/ + /* starting with MAJ(x,y,z) */ + ldd Func1, Y+4*0+0 + ldd Func2, Y+4*0+1 + ldd Func3, Y+4*0+2 + ldd Func4, Y+4*0+3 /* load x=a[0] */ + ldd XAccu1, Y+4*1+0 + ldd XAccu2, Y+4*1+1 + ldd XAccu3, Y+4*1+2 + ldd XAccu4, Y+4*1+3 /* load y=a[1] */ + and XAccu1, Func1 + and XAccu2, Func2 + and XAccu3, Func3 + and XAccu4, Func4 /* XAccu == (x & y) */ + ldd Bck1, Y+4*2+0 + ldd Bck2, Y+4*2+1 + ldd Bck3, Y+4*2+2 + ldd Bck4, Y+4*2+3 /* load z=a[2] */ + and Func1, Bck1 + and Func2, Bck2 + and Func3, Bck3 + and Func4, Bck4 + eor XAccu1, Func1 + eor XAccu2, Func2 + eor XAccu3, Func3 + eor XAccu4, Func4 /* XAccu == (x & y) ^ (x & z) */ + ldd Func1, Y+4*1+0 + ldd Func2, Y+4*1+1 + ldd Func3, Y+4*1+2 + ldd Func4, Y+4*1+3 /* load y=a[1] */ + and Func1, Bck1 + and Func2, Bck2 + and Func3, Bck3 + and Func4, Bck4 + eor XAccu1, Func1 + eor XAccu2, Func2 + eor XAccu3, Func3 + eor XAccu4, Func4 /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */ + /* SIGMA0(a[0]) */ + ldd Bck1, Y+4*0+0 /* we should combine this with above */ + ldd Bck2, Y+4*0+1 + ldd Bck3, Y+4*0+2 + ldd Bck4, Y+4*0+3 + movw Func1, Bck1 + movw Func3, Bck3 + ldi r20, 2 + rcall bitrotr + movw Accu1, Func1 + movw Accu3, Func3 /* Accu = shr(a[0], 2) */ + movw Func1, Bck3 + movw Func3, Bck1 /* prerotate by 16 bits */ + ldi r20, 3 + rcall bitrotl + eor Accu1, Func1 + eor Accu2, Func2 + eor Accu3, Func3 + eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */ + mov Func1, Bck4 + mov Func2, Bck1 + mov Func3, Bck2 + mov Func4, Bck3 /* prerotate by 24 bits */ + ldi r20, 2 + rcall bitrotl + eor Accu1, Func1 + eor Accu2, Func2 + eor Accu3, Func3 + eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */ + add Accu1, XAccu1 /* add previous result (MAJ)*/ + adc Accu2, XAccu2 + adc Accu3, XAccu3 + adc Accu4, XAccu4 + /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/ + /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */ + + ldi r21, 7*4 + adiw r28, 7*4 +a_shift_loop: + ld r25, -Y /* warning: this is PREdecrement */ + std Y+4, r25 + dec r21 + brne a_shift_loop + + ldd Bck1, Y+4*4+0 + ldd Bck2, Y+4*4+1 + ldd Bck3, Y+4*4+2 + ldd Bck4, Y+4*4+3 + add Bck1, T1 + adc Bck2, T2 + adc Bck3, T3 + adc Bck4, T4 + std Y+4*4+0, Bck1 + std Y+4*4+1, Bck2 + std Y+4*4+2, Bck3 + std Y+4*4+3, Bck4 + add Accu1, T1 + adc Accu2, T2 + adc Accu3, T3 + adc Accu4, T4 + std Y+4*0+0, Accu1 + std Y+4*0+1, Accu2 + std Y+4*0+2, Accu3 + std Y+4*0+3, Accu4 /* a array updated */ + + + dec LoopC + breq update_state + rjmp sha256_main_loop ;brne sha256_main_loop +update_state: + /* update state */ + /* pointers to state should still exist on the stack ;-) */ + pop r31 + pop r30 + ldi r21, 8 +update_state_loop: + ldd Accu1, Z+0 + ldd Accu2, Z+1 + ldd Accu3, Z+2 + ldd Accu4, Z+3 + ld Func1, Y+ + ld Func2, Y+ + ld Func3, Y+ + ld Func4, Y+ + add Accu1, Func1 + adc Accu2, Func2 + adc Accu3, Func3 + adc Accu4, Func4 + st Z+, Accu1 + st Z+, Accu2 + st Z+, Accu3 + st Z+, Accu4 + dec r21 + brne update_state_loop + /* now we just have to update the length */ + adiw r30, 1 /* since we add 512, we can simply skip the LSB */ + ldi r21, 2 + ldi r22, 6 + ld r20, Z + add r20, r21 + st Z+, r20 + clr r21 +sha256_nextBlock_fix_length: + brcc sha256_nextBlock_epilog + ld r20, Z + adc r20, r21 + st Z+, r20 + dec r22 + brne sha256_nextBlock_fix_length + +; EPILOG +sha256_nextBlock_epilog: +/* now we should clean up the stack */ + + pop r21 + pop r20 + in r0, SREG + cli ; we want to be uninterrupted while updating SP + out SPL, r20 + out SPH, r21 + out SREG, r0 + + clr r1 + pop r29 + pop r28 + pop r17 + pop r16 + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop r8 + pop r7 + pop r6 + pop r5 + pop r4 + ret + +sha256_kv: ; round-key-vector stored in ProgMem +.word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c +.word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b +.word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9 +.word 0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429 +.word 0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272 +.word 0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a +.word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e +.word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671 + + +;########################################################### + +.global sha256_init +;uint32_t sha256_init_vector[]={ +; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, +; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; +; +;void sha256_init(sha256_ctx_t *state){ +; state->length=0; +; memcpy(state->h, sha256_init_vector, 8*4); +;} +; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram +; modifys: Z(r30,r31), Func1, r22 +sha256_init: + movw r26, r24 ; (24,25) --> (26,27) load X with param1 + ldi r30, lo8((sha256_init_vector)) + ldi r31, hi8((sha256_init_vector)) + ldi r22, 32+8 +sha256_init_vloop: + lpm r23, Z+ + st X+, r23 + dec r22 + brne sha256_init_vloop + ret + +sha256_init_vector: +.word 0xE667, 0x6A09 +.word 0xAE85, 0xBB67 +.word 0xF372, 0x3C6E +.word 0xF53A, 0xA54F +.word 0x527F, 0x510E +.word 0x688C, 0x9B05 +.word 0xD9AB, 0x1F83 +.word 0xCD19, 0x5BE0 +.word 0x0000, 0x0000 +.word 0x0000, 0x0000 + +;########################################################### + +.global rotl32 +; === ROTL32 === +; function that rotates a 32 bit word to the left +; param1: the 32-bit word to rotate +; given in r25,r24,r23,r22 (r25 is most significant) +; param2: an 8-bit value telling how often to rotate +; given in r20 +; modifys: r21, r22 +rotl32: + cpi r20, 8 + brlo bitrotl + mov r21, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r21 + subi r20, 8 + rjmp rotl32 +bitrotl: + clr r21 + clc +bitrotl_loop: + tst r20 + breq fixrotl + rol r22 + rol r23 + rol r24 + rol r25 + rol r21 + dec r20 + rjmp bitrotl_loop +fixrotl: + or r22, r21 + ret + + +;########################################################### + +.global rotr32 +; === ROTR32 === +; function that rotates a 32 bit word to the right +; param1: the 32-bit word to rotate +; given in r25,r24,r23,22 (r25 is most significant) +; param2: an 8-bit value telling how often to rotate +; given in r20 +; modifys: r21, r22 +rotr32: + cpi r20, 8 + brlo bitrotr + mov r21, r22 + mov r22, r23 + mov r23, r24 + mov r24, r25 + mov r25, r21 + subi r20, 8 + rjmp rotr32 +bitrotr: + clr r21 + clc +bitrotr_loop: + tst r20 + breq fixrotr + ror r25 + ror r24 + ror r23 + ror r22 + ror r21 + dec r20 + rjmp bitrotr_loop +fixrotr: + or r25, r21 + ret + + +;########################################################### + +.global change_endian32 +; === change_endian32 === +; function that changes the endianess of a 32-bit word +; param1: the 32-bit word +; given in r25,r24,r23,22 (r25 is most significant) +; modifys: r21, r22 +change_endian32: + movw r20, r22 ; (r22,r23) --> (r20,r21) + mov r22, r25 + mov r23, r24 + mov r24, r21 + mov r25, r20 + ret +