X-Git-Url: https://git.cryptolib.org/?p=avr-crypto-lib.git;a=blobdiff_plain;f=shabea%2Fsha256-asm.S;fp=shabea%2Fsha256-asm.S;h=0000000000000000000000000000000000000000;hp=d9eb6b65a789a49a7dc730775685114f146ed53b;hb=7b5401ab9ce23a5da1de8b6c7de3a1aa20ac4cf8;hpb=02ac3b653f3a11f284cc1a0cb0e983575f2f431b diff --git a/shabea/sha256-asm.S b/shabea/sha256-asm.S deleted file mode 100644 index d9eb6b6..0000000 --- a/shabea/sha256-asm.S +++ /dev/null @@ -1,1042 +0,0 @@ -/* sha256-asm.S */ -/* - This file is part of the AVR-Crypto-Lib. - Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ -/* - * Author: Daniel Otte - * - * License: GPLv3 or later -*/ -; sha-256 implementation in assembler -SHA256_BLOCK_BITS = 512 -SHA256_HASH_BITS = 256 - -.macro precall - /* push r18 - r27, r30 - r31*/ - push r0 - push r1 - push r18 - push r19 - push r20 - push r21 - push r22 - push r23 - push r24 - push r25 - push r26 - push r27 - push r30 - push r31 - clr r1 -.endm - -.macro postcall - pop r31 - pop r30 - pop r27 - pop r26 - pop r25 - pop r24 - pop r23 - pop r22 - pop r21 - pop r20 - pop r19 - pop r18 - pop r1 - pop r0 -.endm - - -.macro hexdump length - push r27 - push r26 - ldi r25, '\r' - mov r24, r25 - call uart_putc - ldi r25, '\n' - mov r24, r25 - call uart_putc - pop r26 - pop r27 - movw r24, r26 -.if \length > 16 - ldi r22, lo8(16) - ldi r23, hi8(16) - push r27 - push r26 - call uart_hexdump - pop r26 - pop r27 - adiw r26, 16 - hexdump \length-16 -.else - ldi r22, lo8(\length) - ldi r23, hi8(\length) - call uart_hexdump -.endif -.endm - -/* X points to Block */ -.macro dbg_hexdump length - precall - hexdump \length - postcall -.endm - -.section .text - -SPL = 0x3D -SPH = 0x3E -SREG = 0x3F - - -; -;sha256_ctx_t is: -; -; [h0][h1][h2][h3][h4][h5][h6][h7][length] -; hn is 32 bit large, length is 64 bit large - -;########################################################### - -.global sha256_ctx2hash -; === sha256_ctx2hash === -; this function converts a state into a normal hash (bytestring) -; param1: the 16-bit destination pointer -; given in r25,r24 (r25 is most significant) -; param2: the 16-bit pointer to sha256_ctx structure -; given in r23,r22 -sha256_ctx2hash: - movw r26, r22 - movw r30, r24 - ldi r21, 8 - sbiw r26, 4 -1: - ldi r20, 4 - adiw r26, 8 -2: - ld r0, -X - st Z+, r0 - dec r20 - brne 2b - - dec r21 - brne 1b - - ret - -;########################################################### - -.global sha256 -; === sha256 === -; this function calculates SHA-256 hashes from messages in RAM -; param1: the 16-bit hash destination pointer -; given in r25,r24 (r25 is most significant) -; param2: the 16-bit pointer to message -; given in r23,r22 -; param3: 32-bit length value (length of message in bits) -; given in r21,r20,r19,r18 -sha256: -sha256_prolog: - push r8 - push r9 - push r10 - push r11 - push r12 - push r13 - push r16 - push r17 - in r16, SPL - in r17, SPH - subi r16, 8*4+8 - sbci r17, 0 - in r0, SREG - cli - out SPL, r16 - out SPH, r17 - out SREG, r0 - - push r25 - push r24 - inc r16 - adc r17, r1 - - movw r8, r18 /* backup of length*/ - movw r10, r20 - - movw r12, r22 /* backup pf msg-ptr */ - - movw r24, r16 - rcall sha256_init - /* if length >= 512 */ -1: - tst r11 - brne 4f - tst r10 - brne 4f - mov r19, r9 - cpi r19, 0x02 - brlo 4f - - movw r24, r16 - movw r22, r12 - rcall sha256_nextBlock - ldi r19, 0x64 - add r22, r19 - adc r23, r1 - /* length -= 512 */ - ldi r19, 0x02 - sub r9, r19 - sbc r10, r1 - sbc r11, r1 - rjmp 1b - -4: - movw r24, r16 - movw r22, r12 - movw r20, r8 - rcall sha256_lastBlock - - pop r24 - pop r25 - movw r22, r16 - rcall sha256_ctx2hash - -sha256_epilog: - in r30, SPL - in r31, SPH - adiw r30, 8*4+8 - in r0, SREG - cli - out SPL, r30 - out SPH, r31 - out SREG, r0 - pop r17 - pop r16 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop r8 - ret - -;########################################################### - - -; block MUST NOT be larger than 64 bytes - -.global sha256_lastBlock -; === sha256_lastBlock === -; this function does padding & Co. for calculating SHA-256 hashes -; param1: the 16-bit pointer to sha256_ctx structure -; given in r25,r24 (r25 is most significant) -; param2: an 16-bit pointer to 64 byte block to hash -; given in r23,r22 -; param3: an 16-bit integer specifing length of block in bits -; given in r21,r20 -sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1) - - -sha256_lastBlock: - cpi r21, 0x02 - brlo sha256_lastBlock_prolog - push r25 - push r24 - push r23 - push r22 - push r21 - push r20 - rcall sha256_nextBlock - pop r20 - pop r21 - pop r22 - pop r23 - pop r24 - pop r25 - subi r21, 0x02 - subi r23, -2 - rjmp sha256_lastBlock -sha256_lastBlock_prolog: - /* allocate space on stack */ - in r30, SPL - in r31, SPH - in r1, SREG - subi r30, lo8(64) - sbci r31, hi8(64) - cli - out SPL, r30 - out SPH, r31 - out SREG,r1 - - adiw r30, 1 /* SP points to next free byte on stack */ - mov r18, r20 /* r20 = LSB(length) */ - lsr r18 - lsr r18 - lsr r18 - bst r21, 0 /* may be we should explain this ... */ - bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */ - - - movw r26, r22 /* X points to begin of msg */ - tst r18 - breq sha256_lastBlock_post_copy - mov r1, r18 -sha256_lastBlock_copy_loop: - ld r0, X+ - st Z+, r0 - dec r1 - brne sha256_lastBlock_copy_loop -sha256_lastBlock_post_copy: -sha256_lastBlock_insert_stuffing_bit: - ldi r19, 0x80 - mov r0,r19 - ldi r19, 0x07 - and r19, r20 /* if we are in bitmode */ - breq 2f /* no bitmode */ -1: - lsr r0 - dec r19 - brne 1b - ld r19, X -/* maybe we should do some ANDing here, just for safety */ - or r0, r19 -2: - st Z+, r0 - inc r18 - -/* checking stuff here */ - cpi r18, 64-8+1 - brsh 0f - rjmp sha256_lastBlock_insert_zeros -0: - /* oh shit, we landed here */ - /* first we have to fill it up with zeros */ - ldi r19, 64 - sub r19, r18 - breq 2f -1: - st Z+, r1 - dec r19 - brne 1b -2: - sbiw r30, 63 - sbiw r30, 1 - movw r22, r30 - - push r31 - push r30 - push r25 - push r24 - push r21 - push r20 - rcall sha256_nextBlock - pop r20 - pop r21 - pop r24 - pop r25 - pop r30 - pop r31 - - /* now we should subtract 512 from length */ - movw r26, r24 - adiw r26, 4*8+1 /* we can skip the lowest byte */ - ld r19, X - subi r19, hi8(512) - st X+, r19 - ldi r18, 6 -1: - ld r19, X - sbci r19, 0 - st X+, r19 - dec r18 - brne 1b - -; clr r18 /* not neccessary ;-) */ - /* reset Z pointer to begin of block */ - -sha256_lastBlock_insert_zeros: - ldi r19, 64-8 - sub r19, r18 - breq sha256_lastBlock_insert_length - clr r1 -1: - st Z+, r1 /* r1 is still zero */ - dec r19 - brne 1b - -; rjmp sha256_lastBlock_epilog -sha256_lastBlock_insert_length: - movw r26, r24 /* X points to state */ - adiw r26, 8*4 /* X points to (state.length) */ - adiw r30, 8 /* Z points one after the last byte of block */ - ld r0, X+ - add r0, r20 - st -Z, r0 - ld r0, X+ - adc r0, r21 - st -Z, r0 - ldi r19, 6 -1: - ld r0, X+ - adc r0, r1 - st -Z, r0 - dec r19 - brne 1b - - sbiw r30, 64-8 - movw r22, r30 - rcall sha256_nextBlock - -sha256_lastBlock_epilog: - in r30, SPL - in r31, SPH - in r1, SREG - adiw r30, 63 ; lo8(64) - adiw r30, 1 ; hi8(64) - cli - out SPL, r30 - out SPH, r31 - out SREG,r1 - clr r1 - clr r0 - ret - -/**/ -;########################################################### - -.global sha256_nextBlock -; === sha256_nextBlock === -; this is the core function for calculating SHA-256 hashes -; param1: the 16-bit pointer to sha256_ctx structure -; given in r25,r24 (r25 is most significant) -; param2: an 16-bit pointer to 64 byte block to hash -; given in r23,r22 -sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte) - -Bck1 = 12 -Bck2 = 13 -Bck3 = 14 -Bck4 = 15 -Func1 = 22 -Func2 = 23 -Func3 = 24 -Func4 = 25 -Accu1 = 16 -Accu2 = 17 -Accu3 = 18 -Accu4 = 19 -XAccu1 = 8 -XAccu2 = 9 -XAccu3 = 10 -XAccu4 = 11 -T1 = 4 -T2 = 5 -T3 = 6 -T4 = 7 -LoopC = 1 -/* byteorder: high number <--> high significance */ -sha256_nextBlock: - ; initial, let's make some space ready for local vars - push r4 /* replace push & pop by mem ops? */ - push r5 - push r6 - push r7 - push r8 - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - push r16 - push r17 - push r28 - push r29 - in r20, SPL - in r21, SPH - movw r18, r20 ;backup SP -; movw r26, r20 ; X points to free space on stack - movw r30, r22 ; Z points to message - subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63 - sbci r21, hi8(sha256_nextBlock_localSpace) - movw r26, r20 ; X points to free space on stack - in r0, SREG - cli ; we want to be uninterrupted while updating SP - out SPL, r20 - out SPH, r21 - out SREG, r0 - push r18 - push r19 - push r24 - push r25 /* param1 will be needed later */ - ; now we fill the w array with message (think about endianess) - adiw r26, 1 ; X++ - ldi r20, 16 -sha256_nextBlock_wcpyloop: - ld r23, Z+ - ld r22, Z+ - ld r19, Z+ - ld r18, Z+ - st X+, r18 - st X+, r19 - st X+, r22 - st X+, r23 - dec r20 - brne sha256_nextBlock_wcpyloop -/* for (i=16; i<64; ++i){ - w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16]; - } */ - /* r25,r24,r23,r24 (r21,r20) are function values - r19,r18,r17,r16 are the accumulator - r15,r14,r13,rBck1 are backup1 - r11,r10,r9 ,r8 are xor accu - r1 is round counter */ - - ldi r20, 64-16 - mov LoopC, r20 -sha256_nextBlock_wcalcloop: - movw r30, r26 ; cp X to Z - sbiw r30, 63 - sbiw r30, 1 ; substract 64 = 16*4 - ld Accu1, Z+ - ld Accu2, Z+ - ld Accu3, Z+ - ld Accu4, Z+ /* w[i] = w[i-16] */ - ld Bck1, Z+ - ld Bck2, Z+ - ld Bck3, Z+ - ld Bck4, Z+ /* backup = w[i-15] */ - /* now sigma 0 */ - mov Func1, Bck2 - mov Func2, Bck3 - mov Func3, Bck4 - mov Func4, Bck1 /* prerotated by 8 */ - ldi r20, 1 - rcall bitrotl - movw XAccu1, Func1 - movw XAccu3, Func3 /* store ROTR(w[i-15],7) in xor accu */ - movw Func1, Bck3 - movw Func3, Bck1 /* prerotated by 16 */ - ldi r20, 2 - rcall bitrotr - eor XAccu1, Func1 /* xor ROTR(w[i-15], 18)*/ - eor XAccu2, Func2 - eor XAccu3, Func3 - eor XAccu4, Func4 - ldi Func2, 3 /* now shr3 */ /*we can destroy backup now*/ -sigma0_shr: - lsr Bck4 - ror Bck3 - ror Bck2 - ror Bck1 - dec Func2 - brne sigma0_shr - eor XAccu1, Bck1 - eor XAccu2, Bck2 - eor XAccu3, Bck3 - eor XAccu4, Bck4 /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */ - add Accu1, XAccu1 - adc Accu2, XAccu2 - adc Accu3, XAccu3 - adc Accu4, XAccu4 /* finished with sigma0 */ - ldd Func1, Z+7*4 /* now accu += w[i-7] */ - ldd Func2, Z+7*4+1 - ldd Func3, Z+7*4+2 - ldd Func4, Z+7*4+3 - add Accu1, Func1 - adc Accu2, Func2 - adc Accu3, Func3 - adc Accu4, Func4 - ldd Bck1, Z+12*4 /* now backup = w[i-2]*/ - ldd Bck2, Z+12*4+1 - ldd Bck3, Z+12*4+2 - ldd Bck4, Z+12*4+3 - /* now sigma 1 */ - movw Func1, Bck3 - movw Func3, Bck1 /* prerotated by 16 */ - ldi r20, 1 - rcall bitrotr - movw XAccu3, Func3 - movw XAccu1, Func1 /* store in ROTR(w[i-2], 17) xor accu */ -; movw Func1, Bck3 -; movw Func3, Bck1 /* prerotated by 16 */ - ldi r20, 2 - rcall bitrotr - eor XAccu1, Func1 /* xor ROTR(w[i-2], 19)*/ - eor XAccu2, Func2 - eor XAccu3, Func3 - eor XAccu4, Func4 - ldi Func2, 2 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/ -sigma1_shr: - lsr Bck4 - ror Bck3 - ror Bck2 - dec Func2 - brne sigma1_shr - eor XAccu1, Bck2 - eor XAccu2, Bck3 - eor XAccu3, Bck4 /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */ - add Accu1, XAccu1 - adc Accu2, XAccu2 - adc Accu3, XAccu3 - adc Accu4, XAccu4 /* finished with sigma0 */ - /* now let's store the shit */ - st X+, Accu1 - st X+, Accu2 - st X+, Accu3 - st X+, Accu4 - dec LoopC - breq 3f ; skip if zero - rjmp sha256_nextBlock_wcalcloop -3: - /* we are finished with w array X points one byte post w */ -/* init a array */ - pop r31 - pop r30 - push r30 - push r31 - ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */ -init_a_array: - ld r1, Z+ - st X+, r1 - dec r25 - brne init_a_array - -/* now the real fun begins */ -/* for (i=0; i<64; ++i){ - t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i]; - t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]); - memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; - a[4] += t1; - a[0] = t1 + t2; - } */ - /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */ - sbiw r26, 8*4 /* X still points at a[7]+1*/ - movw r28, r26 - ldi r30, lo8(sha256_kv) - ldi r31, hi8(sha256_kv) - dec r27 /* X - (64*4 == 256) */ - ldi r25, 64 - mov LoopC, r25 -sha256_main_loop: - /* now calculate t1 */ - /*CH(x,y,z) = (x&y)^((~x)&z)*/ - ldd T1, Y+5*4 - ldd T2, Y+5*4+1 - ldd T3, Y+5*4+2 - ldd T4, Y+5*4+3 /* y in T */ - ldd Func1, Y+4*4 - ldd Func2, Y+4*4+1 - ldd Func3, Y+4*4+2 - ldd Func4, Y+4*4+3 /* x in Func */ - ldd Bck1, Y+6*4 - ldd Bck2, Y+6*4+1 - ldd Bck3, Y+6*4+2 - ldd Bck4, Y+6*4+3 /* z in Bck */ - and T1, Func1 - and T2, Func2 - and T3, Func3 - and T4, Func4 - com Func1 - com Func2 - com Func3 - com Func4 - and Bck1, Func1 - and Bck2, Func2 - and Bck3, Func3 - and Bck4, Func4 - eor T1, Bck1 - eor T2, Bck2 - eor T3, Bck3 - eor T4, Bck4 /* done, CH(x,y,z) is in T */ - /* now SIGMA1(a[4]) */ - ldd Bck4, Y+4*4 /* think about using it from Func reg above*/ - ldd Bck1, Y+4*4+1 - ldd Bck2, Y+4*4+2 - ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */ - movw Func1, Bck1 - movw Func3, Bck3 - ldi r20, 2 - rcall bitrotl /* rotr(x,6) */ - movw XAccu1, Func1 - movw XAccu3, Func3 - movw Func1, Bck1 - movw Func3, Bck3 - ldi r20, 3 - rcall bitrotr /* rotr(x,11) */ - eor XAccu1, Func1 - eor XAccu2, Func2 - eor XAccu3, Func3 - eor XAccu4, Func4 - movw Func1, Bck3 /* this prerotates furteh 16 bits*/ - movw Func3, Bck1 /* so we have now prerotated by 24 bits*/ - ldi r20, 1 - rcall bitrotr /* rotr(x,11) */ - eor XAccu1, Func1 - eor XAccu2, Func2 - eor XAccu3, Func3 - eor XAccu4, Func4 /* finished with SIGMA1, add it to T */ - add T1, XAccu1 - adc T2, XAccu2 - adc T3, XAccu3 - adc T4, XAccu4 - /* now we've to add a[7], w[i] and k[i] */ - ldd XAccu1, Y+4*7 - ldd XAccu2, Y+4*7+1 - ldd XAccu3, Y+4*7+2 - ldd XAccu4, Y+4*7+3 - add T1, XAccu1 - adc T2, XAccu2 - adc T3, XAccu3 - adc T4, XAccu4 /* add a[7] */ - ld XAccu1, X+ - ld XAccu2, X+ - ld XAccu3, X+ - ld XAccu4, X+ - add T1, XAccu1 - adc T2, XAccu2 - adc T3, XAccu3 - adc T4, XAccu4 /* add w[i] */ - lpm XAccu1, Z+ - lpm XAccu2, Z+ - lpm XAccu3, Z+ - lpm XAccu4, Z+ - add T1, XAccu1 - adc T2, XAccu2 - adc T3, XAccu3 - adc T4, XAccu4 /* add k[i] */ /* finished with t1 */ - /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/ - /* starting with MAJ(x,y,z) */ - ldd Func1, Y+4*0+0 - ldd Func2, Y+4*0+1 - ldd Func3, Y+4*0+2 - ldd Func4, Y+4*0+3 /* load x=a[0] */ - ldd XAccu1, Y+4*1+0 - ldd XAccu2, Y+4*1+1 - ldd XAccu3, Y+4*1+2 - ldd XAccu4, Y+4*1+3 /* load y=a[1] */ - and XAccu1, Func1 - and XAccu2, Func2 - and XAccu3, Func3 - and XAccu4, Func4 /* XAccu == (x & y) */ - ldd Bck1, Y+4*2+0 - ldd Bck2, Y+4*2+1 - ldd Bck3, Y+4*2+2 - ldd Bck4, Y+4*2+3 /* load z=a[2] */ - and Func1, Bck1 - and Func2, Bck2 - and Func3, Bck3 - and Func4, Bck4 - eor XAccu1, Func1 - eor XAccu2, Func2 - eor XAccu3, Func3 - eor XAccu4, Func4 /* XAccu == (x & y) ^ (x & z) */ - ldd Func1, Y+4*1+0 - ldd Func2, Y+4*1+1 - ldd Func3, Y+4*1+2 - ldd Func4, Y+4*1+3 /* load y=a[1] */ - and Func1, Bck1 - and Func2, Bck2 - and Func3, Bck3 - and Func4, Bck4 - eor XAccu1, Func1 - eor XAccu2, Func2 - eor XAccu3, Func3 - eor XAccu4, Func4 /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */ - /* SIGMA0(a[0]) */ - ldd Bck1, Y+4*0+0 /* we should combine this with above */ - ldd Bck2, Y+4*0+1 - ldd Bck3, Y+4*0+2 - ldd Bck4, Y+4*0+3 - movw Func1, Bck1 - movw Func3, Bck3 - ldi r20, 2 - rcall bitrotr - movw Accu1, Func1 - movw Accu3, Func3 /* Accu = shr(a[0], 2) */ - movw Func1, Bck3 - movw Func3, Bck1 /* prerotate by 16 bits */ - ldi r20, 3 - rcall bitrotl - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */ - mov Func1, Bck4 - mov Func2, Bck1 - mov Func3, Bck2 - mov Func4, Bck3 /* prerotate by 24 bits */ - ldi r20, 2 - rcall bitrotl - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */ - add Accu1, XAccu1 /* add previous result (MAJ)*/ - adc Accu2, XAccu2 - adc Accu3, XAccu3 - adc Accu4, XAccu4 - /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/ - /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */ - - ldi r21, 7*4 - adiw r28, 7*4 -a_shift_loop: - ld r25, -Y /* warning: this is PREdecrement */ - std Y+4, r25 - dec r21 - brne a_shift_loop - - ldd Bck1, Y+4*4+0 - ldd Bck2, Y+4*4+1 - ldd Bck3, Y+4*4+2 - ldd Bck4, Y+4*4+3 - add Bck1, T1 - adc Bck2, T2 - adc Bck3, T3 - adc Bck4, T4 - std Y+4*4+0, Bck1 - std Y+4*4+1, Bck2 - std Y+4*4+2, Bck3 - std Y+4*4+3, Bck4 - add Accu1, T1 - adc Accu2, T2 - adc Accu3, T3 - adc Accu4, T4 - std Y+4*0+0, Accu1 - std Y+4*0+1, Accu2 - std Y+4*0+2, Accu3 - std Y+4*0+3, Accu4 /* a array updated */ - - - dec LoopC - breq update_state - rjmp sha256_main_loop ;brne sha256_main_loop -update_state: - /* update state */ - /* pointers to state should still exist on the stack ;-) */ - pop r31 - pop r30 - ldi r21, 8 -update_state_loop: - ldd Accu1, Z+0 - ldd Accu2, Z+1 - ldd Accu3, Z+2 - ldd Accu4, Z+3 - ld Func1, Y+ - ld Func2, Y+ - ld Func3, Y+ - ld Func4, Y+ - add Accu1, Func1 - adc Accu2, Func2 - adc Accu3, Func3 - adc Accu4, Func4 - st Z+, Accu1 - st Z+, Accu2 - st Z+, Accu3 - st Z+, Accu4 - dec r21 - brne update_state_loop - /* now we just have to update the length */ - adiw r30, 1 /* since we add 512, we can simply skip the LSB */ - ldi r21, 2 - ldi r22, 6 - ld r20, Z - add r20, r21 - st Z+, r20 - clr r21 -sha256_nextBlock_fix_length: - brcc sha256_nextBlock_epilog - ld r20, Z - adc r20, r21 - st Z+, r20 - dec r22 - brne sha256_nextBlock_fix_length - -; EPILOG -sha256_nextBlock_epilog: -/* now we should clean up the stack */ - - pop r21 - pop r20 - in r0, SREG - cli ; we want to be uninterrupted while updating SP - out SPL, r20 - out SPH, r21 - out SREG, r0 - - clr r1 - pop r29 - pop r28 - pop r17 - pop r16 - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop r8 - pop r7 - pop r6 - pop r5 - pop r4 - ret - -sha256_kv: ; round-key-vector stored in ProgMem -.word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c -.word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b -.word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9 -.word 0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429 -.word 0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272 -.word 0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a -.word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e -.word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671 - - -;########################################################### - -.global sha256_init -;uint32_t sha256_init_vector[]={ -; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, -; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; -; -;void sha256_init(sha256_ctx_t *state){ -; state->length=0; -; memcpy(state->h, sha256_init_vector, 8*4); -;} -; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram -; modifys: Z(r30,r31), Func1, r22 -sha256_init: - movw r26, r24 ; (24,25) --> (26,27) load X with param1 - ldi r30, lo8((sha256_init_vector)) - ldi r31, hi8((sha256_init_vector)) - ldi r22, 32+8 -sha256_init_vloop: - lpm r23, Z+ - st X+, r23 - dec r22 - brne sha256_init_vloop - ret - -sha256_init_vector: -.word 0xE667, 0x6A09 -.word 0xAE85, 0xBB67 -.word 0xF372, 0x3C6E -.word 0xF53A, 0xA54F -.word 0x527F, 0x510E -.word 0x688C, 0x9B05 -.word 0xD9AB, 0x1F83 -.word 0xCD19, 0x5BE0 -.word 0x0000, 0x0000 -.word 0x0000, 0x0000 - -;########################################################### - -.global rotl32 -; === ROTL32 === -; function that rotates a 32 bit word to the left -; param1: the 32-bit word to rotate -; given in r25,r24,r23,r22 (r25 is most significant) -; param2: an 8-bit value telling how often to rotate -; given in r20 -; modifys: r21, r22 -rotl32: - cpi r20, 8 - brlo bitrotl - mov r21, r25 - mov r25, r24 - mov r24, r23 - mov r23, r22 - mov r22, r21 - subi r20, 8 - rjmp rotl32 -bitrotl: - clr r21 - clc -bitrotl_loop: - tst r20 - breq fixrotl - rol r22 - rol r23 - rol r24 - rol r25 - rol r21 - dec r20 - rjmp bitrotl_loop -fixrotl: - or r22, r21 - ret - - -;########################################################### - -.global rotr32 -; === ROTR32 === -; function that rotates a 32 bit word to the right -; param1: the 32-bit word to rotate -; given in r25,r24,r23,22 (r25 is most significant) -; param2: an 8-bit value telling how often to rotate -; given in r20 -; modifys: r21, r22 -rotr32: - cpi r20, 8 - brlo bitrotr - mov r21, r22 - mov r22, r23 - mov r23, r24 - mov r24, r25 - mov r25, r21 - subi r20, 8 - rjmp rotr32 -bitrotr: - clr r21 - clc -bitrotr_loop: - tst r20 - breq fixrotr - ror r25 - ror r24 - ror r23 - ror r22 - ror r21 - dec r20 - rjmp bitrotr_loop -fixrotr: - or r25, r21 - ret - - -;########################################################### - -.global change_endian32 -; === change_endian32 === -; function that changes the endianess of a 32-bit word -; param1: the 32-bit word -; given in r25,r24,r23,22 (r25 is most significant) -; modifys: r21, r22 -change_endian32: - movw r20, r22 ; (r22,r23) --> (r20,r21) - mov r22, r25 - mov r23, r24 - mov r24, r21 - mov r25, r20 - ret -