3 This file is part of the AVR-Crypto-Lib.
4 Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de)
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 * License: GPLv3 or later
24 ; sha-256 implementation in assembler
25 SHA256_BLOCK_BITS = 512
26 SHA256_HASH_BITS = 256
30 /* push r18 - r27, r30 - r31*/
95 /* X points to Block */
96 .macro dbg_hexdump length
112 ; [h0][h1][h2][h3][h4][h5][h6][h7][length]
113 ; hn is 32 bit large, length is 64 bit large
115 ;###########################################################
117 .global sha256_ctx2hash
118 ; === sha256_ctx2hash ===
119 ; this function converts a state into a normal hash (bytestring)
120 ; param1: the 16-bit destination pointer
121 ; given in r25,r24 (r25 is most significant)
122 ; param2: the 16-bit pointer to sha256_ctx structure
143 ;###########################################################
147 ; this function calculates SHA-256 hashes from messages in RAM
148 ; param1: the 16-bit hash destination pointer
149 ; given in r25,r24 (r25 is most significant)
150 ; param2: the 16-bit pointer to message
152 ; param3: 32-bit length value (length of message in bits)
153 ; given in r21,r20,r19,r18
177 movw r8, r18 /* backup of length*/
180 movw r12, r22 /* backup pf msg-ptr */
184 /* if length > 0xffff */
193 rcall sha256_nextBlock
208 rcall sha256_lastBlock
213 rcall sha256_ctx2hash
234 ;###########################################################
237 ; block MUST NOT be larger than 64 bytes
239 .global sha256_lastBlock
240 ; === sha256_lastBlock ===
241 ; this function does padding & Co. for calculating SHA-256 hashes
242 ; param1: the 16-bit pointer to sha256_ctx structure
243 ; given in r25,r24 (r25 is most significant)
244 ; param2: an 16-bit pointer to 64 byte block to hash
246 ; param3: an 16-bit integer specifing length of block in bits
248 sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
253 brlo sha256_lastBlock_prolog
260 rcall sha256_nextBlock
271 rjmp sha256_lastBlock
272 sha256_lastBlock_prolog:
273 /* allocate space on stack */
284 adiw r30, 1 /* SP points to next free byte on stack */
285 mov r18, r20 /* r20 = LSB(length) */
289 bst r21, 0 /* may be we should explain this ... */
290 bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
293 movw r26, r22 /* X points to begin of msg */
295 breq sha256_lastBlock_post_copy
297 sha256_lastBlock_copy_loop:
301 brne sha256_lastBlock_copy_loop
302 sha256_lastBlock_post_copy:
303 sha256_lastBlock_insert_stuffing_bit:
307 and r19, r20 /* if we are in bitmode */
308 breq 2f /* no bitmode */
314 /* maybe we should do some ANDing here, just for safety */
320 /* checking stuff here */
323 rjmp sha256_lastBlock_insert_zeros
325 /* oh shit, we landed here */
326 /* first we have to fill it up with zeros */
345 rcall sha256_nextBlock
353 /* now we should subtract 512 from length */
355 adiw r26, 4*8+1 /* we can skip the lowest byte */
367 ; clr r18 /* not neccessary ;-) */
368 /* reset Z pointer to begin of block */
370 sha256_lastBlock_insert_zeros:
373 breq sha256_lastBlock_insert_length
376 st Z+, r1 /* r1 is still zero */
380 ; rjmp sha256_lastBlock_epilog
381 sha256_lastBlock_insert_length:
382 movw r26, r24 /* X points to state */
383 adiw r26, 8*4 /* X points to (state.length) */
384 adiw r30, 8 /* Z points one after the last byte of block */
401 rcall sha256_nextBlock
403 sha256_lastBlock_epilog:
407 adiw r30, 63 ; lo8(64)
408 adiw r30, 1 ; hi8(64)
417 ;###########################################################
419 .global sha256_nextBlock
420 ; === sha256_nextBlock ===
421 ; this is the core function for calculating SHA-256 hashes
422 ; param1: the 16-bit pointer to sha256_ctx structure
423 ; given in r25,r24 (r25 is most significant)
424 ; param2: an 16-bit pointer to 64 byte block to hash
426 sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
449 /* byteorder: high number <--> high significance */
451 ; initial, let's make some space ready for local vars
452 push r4 /* replace push & pop by mem ops? */
470 movw r18, r20 ;backup SP
471 ; movw r26, r20 ; X points to free space on stack
472 movw r30, r22 ; Z points to message
473 subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
474 sbci r21, hi8(sha256_nextBlock_localSpace)
475 movw r26, r20 ; X points to free space on stack
477 cli ; we want to be uninterrupted while updating SP
484 push r25 /* param1 will be needed later */
485 ; now we fill the w array with message (think about endianess)
488 sha256_nextBlock_wcpyloop:
498 brne sha256_nextBlock_wcpyloop
499 /* for (i=16; i<64; ++i){
500 w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];
502 /* r25,r24,r23,r24 (r21,r20) are function values
503 r19,r18,r17,r16 are the accumulator
504 r15,r14,r13,rBck1 are backup1
505 r11,r10,r9 ,r8 are xor accu
506 r1 is round counter */
510 sha256_nextBlock_wcalcloop:
511 movw r30, r26 ; cp X to Z
513 sbiw r30, 1 ; substract 64 = 16*4
517 ld Accu4, Z+ /* w[i] = w[i-16] */
521 ld Bck4, Z+ /* backup = w[i-15] */
526 mov Func4, Bck1 /* prerotated by 8 */
530 movw XAccu3, Func3 /* store ROTR(w[i-15],7) in xor accu */
532 movw Func3, Bck1 /* prerotated by 16 */
535 eor XAccu1, Func1 /* xor ROTR(w[i-15], 18)*/
539 ldi Func2, 3 /* now shr3 */ /*we can destroy backup now*/
550 eor XAccu4, Bck4 /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
554 adc Accu4, XAccu4 /* finished with sigma0 */
555 ldd Func1, Z+7*4 /* now accu += w[i-7] */
563 ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
569 movw Func3, Bck1 /* prerotated by 16 */
573 movw XAccu1, Func1 /* store in ROTR(w[i-2], 17) xor accu */
575 ; movw Func3, Bck1 /* prerotated by 16 */
578 eor XAccu1, Func1 /* xor ROTR(w[i-2], 19)*/
582 ldi Func2, 2 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
591 eor XAccu3, Bck4 /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
595 adc Accu4, XAccu4 /* finished with sigma0 */
596 /* now let's store the shit */
602 breq 3f ; skip if zero
603 rjmp sha256_nextBlock_wcalcloop
605 /* we are finished with w array X points one byte post w */
611 ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
618 /* now the real fun begins */
619 /* for (i=0; i<64; ++i){
620 t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
621 t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
622 memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0];
626 /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
627 sbiw r26, 8*4 /* X still points at a[7]+1*/
629 ldi r30, lo8(sha256_kv)
630 ldi r31, hi8(sha256_kv)
631 dec r27 /* X - (64*4 == 256) */
635 /* now calculate t1 */
636 /*CH(x,y,z) = (x&y)^((~x)&z)*/
640 ldd T4, Y+5*4+3 /* y in T */
644 ldd Func4, Y+4*4+3 /* x in Func */
648 ldd Bck4, Y+6*4+3 /* z in Bck */
664 eor T4, Bck4 /* done, CH(x,y,z) is in T */
665 /* now SIGMA1(a[4]) */
666 ldd Bck4, Y+4*4 /* think about using it from Func reg above*/
669 ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */
673 rcall bitrotl /* rotr(x,6) */
679 rcall bitrotr /* rotr(x,11) */
684 movw Func1, Bck3 /* this prerotates furteh 16 bits*/
685 movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
687 rcall bitrotr /* rotr(x,11) */
691 eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
696 /* now we've to add a[7], w[i] and k[i] */
704 adc T4, XAccu4 /* add a[7] */
712 adc T4, XAccu4 /* add w[i] */
720 adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
721 /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
722 /* starting with MAJ(x,y,z) */
726 ldd Func4, Y+4*0+3 /* load x=a[0] */
730 ldd XAccu4, Y+4*1+3 /* load y=a[1] */
734 and XAccu4, Func4 /* XAccu == (x & y) */
738 ldd Bck4, Y+4*2+3 /* load z=a[2] */
746 eor XAccu4, Func4 /* XAccu == (x & y) ^ (x & z) */
750 ldd Func4, Y+4*1+3 /* load y=a[1] */
758 eor XAccu4, Func4 /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
760 ldd Bck1, Y+4*0+0 /* we should combine this with above */
769 movw Accu3, Func3 /* Accu = shr(a[0], 2) */
771 movw Func3, Bck1 /* prerotate by 16 bits */
777 eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
781 mov Func4, Bck3 /* prerotate by 24 bits */
787 eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
788 add Accu1, XAccu1 /* add previous result (MAJ)*/
792 /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
793 /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
798 ld r25, -Y /* warning: this is PREdecrement */
822 std Y+4*0+3, Accu4 /* a array updated */
827 rjmp sha256_main_loop ;brne sha256_main_loop
830 /* pointers to state should still exist on the stack ;-) */
852 brne update_state_loop
853 /* now we just have to update the length */
854 adiw r30, 1 /* since we add 512, we can simply skip the LSB */
861 sha256_nextBlock_fix_length:
862 brcc sha256_nextBlock_epilog
867 brne sha256_nextBlock_fix_length
870 sha256_nextBlock_epilog:
871 /* now we should clean up the stack */
876 cli ; we want to be uninterrupted while updating SP
899 sha256_kv: ; round-key-vector stored in ProgMem
900 .word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
901 .word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
902 .word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
903 .word 0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
904 .word 0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
905 .word 0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
906 .word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
907 .word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
910 ;###########################################################
913 ;uint32_t sha256_init_vector[]={
914 ; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
915 ; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
917 ;void sha256_init(sha256_ctx_t *state){
919 ; memcpy(state->h, sha256_init_vector, 8*4);
921 ; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
922 ; modifys: Z(r30,r31), Func1, r22
924 movw r26, r24 ; (24,25) --> (26,27) load X with param1
925 ldi r30, lo8((sha256_init_vector))
926 ldi r31, hi8((sha256_init_vector))
932 brne sha256_init_vloop
947 ;###########################################################
951 ; function that rotates a 32 bit word to the left
952 ; param1: the 32-bit word to rotate
953 ; given in r25,r24,r23,r22 (r25 is most significant)
954 ; param2: an 8-bit value telling how often to rotate
986 ;###########################################################
990 ; function that rotates a 32 bit word to the right
991 ; param1: the 32-bit word to rotate
992 ; given in r25,r24,r23,22 (r25 is most significant)
993 ; param2: an 8-bit value telling how often to rotate
1025 ;###########################################################
1027 .global change_endian32
1028 ; === change_endian32 ===
1029 ; function that changes the endianess of a 32-bit word
1030 ; param1: the 32-bit word
1031 ; given in r25,r24,r23,22 (r25 is most significant)
1034 movw r20, r22 ; (r22,r23) --> (r20,r21)