3 This file is part of the AVR-Crypto-Lib.
4 Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de)
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 * License: GPLv3 or later
24 ; sha-256 implementation in assembler
25 SHA256_BLOCK_BITS = 512
26 SHA256_HASH_BITS = 256
30 /* push r18 - r27, r30 - r31*/
95 /* X points to Block */
96 .macro dbg_hexdump length
112 ; [h0][h1][h2][h3][h4][h5][h6][h7][length]
113 ; hn is 32 bit large, length is 64 bit large
115 ;###########################################################
117 .global sha256_ctx2hash
118 ; === sha256_ctx2hash ===
119 ; this function converts a state into a normal hash (bytestring)
120 ; param1: the 16-bit destination pointer
121 ; given in r25,r24 (r25 is most significant)
122 ; param2: the 16-bit pointer to sha256_ctx structure
143 ;###########################################################
147 ; this function calculates SHA-256 hashes from messages in RAM
148 ; param1: the 16-bit hash destination pointer
149 ; given in r25,r24 (r25 is most significant)
150 ; param2: the 16-bit pointer to message
152 ; param3: 32-bit length value (length of message in bits)
153 ; given in r21,r20,r19,r18
177 movw r8, r18 /* backup of length*/
180 movw r12, r22 /* backup pf msg-ptr */
184 /* if length > 0xffff */
193 rcall sha256_nextBlock
208 rcall sha256_lastBlock
213 rcall sha256_ctx2hash
234 ;###########################################################
237 ; block MUST NOT be larger than 64 bytes
239 .global sha256_lastBlock
240 ; === sha256_lastBlock ===
241 ; this function does padding & Co. for calculating SHA-256 hashes
242 ; param1: the 16-bit pointer to sha256_ctx structure
243 ; given in r25,r24 (r25 is most significant)
244 ; param2: an 16-bit pointer to 64 byte block to hash
246 ; param3: an 16-bit integer specifing length of block in bits
248 sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
253 brlo sha256_lastBlock_prolog
260 rcall sha256_nextBlock
271 rjmp sha256_lastBlock
272 sha256_lastBlock_prolog:
273 /* allocate space on stack */
284 adiw r30, 1 /* SP points to next free byte on stack */
285 mov r18, r20 /* r20 = LSB(length) */
289 bst r21, 0 /* may be we should explain this ... */
290 bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
293 movw r26, r22 /* X points to begin of msg */
295 breq sha256_lastBlock_post_copy
297 sha256_lastBlock_copy_loop:
301 brne sha256_lastBlock_copy_loop
302 sha256_lastBlock_post_copy:
303 sha256_lastBlock_insert_stuffing_bit:
307 and r19, r20 /* if we are in bitmode */
308 breq 2f /* no bitmode */
314 /* maybe we should do some ANDing here, just for safety */
320 /* checking stuff here */
323 rjmp sha256_lastBlock_insert_zeros
325 /* oh shit, we landed here */
326 /* first we have to fill it up with zeros */
345 rcall sha256_nextBlock
353 /* now we should subtract 512 from length */
355 adiw r26, 4*8+1 /* we can skip the lowest byte */
367 ; clr r18 /* not neccessary ;-) */
368 /* reset Z pointer to begin of block */
370 sha256_lastBlock_insert_zeros:
373 breq sha256_lastBlock_insert_length
376 st Z+, r1 /* r1 is still zero */
380 ; rjmp sha256_lastBlock_epilog
381 sha256_lastBlock_insert_length:
382 movw r26, r24 /* X points to state */
383 adiw r26, 8*4 /* X points to (state.length) */
384 adiw r30, 8 /* Z points one after the last byte of block */
401 rcall sha256_nextBlock
403 sha256_lastBlock_epilog:
407 adiw r30, 63 ; lo8(64)
408 adiw r30, 1 ; hi8(64)
418 ;###########################################################
420 .global sha256_nextBlock
421 ; === sha256_nextBlock ===
422 ; this is the core function for calculating SHA-256 hashes
423 ; param1: the 16-bit pointer to sha256_ctx structure
424 ; given in r25,r24 (r25 is most significant)
425 ; param2: an 16-bit pointer to 64 byte block to hash
427 sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
450 /* byteorder: high number <--> high significance */
452 ; initial, let's make some space ready for local vars
453 push r4 /* replace push & pop by mem ops? */
471 movw r18, r20 ;backup SP
472 ; movw r26, r20 ; X points to free space on stack
473 movw r30, r22 ; Z points to message
474 subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
475 sbci r21, hi8(sha256_nextBlock_localSpace)
476 movw r26, r20 ; X points to free space on stack
478 cli ; we want to be uninterrupted while updating SP
485 push r25 /* param1 will be needed later */
486 ; now we fill the w array with message (think about endianess)
489 sha256_nextBlock_wcpyloop:
499 brne sha256_nextBlock_wcpyloop
500 /* for (i=16; i<64; ++i){
501 w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];
503 /* r25,r24,r23,r24 (r21,r20) are function values
504 r19,r18,r17,r16 are the accumulator
505 r15,r14,r13,rBck1 are backup1
506 r11,r10,r9 ,r8 are xor accu
507 r1 is round counter */
511 sha256_nextBlock_wcalcloop:
512 movw r30, r26 ; cp X to Z
514 sbiw r30, 1 ; substract 64 = 16*4
518 ld Accu4, Z+ /* w[i] = w[i-16] */
522 ld Bck4, Z+ /* backup = w[i-15] */
527 mov Func4, Bck1 /* prerotated by 8 */
531 movw XAccu3, Func3 /* store ROTR(w[i-15],7) in xor accu */
533 movw Func3, Bck1 /* prerotated by 16 */
536 eor XAccu1, Func1 /* xor ROTR(w[i-15], 18)*/
540 ldi Func2, 3 /* now shr3 */ /*we can destroy backup now*/
551 eor XAccu4, Bck4 /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
555 adc Accu4, XAccu4 /* finished with sigma0 */
556 ldd Func1, Z+7*4 /* now accu += w[i-7] */
564 ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
570 movw Func3, Bck1 /* prerotated by 16 */
574 movw XAccu1, Func1 /* store in ROTR(w[i-2], 17) xor accu */
576 ; movw Func3, Bck1 /* prerotated by 16 */
579 eor XAccu1, Func1 /* xor ROTR(w[i-2], 19)*/
583 ldi Func2, 2 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
592 eor XAccu3, Bck4 /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
596 adc Accu4, XAccu4 /* finished with sigma0 */
597 /* now let's store the shit */
603 breq 3f ; skip if zero
604 rjmp sha256_nextBlock_wcalcloop
606 /* we are finished with w array X points one byte post w */
612 ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
619 /* now the real fun begins */
620 /* for (i=0; i<64; ++i){
621 t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
622 t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
623 memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0];
627 /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
628 sbiw r26, 8*4 /* X still points at a[7]+1*/
630 ldi r30, lo8(sha256_kv)
631 ldi r31, hi8(sha256_kv)
632 dec r27 /* X - (64*4 == 256) */
636 /* now calculate t1 */
637 /*CH(x,y,z) = (x&y)^((~x)&z)*/
641 ldd T4, Y+5*4+3 /* y in T */
645 ldd Func4, Y+4*4+3 /* x in Func */
649 ldd Bck4, Y+6*4+3 /* z in Bck */
665 eor T4, Bck4 /* done, CH(x,y,z) is in T */
666 /* now SIGMA1(a[4]) */
667 ldd Bck4, Y+4*4 /* think about using it from Func reg above*/
670 ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */
674 rcall bitrotl /* rotr(x,6) */
680 rcall bitrotr /* rotr(x,11) */
685 movw Func1, Bck3 /* this prerotates furteh 16 bits*/
686 movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
688 rcall bitrotr /* rotr(x,11) */
692 eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
697 /* now we've to add a[7], w[i] and k[i] */
705 adc T4, XAccu4 /* add a[7] */
713 adc T4, XAccu4 /* add w[i] */
721 adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
722 /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
723 /* starting with MAJ(x,y,z) */
727 ldd Func4, Y+4*0+3 /* load x=a[0] */
731 ldd XAccu4, Y+4*1+3 /* load y=a[1] */
735 and XAccu4, Func4 /* XAccu == (x & y) */
739 ldd Bck4, Y+4*2+3 /* load z=a[2] */
747 eor XAccu4, Func4 /* XAccu == (x & y) ^ (x & z) */
751 ldd Func4, Y+4*1+3 /* load y=a[1] */
759 eor XAccu4, Func4 /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
761 ldd Bck1, Y+4*0+0 /* we should combine this with above */
770 movw Accu3, Func3 /* Accu = shr(a[0], 2) */
772 movw Func3, Bck1 /* prerotate by 16 bits */
778 eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
782 mov Func4, Bck3 /* prerotate by 24 bits */
788 eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
789 add Accu1, XAccu1 /* add previous result (MAJ)*/
793 /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
794 /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
799 ld r25, -Y /* warning: this is PREdecrement */
823 std Y+4*0+3, Accu4 /* a array updated */
828 rjmp sha256_main_loop ;brne sha256_main_loop
831 /* pointers to state should still exist on the stack ;-) */
853 brne update_state_loop
854 /* now we just have to update the length */
855 adiw r30, 1 /* since we add 512, we can simply skip the LSB */
862 sha256_nextBlock_fix_length:
863 brcc sha256_nextBlock_epilog
868 brne sha256_nextBlock_fix_length
871 sha256_nextBlock_epilog:
872 /* now we should clean up the stack */
877 cli ; we want to be uninterrupted while updating SP
901 sha256_kv: ; round-key-vector stored in ProgMem
902 .word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
903 .word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
904 .word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
905 .word 0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
906 .word 0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
907 .word 0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
908 .word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
909 .word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
912 ;###########################################################
915 ;uint32_t sha256_init_vector[]={
916 ; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
917 ; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
919 ;void sha256_init(sha256_ctx_t *state){
921 ; memcpy(state->h, sha256_init_vector, 8*4);
923 ; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
924 ; modifys: Z(r30,r31), Func1, r22
926 movw r26, r24 ; (24,25) --> (26,27) load X with param1
927 ldi r30, lo8((sha256_init_vector))
928 ldi r31, hi8((sha256_init_vector))
934 brne sha256_init_vloop
949 ;###########################################################
953 ; function that rotates a 32 bit word to the left
954 ; param1: the 32-bit word to rotate
955 ; given in r25,r24,r23,r22 (r25 is most significant)
956 ; param2: an 8-bit value telling how often to rotate
987 ;###########################################################
991 ; function that rotates a 32 bit word to the right
992 ; param1: the 32-bit word to rotate
993 ; given in r25,r24,r23,22 (r25 is most significant)
994 ; param2: an 8-bit value telling how often to rotate
1025 ;###########################################################
1027 .global change_endian32
1028 ; === change_endian32 ===
1029 ; function that changes the endianess of a 32-bit word
1030 ; param1: the 32-bit word
1031 ; given in r25,r24,r23,22 (r25 is most significant)
1034 movw r20, r22 ; (r22,r23) --> (r20,r21)