3 This file is part of the This file is part of the AVR-Crypto-Lib.
4 Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de)
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 * License: GPLv3 or later
24 ; sha-256 implementation in assembler
25 SHA256_BLOCK_BITS = 512
26 SHA256_HASH_BITS = 256
29 /* push r18 - r27, r30 - r31*/
94 /* X points to Block */
95 .macro dbg_hexdump length
111 ; [h0][h1][h2][h3][h4][h5][h6][h7][length]
112 ; hn is 32 bit large, length is 64 bit large
114 ;###########################################################
116 .global sha256_ctx2hash
117 ; === sha256_ctx2hash ===
118 ; this function converts a state into a normal hash (bytestring)
119 ; param1: the 16-bit destination pointer
120 ; given in r25,r24 (r25 is most significant)
121 ; param2: the 16-bit pointer to sha256_ctx structure
142 ;###########################################################
146 ; this function calculates SHA-256 hashes from messages in RAM
147 ; param1: the 16-bit hash destination pointer
148 ; given in r25,r24 (r25 is most significant)
149 ; param2: the 16-bit pointer to message
151 ; param3: 32-bit length value (length of message in bits)
152 ; given in r21,r20,r19,r18
178 movw r8, r18 /* backup of length*/
181 movw r12, r22 /* backup pf msg-ptr */
185 /* if length >= 512 */
197 rcall sha256_nextBlock
212 rcall sha256_lastBlock
217 rcall sha256_ctx2hash
238 ;###########################################################
241 ; block MUST NOT be larger than 64 bytes
243 .global sha256_lastBlock
244 ; === sha256_lastBlock ===
245 ; this function does padding & Co. for calculating SHA-256 hashes
246 ; param1: the 16-bit pointer to sha256_ctx structure
247 ; given in r25,r24 (r25 is most significant)
248 ; param2: an 16-bit pointer to 64 byte block to hash
250 ; param3: an 16-bit integer specifing length of block in bits
252 sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
257 brne sha256_lastBlock_prolog
259 brne sha256_lastBlock_prolog
264 rcall sha256_nextBlock
271 sha256_lastBlock_prolog:
272 /* allocate space on stack */
283 adiw r30, 1 /* SP points to next free byte on stack */
284 mov r18, r20 /* r20 = LSB(length) */
288 bst r21, 0 /* may be we should explain this ... */
289 bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
292 movw r26, r22 /* X points to begin of msg */
294 breq sha256_lastBlock_post_copy
296 sha256_lastBlock_copy_loop:
300 brne sha256_lastBlock_copy_loop
301 sha256_lastBlock_post_copy:
302 sha256_lastBlock_insert_stuffing_bit:
306 and r19, r20 /* if we are in bitmode */
307 breq 2f /* no bitmode */
313 /* maybe we should do some ANDing here, just for safety */
319 /* checking stuff here */
322 rjmp sha256_lastBlock_insert_zeros
324 /* oh shit, we landed here */
325 /* first we have to fill it up with zeros */
344 rcall sha256_nextBlock
352 /* now we should subtract 512 from length */
354 adiw r26, 4*8+1 /* we can skip the lowest byte */
366 ; clr r18 /* not neccessary ;-) */
367 /* reset Z pointer to begin of block */
369 sha256_lastBlock_insert_zeros:
372 breq sha256_lastBlock_insert_length
375 st Z+, r1 /* r1 is still zero */
379 ; rjmp sha256_lastBlock_epilog
380 sha256_lastBlock_insert_length:
381 movw r26, r24 /* X points to state */
382 adiw r26, 8*4 /* X points to (state.length) */
383 adiw r30, 8 /* Z points one after the last byte of block */
400 rcall sha256_nextBlock
402 sha256_lastBlock_epilog:
406 adiw r30, 63 ; lo8(64)
407 adiw r30, 1 ; hi8(64)
417 ;###########################################################
419 .global sha256_nextBlock
420 ; === sha256_nextBlock ===
421 ; this is the core function for calculating SHA-256 hashes
422 ; param1: the 16-bit pointer to sha256_ctx structure
423 ; given in r25,r24 (r25 is most significant)
424 ; param2: an 16-bit pointer to 64 byte block to hash
426 sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
449 /* byteorder: high number <--> high significance */
451 ; initial, let's make some space ready for local vars
452 push r4 /* replace push & pop by mem ops? */
470 movw r18, r20 ;backup SP
471 ; movw r26, r20 ; X points to free space on stack
472 movw r30, r22 ; Z points to message
473 subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
474 sbci r21, hi8(sha256_nextBlock_localSpace)
475 movw r26, r20 ; X points to free space on stack
477 cli ; we want to be uninterrupted while updating SP
484 push r25 /* param1 will be needed later */
485 ; now we fill the w array with message (think about endianess)
488 sha256_nextBlock_wcpyloop:
498 brne sha256_nextBlock_wcpyloop
499 /* for (i=16; i<64; ++i){
500 w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];
502 /* r25,r24,r23,r24 (r21,r20) are function values
503 r19,r18,r17,r16 are the accumulator
504 r15,r14,r13,rBck1 are backup1
505 r11,r10,r9 ,r8 are xor accu
506 r1 is round counter */
510 sha256_nextBlock_wcalcloop:
511 movw r30, r26 ; cp X to Z
513 sbiw r30, 1 ; substract 64 = 16*4
517 ld Accu4, Z+ /* w[i] = w[i-16] */
521 ld Bck4, Z+ /* backup = w[i-15] */
526 mov Func4, Bck1 /* prerotated by 8 */
530 movw XAccu3, Func3 /* store ROTR(w[i-15],7) in xor accu */
532 movw Func3, Bck1 /* prerotated by 16 */
535 eor XAccu1, Func1 /* xor ROTR(w[i-15], 18)*/
539 ldi Func2, 3 /* now shr3 */ /*we can destroy backup now*/
550 eor XAccu4, Bck4 /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
554 adc Accu4, XAccu4 /* finished with sigma0 */
555 ldd Func1, Z+7*4 /* now accu += w[i-7] */
563 ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
569 movw Func3, Bck1 /* prerotated by 16 */
573 movw XAccu1, Func1 /* store in ROTR(w[i-2], 17) xor accu */
575 ; movw Func3, Bck1 /* prerotated by 16 */
578 eor XAccu1, Func1 /* xor ROTR(w[i-2], 19)*/
582 ldi Func2, 2 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
591 eor XAccu3, Bck4 /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
595 adc Accu4, XAccu4 /* finished with sigma0 */
596 /* now let's store the shit */
602 breq 3f ; skip if zero
603 rjmp sha256_nextBlock_wcalcloop
605 /* we are finished with w array X points one byte post w */
611 ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
618 /* now the real fun begins */
619 /* for (i=0; i<64; ++i){
620 t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
621 t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
622 memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0];
626 /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
627 sbiw r26, 8*4 /* X still points at a[7]+1*/
629 ldi r30, lo8(sha256_kv)
630 ldi r31, hi8(sha256_kv)
631 dec r27 /* X - (64*4 == 256) */
635 /* now calculate t1 */
636 /*CH(x,y,z) = (x&y)^((~x)&z)*/
640 ldd T4, Y+5*4+3 /* y in T */
644 ldd Func4, Y+4*4+3 /* x in Func */
648 ldd Bck4, Y+6*4+3 /* z in Bck */
664 eor T4, Bck4 /* done, CH(x,y,z) is in T */
665 /* now SIGMA1(a[4]) */
666 ldd Bck4, Y+4*4 /* think about using it from Func reg above*/
669 ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */
673 rcall bitrotl /* rotr(x,6) */
679 rcall bitrotr /* rotr(x,11) */
684 movw Func1, Bck3 /* this prerotates furteh 16 bits*/
685 movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
687 rcall bitrotr /* rotr(x,11) */
691 eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
696 /* now we've to add a[7], w[i] and k[i] */
704 adc T4, XAccu4 /* add a[7] */
712 adc T4, XAccu4 /* add w[i] */
720 adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
721 /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
722 /* starting with MAJ(x,y,z) */
726 ldd Func4, Y+4*0+3 /* load x=a[0] */
730 ldd XAccu4, Y+4*1+3 /* load y=a[1] */
734 and XAccu4, Func4 /* XAccu == (x & y) */
738 ldd Bck4, Y+4*2+3 /* load z=a[2] */
746 eor XAccu4, Func4 /* XAccu == (x & y) ^ (x & z) */
750 ldd Func4, Y+4*1+3 /* load y=a[1] */
758 eor XAccu4, Func4 /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
760 ldd Bck1, Y+4*0+0 /* we should combine this with above */
769 movw Accu3, Func3 /* Accu = shr(a[0], 2) */
771 movw Func3, Bck1 /* prerotate by 16 bits */
777 eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
781 mov Func4, Bck3 /* prerotate by 24 bits */
787 eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
788 add Accu1, XAccu1 /* add previous result (MAJ)*/
792 /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
793 /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
798 ld r25, -Y /* warning: this is PREdecrement */
822 std Y+4*0+3, Accu4 /* a array updated */
827 rjmp sha256_main_loop ;brne sha256_main_loop
830 /* pointers to state should still exist on the stack ;-) */
852 brne update_state_loop
853 /* now we just have to update the length */
854 adiw r30, 1 /* since we add 512, we can simply skip the LSB */
861 sha256_nextBlock_fix_length:
862 brcc sha256_nextBlock_epilog
867 brne sha256_nextBlock_fix_length
870 sha256_nextBlock_epilog:
871 /* now we should clean up the stack */
876 cli ; we want to be uninterrupted while updating SP
900 sha256_kv: ; round-key-vector stored in ProgMem
901 .word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
902 .word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
903 .word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
904 .word 0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
905 .word 0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
906 .word 0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
907 .word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
908 .word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
911 ;###########################################################
914 ;uint32_t sha256_init_vector[]={
915 ; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
916 ; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
918 ;void sha256_init(sha256_ctx_t *state){
920 ; memcpy(state->h, sha256_init_vector, 8*4);
922 ; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
923 ; modifys: Z(r30,r31), Func1, r22
925 movw r26, r24 ; (24,25) --> (26,27) load X with param1
926 ldi r30, lo8((sha256_init_vector))
927 ldi r31, hi8((sha256_init_vector))
933 brne sha256_init_vloop
948 ;###########################################################
952 ; function that rotates a 32 bit word to the left
953 ; param1: the 32-bit word to rotate
954 ; given in r25,r24,r23,r22 (r25 is most significant)
955 ; param2: an 8-bit value telling how often to rotate
986 ;###########################################################
990 ; function that rotates a 32 bit word to the right
991 ; param1: the 32-bit word to rotate
992 ; given in r25,r24,r23,22 (r25 is most significant)
993 ; param2: an 8-bit value telling how often to rotate
1024 ;###########################################################
1026 .global change_endian32
1027 ; === change_endian32 ===
1028 ; function that changes the endianess of a 32-bit word
1029 ; param1: the 32-bit word
1030 ; given in r25,r24,r23,22 (r25 is most significant)
1033 movw r20, r22 ; (r22,r23) --> (r20,r21)