3 This file is part of the AVR-Crypto-Lib.
4 Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de)
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 * License: GPLv3 or later
24 ; sha-256 implementation in assembler
25 SHA256_BLOCK_BITS = 512
26 SHA256_HASH_BITS = 256
29 /* push r18 - r27, r30 - r31*/
94 /* X points to Block */
95 .macro dbg_hexdump length
111 ; [h0][h1][h2][h3][h4][h5][h6][h7][length]
112 ; hn is 32 bit large, length is 64 bit large
114 ;###########################################################
116 .global sha256_ctx2hash
117 ; === sha256_ctx2hash ===
118 ; this function converts a state into a normal hash (bytestring)
119 ; param1: the 16-bit destination pointer
120 ; given in r25,r24 (r25 is most significant)
121 ; param2: the 16-bit pointer to sha256_ctx structure
142 ;###########################################################
146 ; this function calculates SHA-256 hashes from messages in RAM
147 ; param1: the 16-bit hash destination pointer
148 ; given in r25,r24 (r25 is most significant)
149 ; param2: the 16-bit pointer to message
151 ; param3: 32-bit length value (length of message in bits)
152 ; given in r21,r20,r19,r18
178 movw r8, r18 /* backup of length*/
181 movw r12, r22 /* backup pf msg-ptr */
185 /* if length >= 512 */
197 rcall sha256_nextBlock
212 rcall sha256_lastBlock
217 rcall sha256_ctx2hash
238 ;###########################################################
241 ; block MUST NOT be larger than 64 bytes
243 .global sha256_lastBlock
244 ; === sha256_lastBlock ===
245 ; this function does padding & Co. for calculating SHA-256 hashes
246 ; param1: the 16-bit pointer to sha256_ctx structure
247 ; given in r25,r24 (r25 is most significant)
248 ; param2: an 16-bit pointer to 64 byte block to hash
250 ; param3: an 16-bit integer specifing length of block in bits
252 sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
257 brlo sha256_lastBlock_prolog
264 rcall sha256_nextBlock
273 rjmp sha256_lastBlock
274 sha256_lastBlock_prolog:
275 /* allocate space on stack */
286 adiw r30, 1 /* SP points to next free byte on stack */
287 mov r18, r20 /* r20 = LSB(length) */
291 bst r21, 0 /* may be we should explain this ... */
292 bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
295 movw r26, r22 /* X points to begin of msg */
297 breq sha256_lastBlock_post_copy
299 sha256_lastBlock_copy_loop:
303 brne sha256_lastBlock_copy_loop
304 sha256_lastBlock_post_copy:
305 sha256_lastBlock_insert_stuffing_bit:
309 and r19, r20 /* if we are in bitmode */
310 breq 2f /* no bitmode */
316 /* maybe we should do some ANDing here, just for safety */
322 /* checking stuff here */
325 rjmp sha256_lastBlock_insert_zeros
327 /* oh shit, we landed here */
328 /* first we have to fill it up with zeros */
347 rcall sha256_nextBlock
355 /* now we should subtract 512 from length */
357 adiw r26, 4*8+1 /* we can skip the lowest byte */
369 ; clr r18 /* not neccessary ;-) */
370 /* reset Z pointer to begin of block */
372 sha256_lastBlock_insert_zeros:
375 breq sha256_lastBlock_insert_length
378 st Z+, r1 /* r1 is still zero */
382 ; rjmp sha256_lastBlock_epilog
383 sha256_lastBlock_insert_length:
384 movw r26, r24 /* X points to state */
385 adiw r26, 8*4 /* X points to (state.length) */
386 adiw r30, 8 /* Z points one after the last byte of block */
403 rcall sha256_nextBlock
405 sha256_lastBlock_epilog:
409 adiw r30, 63 ; lo8(64)
410 adiw r30, 1 ; hi8(64)
420 ;###########################################################
422 .global sha256_nextBlock
423 ; === sha256_nextBlock ===
424 ; this is the core function for calculating SHA-256 hashes
425 ; param1: the 16-bit pointer to sha256_ctx structure
426 ; given in r25,r24 (r25 is most significant)
427 ; param2: an 16-bit pointer to 64 byte block to hash
429 sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
452 /* byteorder: high number <--> high significance */
454 ; initial, let's make some space ready for local vars
455 push r4 /* replace push & pop by mem ops? */
473 movw r18, r20 ;backup SP
474 ; movw r26, r20 ; X points to free space on stack
475 movw r30, r22 ; Z points to message
476 subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
477 sbci r21, hi8(sha256_nextBlock_localSpace)
478 movw r26, r20 ; X points to free space on stack
480 cli ; we want to be uninterrupted while updating SP
487 push r25 /* param1 will be needed later */
488 ; now we fill the w array with message (think about endianess)
491 sha256_nextBlock_wcpyloop:
501 brne sha256_nextBlock_wcpyloop
502 /* for (i=16; i<64; ++i){
503 w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];
505 /* r25,r24,r23,r24 (r21,r20) are function values
506 r19,r18,r17,r16 are the accumulator
507 r15,r14,r13,rBck1 are backup1
508 r11,r10,r9 ,r8 are xor accu
509 r1 is round counter */
513 sha256_nextBlock_wcalcloop:
514 movw r30, r26 ; cp X to Z
516 sbiw r30, 1 ; substract 64 = 16*4
520 ld Accu4, Z+ /* w[i] = w[i-16] */
524 ld Bck4, Z+ /* backup = w[i-15] */
529 mov Func4, Bck1 /* prerotated by 8 */
533 movw XAccu3, Func3 /* store ROTR(w[i-15],7) in xor accu */
535 movw Func3, Bck1 /* prerotated by 16 */
538 eor XAccu1, Func1 /* xor ROTR(w[i-15], 18)*/
542 ldi Func2, 3 /* now shr3 */ /*we can destroy backup now*/
553 eor XAccu4, Bck4 /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
557 adc Accu4, XAccu4 /* finished with sigma0 */
558 ldd Func1, Z+7*4 /* now accu += w[i-7] */
566 ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
572 movw Func3, Bck1 /* prerotated by 16 */
576 movw XAccu1, Func1 /* store in ROTR(w[i-2], 17) xor accu */
578 ; movw Func3, Bck1 /* prerotated by 16 */
581 eor XAccu1, Func1 /* xor ROTR(w[i-2], 19)*/
585 ldi Func2, 2 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
594 eor XAccu3, Bck4 /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
598 adc Accu4, XAccu4 /* finished with sigma0 */
599 /* now let's store the shit */
605 breq 3f ; skip if zero
606 rjmp sha256_nextBlock_wcalcloop
608 /* we are finished with w array X points one byte post w */
614 ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
621 /* now the real fun begins */
622 /* for (i=0; i<64; ++i){
623 t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
624 t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
625 memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0];
629 /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
630 sbiw r26, 8*4 /* X still points at a[7]+1*/
632 ldi r30, lo8(sha256_kv)
633 ldi r31, hi8(sha256_kv)
634 dec r27 /* X - (64*4 == 256) */
638 /* now calculate t1 */
639 /*CH(x,y,z) = (x&y)^((~x)&z)*/
643 ldd T4, Y+5*4+3 /* y in T */
647 ldd Func4, Y+4*4+3 /* x in Func */
651 ldd Bck4, Y+6*4+3 /* z in Bck */
667 eor T4, Bck4 /* done, CH(x,y,z) is in T */
668 /* now SIGMA1(a[4]) */
669 ldd Bck4, Y+4*4 /* think about using it from Func reg above*/
672 ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */
676 rcall bitrotl /* rotr(x,6) */
682 rcall bitrotr /* rotr(x,11) */
687 movw Func1, Bck3 /* this prerotates furteh 16 bits*/
688 movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
690 rcall bitrotr /* rotr(x,11) */
694 eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
699 /* now we've to add a[7], w[i] and k[i] */
707 adc T4, XAccu4 /* add a[7] */
715 adc T4, XAccu4 /* add w[i] */
723 adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
724 /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
725 /* starting with MAJ(x,y,z) */
729 ldd Func4, Y+4*0+3 /* load x=a[0] */
733 ldd XAccu4, Y+4*1+3 /* load y=a[1] */
737 and XAccu4, Func4 /* XAccu == (x & y) */
741 ldd Bck4, Y+4*2+3 /* load z=a[2] */
749 eor XAccu4, Func4 /* XAccu == (x & y) ^ (x & z) */
753 ldd Func4, Y+4*1+3 /* load y=a[1] */
761 eor XAccu4, Func4 /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
763 ldd Bck1, Y+4*0+0 /* we should combine this with above */
772 movw Accu3, Func3 /* Accu = shr(a[0], 2) */
774 movw Func3, Bck1 /* prerotate by 16 bits */
780 eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
784 mov Func4, Bck3 /* prerotate by 24 bits */
790 eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
791 add Accu1, XAccu1 /* add previous result (MAJ)*/
795 /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
796 /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
801 ld r25, -Y /* warning: this is PREdecrement */
825 std Y+4*0+3, Accu4 /* a array updated */
830 rjmp sha256_main_loop ;brne sha256_main_loop
833 /* pointers to state should still exist on the stack ;-) */
855 brne update_state_loop
856 /* now we just have to update the length */
857 adiw r30, 1 /* since we add 512, we can simply skip the LSB */
864 sha256_nextBlock_fix_length:
865 brcc sha256_nextBlock_epilog
870 brne sha256_nextBlock_fix_length
873 sha256_nextBlock_epilog:
874 /* now we should clean up the stack */
879 cli ; we want to be uninterrupted while updating SP
903 sha256_kv: ; round-key-vector stored in ProgMem
904 .word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
905 .word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
906 .word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
907 .word 0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
908 .word 0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
909 .word 0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
910 .word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
911 .word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
914 ;###########################################################
917 ;uint32_t sha256_init_vector[]={
918 ; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
919 ; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
921 ;void sha256_init(sha256_ctx_t *state){
923 ; memcpy(state->h, sha256_init_vector, 8*4);
925 ; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
926 ; modifys: Z(r30,r31), Func1, r22
928 movw r26, r24 ; (24,25) --> (26,27) load X with param1
929 ldi r30, lo8((sha256_init_vector))
930 ldi r31, hi8((sha256_init_vector))
936 brne sha256_init_vloop
951 ;###########################################################
955 ; function that rotates a 32 bit word to the left
956 ; param1: the 32-bit word to rotate
957 ; given in r25,r24,r23,r22 (r25 is most significant)
958 ; param2: an 8-bit value telling how often to rotate
989 ;###########################################################
993 ; function that rotates a 32 bit word to the right
994 ; param1: the 32-bit word to rotate
995 ; given in r25,r24,r23,22 (r25 is most significant)
996 ; param2: an 8-bit value telling how often to rotate
1027 ;###########################################################
1029 .global change_endian32
1030 ; === change_endian32 ===
1031 ; function that changes the endianess of a 32-bit word
1032 ; param1: the 32-bit word
1033 ; given in r25,r24,r23,22 (r25 is most significant)
1036 movw r20, r22 ; (r22,r23) --> (r20,r21)