3 This file is part of the Crypto-avr-lib/microcrypt-lib.
4 Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de)
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 * Author: Daniel Otte
\r
24 ; sha-256 implementation in assembler
\r
25 SHA256_BLOCK_BITS = 512
\r
26 SHA256_HASH_BITS = 256
\r
29 /* push r18 - r27, r30 - r31*/
\r
65 .macro hexdump length
\r
88 ldi r22, lo8(\length)
\r
89 ldi r23, hi8(\length)
\r
94 /* X points to Block */
\r
95 .macro dbg_hexdump length
\r
111 ; [h0][h1][h2][h3][h4][h5][h6][h7][length]
\r
112 ; hn is 32 bit large, length is 64 bit large
\r
114 ;###########################################################
\r
116 .global sha256_ctx2hash
\r
117 ; === sha256_ctx2hash ===
\r
118 ; this function converts a state into a normal hash (bytestring)
\r
119 ; param1: the 16-bit destination pointer
\r
120 ; given in r25,r24 (r25 is most significant)
\r
121 ; param2: the 16-bit pointer to sha256_ctx structure
\r
142 ;###########################################################
\r
146 ; this function calculates SHA-256 hashes from messages in RAM
\r
147 ; param1: the 16-bit hash destination pointer
\r
148 ; given in r25,r24 (r25 is most significant)
\r
149 ; param2: the 16-bit pointer to message
\r
151 ; param3: 32-bit length value (length of message in bits)
\r
152 ; given in r21,r20,r19,r18
\r
178 movw r8, r18 /* backup of length*/
\r
181 movw r12, r22 /* backup pf msg-ptr */
\r
185 /* if length >= 512 */
\r
197 rcall sha256_nextBlock
\r
201 /* length -= 512 */
\r
212 rcall sha256_lastBlock
\r
217 rcall sha256_ctx2hash
\r
238 ;###########################################################
\r
241 ; block MUST NOT be larger than 64 bytes
\r
243 .global sha256_lastBlock
\r
244 ; === sha256_lastBlock ===
\r
245 ; this function does padding & Co. for calculating SHA-256 hashes
\r
246 ; param1: the 16-bit pointer to sha256_ctx structure
\r
247 ; given in r25,r24 (r25 is most significant)
\r
248 ; param2: an 16-bit pointer to 64 byte block to hash
\r
250 ; param3: an 16-bit integer specifing length of block in bits
\r
252 sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
\r
257 brne sha256_lastBlock_prolog
\r
259 brne sha256_lastBlock_prolog
\r
264 rcall sha256_nextBlock
\r
271 sha256_lastBlock_prolog:
\r
272 /* allocate space on stack */
\r
283 adiw r30, 1 /* SP points to next free byte on stack */
\r
284 mov r18, r20 /* r20 = LSB(length) */
\r
288 bst r21, 0 /* may be we should explain this ... */
\r
289 bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
\r
292 movw r26, r22 /* X points to begin of msg */
\r
294 breq sha256_lastBlock_post_copy
\r
296 sha256_lastBlock_copy_loop:
\r
300 brne sha256_lastBlock_copy_loop
\r
301 sha256_lastBlock_post_copy:
\r
302 sha256_lastBlock_insert_stuffing_bit:
\r
306 and r19, r20 /* if we are in bitmode */
\r
307 breq 2f /* no bitmode */
\r
313 /* maybe we should do some ANDing here, just for safety */
\r
319 /* checking stuff here */
\r
322 rjmp sha256_lastBlock_insert_zeros
\r
324 /* oh shit, we landed here */
\r
325 /* first we have to fill it up with zeros */
\r
344 rcall sha256_nextBlock
\r
352 /* now we should subtract 512 from length */
\r
354 adiw r26, 4*8+1 /* we can skip the lowest byte */
\r
366 ; clr r18 /* not neccessary ;-) */
\r
367 /* reset Z pointer to begin of block */
\r
369 sha256_lastBlock_insert_zeros:
\r
372 breq sha256_lastBlock_insert_length
\r
375 st Z+, r1 /* r1 is still zero */
\r
379 ; rjmp sha256_lastBlock_epilog
\r
380 sha256_lastBlock_insert_length:
\r
381 movw r26, r24 /* X points to state */
\r
382 adiw r26, 8*4 /* X points to (state.length) */
\r
383 adiw r30, 8 /* Z points one after the last byte of block */
\r
400 rcall sha256_nextBlock
\r
402 sha256_lastBlock_epilog:
\r
406 adiw r30, 63 ; lo8(64)
\r
407 adiw r30, 1 ; hi8(64)
\r
417 ;###########################################################
\r
419 .global sha256_nextBlock
\r
420 ; === sha256_nextBlock ===
\r
421 ; this is the core function for calculating SHA-256 hashes
\r
422 ; param1: the 16-bit pointer to sha256_ctx structure
\r
423 ; given in r25,r24 (r25 is most significant)
\r
424 ; param2: an 16-bit pointer to 64 byte block to hash
\r
426 sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
\r
449 /* byteorder: high number <--> high significance */
\r
451 ; initial, let's make some space ready for local vars
\r
452 push r4 /* replace push & pop by mem ops? */
\r
470 movw r18, r20 ;backup SP
\r
471 ; movw r26, r20 ; X points to free space on stack
\r
472 movw r30, r22 ; Z points to message
\r
473 subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
\r
474 sbci r21, hi8(sha256_nextBlock_localSpace)
\r
475 movw r26, r20 ; X points to free space on stack
\r
477 cli ; we want to be uninterrupted while updating SP
\r
484 push r25 /* param1 will be needed later */
\r
485 ; now we fill the w array with message (think about endianess)
\r
488 sha256_nextBlock_wcpyloop:
\r
498 brne sha256_nextBlock_wcpyloop
\r
499 /* for (i=16; i<64; ++i){
\r
500 w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];
\r
502 /* r25,r24,r23,r24 (r21,r20) are function values
\r
503 r19,r18,r17,r16 are the accumulator
\r
504 r15,r14,r13,rBck1 are backup1
\r
505 r11,r10,r9 ,r8 are xor accu
\r
506 r1 is round counter */
\r
510 sha256_nextBlock_wcalcloop:
\r
511 movw r30, r26 ; cp X to Z
\r
513 sbiw r30, 1 ; substract 64 = 16*4
\r
517 ld Accu4, Z+ /* w[i] = w[i-16] */
\r
521 ld Bck4, Z+ /* backup = w[i-15] */
\r
526 mov Func4, Bck1 /* prerotated by 8 */
\r
530 movw XAccu3, Func3 /* store ROTR(w[i-15],7) in xor accu */
\r
532 movw Func3, Bck1 /* prerotated by 16 */
\r
535 eor XAccu1, Func1 /* xor ROTR(w[i-15], 18)*/
\r
539 ldi Func2, 3 /* now shr3 */ /*we can destroy backup now*/
\r
550 eor XAccu4, Bck4 /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
\r
554 adc Accu4, XAccu4 /* finished with sigma0 */
\r
555 ldd Func1, Z+7*4 /* now accu += w[i-7] */
\r
563 ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
\r
569 movw Func3, Bck1 /* prerotated by 16 */
\r
573 movw XAccu1, Func1 /* store in ROTR(w[i-2], 17) xor accu */
\r
575 ; movw Func3, Bck1 /* prerotated by 16 */
\r
578 eor XAccu1, Func1 /* xor ROTR(w[i-2], 19)*/
\r
582 ldi Func2, 2 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
\r
591 eor XAccu3, Bck4 /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
\r
595 adc Accu4, XAccu4 /* finished with sigma0 */
\r
596 /* now let's store the shit */
\r
602 breq 3f ; skip if zero
\r
603 rjmp sha256_nextBlock_wcalcloop
\r
605 /* we are finished with w array X points one byte post w */
\r
611 ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
\r
618 /* now the real fun begins */
\r
619 /* for (i=0; i<64; ++i){
\r
620 t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
\r
621 t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
\r
622 memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0];
\r
626 /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
\r
627 sbiw r26, 8*4 /* X still points at a[7]+1*/
\r
629 ldi r30, lo8(sha256_kv)
\r
630 ldi r31, hi8(sha256_kv)
\r
631 dec r27 /* X - (64*4 == 256) */
\r
635 /* now calculate t1 */
\r
636 /*CH(x,y,z) = (x&y)^((~x)&z)*/
\r
640 ldd T4, Y+5*4+3 /* y in T */
\r
644 ldd Func4, Y+4*4+3 /* x in Func */
\r
648 ldd Bck4, Y+6*4+3 /* z in Bck */
\r
664 eor T4, Bck4 /* done, CH(x,y,z) is in T */
\r
665 /* now SIGMA1(a[4]) */
\r
666 ldd Bck4, Y+4*4 /* think about using it from Func reg above*/
\r
669 ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */
\r
673 rcall bitrotl /* rotr(x,6) */
\r
679 rcall bitrotr /* rotr(x,11) */
\r
684 movw Func1, Bck3 /* this prerotates furteh 16 bits*/
\r
685 movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
\r
687 rcall bitrotr /* rotr(x,11) */
\r
691 eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
\r
696 /* now we've to add a[7], w[i] and k[i] */
\r
698 ldd XAccu2, Y+4*7+1
\r
699 ldd XAccu3, Y+4*7+2
\r
700 ldd XAccu4, Y+4*7+3
\r
704 adc T4, XAccu4 /* add a[7] */
\r
712 adc T4, XAccu4 /* add w[i] */
\r
720 adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
\r
721 /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
\r
722 /* starting with MAJ(x,y,z) */
\r
726 ldd Func4, Y+4*0+3 /* load x=a[0] */
\r
727 ldd XAccu1, Y+4*1+0
\r
728 ldd XAccu2, Y+4*1+1
\r
729 ldd XAccu3, Y+4*1+2
\r
730 ldd XAccu4, Y+4*1+3 /* load y=a[1] */
\r
734 and XAccu4, Func4 /* XAccu == (x & y) */
\r
738 ldd Bck4, Y+4*2+3 /* load z=a[2] */
\r
746 eor XAccu4, Func4 /* XAccu == (x & y) ^ (x & z) */
\r
750 ldd Func4, Y+4*1+3 /* load y=a[1] */
\r
758 eor XAccu4, Func4 /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
\r
760 ldd Bck1, Y+4*0+0 /* we should combine this with above */
\r
769 movw Accu3, Func3 /* Accu = shr(a[0], 2) */
\r
771 movw Func3, Bck1 /* prerotate by 16 bits */
\r
777 eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
\r
781 mov Func4, Bck3 /* prerotate by 24 bits */
\r
787 eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
\r
788 add Accu1, XAccu1 /* add previous result (MAJ)*/
\r
792 /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
\r
793 /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
\r
798 ld r25, -Y /* warning: this is PREdecrement */
\r
822 std Y+4*0+3, Accu4 /* a array updated */
\r
827 rjmp sha256_main_loop ;brne sha256_main_loop
\r
830 /* pointers to state should still exist on the stack ;-) */
\r
852 brne update_state_loop
\r
853 /* now we just have to update the length */
\r
854 adiw r30, 1 /* since we add 512, we can simply skip the LSB */
\r
861 sha256_nextBlock_fix_length:
\r
862 brcc sha256_nextBlock_epilog
\r
867 brne sha256_nextBlock_fix_length
\r
870 sha256_nextBlock_epilog:
\r
871 /* now we should clean up the stack */
\r
876 cli ; we want to be uninterrupted while updating SP
\r
900 sha256_kv: ; round-key-vector stored in ProgMem
\r
901 .word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
\r
902 .word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
\r
903 .word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
\r
904 .word 0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
\r
905 .word 0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
\r
906 .word 0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
\r
907 .word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
\r
908 .word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
\r
911 ;###########################################################
\r
913 .global sha256_init
\r
914 ;uint32_t sha256_init_vector[]={
\r
915 ; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
\r
916 ; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
\r
918 ;void sha256_init(sha256_ctx_t *state){
\r
920 ; memcpy(state->h, sha256_init_vector, 8*4);
\r
922 ; param1: (Func3,r24) 16-bit pointer to sha256_ctx_t struct in ram
\r
923 ; modifys: Z(r30,r31), Func1, r22
\r
925 movw r26, r24 ; (24,25) --> (26,27) load X with param1
\r
926 ldi r30, lo8((sha256_init_vector))
\r
927 ldi r31, hi8((sha256_init_vector))
\r
929 sha256_init_vloop:
\r
933 brne sha256_init_vloop
\r
935 clr r1 ;this should not be needed
\r
939 brne sha256_init_lloop
\r
942 sha256_init_vector:
\r
943 .word 0xE667, 0x6A09
\r
944 .word 0xAE85, 0xBB67
\r
945 .word 0xF372, 0x3C6E
\r
946 .word 0xF53A, 0xA54F
\r
947 .word 0x527F, 0x510E
\r
948 .word 0x688C, 0x9B05
\r
949 .word 0xD9AB, 0x1F83
\r
950 .word 0xCD19, 0x5BE0
\r
952 ;###########################################################
\r
956 ; function that rotates a 32 bit word to the left
\r
957 ; param1: the 32-bit word to rotate
\r
958 ; given in r25,r24,r23,r22 (r25 is most significant)
\r
959 ; param2: an 8-bit value telling how often to rotate
\r
961 ; modifys: r21, r22
\r
990 ;###########################################################
\r
994 ; function that rotates a 32 bit word to the right
\r
995 ; param1: the 32-bit word to rotate
\r
996 ; given in r25,r24,r23,22 (r25 is most significant)
\r
997 ; param2: an 8-bit value telling how often to rotate
\r
999 ; modifys: r21, r22
\r
1028 ;###########################################################
\r
1030 .global change_endian32
\r
1031 ; === change_endian32 ===
\r
1032 ; function that changes the endianess of a 32-bit word
\r
1033 ; param1: the 32-bit word
\r
1034 ; given in r25,r24,r23,22 (r25 is most significant)
\r
1035 ; modifys: r21, r22
\r
1037 movw r20, r22 ; (r22,r23) --> (r20,r21)
\r