2 * Author: Daniel Otte
\r
6 ; sha-256 implementation in assembler
\r
7 SHA256_BLOCK_BITS = 512
\r
8 SHA256_HASH_BITS = 256
\r
11 /* push r18 - r27, r30 - r31*/
\r
47 .macro hexdump length
\r
70 ldi r22, lo8(\length)
\r
71 ldi r23, hi8(\length)
\r
76 /* X points to Block */
\r
77 .macro dbg_hexdump length
\r
93 ; [h0][h1][h2][h3][h4][h5][h6][h7][length]
\r
94 ; hn is 32 bit large, length is 64 bit large
\r
96 ;###########################################################
\r
98 .global sha256_ctx2hash
\r
99 ; === sha256_ctx2hash ===
\r
100 ; this function converts a state into a normal hash (bytestring)
\r
101 ; param1: the 16-bit destination pointer
\r
102 ; given in r25,r24 (r25 is most significant)
\r
103 ; param2: the 16-bit pointer to sha256_ctx structure
\r
124 ;###########################################################
\r
128 ; this function calculates SHA-256 hashes from messages in RAM
\r
129 ; param1: the 16-bit hash destination pointer
\r
130 ; given in r25,r24 (r25 is most significant)
\r
131 ; param2: the 16-bit pointer to message
\r
133 ; param3: 32-bit length value (length of message in bits)
\r
134 ; given in r21,r20,r19,r18
\r
160 movw r8, r18 /* backup of length*/
\r
163 movw r12, r22 /* backup pf msg-ptr */
\r
167 /* if length >= 512 */
\r
179 rcall sha256_nextBlock
\r
183 /* length -= 512 */
\r
194 rcall sha256_lastBlock
\r
199 rcall sha256_ctx2hash
\r
220 ;###########################################################
\r
223 ; block MUST NOT be larger than 64 bytes
\r
225 .global sha256_lastBlock
\r
226 ; === sha256_lastBlock ===
\r
227 ; this function does padding & Co. for calculating SHA-256 hashes
\r
228 ; param1: the 16-bit pointer to sha256_ctx structure
\r
229 ; given in r25,r24 (r25 is most significant)
\r
230 ; param2: an 16-bit pointer to 64 byte block to hash
\r
232 ; param3: an 16-bit integer specifing length of block in bits
\r
234 sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
\r
239 brne sha256_lastBlock_prolog
\r
241 brne sha256_lastBlock_prolog
\r
246 rcall sha256_nextBlock
\r
253 sha256_lastBlock_prolog:
\r
254 /* allocate space on stack */
\r
265 adiw r30, 1 /* SP points to next free byte on stack */
\r
266 mov r18, r20 /* r20 = LSB(length) */
\r
270 bst r21, 0 /* may be we should explain this ... */
\r
271 bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
\r
274 movw r26, r22 /* X points to begin of msg */
\r
276 breq sha256_lastBlock_post_copy
\r
278 sha256_lastBlock_copy_loop:
\r
282 brne sha256_lastBlock_copy_loop
\r
283 sha256_lastBlock_post_copy:
\r
284 sha256_lastBlock_insert_stuffing_bit:
\r
288 and r19, r20 /* if we are in bitmode */
\r
289 breq 2f /* no bitmode */
\r
295 /* maybe we should do some ANDing here, just for safety */
\r
301 /* checking stuff here */
\r
304 rjmp sha256_lastBlock_insert_zeros
\r
306 /* oh shit, we landed here */
\r
307 /* first we have to fill it up with zeros */
\r
326 rcall sha256_nextBlock
\r
334 /* now we should subtract 512 from length */
\r
336 adiw r26, 4*8+1 /* we can skip the lowest byte */
\r
348 ; clr r18 /* not neccessary ;-) */
\r
349 /* reset Z pointer to begin of block */
\r
351 sha256_lastBlock_insert_zeros:
\r
354 breq sha256_lastBlock_insert_length
\r
357 st Z+, r1 /* r1 is still zero */
\r
361 ; rjmp sha256_lastBlock_epilog
\r
362 sha256_lastBlock_insert_length:
\r
363 movw r26, r24 /* X points to state */
\r
364 adiw r26, 8*4 /* X points to (state.length) */
\r
365 adiw r30, 8 /* Z points one after the last byte of block */
\r
382 rcall sha256_nextBlock
\r
384 sha256_lastBlock_epilog:
\r
388 adiw r30, 63 ; lo8(64)
\r
389 adiw r30, 1 ; hi8(64)
\r
399 ;###########################################################
\r
401 .global sha256_nextBlock
\r
402 ; === sha256_nextBlock ===
\r
403 ; this is the core function for calculating SHA-256 hashes
\r
404 ; param1: the 16-bit pointer to sha256_ctx structure
\r
405 ; given in r25,r24 (r25 is most significant)
\r
406 ; param2: an 16-bit pointer to 64 byte block to hash
\r
408 sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
\r
431 /* byteorder: high number <--> high significance */
\r
433 ; initial, let's make some space ready for local vars
\r
434 push r4 /* replace push & pop by mem ops? */
\r
452 movw r18, r20 ;backup SP
\r
453 ; movw r26, r20 ; X points to free space on stack
\r
454 movw r30, r22 ; Z points to message
\r
455 subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
\r
456 sbci r21, hi8(sha256_nextBlock_localSpace)
\r
457 movw r26, r20 ; X points to free space on stack
\r
459 cli ; we want to be uninterrupted while updating SP
\r
466 push r25 /* param1 will be needed later */
\r
467 ; now we fill the w array with message (think about endianess)
\r
470 sha256_nextBlock_wcpyloop:
\r
480 brne sha256_nextBlock_wcpyloop
\r
481 /* for (i=16; i<64; ++i){
\r
482 w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];
\r
484 /* r25,r24,r23,r24 (r21,r20) are function values
\r
485 r19,r18,r17,r16 are the accumulator
\r
486 r15,r14,r13,rBck1 are backup1
\r
487 r11,r10,r9 ,r8 are xor accu
\r
488 r1 is round counter */
\r
492 sha256_nextBlock_wcalcloop:
\r
493 movw r30, r26 ; cp X to Z
\r
495 sbiw r30, 1 ; substract 64 = 16*4
\r
499 ld Accu4, Z+ /* w[i] = w[i-16] */
\r
503 ld Bck4, Z+ /* backup = w[i-15] */
\r
508 mov Func4, Bck1 /* prerotated by 8 */
\r
512 movw XAccu3, Func3 /* store ROTR(w[i-15],7) in xor accu */
\r
514 movw Func3, Bck1 /* prerotated by 16 */
\r
517 eor XAccu1, Func1 /* xor ROTR(w[i-15], 18)*/
\r
521 ldi Func2, 3 /* now shr3 */ /*we can destroy backup now*/
\r
532 eor XAccu4, Bck4 /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
\r
536 adc Accu4, XAccu4 /* finished with sigma0 */
\r
537 ldd Func1, Z+7*4 /* now accu += w[i-7] */
\r
545 ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
\r
551 movw Func3, Bck1 /* prerotated by 16 */
\r
555 movw XAccu1, Func1 /* store in ROTR(w[i-2], 17) xor accu */
\r
557 ; movw Func3, Bck1 /* prerotated by 16 */
\r
560 eor XAccu1, Func1 /* xor ROTR(w[i-2], 19)*/
\r
564 ldi Func2, 2 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
\r
573 eor XAccu3, Bck4 /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
\r
577 adc Accu4, XAccu4 /* finished with sigma0 */
\r
578 /* now let's store the shit */
\r
584 breq 3f ; skip if zero
\r
585 rjmp sha256_nextBlock_wcalcloop
\r
587 /* we are finished with w array X points one byte post w */
\r
593 ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
\r
600 /* now the real fun begins */
\r
601 /* for (i=0; i<64; ++i){
\r
602 t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
\r
603 t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
\r
604 memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0];
\r
608 /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
\r
609 sbiw r26, 8*4 /* X still points at a[7]+1*/
\r
611 ldi r30, lo8(sha256_kv)
\r
612 ldi r31, hi8(sha256_kv)
\r
613 dec r27 /* X - (64*4 == 256) */
\r
617 /* now calculate t1 */
\r
618 /*CH(x,y,z) = (x&y)^((~x)&z)*/
\r
622 ldd T4, Y+5*4+3 /* y in T */
\r
626 ldd Func4, Y+4*4+3 /* x in Func */
\r
630 ldd Bck4, Y+6*4+3 /* z in Bck */
\r
646 eor T4, Bck4 /* done, CH(x,y,z) is in T */
\r
647 /* now SIGMA1(a[4]) */
\r
648 ldd Bck4, Y+4*4 /* think about using it from Func reg above*/
\r
651 ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */
\r
655 rcall bitrotl /* rotr(x,6) */
\r
661 rcall bitrotr /* rotr(x,11) */
\r
666 movw Func1, Bck3 /* this prerotates furteh 16 bits*/
\r
667 movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
\r
669 rcall bitrotr /* rotr(x,11) */
\r
673 eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
\r
678 /* now we've to add a[7], w[i] and k[i] */
\r
680 ldd XAccu2, Y+4*7+1
\r
681 ldd XAccu3, Y+4*7+2
\r
682 ldd XAccu4, Y+4*7+3
\r
686 adc T4, XAccu4 /* add a[7] */
\r
694 adc T4, XAccu4 /* add w[i] */
\r
702 adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
\r
703 /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
\r
704 /* starting with MAJ(x,y,z) */
\r
708 ldd Func4, Y+4*0+3 /* load x=a[0] */
\r
709 ldd XAccu1, Y+4*1+0
\r
710 ldd XAccu2, Y+4*1+1
\r
711 ldd XAccu3, Y+4*1+2
\r
712 ldd XAccu4, Y+4*1+3 /* load y=a[1] */
\r
716 and XAccu4, Func4 /* XAccu == (x & y) */
\r
720 ldd Bck4, Y+4*2+3 /* load z=a[2] */
\r
728 eor XAccu4, Func4 /* XAccu == (x & y) ^ (x & z) */
\r
732 ldd Func4, Y+4*1+3 /* load y=a[1] */
\r
740 eor XAccu4, Func4 /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
\r
742 ldd Bck1, Y+4*0+0 /* we should combine this with above */
\r
751 movw Accu3, Func3 /* Accu = shr(a[0], 2) */
\r
753 movw Func3, Bck1 /* prerotate by 16 bits */
\r
759 eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
\r
763 mov Func4, Bck3 /* prerotate by 24 bits */
\r
769 eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
\r
770 add Accu1, XAccu1 /* add previous result (MAJ)*/
\r
774 /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
\r
775 /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
\r
780 ld r25, -Y /* warning: this is PREdecrement */
\r
804 std Y+4*0+3, Accu4 /* a array updated */
\r
809 rjmp sha256_main_loop ;brne sha256_main_loop
\r
812 /* pointers to state should still exist on the stack ;-) */
\r
834 brne update_state_loop
\r
835 /* now we just have to update the length */
\r
836 adiw r30, 1 /* since we add 512, we can simply skip the LSB */
\r
843 sha256_nextBlock_fix_length:
\r
844 brcc sha256_nextBlock_epilog
\r
849 brne sha256_nextBlock_fix_length
\r
852 sha256_nextBlock_epilog:
\r
853 /* now we should clean up the stack */
\r
858 cli ; we want to be uninterrupted while updating SP
\r
882 sha256_kv: ; round-key-vector stored in ProgMem
\r
883 .word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
\r
884 .word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
\r
885 .word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
\r
886 .word 0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
\r
887 .word 0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
\r
888 .word 0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
\r
889 .word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
\r
890 .word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
\r
893 ;###########################################################
\r
895 .global sha256_init
\r
896 ;uint32_t sha256_init_vector[]={
\r
897 ; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
\r
898 ; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
\r
900 ;void sha256_init(sha256_ctx_t *state){
\r
902 ; memcpy(state->h, sha256_init_vector, 8*4);
\r
904 ; param1: (Func3,r24) 16-bit pointer to sha256_ctx_t struct in ram
\r
905 ; modifys: Z(r30,r31), Func1, r22
\r
907 movw r26, r24 ; (24,25) --> (26,27) load X with param1
\r
908 ldi r30, lo8((sha256_init_vector))
\r
909 ldi r31, hi8((sha256_init_vector))
\r
911 sha256_init_vloop:
\r
915 brne sha256_init_vloop
\r
917 clr r1 ;this should not be needed
\r
921 brne sha256_init_lloop
\r
924 sha256_init_vector:
\r
925 .word 0xE667, 0x6A09
\r
926 .word 0xAE85, 0xBB67
\r
927 .word 0xF372, 0x3C6E
\r
928 .word 0xF53A, 0xA54F
\r
929 .word 0x527F, 0x510E
\r
930 .word 0x688C, 0x9B05
\r
931 .word 0xD9AB, 0x1F83
\r
932 .word 0xCD19, 0x5BE0
\r
934 ;###########################################################
\r
938 ; function that rotates a 32 bit word to the left
\r
939 ; param1: the 32-bit word to rotate
\r
940 ; given in r25,r24,r23,r22 (r25 is most significant)
\r
941 ; param2: an 8-bit value telling how often to rotate
\r
943 ; modifys: r21, r22
\r
972 ;###########################################################
\r
976 ; function that rotates a 32 bit word to the right
\r
977 ; param1: the 32-bit word to rotate
\r
978 ; given in r25,r24,r23,22 (r25 is most significant)
\r
979 ; param2: an 8-bit value telling how often to rotate
\r
981 ; modifys: r21, r22
\r
1010 ;###########################################################
\r
1012 .global change_endian32
\r
1013 ; === change_endian32 ===
\r
1014 ; function that changes the endianess of a 32-bit word
\r
1015 ; param1: the 32-bit word
\r
1016 ; given in r25,r24,r23,22 (r25 is most significant)
\r
1017 ; modifys: r21, r22
\r
1019 movw r20, r22 ; (r22,r23) --> (r20,r21)
\r