X-Git-Url: https://git.cryptolib.org/?a=blobdiff_plain;f=sha256%2Fsha256-asm.S;h=97c3b562b30e1d018747c081e18430e056f9aec4;hb=e9e07569721b9e005d6b602e26a03e930e796577;hp=d9eb6b65a789a49a7dc730775685114f146ed53b;hpb=d32eba56ce10ea6b9eff123b50d9842673b38f2b;p=avr-crypto-lib.git diff --git a/sha256/sha256-asm.S b/sha256/sha256-asm.S index d9eb6b6..97c3b56 100644 --- a/sha256/sha256-asm.S +++ b/sha256/sha256-asm.S @@ -1,7 +1,7 @@ /* sha256-asm.S */ /* This file is part of the AVR-Crypto-Lib. - Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de) + Copyright (C) 2006-2015 Daniel Otte (bg@nerilex.org) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,10 +21,11 @@ * * License: GPLv3 or later */ -; sha-256 implementation in assembler +; sha-256 implementation in assembler SHA256_BLOCK_BITS = 512 SHA256_HASH_BITS = 256 + .macro precall /* push r18 - r27, r30 - r31*/ push r0 @@ -111,7 +112,7 @@ SREG = 0x3F ; [h0][h1][h2][h3][h4][h5][h6][h7][length] ; hn is 32 bit large, length is 64 bit large -;########################################################### +;########################################################### .global sha256_ctx2hash ; === sha256_ctx2hash === @@ -125,21 +126,21 @@ sha256_ctx2hash: movw r30, r24 ldi r21, 8 sbiw r26, 4 -1: +1: ldi r20, 4 adiw r26, 8 -2: +2: ld r0, -X - st Z+, r0 + st Z+, r0 dec r20 brne 2b - + dec r21 brne 1b - + ret -;########################################################### +;########################################################### .global sha256 ; === sha256 === @@ -160,71 +161,66 @@ sha256_prolog: push r13 push r16 push r17 - in r16, SPL - in r17, SPH - subi r16, 8*4+8 - sbci r17, 0 + in r30, SPL + in r31, SPH + sbiw r30, 8*4+8 in r0, SREG cli - out SPL, r16 - out SPH, r17 + out SPL, r30 out SREG, r0 - + out SPH, r31 + push r25 push r24 - inc r16 - adc r17, r1 - + adiw r30, 1 + movw r16, r30 movw r8, r18 /* backup of length*/ movw r10, r20 - + movw r12, r22 /* backup pf msg-ptr */ - + movw r24, r16 rcall sha256_init - /* if length >= 512 */ + /* if length > 0xffff */ 1: tst r11 - brne 4f + brne 2f tst r10 - brne 4f - mov r19, r9 - cpi r19, 0x02 - brlo 4f - + breq 4f +2: movw r24, r16 movw r22, r12 rcall sha256_nextBlock - ldi r19, 0x64 - add r22, r19 - adc r23, r1 + ldi r19, 64 + add r12, r19 + adc r13, r1 /* length -= 512 */ ldi r19, 0x02 sub r9, r19 sbc r10, r1 sbc r11, r1 rjmp 1b - + 4: movw r24, r16 movw r22, r12 movw r20, r8 rcall sha256_lastBlock - + pop r24 pop r25 movw r22, r16 - rcall sha256_ctx2hash - + rcall sha256_ctx2hash + sha256_epilog: in r30, SPL in r31, SPH - adiw r30, 8*4+8 + adiw r30, 8*4+8 in r0, SREG cli out SPL, r30 - out SPH, r31 out SREG, r0 + out SPH, r31 pop r17 pop r16 pop r13 @@ -235,7 +231,7 @@ sha256_epilog: pop r8 ret -;########################################################### +;########################################################### ; block MUST NOT be larger than 64 bytes @@ -269,19 +265,21 @@ sha256_lastBlock: pop r24 pop r25 subi r21, 0x02 - subi r23, -2 - rjmp sha256_lastBlock + ldi r19, 64 + add r22, r19 + adc r23, r1 + rjmp sha256_lastBlock sha256_lastBlock_prolog: /* allocate space on stack */ in r30, SPL in r31, SPH - in r1, SREG + in r0, SREG subi r30, lo8(64) sbci r31, hi8(64) cli out SPL, r30 + out SREG,r0 out SPH, r31 - out SREG,r1 adiw r30, 1 /* SP points to next free byte on stack */ mov r18, r20 /* r20 = LSB(length) */ @@ -290,8 +288,8 @@ sha256_lastBlock_prolog: lsr r18 bst r21, 0 /* may be we should explain this ... */ bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */ - - + + movw r26, r22 /* X points to begin of msg */ tst r18 breq sha256_lastBlock_post_copy @@ -301,27 +299,27 @@ sha256_lastBlock_copy_loop: st Z+, r0 dec r1 brne sha256_lastBlock_copy_loop -sha256_lastBlock_post_copy: -sha256_lastBlock_insert_stuffing_bit: +sha256_lastBlock_post_copy: +sha256_lastBlock_insert_stuffing_bit: ldi r19, 0x80 - mov r0,r19 + mov r0,r19 ldi r19, 0x07 and r19, r20 /* if we are in bitmode */ breq 2f /* no bitmode */ -1: +1: lsr r0 dec r19 brne 1b ld r19, X /* maybe we should do some ANDing here, just for safety */ or r0, r19 -2: +2: st Z+, r0 inc r18 /* checking stuff here */ cpi r18, 64-8+1 - brsh 0f + brsh 0f rjmp sha256_lastBlock_insert_zeros 0: /* oh shit, we landed here */ @@ -329,15 +327,15 @@ sha256_lastBlock_insert_stuffing_bit: ldi r19, 64 sub r19, r18 breq 2f -1: +1: st Z+, r1 dec r19 - brne 1b -2: + brne 1b +2: sbiw r30, 63 sbiw r30, 1 movw r22, r30 - + push r31 push r30 push r25 @@ -351,7 +349,7 @@ sha256_lastBlock_insert_stuffing_bit: pop r25 pop r30 pop r31 - + /* now we should subtract 512 from length */ movw r26, r24 adiw r26, 4*8+1 /* we can skip the lowest byte */ @@ -365,11 +363,11 @@ sha256_lastBlock_insert_stuffing_bit: st X+, r19 dec r18 brne 1b - + ; clr r18 /* not neccessary ;-) */ /* reset Z pointer to begin of block */ -sha256_lastBlock_insert_zeros: +sha256_lastBlock_insert_zeros: ldi r19, 64-8 sub r19, r18 breq sha256_lastBlock_insert_length @@ -405,19 +403,18 @@ sha256_lastBlock_insert_length: sha256_lastBlock_epilog: in r30, SPL in r31, SPH - in r1, SREG + in r0, SREG adiw r30, 63 ; lo8(64) adiw r30, 1 ; hi8(64) cli out SPL, r30 + out SREG,r0 out SPH, r31 - out SREG,r1 clr r1 - clr r0 ret /**/ -;########################################################### +;########################################################### .global sha256_nextBlock ; === sha256_nextBlock === @@ -471,16 +468,16 @@ sha256_nextBlock: in r20, SPL in r21, SPH movw r18, r20 ;backup SP -; movw r26, r20 ; X points to free space on stack +; movw r26, r20 ; X points to free space on stack movw r30, r22 ; Z points to message subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63 sbci r21, hi8(sha256_nextBlock_localSpace) - movw r26, r20 ; X points to free space on stack + movw r26, r20 ; X points to free space on stack in r0, SREG cli ; we want to be uninterrupted while updating SP out SPL, r20 - out SPH, r21 out SREG, r0 + out SPH, r21 push r18 push r19 push r24 @@ -488,29 +485,29 @@ sha256_nextBlock: ; now we fill the w array with message (think about endianess) adiw r26, 1 ; X++ ldi r20, 16 -sha256_nextBlock_wcpyloop: +sha256_nextBlock_wcpyloop: ld r23, Z+ ld r22, Z+ ld r19, Z+ ld r18, Z+ st X+, r18 st X+, r19 - st X+, r22 + st X+, r22 st X+, r23 dec r20 brne sha256_nextBlock_wcpyloop /* for (i=16; i<64; ++i){ - w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16]; + w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16]; } */ /* r25,r24,r23,r24 (r21,r20) are function values r19,r18,r17,r16 are the accumulator r15,r14,r13,rBck1 are backup1 - r11,r10,r9 ,r8 are xor accu + r11,r10,r9 ,r8 are xor accu r1 is round counter */ ldi r20, 64-16 mov LoopC, r20 -sha256_nextBlock_wcalcloop: +sha256_nextBlock_wcalcloop: movw r30, r26 ; cp X to Z sbiw r30, 63 sbiw r30, 1 ; substract 64 = 16*4 @@ -544,7 +541,7 @@ sigma0_shr: lsr Bck4 ror Bck3 ror Bck2 - ror Bck1 + ror Bck1 dec Func2 brne sigma0_shr eor XAccu1, Bck1 @@ -586,7 +583,7 @@ sigma0_shr: sigma1_shr: lsr Bck4 ror Bck3 - ror Bck2 + ror Bck2 dec Func2 brne sigma1_shr eor XAccu1, Bck2 @@ -612,17 +609,17 @@ sigma1_shr: push r30 push r31 ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */ -init_a_array: +init_a_array: ld r1, Z+ st X+, r1 dec r25 brne init_a_array - + /* now the real fun begins */ /* for (i=0; i<64; ++i){ t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i]; t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]); - memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; + memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; a[4] += t1; a[0] = t1 + t2; } */ @@ -630,7 +627,7 @@ init_a_array: sbiw r26, 8*4 /* X still points at a[7]+1*/ movw r28, r26 ldi r30, lo8(sha256_kv) - ldi r31, hi8(sha256_kv) + ldi r31, hi8(sha256_kv) dec r27 /* X - (64*4 == 256) */ ldi r25, 64 mov LoopC, r25 @@ -667,18 +664,18 @@ sha256_main_loop: eor T4, Bck4 /* done, CH(x,y,z) is in T */ /* now SIGMA1(a[4]) */ ldd Bck4, Y+4*4 /* think about using it from Func reg above*/ - ldd Bck1, Y+4*4+1 + ldd Bck1, Y+4*4+1 ldd Bck2, Y+4*4+2 - ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */ + ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */ movw Func1, Bck1 movw Func3, Bck3 - ldi r20, 2 - rcall bitrotl /* rotr(x,6) */ + ldi r20, 2 + rcall bitrotl /* rotr(x,6) */ movw XAccu1, Func1 movw XAccu3, Func3 movw Func1, Bck1 movw Func3, Bck3 - ldi r20, 3 + ldi r20, 3 rcall bitrotr /* rotr(x,11) */ eor XAccu1, Func1 eor XAccu2, Func2 @@ -686,7 +683,7 @@ sha256_main_loop: eor XAccu4, Func4 movw Func1, Bck3 /* this prerotates furteh 16 bits*/ movw Func3, Bck1 /* so we have now prerotated by 24 bits*/ - ldi r20, 1 + ldi r20, 1 rcall bitrotr /* rotr(x,11) */ eor XAccu1, Func1 eor XAccu2, Func2 @@ -770,7 +767,7 @@ sha256_main_loop: rcall bitrotr movw Accu1, Func1 movw Accu3, Func3 /* Accu = shr(a[0], 2) */ - movw Func1, Bck3 + movw Func1, Bck3 movw Func3, Bck1 /* prerotate by 16 bits */ ldi r20, 3 rcall bitrotl @@ -823,12 +820,12 @@ a_shift_loop: std Y+4*0+1, Accu2 std Y+4*0+2, Accu3 std Y+4*0+3, Accu4 /* a array updated */ - - + + dec LoopC breq update_state rjmp sha256_main_loop ;brne sha256_main_loop -update_state: +update_state: /* update state */ /* pointers to state should still exist on the stack ;-) */ pop r31 @@ -838,7 +835,7 @@ update_state_loop: ldd Accu1, Z+0 ldd Accu2, Z+1 ldd Accu3, Z+2 - ldd Accu4, Z+3 + ldd Accu4, Z+3 ld Func1, Y+ ld Func2, Y+ ld Func3, Y+ @@ -854,33 +851,32 @@ update_state_loop: dec r21 brne update_state_loop /* now we just have to update the length */ - adiw r30, 1 /* since we add 512, we can simply skip the LSB */ + adiw r30, 1 /* since we add 512, we can simply skip the LSB */ ldi r21, 2 ldi r22, 6 ld r20, Z add r20, r21 - st Z+, r20 + st Z+, r20 clr r21 -sha256_nextBlock_fix_length: +sha256_nextBlock_fix_length: brcc sha256_nextBlock_epilog ld r20, Z adc r20, r21 st Z+, r20 dec r22 brne sha256_nextBlock_fix_length - + ; EPILOG sha256_nextBlock_epilog: /* now we should clean up the stack */ - + pop r21 pop r20 in r0, SREG cli ; we want to be uninterrupted while updating SP out SPL, r20 - out SPH, r21 out SREG, r0 - + out SPH, r21 clr r1 pop r29 pop r28 @@ -897,10 +893,10 @@ sha256_nextBlock_epilog: pop r7 pop r6 pop r5 - pop r4 + pop r4 ret -sha256_kv: ; round-key-vector stored in ProgMem +sha256_kv: ; round-key-vector stored in ProgMem .word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c .word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b .word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9 @@ -910,10 +906,10 @@ sha256_kv: ; round-key-vector stored in ProgMem .word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e .word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671 - -;########################################################### -.global sha256_init +;########################################################### + +.global sha256_init ;uint32_t sha256_init_vector[]={ ; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, ; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; @@ -929,26 +925,26 @@ sha256_init: ldi r30, lo8((sha256_init_vector)) ldi r31, hi8((sha256_init_vector)) ldi r22, 32+8 -sha256_init_vloop: - lpm r23, Z+ +sha256_init_vloop: + lpm r23, Z+ st X+, r23 dec r22 brne sha256_init_vloop ret - + sha256_init_vector: .word 0xE667, 0x6A09 -.word 0xAE85, 0xBB67 -.word 0xF372, 0x3C6E -.word 0xF53A, 0xA54F -.word 0x527F, 0x510E -.word 0x688C, 0x9B05 -.word 0xD9AB, 0x1F83 +.word 0xAE85, 0xBB67 +.word 0xF372, 0x3C6E +.word 0xF53A, 0xA54F +.word 0x527F, 0x510E +.word 0x688C, 0x9B05 +.word 0xD9AB, 0x1F83 .word 0xCD19, 0x5BE0 .word 0x0000, 0x0000 .word 0x0000, 0x0000 -;########################################################### +;########################################################### .global rotl32 ; === ROTL32 === @@ -971,22 +967,23 @@ rotl32: bitrotl: clr r21 clc -bitrotl_loop: +bitrotl_loop: tst r20 breq fixrotl +2: rol r22 rol r23 rol r24 rol r25 rol r21 dec r20 - rjmp bitrotl_loop + brne 2b fixrotl: or r22, r21 ret - -;########################################################### + +;########################################################### .global rotr32 ; === ROTR32 === @@ -1009,23 +1006,24 @@ rotr32: bitrotr: clr r21 clc -bitrotr_loop: +bitrotr_loop: tst r20 breq fixrotr +2: ror r25 ror r24 ror r23 ror r22 ror r21 dec r20 - rjmp bitrotr_loop + brne 2b fixrotr: or r25, r21 ret - - -;########################################################### - + + +;########################################################### + .global change_endian32 ; === change_endian32 === ; function that changes the endianess of a 32-bit word @@ -1037,6 +1035,6 @@ change_endian32: mov r22, r25 mov r23, r24 mov r24, r21 - mov r25, r20 + mov r25, r20 ret