/* * Author: Daniel Otte * * License: GPL */ ; SHA1 implementation in assembler for AVR SHA1_BLOCK_BITS = 512 SHA1_HASH_BITS = 160 .macro precall /* push r18 - r27, r30 - r31*/ push r0 push r1 push r18 push r19 push r20 push r21 push r22 push r23 push r24 push r25 push r26 push r27 push r30 push r31 clr r1 .endm .macro postcall pop r31 pop r30 pop r27 pop r26 pop r25 pop r24 pop r23 pop r22 pop r21 pop r20 pop r19 pop r18 pop r1 pop r0 .endm .macro hexdump length push r27 push r26 ldi r25, '\r' mov r24, r25 call uart_putc ldi r25, '\n' mov r24, r25 call uart_putc pop r26 pop r27 movw r24, r26 .if \length > 16 ldi r22, lo8(16) ldi r23, hi8(16) push r27 push r26 call uart_hexdump pop r26 pop r27 adiw r26, 16 hexdump \length-16 .else ldi r22, lo8(\length) ldi r23, hi8(\length) call uart_hexdump .endif .endm .macro delay /* push r0 push r1 clr r0 1: clr r1 2: dec r1 brne 2b dec r0 brne 1b pop r1 pop r0 // */ .endm /* X points to Block */ .macro dbg_hexdump length /* precall hexdump \length postcall // */ .endm .section .text SPL = 0x3D SPH = 0x3E SREG = 0x3F ; ;sha1_ctx_t is: ; ; [h0][h1][h2][h3][h4][length] ; hn is 32 bit large, length is 64 bit large ;########################################################### .global sha1_ctx2hash ; === sha1_ctx2hash === ; this function converts a state into a normal hash (bytestring) ; param1: the 16-bit destination pointer ; given in r25,r24 (r25 is most significant) ; param2: the 16-bit pointer to sha1_ctx structure ; given in r23,r22 sha1_ctx2hash: movw r26, r22 movw r30, r24 ldi r21, 5 sbiw r26, 4 1: ldi r20, 4 adiw r26, 8 2: ld r0, -X st Z+, r0 dec r20 brne 2b dec r21 brne 1b ret ;########################################################### .global sha1 ; === sha1 === ; this function calculates SHA-1 hashes from messages in RAM ; param1: the 16-bit hash destination pointer ; given in r25,r24 (r25 is most significant) ; param2: the 16-bit pointer to message ; given in r23,r22 ; param3: 32-bit length value (length of message in bits) ; given in r21,r20,r19,r18 sha1: sha1_prolog: push r8 push r9 push r10 push r11 push r12 push r13 push r16 push r17 in r16, SPL in r17, SPH subi r16, 5*4+8 sbci r17, 0 in r0, SREG cli out SPL, r16 out SPH, r17 out SREG, r0 push r25 push r24 inc r16 adc r17, r1 movw r8, r18 /* backup of length*/ movw r10, r20 movw r12, r22 /* backup pf msg-ptr */ movw r24, r16 rcall sha1_init /* if length >= 512 */ 1: tst r11 brne 4f tst r10 brne 4f mov r19, r9 cpi r19, 0x02 brlo 4f movw r24, r16 movw r22, r12 rcall sha1_nextBlock ldi r19, 0x64 add r22, r19 adc r23, r1 /* length -= 512 */ ldi r19, 0x02 sub r9, r19 sbc r10, r1 sbc r11, r1 rjmp 1b 4: movw r24, r16 movw r22, r12 movw r20, r8 rcall sha1_lastBlock pop r24 pop r25 movw r22, r16 rcall sha1_ctx2hash sha1_epilog: in r30, SPL in r31, SPH adiw r30, 5*4+8 in r0, SREG cli out SPL, r30 out SPH, r31 out SREG, r0 pop r17 pop r16 pop r13 pop r12 pop r11 pop r10 pop r9 pop r8 ret ;########################################################### ; block MUST NOT be larger than 64 bytes .global sha1_lastBlock ; === sha1_lastBlock === ; this function does padding & Co. for calculating SHA-1 hashes ; param1: the 16-bit pointer to sha1_ctx structure ; given in r25,r24 (r25 is most significant) ; param2: an 16-bit pointer to 64 byte block to hash ; given in r23,r22 ; param3: an 16-bit integer specifing length of block in bits ; given in r21,r20 sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1) sha1_lastBlock: tst r20 brne sha1_lastBlock_prolog cpi r21, 0x02 brne sha1_lastBlock_prolog push r25 push r24 push r23 push r22 rcall sha1_nextBlock pop r22 pop r23 pop r24 pop r25 clr r21 clr r22 sha1_lastBlock_prolog: /* allocate space on stack */ in r30, SPL in r31, SPH in r1, SREG subi r30, lo8(64) sbci r31, hi8(64) /* ??? */ cli out SPL, r30 out SPH, r31 out SREG,r1 adiw r30, 1 /* SP points to next free byte on stack */ mov r18, r20 /* r20 = LSB(length) */ lsr r18 lsr r18 lsr r18 bst r21, 0 /* may be we should explain this ... */ bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */ movw r26, r22 /* X points to begin of msg */ tst r18 breq sha1_lastBlock_post_copy mov r1, r18 sha1_lastBlock_copy_loop: ld r0, X+ st Z+, r0 dec r1 brne sha1_lastBlock_copy_loop sha1_lastBlock_post_copy: sha1_lastBlock_insert_stuffing_bit: ldi r19, 0x80 mov r0,r19 ldi r19, 0x07 and r19, r20 /* if we are in bitmode */ breq 2f /* no bitmode */ 1: lsr r0 dec r19 brne 1b ld r19, X /* maybe we should do some ANDing here, just for safety */ or r0, r19 2: st Z+, r0 inc r18 /* checking stuff here */ cpi r18, 64-8+1 brsh 0f rjmp sha1_lastBlock_insert_zeros 0: /* oh shit, we landed here */ /* first we have to fill it up with zeros */ ldi r19, 64 sub r19, r18 breq 2f 1: st Z+, r1 dec r19 brne 1b 2: sbiw r30, 63 sbiw r30, 1 movw r22, r30 push r31 push r30 push r25 push r24 push r21 push r20 rcall sha1_nextBlock pop r20 pop r21 pop r24 pop r25 pop r30 pop r31 /* now we should subtract 512 from length */ movw r26, r24 adiw r26, 4*5+1 /* we can skip the lowest byte */ ld r19, X subi r19, hi8(512) st X+, r19 ldi r18, 6 1: ld r19, X sbci r19, 0 st X+, r19 dec r18 brne 1b ; clr r18 /* not neccessary ;-) */ /* reset Z pointer to begin of block */ sha1_lastBlock_insert_zeros: ldi r19, 64-8 sub r19, r18 breq sha1_lastBlock_insert_length clr r1 1: st Z+, r1 /* r1 is still zero */ dec r19 brne 1b ; rjmp sha1_lastBlock_epilog sha1_lastBlock_insert_length: movw r26, r24 /* X points to state */ adiw r26, 5*4 /* X points to (state.length) */ adiw r30, 8 /* Z points one after the last byte of block */ ld r0, X+ add r0, r20 st -Z, r0 ld r0, X+ adc r0, r21 st -Z, r0 ldi r19, 6 1: ld r0, X+ adc r0, r1 st -Z, r0 dec r19 brne 1b sbiw r30, 64-8 movw r22, r30 rcall sha1_nextBlock sha1_lastBlock_epilog: in r30, SPL in r31, SPH in r1, SREG adiw r30, 63 ; lo8(64) adiw r30, 1 ; hi8(64) cli out SPL, r30 out SPH, r31 out SREG,r1 clr r1 clr r0 ret /**/ ;########################################################### .global sha1_nextBlock ; === sha1_nextBlock === ; this is the core function for calculating SHA-1 hashes ; param1: the 16-bit pointer to sha1_ctx structure ; given in r25,r24 (r25 is most significant) ; param2: an 16-bit pointer to 64 byte block to hash ; given in r23,r22 sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte) xtmp = 0 xNULL = 1 W1 = 10 W2 = 11 T1 = 12 T2 = 13 T3 = 14 T4 = 15 LoopC = 16 S = 17 tmp1 = 18 tmp2 = 19 tmp3 = 20 tmp4 = 21 F1 = 22 F2 = 23 F3 = 24 F4 = 25 /* byteorder: high number <--> high significance */ sha1_nextBlock: ; initial, let's make some space ready for local vars /* replace push & pop by mem ops? */ push r10 push r11 push r12 push r13 push r14 push r15 push r16 push r17 push r28 push r29 in r20, SPL in r21, SPH movw r18, r20 ;backup SP ; movw r26, r20 ; X points to free space on stack /* maybe removeable? */ movw r30, r22 ; Z points to message subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63 sbci r21, hi8(sha1_nextBlock_localSpace) movw r26, r20 ; X points to free space on stack in r0, SREG cli ; we want to be uninterrupted while updating SP out SPL, r20 out SPH, r21 out SREG, r0 push r18 push r19 /* push old SP on new stack */ push r24 push r25 /* param1 will be needed later */ /* load a[] with state */ movw 28, r24 /* load pointer to state in Y */ adiw r26, 1 ; X++ ldi LoopC, 5*4 1: ld tmp1, Y+ st X+, tmp1 dec LoopC brne 1b movw W1, r26 /* save pointer to w[0] */ /* load w[] with endian fixed message */ /* we might also use the changeendian32() function at bottom */ movw r30, r22 /* mv param2 (ponter to msg) to Z */ ldi LoopC, 16 1: ldd tmp1, Z+3 st X+, tmp1 ldd tmp1, Z+2 st X+, tmp1 ldd tmp1, Z+1 st X+, tmp1 ld tmp1, Z st X+, tmp1 adiw r30, 4 dec LoopC brne 1b ;clr LoopC /* LoopC is named t in FIPS 180-2 */ clr xtmp sha1_nextBlock_mainloop: mov S, LoopC lsl S lsl S andi S, 0x3C /* S is a bytepointer so *4 */ /* load w[s] */ movw r26, W1 add r26, S /* X points at w[s] */ adc r27, xNULL ld T1, X+ ld T2, X+ ld T3, X+ ld T4, X+ /**/ push r26 push r27 push T4 push T3 push T2 push T1 in r26, SPL in r27, SPH adiw r26, 1 dbg_hexdump 4 pop T1 pop T2 pop T3 pop T4 pop r27 pop r26 /**/ cpi LoopC, 16 brlt sha1_nextBlock_mainloop_core /* update w[s] */ ldi tmp1, 2*4 rcall 1f ldi tmp1, 8*4 rcall 1f ldi tmp1, 13*4 rcall 1f rjmp 2f 1: /* this might be "outsourced" to save the jump above */ add tmp1, S andi tmp1, 0x3f movw r26, W1 add r26, tmp1 adc r27, xNULL ld tmp2, X+ eor T1, tmp2 ld tmp2, X+ eor T2, tmp2 ld tmp2, X+ eor T3, tmp2 ld tmp2, X+ eor T4, tmp2 ret 2: /* now we just hav to do a ROTL(T) and save T back */ mov tmp2, T4 rol tmp2 rol T1 rol T2 rol T3 rol T4 movw r26, W1 add r26, S adc r27, xNULL st X+, T1 st X+, T2 st X+, T3 st X+, T4 sha1_nextBlock_mainloop_core: /* ther core function; T=ROTL5(a) ....*/ /* T already contains w[s] */ movw r26, W1 sbiw r26, 4*1 /* X points at a[4] aka e */ ld tmp1, X+ add T1, tmp1 ld tmp1, X+ adc T2, tmp1 ld tmp1, X+ adc T3, tmp1 ld tmp1, X+ adc T4, tmp1 /* T = w[s]+e */ sbiw r26, 4*5 /* X points at a[0] aka a */ ld F1, X+ ld F2, X+ ld F3, X+ ld F4, X+ mov tmp1, F4 /* X points at a[1] aka b */ ldi tmp2, 5 1: rol tmp1 rol F1 rol F2 rol F3 rol F4 dec tmp2 brne 1b add T1, F1 adc T2, F2 adc T3, F3 adc T4, F4 /* T = ROTL(a,5) + e + w[s] */ /* now we have to do this fucking conditional stuff */ ldi r30, lo8(sha1_nextBlock_xTable) ldi r31, hi8(sha1_nextBlock_xTable) add r30, xtmp adc r31, xNULL lpm tmp1, Z cp tmp1, LoopC brne 1f inc xtmp 1: ldi r30, lo8(sha1_nextBlock_KTable) ldi r31, hi8(sha1_nextBlock_KTable) lsl xtmp lsl xtmp add r30, xtmp adc r31, xNULL lsr xtmp lsr xtmp lpm tmp1, Z+ add T1, tmp1 lpm tmp1, Z+ adc T2, tmp1 lpm tmp1, Z+ adc T3, tmp1 lpm tmp1, Z+ adc T4, tmp1 /* T = ROTL(a,5) + e + kt + w[s] */ /* wo Z-4 gerade auf kt zeigt ... */ movw r28, r26 /* copy X in Y */ adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */ clc ror r31 ror r30 icall mov F1, tmp1 icall mov F2, tmp1 icall mov F3, tmp1 icall add T1, F1 adc T2, F2 adc T3, F3 adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */ /* X points still at a[1] aka b, Y points at a[2] aka c */ /* update a[] */ sha1_nextBlock_update_a: /*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/ //adiw r28, 3*4 /* Y should point at a[4] aka e */ movw r28, W1 sbiw r28, 4 ldi tmp2, 4*4 1: ld tmp1, -Y std Y+4, tmp1 dec tmp2 brne 1b /* Y points at a[0] aka a*/ movw r28, W1 sbiw r28, 5*4 /* store T in a[0] aka a */ st Y+, T1 st Y+, T2 st Y+, T3 st Y+, T4 /* Y points at a[1] aka b*/ /* rotate c */ ldd T1, Y+1*4 ldd T2, Y+1*4+1 ldd T3, Y+1*4+2 ldd T4, Y+1*4+3 mov tmp1, T1 ldi tmp2, 2 1: ror tmp1 ror T4 ror T3 ror T2 ror T1 dec tmp2 brne 1b std Y+1*4+0, T1 std Y+1*4+1, T2 std Y+1*4+2, T3 std Y+1*4+3, T4 push r27 push r26 movw r26, W1 sbiw r26, 4*5 dbg_hexdump 4*5 pop r26 pop r27 inc LoopC cpi LoopC, 80 brge 1f jmp sha1_nextBlock_mainloop /**************************************/ 1: /* littel patch */ sbiw r28, 4 /* add a[] to state and inc length */ pop r27 pop r26 /* now X points to state (and Y still at a[0]) */ ldi tmp4, 5 1: clc ldi tmp3, 4 2: ld tmp1, X ld tmp2, Y+ adc tmp1, tmp2 st X+, tmp1 dec tmp3 brne 2b dec tmp4 brne 1b /* now length += 512 */ adiw r26, 1 /* we skip the least significant byte */ ld tmp1, X ldi tmp2, hi8(512) /* 2 */ add tmp1, tmp2 st X+, tmp1 ldi tmp2, 6 1: ld tmp1, X adc tmp1, xNULL st X+, tmp1 dec tmp2 brne 1b ; EPILOG sha1_nextBlock_epilog: /* now we should clean up the stack */ pop r21 pop r20 in r0, SREG cli ; we want to be uninterrupted while updating SP out SPL, r20 out SPH, r21 out SREG, r0 clr r1 pop r29 pop r28 pop r17 pop r16 pop r15 pop r14 pop r13 pop r12 pop r11 pop r10 ret sha1_nextBlock_xTable: .byte 20,40,60,0 sha1_nextBlock_KTable: .int 0x5a827999 .int 0x6ed9eba1 .int 0x8f1bbcdc .int 0xca62c1d6 sha1_nextBlock_JumpTable: jmp sha1_nextBlock_Ch jmp sha1_nextBlock_Parity jmp sha1_nextBlock_Maj jmp sha1_nextBlock_Parity /* X and Y still point at a[1] aka b ; return value in tmp1 */ sha1_nextBlock_Ch: ld tmp1, Y+ mov tmp2, tmp1 com tmp2 ldd tmp3, Y+3 /* load from c */ and tmp1, tmp3 ldd tmp3, Y+7 /* load from d */ and tmp2, tmp3 eor tmp1, tmp2 /** precall ldi r24, lo8(ch_str) ldi r25, hi8(ch_str) call uart_putstr_P postcall /**/ ret sha1_nextBlock_Maj: ld tmp1, Y+ mov tmp2, tmp1 ldd tmp3, Y+3 /* load from c */ and tmp1, tmp3 ldd tmp4, Y+7 /* load from d */ and tmp2, tmp4 eor tmp1, tmp2 and tmp3, tmp4 eor tmp1, tmp3 /** precall ldi r24, lo8(maj_str) ldi r25, hi8(maj_str) call uart_putstr_P postcall /**/ ret sha1_nextBlock_Parity: ld tmp1, Y+ ldd tmp2, Y+3 /* load from c */ eor tmp1, tmp2 ldd tmp2, Y+7 /* load from d */ eor tmp1, tmp2 /** precall ldi r24, lo8(parity_str) ldi r25, hi8(parity_str) call uart_putstr_P postcall /**/ ret /* ch_str: .asciz "\r\nCh" maj_str: .asciz "\r\nMaj" parity_str: .asciz "\r\nParity" */ ;########################################################### .global sha1_init ;void sha1_init(sha1_ctx_t *state){ ; DEBUG_S("\r\nSHA1_INIT"); ; state->h[0] = 0x67452301; ; state->h[1] = 0xefcdab89; ; state->h[2] = 0x98badcfe; ; state->h[3] = 0x10325476; ; state->h[4] = 0xc3d2e1f0; ; state->length = 0; ;} ; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram ; modifys: Z(r30,r31), Func1, r22 sha1_init: movw r26, r24 ; (24,25) --> (26,27) load X with param1 ldi r30, lo8((sha1_init_vector)) ldi r31, hi8((sha1_init_vector)) ldi r22, 5*4 /* bytes to copy */ sha1_init_vloop: lpm r23, Z+ st X+, r23 dec r22 brne sha1_init_vloop ldi r22, 8 clr r1 /* this should not be needed */ sha1_init_lloop: st X+, r1 dec r22 brne sha1_init_lloop ret sha1_init_vector: .int 0x67452301; .int 0xefcdab89; .int 0x98badcfe; .int 0x10325476; .int 0xc3d2e1f0; /* ;########################################################### .global rotl32 ; === ROTL32 === ; function that rotates a 32 bit word to the left ; param1: the 32-bit word to rotate ; given in r25,r24,r23,r22 (r25 is most significant) ; param2: an 8-bit value telling how often to rotate ; given in r20 ; modifys: r21, r22 rotl32: cpi r20, 8 brlo bitrotl mov r21, r25 mov r25, r24 mov r24, r23 mov r23, r22 mov r22, r21 subi r20, 8 rjmp rotr32 bitrotl: clr r21 clc bitrotl_loop: tst r20 breq fixrotl rol r22 rol r23 rol r24 rol r25 rol r21 dec r20 rjmp bitrotl_loop fixrotl: or r22, r21 ret ;########################################################### .global rotr32 ; === ROTR32 === ; function that rotates a 32 bit word to the right ; param1: the 32-bit word to rotate ; given in r25,r24,r23,22 (r25 is most significant) ; param2: an 8-bit value telling how often to rotate ; given in r20 ; modifys: r21, r22 rotr32: cpi r20, 8 brlo bitrotr mov r21, r22 mov r22, r23 mov r23, r24 mov r24, r25 mov r25, r21 subi r20, 8 rjmp rotr32 bitrotr: clr r21 clc bitrotr_loop: tst r20 breq fixrotr ror r25 ror r24 ror r23 ror r22 ror r21 dec r20 rjmp bitrotr_loop fixrotr: or r25, r21 ret ;########################################################### .global change_endian32 ; === change_endian32 === ; function that changes the endianess of a 32-bit word ; param1: the 32-bit word ; given in r25,r24,r23,22 (r25 is most significant) ; modifys: r21, r22 change_endian32: movw r20, r22 ; (r22,r23) --> (r20,r21) mov r22, r25 mov r23, r24 mov r24, r21 mov r25, r20 ret */