/* sha1-asm.S */ /* This file is part of the AVR-Crypto-Lib. Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* * Author: Daniel Otte * * License: GPLv3 or later */ ; SHA1 implementation in assembler for AVR SHA1_BLOCK_BITS = 512 SHA1_HASH_BITS = 160 .macro precall /* push r18 - r27, r30 - r31*/ push r0 push r1 push r18 push r19 push r20 push r21 push r22 push r23 push r24 push r25 push r26 push r27 push r30 push r31 clr r1 .endm .macro postcall pop r31 pop r30 pop r27 pop r26 pop r25 pop r24 pop r23 pop r22 pop r21 pop r20 pop r19 pop r18 pop r1 pop r0 .endm .macro hexdump length push r27 push r26 ldi r25, '\r' mov r24, r25 call uart_putc ldi r25, '\n' mov r24, r25 call uart_putc pop r26 pop r27 movw r24, r26 .if \length > 16 ldi r22, lo8(16) ldi r23, hi8(16) push r27 push r26 call uart_hexdump pop r26 pop r27 adiw r26, 16 hexdump \length-16 .else ldi r22, lo8(\length) ldi r23, hi8(\length) call uart_hexdump .endif .endm .macro delay /* push r0 push r1 clr r0 1: clr r1 2: dec r1 brne 2b dec r0 brne 1b pop r1 pop r0 // */ .endm /* X points to Block */ .macro dbg_hexdump length /* precall hexdump \length postcall // */ .endm .section .text SPL = 0x3D SPH = 0x3E SREG = 0x3F ; ;sha1_ctx_t is: ; ; [h0][h1][h2][h3][h4][length] ; hn is 32 bit large, length is 64 bit large ;########################################################### .global sha1_ctx2hash ; === sha1_ctx2hash === ; this function converts a state into a normal hash (bytestring) ; param1: the 16-bit destination pointer ; given in r25,r24 (r25 is most significant) ; param2: the 16-bit pointer to sha1_ctx structure ; given in r23,r22 sha1_ctx2hash: movw r26, r22 movw r30, r24 ldi r21, 5 sbiw r26, 4 1: ldi r20, 4 adiw r26, 8 2: ld r0, -X st Z+, r0 dec r20 brne 2b dec r21 brne 1b ret ;########################################################### .global sha1 ; === sha1 === ; this function calculates SHA-1 hashes from messages in RAM ; param1: the 16-bit hash destination pointer ; given in r25,r24 (r25 is most significant) ; param2: the 16-bit pointer to message ; given in r23,r22 ; param3: 32-bit length value (length of message in bits) ; given in r21,r20,r19,r18 sha1: sha1_prolog: push r8 push r9 push r10 push r11 push r12 push r13 push r16 push r17 in r30, SPL in r31, SPH sbiw r30, 5*4+8 in r0, SREG cli out SPL, r30 out SREG, r0 out SPH, r31 push r25 push r24 adiw r30, 1 movw r16, r30 movw r8, r18 /* backup of length*/ movw r10, r20 movw r12, r22 /* backup pf msg-ptr */ movw r24, r16 rcall sha1_init /* if length >= 512 */ 1: tst r11 brne 2f tst r10 breq 4f 2: movw r24, r16 movw r22, r12 rcall sha1_nextBlock ldi r19, 64 add r12, r19 adc r13, r1 /* length -= 512 */ ldi r19, 0x02 sub r9, r19 sbc r10, r1 sbc r11, r1 rjmp 1b 4: movw r24, r16 movw r22, r12 movw r20, r8 rcall sha1_lastBlock pop r24 pop r25 movw r22, r16 rcall sha1_ctx2hash sha1_epilog: in r30, SPL in r31, SPH adiw r30, 5*4+8 in r0, SREG cli out SPL, r30 out SREG, r0 out SPH, r31 pop r17 pop r16 pop r13 pop r12 pop r11 pop r10 pop r9 pop r8 ret ;########################################################### ; block MUST NOT be larger than 64 bytes .global sha1_lastBlock ; === sha1_lastBlock === ; this function does padding & Co. for calculating SHA-1 hashes ; param1: the 16-bit pointer to sha1_ctx structure ; given in r25,r24 (r25 is most significant) ; param2: an 16-bit pointer to 64 byte block to hash ; given in r23,r22 ; param3: an 16-bit integer specifing length of block in bits ; given in r21,r20 sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1) sha1_lastBlock: cpi r21, 0x02 brlo sha1_lastBlock_prolog push r25 push r24 push r23 push r22 push r21 push r20 rcall sha1_nextBlock pop r20 pop r21 pop r22 pop r23 pop r24 pop r25 subi r21, 2 ldi r19, 64 add r22, r19 adc r23, r1 rjmp sha1_lastBlock sha1_lastBlock_prolog: /* allocate space on stack */ in r30, SPL in r31, SPH in r0, SREG subi r30, lo8(64) sbci r31, hi8(64) /* ??? */ cli out SPL, r30 out SREG, r0 out SPH, r31 adiw r30, 1 /* SP points to next free byte on stack */ mov r18, r20 /* r20 = LSB(length) */ lsr r18 lsr r18 lsr r18 bst r21, 0 /* may be we should explain this ... */ bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */ movw r26, r22 /* X points to begin of msg */ tst r18 breq sha1_lastBlock_post_copy mov r1, r18 sha1_lastBlock_copy_loop: ld r0, X+ st Z+, r0 dec r1 brne sha1_lastBlock_copy_loop sha1_lastBlock_post_copy: sha1_lastBlock_insert_stuffing_bit: ldi r19, 0x80 mov r0,r19 ldi r19, 0x07 and r19, r20 /* if we are in bitmode */ breq 2f /* no bitmode */ 1: lsr r0 dec r19 brne 1b ld r19, X /* maybe we should do some ANDing here, just for safety */ or r0, r19 2: st Z+, r0 inc r18 /* checking stuff here */ cpi r18, 64-8+1 brsh 0f rjmp sha1_lastBlock_insert_zeros 0: /* oh shit, we landed here */ /* first we have to fill it up with zeros */ ldi r19, 64 sub r19, r18 breq 2f 1: st Z+, r1 dec r19 brne 1b 2: sbiw r30, 63 sbiw r30, 1 movw r22, r30 push r31 push r30 push r25 push r24 push r21 push r20 rcall sha1_nextBlock pop r20 pop r21 pop r24 pop r25 pop r30 pop r31 /* now we should subtract 512 from length */ movw r26, r24 adiw r26, 4*5+1 /* we can skip the lowest byte */ ld r19, X subi r19, hi8(512) st X+, r19 ldi r18, 6 1: ld r19, X sbci r19, 0 st X+, r19 dec r18 brne 1b ; clr r18 /* not neccessary ;-) */ /* reset Z pointer to begin of block */ sha1_lastBlock_insert_zeros: ldi r19, 64-8 sub r19, r18 breq sha1_lastBlock_insert_length clr r1 1: st Z+, r1 /* r1 is still zero */ dec r19 brne 1b ; rjmp sha1_lastBlock_epilog sha1_lastBlock_insert_length: movw r26, r24 /* X points to state */ adiw r26, 5*4 /* X points to (state.length) */ adiw r30, 8 /* Z points one after the last byte of block */ ld r0, X+ add r0, r20 st -Z, r0 ld r0, X+ adc r0, r21 st -Z, r0 ldi r19, 6 1: ld r0, X+ adc r0, r1 st -Z, r0 dec r19 brne 1b sbiw r30, 64-8 movw r22, r30 rcall sha1_nextBlock sha1_lastBlock_epilog: in r30, SPL in r31, SPH in r0, SREG adiw r30, 63 ; lo8(64) adiw r30, 1 ; hi8(64) cli out SPL, r30 out SREG, r0 out SPH, r31 clr r1 ret /**/ ;########################################################### .global sha1_nextBlock ; === sha1_nextBlock === ; this is the core function for calculating SHA-1 hashes ; param1: the 16-bit pointer to sha1_ctx structure ; given in r25,r24 (r25 is most significant) ; param2: an 16-bit pointer to 64 byte block to hash ; given in r23,r22 sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte) xtmp = 0 xNULL = 1 W1 = 10 W2 = 11 T1 = 12 T2 = 13 T3 = 14 T4 = 15 LoopC = 16 S = 17 tmp1 = 18 tmp2 = 19 tmp3 = 20 tmp4 = 21 F1 = 22 F2 = 23 F3 = 24 F4 = 25 /* byteorder: high number <--> high significance */ sha1_nextBlock: ; initial, let's make some space ready for local vars /* replace push & pop by mem ops? */ push r10 push r11 push r12 push r13 push r14 push r15 push r16 push r17 push r28 push r29 in r20, SPL in r21, SPH movw r18, r20 ;backup SP ; movw r26, r20 ; X points to free space on stack /* maybe removeable? */ movw r30, r22 ; Z points to message subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63 sbci r21, hi8(sha1_nextBlock_localSpace) movw r26, r20 ; X points to free space on stack in r0, SREG cli ; we want to be uninterrupted while updating SP out SPL, r20 out SREG, r0 out SPH, r21 push r18 push r19 /* push old SP on new stack */ push r24 push r25 /* param1 will be needed later */ /* load a[] with state */ movw 28, r24 /* load pointer to state in Y */ adiw r26, 1 ; X++ ldi LoopC, 5*4 1: ld tmp1, Y+ st X+, tmp1 dec LoopC brne 1b movw W1, r26 /* save pointer to w[0] */ /* load w[] with endian fixed message */ /* we might also use the changeendian32() function at bottom */ movw r30, r22 /* mv param2 (ponter to msg) to Z */ ldi LoopC, 16 1: ldd tmp1, Z+3 st X+, tmp1 ldd tmp1, Z+2 st X+, tmp1 ldd tmp1, Z+1 st X+, tmp1 ld tmp1, Z st X+, tmp1 adiw r30, 4 dec LoopC brne 1b ;clr LoopC /* LoopC is named t in FIPS 180-2 */ clr xtmp sha1_nextBlock_mainloop: mov S, LoopC lsl S lsl S andi S, 0x3C /* S is a bytepointer so *4 */ /* load w[s] */ movw r26, W1 add r26, S /* X points at w[s] */ adc r27, xNULL ld T1, X+ ld T2, X+ ld T3, X+ ld T4, X+ /* push r26 push r27 push T4 push T3 push T2 push T1 in r26, SPL in r27, SPH adiw r26, 1 dbg_hexdump 4 pop T1 pop T2 pop T3 pop T4 pop r27 pop r26 */ cpi LoopC, 16 brlt sha1_nextBlock_mainloop_core /* update w[s] */ ldi tmp1, 2*4 rcall 1f ldi tmp1, 8*4 rcall 1f ldi tmp1, 13*4 rcall 1f rjmp 2f 1: /* this might be "outsourced" to save the jump above */ add tmp1, S andi tmp1, 0x3f movw r26, W1 add r26, tmp1 adc r27, xNULL ld tmp2, X+ eor T1, tmp2 ld tmp2, X+ eor T2, tmp2 ld tmp2, X+ eor T3, tmp2 ld tmp2, X+ eor T4, tmp2 ret 2: /* now we just hav to do a ROTL(T) and save T back */ mov tmp2, T4 rol tmp2 rol T1 rol T2 rol T3 rol T4 movw r26, W1 add r26, S adc r27, xNULL st X+, T1 st X+, T2 st X+, T3 st X+, T4 sha1_nextBlock_mainloop_core: /* ther core function; T=ROTL5(a) ....*/ /* T already contains w[s] */ movw r26, W1 sbiw r26, 4*1 /* X points at a[4] aka e */ ld tmp1, X+ add T1, tmp1 ld tmp1, X+ adc T2, tmp1 ld tmp1, X+ adc T3, tmp1 ld tmp1, X+ adc T4, tmp1 /* T = w[s]+e */ sbiw r26, 4*5 /* X points at a[0] aka a */ ld F1, X+ ld F2, X+ ld F3, X+ ld F4, X+ mov tmp1, F4 /* X points at a[1] aka b */ ldi tmp2, 5 1: rol tmp1 rol F1 rol F2 rol F3 rol F4 dec tmp2 brne 1b add T1, F1 adc T2, F2 adc T3, F3 adc T4, F4 /* T = ROTL(a,5) + e + w[s] */ /* now we have to do this fucking conditional stuff */ ldi r30, lo8(sha1_nextBlock_xTable) ldi r31, hi8(sha1_nextBlock_xTable) add r30, xtmp adc r31, xNULL lpm tmp1, Z cp tmp1, LoopC brne 1f inc xtmp 1: ldi r30, lo8(sha1_nextBlock_KTable) ldi r31, hi8(sha1_nextBlock_KTable) lsl xtmp lsl xtmp add r30, xtmp adc r31, xNULL lsr xtmp lsr xtmp lpm tmp1, Z+ add T1, tmp1 lpm tmp1, Z+ adc T2, tmp1 lpm tmp1, Z+ adc T3, tmp1 lpm tmp1, Z+ adc T4, tmp1 /* T = ROTL(a,5) + e + kt + w[s] */ /* Z-4 is just pointing to kt ... */ movw r28, r26 /* copy X in Y */ adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */ lsr r31 ror r30 icall mov F1, tmp1 icall mov F2, tmp1 icall mov F3, tmp1 icall add T1, F1 adc T2, F2 adc T3, F3 adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */ /* X points still at a[1] aka b, Y points at a[2] aka c */ /* update a[] */ sha1_nextBlock_update_a: /*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/ //adiw r28, 3*4 /* Y should point at a[4] aka e */ movw r28, W1 sbiw r28, 4 ldi tmp2, 4*4 1: ld tmp1, -Y std Y+4, tmp1 dec tmp2 brne 1b /* Y points at a[0] aka a*/ movw r28, W1 sbiw r28, 5*4 /* store T in a[0] aka a */ st Y+, T1 st Y+, T2 st Y+, T3 st Y+, T4 /* Y points at a[1] aka b*/ /* rotate c */ ldd T1, Y+1*4 ldd T2, Y+1*4+1 ldd T3, Y+1*4+2 ldd T4, Y+1*4+3 mov tmp1, T1 ldi tmp2, 2 1: ror tmp1 ror T4 ror T3 ror T2 ror T1 dec tmp2 brne 1b std Y+1*4+0, T1 std Y+1*4+1, T2 std Y+1*4+2, T3 std Y+1*4+3, T4 /* push r27 push r26 movw r26, W1 sbiw r26, 4*5 dbg_hexdump 4*5 pop r26 pop r27 */ inc LoopC cpi LoopC, 80 brge 1f rjmp sha1_nextBlock_mainloop /**************************************/ 1: /* littel patch */ sbiw r28, 4 /* add a[] to state and inc length */ pop r27 pop r26 /* now X points to state (and Y still at a[0]) */ ldi tmp4, 5 1: clc ldi tmp3, 4 2: ld tmp1, X ld tmp2, Y+ adc tmp1, tmp2 st X+, tmp1 dec tmp3 brne 2b dec tmp4 brne 1b /* now length += 512 */ adiw r26, 1 /* we skip the least significant byte */ ld tmp1, X ldi tmp2, hi8(512) /* 2 */ add tmp1, tmp2 st X+, tmp1 ldi tmp2, 6 1: ld tmp1, X adc tmp1, xNULL st X+, tmp1 dec tmp2 brne 1b ; EPILOG sha1_nextBlock_epilog: /* now we should clean up the stack */ pop r21 pop r20 in r0, SREG cli ; we want to be uninterrupted while updating SP out SPL, r20 out SREG, r0 out SPH, r21 clr r1 pop r29 pop r28 pop r17 pop r16 pop r15 pop r14 pop r13 pop r12 pop r11 pop r10 ret sha1_nextBlock_xTable: .byte 20,40,60,0 sha1_nextBlock_KTable: .int 0x5a827999 .int 0x6ed9eba1 .int 0x8f1bbcdc .int 0xca62c1d6 sha1_nextBlock_JumpTable: rjmp sha1_nextBlock_Ch nop rjmp sha1_nextBlock_Parity nop rjmp sha1_nextBlock_Maj nop rjmp sha1_nextBlock_Parity /* X and Y still point at a[1] aka b ; return value in tmp1 */ sha1_nextBlock_Ch: ld tmp1, Y+ mov tmp2, tmp1 com tmp2 ldd tmp3, Y+3 /* load from c */ and tmp1, tmp3 ldd tmp3, Y+7 /* load from d */ and tmp2, tmp3 eor tmp1, tmp2 ret sha1_nextBlock_Maj: ld tmp1, Y+ mov tmp2, tmp1 ldd tmp3, Y+3 /* load from c */ and tmp1, tmp3 ldd tmp4, Y+7 /* load from d */ and tmp2, tmp4 eor tmp1, tmp2 and tmp3, tmp4 eor tmp1, tmp3 ret sha1_nextBlock_Parity: ld tmp1, Y+ ldd tmp2, Y+3 /* load from c */ eor tmp1, tmp2 ldd tmp2, Y+7 /* load from d */ eor tmp1, tmp2 ret /* ch_str: .asciz "\r\nCh" maj_str: .asciz "\r\nMaj" parity_str: .asciz "\r\nParity" */ ;########################################################### .global sha1_init ;void sha1_init(sha1_ctx_t *state){ ; DEBUG_S("\r\nSHA1_INIT"); ; state->h[0] = 0x67452301; ; state->h[1] = 0xefcdab89; ; state->h[2] = 0x98badcfe; ; state->h[3] = 0x10325476; ; state->h[4] = 0xc3d2e1f0; ; state->length = 0; ;} ; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram ; modifys: Z(r30,r31), Func1, r22 sha1_init: movw r26, r24 ; (24,25) --> (26,27) load X with param1 ldi r30, lo8((sha1_init_vector)) ldi r31, hi8((sha1_init_vector)) ldi r22, 5*4 /* bytes to copy */ sha1_init_vloop: lpm r23, Z+ st X+, r23 dec r22 brne sha1_init_vloop ldi r22, 8 sha1_init_lloop: st X+, r1 dec r22 brne sha1_init_lloop ret sha1_init_vector: .int 0x67452301; .int 0xefcdab89; .int 0x98badcfe; .int 0x10325476; .int 0xc3d2e1f0;