X-Git-Url: https://git.cryptolib.org/?a=blobdiff_plain;ds=sidebyside;f=shacal1%2Fsha1-asm.S;fp=shacal1%2Fsha1-asm.S;h=0000000000000000000000000000000000000000;hb=7b5401ab9ce23a5da1de8b6c7de3a1aa20ac4cf8;hp=f571685984c5046fd0f4280adbc9a5fe513720a8;hpb=02ac3b653f3a11f284cc1a0cb0e983575f2f431b;p=avr-crypto-lib.git diff --git a/shacal1/sha1-asm.S b/shacal1/sha1-asm.S deleted file mode 100644 index f571685..0000000 --- a/shacal1/sha1-asm.S +++ /dev/null @@ -1,886 +0,0 @@ -/* sha1-asm.S */ -/* - This file is part of the AVR-Crypto-Lib. - Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ -/* - * Author: Daniel Otte - * - * License: GPLv3 or later -*/ -; SHA1 implementation in assembler for AVR -SHA1_BLOCK_BITS = 512 -SHA1_HASH_BITS = 160 - -.macro precall - /* push r18 - r27, r30 - r31*/ - push r0 - push r1 - push r18 - push r19 - push r20 - push r21 - push r22 - push r23 - push r24 - push r25 - push r26 - push r27 - push r30 - push r31 - clr r1 -.endm - -.macro postcall - pop r31 - pop r30 - pop r27 - pop r26 - pop r25 - pop r24 - pop r23 - pop r22 - pop r21 - pop r20 - pop r19 - pop r18 - pop r1 - pop r0 -.endm - - -.macro hexdump length - push r27 - push r26 - ldi r25, '\r' - mov r24, r25 - call uart_putc - ldi r25, '\n' - mov r24, r25 - call uart_putc - pop r26 - pop r27 - movw r24, r26 -.if \length > 16 - ldi r22, lo8(16) - ldi r23, hi8(16) - push r27 - push r26 - call uart_hexdump - pop r26 - pop r27 - adiw r26, 16 - hexdump \length-16 -.else - ldi r22, lo8(\length) - ldi r23, hi8(\length) - call uart_hexdump -.endif -.endm - -.macro delay -/* - push r0 - push r1 - clr r0 -1: clr r1 -2: dec r1 - brne 2b - dec r0 - brne 1b - pop r1 - pop r0 // */ -.endm - -/* X points to Block */ -.macro dbg_hexdump length -/* - precall - hexdump \length - postcall - // */ -.endm - - - -.section .text - -SPL = 0x3D -SPH = 0x3E -SREG = 0x3F - - -; -;sha1_ctx_t is: -; -; [h0][h1][h2][h3][h4][length] -; hn is 32 bit large, length is 64 bit large - -;########################################################### - -.global sha1_ctx2hash -; === sha1_ctx2hash === -; this function converts a state into a normal hash (bytestring) -; param1: the 16-bit destination pointer -; given in r25,r24 (r25 is most significant) -; param2: the 16-bit pointer to sha1_ctx structure -; given in r23,r22 -sha1_ctx2hash: - movw r26, r22 - movw r30, r24 - ldi r21, 5 - sbiw r26, 4 -1: - ldi r20, 4 - adiw r26, 8 -2: - ld r0, -X - st Z+, r0 - dec r20 - brne 2b - - dec r21 - brne 1b - - ret - -;########################################################### - -.global sha1 -; === sha1 === -; this function calculates SHA-1 hashes from messages in RAM -; param1: the 16-bit hash destination pointer -; given in r25,r24 (r25 is most significant) -; param2: the 16-bit pointer to message -; given in r23,r22 -; param3: 32-bit length value (length of message in bits) -; given in r21,r20,r19,r18 -sha1: -sha1_prolog: - push r8 - push r9 - push r10 - push r11 - push r12 - push r13 - push r16 - push r17 - in r16, SPL - in r17, SPH - subi r16, 5*4+8 - sbci r17, 0 - in r0, SREG - cli - out SPL, r16 - out SPH, r17 - out SREG, r0 - - push r25 - push r24 - inc r16 - adc r17, r1 - - movw r8, r18 /* backup of length*/ - movw r10, r20 - - movw r12, r22 /* backup pf msg-ptr */ - - movw r24, r16 - rcall sha1_init - /* if length >= 512 */ -1: - tst r11 - brne 4f - tst r10 - brne 4f - mov r19, r9 - cpi r19, 0x02 - brlo 4f - - movw r24, r16 - movw r22, r12 - rcall sha1_nextBlock - ldi r19, 0x64 - add r22, r19 - adc r23, r1 - /* length -= 512 */ - ldi r19, 0x02 - sub r9, r19 - sbc r10, r1 - sbc r11, r1 - rjmp 1b - -4: - movw r24, r16 - movw r22, r12 - movw r20, r8 - rcall sha1_lastBlock - - pop r24 - pop r25 - movw r22, r16 - rcall sha1_ctx2hash - -sha1_epilog: - in r30, SPL - in r31, SPH - adiw r30, 5*4+8 - in r0, SREG - cli - out SPL, r30 - out SPH, r31 - out SREG, r0 - pop r17 - pop r16 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop r8 - ret - -;########################################################### - - -; block MUST NOT be larger than 64 bytes - -.global sha1_lastBlock -; === sha1_lastBlock === -; this function does padding & Co. for calculating SHA-1 hashes -; param1: the 16-bit pointer to sha1_ctx structure -; given in r25,r24 (r25 is most significant) -; param2: an 16-bit pointer to 64 byte block to hash -; given in r23,r22 -; param3: an 16-bit integer specifing length of block in bits -; given in r21,r20 -sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1) - - -sha1_lastBlock: - cpi r21, 0x02 - brlo sha1_lastBlock_prolog - push r25 - push r24 - push r23 - push r22 - push r21 - push r20 - rcall sha1_nextBlock - pop r20 - pop r21 - pop r22 - pop r23 - pop r24 - pop r25 - subi r21, 2 - subi r23, -2 - rjmp sha1_lastBlock -sha1_lastBlock_prolog: - /* allocate space on stack */ - in r30, SPL - in r31, SPH - in r1, SREG - subi r30, lo8(64) - sbci r31, hi8(64) /* ??? */ - cli - out SPL, r30 - out SPH, r31 - out SREG,r1 - - adiw r30, 1 /* SP points to next free byte on stack */ - mov r18, r20 /* r20 = LSB(length) */ - lsr r18 - lsr r18 - lsr r18 - bst r21, 0 /* may be we should explain this ... */ - bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */ - - - movw r26, r22 /* X points to begin of msg */ - tst r18 - breq sha1_lastBlock_post_copy - mov r1, r18 -sha1_lastBlock_copy_loop: - ld r0, X+ - st Z+, r0 - dec r1 - brne sha1_lastBlock_copy_loop -sha1_lastBlock_post_copy: -sha1_lastBlock_insert_stuffing_bit: - ldi r19, 0x80 - mov r0,r19 - ldi r19, 0x07 - and r19, r20 /* if we are in bitmode */ - breq 2f /* no bitmode */ -1: - lsr r0 - dec r19 - brne 1b - ld r19, X -/* maybe we should do some ANDing here, just for safety */ - or r0, r19 -2: - st Z+, r0 - inc r18 - -/* checking stuff here */ - cpi r18, 64-8+1 - brsh 0f - rjmp sha1_lastBlock_insert_zeros -0: - /* oh shit, we landed here */ - /* first we have to fill it up with zeros */ - ldi r19, 64 - sub r19, r18 - breq 2f -1: - st Z+, r1 - dec r19 - brne 1b -2: - sbiw r30, 63 - sbiw r30, 1 - movw r22, r30 - - push r31 - push r30 - push r25 - push r24 - push r21 - push r20 - rcall sha1_nextBlock - pop r20 - pop r21 - pop r24 - pop r25 - pop r30 - pop r31 - - /* now we should subtract 512 from length */ - movw r26, r24 - adiw r26, 4*5+1 /* we can skip the lowest byte */ - ld r19, X - subi r19, hi8(512) - st X+, r19 - ldi r18, 6 -1: - ld r19, X - sbci r19, 0 - st X+, r19 - dec r18 - brne 1b - -; clr r18 /* not neccessary ;-) */ - /* reset Z pointer to begin of block */ - -sha1_lastBlock_insert_zeros: - ldi r19, 64-8 - sub r19, r18 - breq sha1_lastBlock_insert_length - clr r1 -1: - st Z+, r1 /* r1 is still zero */ - dec r19 - brne 1b - -; rjmp sha1_lastBlock_epilog -sha1_lastBlock_insert_length: - movw r26, r24 /* X points to state */ - adiw r26, 5*4 /* X points to (state.length) */ - adiw r30, 8 /* Z points one after the last byte of block */ - ld r0, X+ - add r0, r20 - st -Z, r0 - ld r0, X+ - adc r0, r21 - st -Z, r0 - ldi r19, 6 -1: - ld r0, X+ - adc r0, r1 - st -Z, r0 - dec r19 - brne 1b - - sbiw r30, 64-8 - movw r22, r30 - rcall sha1_nextBlock - -sha1_lastBlock_epilog: - in r30, SPL - in r31, SPH - in r1, SREG - adiw r30, 63 ; lo8(64) - adiw r30, 1 ; hi8(64) - cli - out SPL, r30 - out SPH, r31 - out SREG,r1 - clr r1 - clr r0 - ret - -/**/ -;########################################################### - -.global sha1_nextBlock -; === sha1_nextBlock === -; this is the core function for calculating SHA-1 hashes -; param1: the 16-bit pointer to sha1_ctx structure -; given in r25,r24 (r25 is most significant) -; param2: an 16-bit pointer to 64 byte block to hash -; given in r23,r22 -sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte) - -xtmp = 0 -xNULL = 1 -W1 = 10 -W2 = 11 -T1 = 12 -T2 = 13 -T3 = 14 -T4 = 15 -LoopC = 16 -S = 17 -tmp1 = 18 -tmp2 = 19 -tmp3 = 20 -tmp4 = 21 -F1 = 22 -F2 = 23 -F3 = 24 -F4 = 25 - -/* byteorder: high number <--> high significance */ -sha1_nextBlock: - ; initial, let's make some space ready for local vars - /* replace push & pop by mem ops? */ - push r10 - push r11 - push r12 - push r13 - push r14 - push r15 - push r16 - push r17 - push r28 - push r29 - in r20, SPL - in r21, SPH - movw r18, r20 ;backup SP -; movw r26, r20 ; X points to free space on stack /* maybe removeable? */ - movw r30, r22 ; Z points to message - subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63 - sbci r21, hi8(sha1_nextBlock_localSpace) - movw r26, r20 ; X points to free space on stack - in r0, SREG - cli ; we want to be uninterrupted while updating SP - out SPL, r20 - out SPH, r21 - out SREG, r0 - - push r18 - push r19 /* push old SP on new stack */ - push r24 - push r25 /* param1 will be needed later */ - - /* load a[] with state */ - movw 28, r24 /* load pointer to state in Y */ - adiw r26, 1 ; X++ - - ldi LoopC, 5*4 -1: ld tmp1, Y+ - st X+, tmp1 - dec LoopC - brne 1b - - movw W1, r26 /* save pointer to w[0] */ - /* load w[] with endian fixed message */ - /* we might also use the changeendian32() function at bottom */ - movw r30, r22 /* mv param2 (ponter to msg) to Z */ - ldi LoopC, 16 -1: - ldd tmp1, Z+3 - st X+, tmp1 - ldd tmp1, Z+2 - st X+, tmp1 - ldd tmp1, Z+1 - st X+, tmp1 - ld tmp1, Z - st X+, tmp1 - adiw r30, 4 - dec LoopC - brne 1b - - ;clr LoopC /* LoopC is named t in FIPS 180-2 */ - clr xtmp -sha1_nextBlock_mainloop: - mov S, LoopC - lsl S - lsl S - andi S, 0x3C /* S is a bytepointer so *4 */ - /* load w[s] */ - movw r26, W1 - add r26, S /* X points at w[s] */ - adc r27, xNULL - ld T1, X+ - ld T2, X+ - ld T3, X+ - ld T4, X+ - - /**/ - push r26 - push r27 - push T4 - push T3 - push T2 - push T1 - in r26, SPL - in r27, SPH - adiw r26, 1 - dbg_hexdump 4 - pop T1 - pop T2 - pop T3 - pop T4 - pop r27 - pop r26 - /**/ - - cpi LoopC, 16 - brlt sha1_nextBlock_mainloop_core - /* update w[s] */ - ldi tmp1, 2*4 - rcall 1f - ldi tmp1, 8*4 - rcall 1f - ldi tmp1, 13*4 - rcall 1f - rjmp 2f -1: /* this might be "outsourced" to save the jump above */ - add tmp1, S - andi tmp1, 0x3f - movw r26, W1 - add r26, tmp1 - adc r27, xNULL - ld tmp2, X+ - eor T1, tmp2 - ld tmp2, X+ - eor T2, tmp2 - ld tmp2, X+ - eor T3, tmp2 - ld tmp2, X+ - eor T4, tmp2 - ret -2: /* now we just hav to do a ROTL(T) and save T back */ - mov tmp2, T4 - rol tmp2 - rol T1 - rol T2 - rol T3 - rol T4 - movw r26, W1 - add r26, S - adc r27, xNULL - st X+, T1 - st X+, T2 - st X+, T3 - st X+, T4 - -sha1_nextBlock_mainloop_core: /* ther core function; T=ROTL5(a) ....*/ - /* T already contains w[s] */ - movw r26, W1 - sbiw r26, 4*1 /* X points at a[4] aka e */ - ld tmp1, X+ - add T1, tmp1 - ld tmp1, X+ - adc T2, tmp1 - ld tmp1, X+ - adc T3, tmp1 - ld tmp1, X+ - adc T4, tmp1 /* T = w[s]+e */ - sbiw r26, 4*5 /* X points at a[0] aka a */ - ld F1, X+ - ld F2, X+ - ld F3, X+ - ld F4, X+ - mov tmp1, F4 /* X points at a[1] aka b */ - ldi tmp2, 5 -1: - rol tmp1 - rol F1 - rol F2 - rol F3 - rol F4 - dec tmp2 - brne 1b - - add T1, F1 - adc T2, F2 - adc T3, F3 - adc T4, F4 /* T = ROTL(a,5) + e + w[s] */ - - /* now we have to do this fucking conditional stuff */ - ldi r30, lo8(sha1_nextBlock_xTable) - ldi r31, hi8(sha1_nextBlock_xTable) - add r30, xtmp - adc r31, xNULL - lpm tmp1, Z - cp tmp1, LoopC - brne 1f - inc xtmp -1: ldi r30, lo8(sha1_nextBlock_KTable) - ldi r31, hi8(sha1_nextBlock_KTable) - lsl xtmp - lsl xtmp - add r30, xtmp - adc r31, xNULL - lsr xtmp - lsr xtmp - - lpm tmp1, Z+ - add T1, tmp1 - lpm tmp1, Z+ - adc T2, tmp1 - lpm tmp1, Z+ - adc T3, tmp1 - lpm tmp1, Z+ - adc T4, tmp1 - /* T = ROTL(a,5) + e + kt + w[s] */ - - /* Z-4 is just pointing to kt ... */ - movw r28, r26 /* copy X in Y */ - adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */ - lsr r31 - ror r30 - - icall - mov F1, tmp1 - icall - mov F2, tmp1 - icall - mov F3, tmp1 - icall - - add T1, F1 - adc T2, F2 - adc T3, F3 - adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */ - /* X points still at a[1] aka b, Y points at a[2] aka c */ - /* update a[] */ -sha1_nextBlock_update_a: - /*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/ - //adiw r28, 3*4 /* Y should point at a[4] aka e */ - movw r28, W1 - sbiw r28, 4 - - ldi tmp2, 4*4 -1: - ld tmp1, -Y - std Y+4, tmp1 - dec tmp2 - brne 1b - /* Y points at a[0] aka a*/ - - movw r28, W1 - sbiw r28, 5*4 - /* store T in a[0] aka a */ - st Y+, T1 - st Y+, T2 - st Y+, T3 - st Y+, T4 - /* Y points at a[1] aka b*/ - - /* rotate c */ - ldd T1, Y+1*4 - ldd T2, Y+1*4+1 - ldd T3, Y+1*4+2 - ldd T4, Y+1*4+3 - mov tmp1, T1 - ldi tmp2, 2 -1: ror tmp1 - ror T4 - ror T3 - ror T2 - ror T1 - dec tmp2 - brne 1b - std Y+1*4+0, T1 - std Y+1*4+1, T2 - std Y+1*4+2, T3 - std Y+1*4+3, T4 - - push r27 - push r26 - movw r26, W1 - sbiw r26, 4*5 - dbg_hexdump 4*5 - pop r26 - pop r27 - - inc LoopC - cpi LoopC, 80 - brge 1f - rjmp sha1_nextBlock_mainloop -/**************************************/ -1: - /* littel patch */ - sbiw r28, 4 - -/* add a[] to state and inc length */ - pop r27 - pop r26 /* now X points to state (and Y still at a[0]) */ - ldi tmp4, 5 -1: clc - ldi tmp3, 4 -2: ld tmp1, X - ld tmp2, Y+ - adc tmp1, tmp2 - st X+, tmp1 - dec tmp3 - brne 2b - dec tmp4 - brne 1b - - /* now length += 512 */ - adiw r26, 1 /* we skip the least significant byte */ - ld tmp1, X - ldi tmp2, hi8(512) /* 2 */ - add tmp1, tmp2 - st X+, tmp1 - ldi tmp2, 6 -1: - ld tmp1, X - adc tmp1, xNULL - st X+, tmp1 - dec tmp2 - brne 1b - -; EPILOG -sha1_nextBlock_epilog: -/* now we should clean up the stack */ - pop r21 - pop r20 - in r0, SREG - cli ; we want to be uninterrupted while updating SP - out SPL, r20 - out SPH, r21 - out SREG, r0 - - clr r1 - pop r29 - pop r28 - pop r17 - pop r16 - pop r15 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - ret - -sha1_nextBlock_xTable: -.byte 20,40,60,0 -sha1_nextBlock_KTable: -.int 0x5a827999 -.int 0x6ed9eba1 -.int 0x8f1bbcdc -.int 0xca62c1d6 -sha1_nextBlock_JumpTable: -rjmp sha1_nextBlock_Ch - nop -rjmp sha1_nextBlock_Parity - nop -rjmp sha1_nextBlock_Maj - nop -rjmp sha1_nextBlock_Parity - - /* X and Y still point at a[1] aka b ; return value in tmp1 */ -sha1_nextBlock_Ch: - ld tmp1, Y+ - mov tmp2, tmp1 - com tmp2 - ldd tmp3, Y+3 /* load from c */ - and tmp1, tmp3 - ldd tmp3, Y+7 /* load from d */ - and tmp2, tmp3 - eor tmp1, tmp2 - ret - -sha1_nextBlock_Maj: - ld tmp1, Y+ - mov tmp2, tmp1 - ldd tmp3, Y+3 /* load from c */ - and tmp1, tmp3 - ldd tmp4, Y+7 /* load from d */ - and tmp2, tmp4 - eor tmp1, tmp2 - and tmp3, tmp4 - eor tmp1, tmp3 - ret - -sha1_nextBlock_Parity: - ld tmp1, Y+ - ldd tmp2, Y+3 /* load from c */ - eor tmp1, tmp2 - ldd tmp2, Y+7 /* load from d */ - eor tmp1, tmp2 - ret -/* -ch_str: .asciz "\r\nCh" -maj_str: .asciz "\r\nMaj" -parity_str: .asciz "\r\nParity" -*/ -;########################################################### - -.global sha1_init -;void sha1_init(sha1_ctx_t *state){ -; DEBUG_S("\r\nSHA1_INIT"); -; state->h[0] = 0x67452301; -; state->h[1] = 0xefcdab89; -; state->h[2] = 0x98badcfe; -; state->h[3] = 0x10325476; -; state->h[4] = 0xc3d2e1f0; -; state->length = 0; -;} -; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram -; modifys: Z(r30,r31), Func1, r22 -sha1_init: - movw r26, r24 ; (24,25) --> (26,27) load X with param1 - ldi r30, lo8((sha1_init_vector)) - ldi r31, hi8((sha1_init_vector)) - ldi r22, 5*4 /* bytes to copy */ -sha1_init_vloop: - lpm r23, Z+ - st X+, r23 - dec r22 - brne sha1_init_vloop - ldi r22, 8 -sha1_init_lloop: - st X+, r1 - dec r22 - brne sha1_init_lloop - ret - -sha1_init_vector: -.int 0x67452301; -.int 0xefcdab89; -.int 0x98badcfe; -.int 0x10325476; -.int 0xc3d2e1f0; -