You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-/*\r
- * Author: Daniel Otte\r
- *\r
- * License: GPL\r
-*/\r
-; SHA1 implementation in assembler for AVR\r
-SHA1_BLOCK_BITS = 512\r
-SHA1_HASH_BITS = 160\r
-\r
-.macro precall\r
- /* push r18 - r27, r30 - r31*/\r
- push r0\r
- push r1\r
- push r18\r
- push r19\r
- push r20\r
- push r21\r
- push r22\r
- push r23\r
- push r24\r
- push r25\r
- push r26\r
- push r27\r
- push r30\r
- push r31\r
- clr r1\r
-.endm\r
-\r
-.macro postcall\r
- pop r31\r
- pop r30\r
- pop r27\r
- pop r26\r
- pop r25\r
- pop r24\r
- pop r23\r
- pop r22\r
- pop r21\r
- pop r20\r
- pop r19\r
- pop r18\r
- pop r1\r
- pop r0\r
-.endm\r
-\r
-\r
-.macro hexdump length\r
- push r27\r
- push r26\r
- ldi r25, '\r'\r
- mov r24, r25\r
- call uart_putc\r
- ldi r25, '\n'\r
- mov r24, r25\r
- call uart_putc\r
- pop r26\r
- pop r27\r
- movw r24, r26\r
-.if \length > 16\r
- ldi r22, lo8(16)\r
- ldi r23, hi8(16)\r
- push r27\r
- push r26\r
- call uart_hexdump\r
- pop r26\r
- pop r27\r
- adiw r26, 16\r
- hexdump \length-16\r
-.else\r
- ldi r22, lo8(\length)\r
- ldi r23, hi8(\length)\r
- call uart_hexdump\r
-.endif\r
-.endm\r
-\r
-.macro delay\r
-/* \r
- push r0\r
- push r1\r
- clr r0\r
-1: clr r1\r
-2: dec r1\r
- brne 2b\r
- dec r0\r
- brne 1b\r
- pop r1\r
- pop r0 // */\r
-.endm\r
-\r
-/* X points to Block */\r
-.macro dbg_hexdump length\r
-/* \r
- precall\r
- hexdump \length\r
- postcall\r
- // */\r
-.endm\r
-\r
-\r
-\r
-.section .text\r
-\r
-SPL = 0x3D\r
-SPH = 0x3E\r
-SREG = 0x3F\r
-\r
-\r
-;\r
-;sha1_ctx_t is:\r
-;\r
-; [h0][h1][h2][h3][h4][length]\r
-; hn is 32 bit large, length is 64 bit large\r
-\r
-;########################################################### \r
-\r
-.global sha1_ctx2hash\r
-; === sha1_ctx2hash ===\r
-; this function converts a state into a normal hash (bytestring)\r
-; param1: the 16-bit destination pointer\r
-; given in r25,r24 (r25 is most significant)\r
-; param2: the 16-bit pointer to sha1_ctx structure\r
-; given in r23,r22\r
-sha1_ctx2hash:\r
- movw r26, r22\r
- movw r30, r24\r
- ldi r21, 5\r
- sbiw r26, 4\r
-1: \r
- ldi r20, 4\r
- adiw r26, 8\r
-2: \r
- ld r0, -X\r
- st Z+, r0 \r
- dec r20\r
- brne 2b\r
- \r
- dec r21\r
- brne 1b\r
- \r
- ret\r
-\r
-;########################################################### \r
-\r
-.global sha1\r
-; === sha1 ===\r
-; this function calculates SHA-1 hashes from messages in RAM\r
-; param1: the 16-bit hash destination pointer\r
-; given in r25,r24 (r25 is most significant)\r
-; param2: the 16-bit pointer to message\r
-; given in r23,r22\r
-; param3: 32-bit length value (length of message in bits)\r
-; given in r21,r20,r19,r18\r
-sha1:\r
-sha1_prolog:\r
- push r8\r
- push r9\r
- push r10\r
- push r11\r
- push r12\r
- push r13\r
- push r16\r
- push r17\r
- in r16, SPL\r
- in r17, SPH\r
- subi r16, 5*4+8 \r
- sbci r17, 0 \r
- in r0, SREG\r
- cli\r
- out SPL, r16\r
- out SPH, r17\r
- out SREG, r0\r
- \r
- push r25\r
- push r24\r
- inc r16\r
- adc r17, r1\r
- \r
- movw r8, r18 /* backup of length*/\r
- movw r10, r20\r
- \r
- movw r12, r22 /* backup pf msg-ptr */\r
- \r
- movw r24, r16\r
- rcall sha1_init\r
- /* if length >= 512 */\r
-1:\r
- tst r11\r
- brne 4f\r
- tst r10\r
- brne 4f\r
- mov r19, r9\r
- cpi r19, 0x02\r
- brlo 4f\r
- \r
- movw r24, r16\r
- movw r22, r12\r
- rcall sha1_nextBlock\r
- ldi r19, 0x64\r
- add r22, r19\r
- adc r23, r1\r
- /* length -= 512 */\r
- ldi r19, 0x02\r
- sub r9, r19\r
- sbc r10, r1\r
- sbc r11, r1\r
- rjmp 1b\r
- \r
-4:\r
- movw r24, r16\r
- movw r22, r12\r
- movw r20, r8\r
- rcall sha1_lastBlock\r
- \r
- pop r24\r
- pop r25\r
- movw r22, r16\r
- rcall sha1_ctx2hash \r
- \r
-sha1_epilog:\r
- in r30, SPL\r
- in r31, SPH\r
- adiw r30, 5*4+8 \r
- in r0, SREG\r
- cli\r
- out SPL, r30\r
- out SPH, r31\r
- out SREG, r0\r
- pop r17\r
- pop r16\r
- pop r13\r
- pop r12\r
- pop r11\r
- pop r10\r
- pop r9\r
- pop r8\r
- ret\r
-\r
-;########################################################### \r
-\r
-\r
-; block MUST NOT be larger than 64 bytes\r
-\r
-.global sha1_lastBlock\r
-; === sha1_lastBlock ===\r
-; this function does padding & Co. for calculating SHA-1 hashes\r
-; param1: the 16-bit pointer to sha1_ctx structure\r
-; given in r25,r24 (r25 is most significant)\r
-; param2: an 16-bit pointer to 64 byte block to hash\r
-; given in r23,r22\r
-; param3: an 16-bit integer specifing length of block in bits\r
-; given in r21,r20\r
-sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1)\r
-\r
-\r
-sha1_lastBlock:\r
- tst r20\r
- brne sha1_lastBlock_prolog\r
- cpi r21, 0x02\r
- brne sha1_lastBlock_prolog\r
- push r25\r
- push r24\r
- push r23\r
- push r22\r
- rcall sha1_nextBlock\r
- pop r22\r
- pop r23\r
- pop r24\r
- pop r25\r
- clr r21\r
- clr r22\r
-sha1_lastBlock_prolog:\r
- /* allocate space on stack */\r
- in r30, SPL\r
- in r31, SPH\r
- in r1, SREG\r
- subi r30, lo8(64)\r
- sbci r31, hi8(64) /* ??? */\r
- cli\r
- out SPL, r30\r
- out SPH, r31\r
- out SREG,r1\r
-\r
- adiw r30, 1 /* SP points to next free byte on stack */\r
- mov r18, r20 /* r20 = LSB(length) */\r
- lsr r18\r
- lsr r18\r
- lsr r18\r
- bst r21, 0 /* may be we should explain this ... */\r
- bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */\r
- \r
- \r
- movw r26, r22 /* X points to begin of msg */\r
- tst r18\r
- breq sha1_lastBlock_post_copy\r
- mov r1, r18\r
-sha1_lastBlock_copy_loop:\r
- ld r0, X+\r
- st Z+, r0\r
- dec r1\r
- brne sha1_lastBlock_copy_loop\r
-sha1_lastBlock_post_copy: \r
-sha1_lastBlock_insert_stuffing_bit: \r
- ldi r19, 0x80\r
- mov r0,r19 \r
- ldi r19, 0x07\r
- and r19, r20 /* if we are in bitmode */\r
- breq 2f /* no bitmode */\r
-1: \r
- lsr r0\r
- dec r19\r
- brne 1b\r
- ld r19, X\r
-/* maybe we should do some ANDing here, just for safety */\r
- or r0, r19\r
-2: \r
- st Z+, r0\r
- inc r18\r
-\r
-/* checking stuff here */\r
- cpi r18, 64-8+1\r
- brsh 0f \r
- rjmp sha1_lastBlock_insert_zeros\r
-0:\r
- /* oh shit, we landed here */\r
- /* first we have to fill it up with zeros */\r
- ldi r19, 64\r
- sub r19, r18\r
- breq 2f\r
-1: \r
- st Z+, r1\r
- dec r19\r
- brne 1b \r
-2: \r
- sbiw r30, 63\r
- sbiw r30, 1\r
- movw r22, r30\r
- \r
- push r31\r
- push r30\r
- push r25\r
- push r24\r
- push r21\r
- push r20\r
- rcall sha1_nextBlock\r
- pop r20\r
- pop r21\r
- pop r24\r
- pop r25\r
- pop r30\r
- pop r31\r
- \r
- /* now we should subtract 512 from length */\r
- movw r26, r24\r
- adiw r26, 4*5+1 /* we can skip the lowest byte */\r
- ld r19, X\r
- subi r19, hi8(512)\r
- st X+, r19\r
- ldi r18, 6\r
-1:\r
- ld r19, X\r
- sbci r19, 0\r
- st X+, r19\r
- dec r18\r
- brne 1b\r
- \r
-; clr r18 /* not neccessary ;-) */\r
- /* reset Z pointer to begin of block */\r
-\r
-sha1_lastBlock_insert_zeros: \r
- ldi r19, 64-8\r
- sub r19, r18\r
- breq sha1_lastBlock_insert_length\r
- clr r1\r
-1:\r
- st Z+, r1 /* r1 is still zero */\r
- dec r19\r
- brne 1b\r
-\r
-; rjmp sha1_lastBlock_epilog\r
-sha1_lastBlock_insert_length:\r
- movw r26, r24 /* X points to state */\r
- adiw r26, 5*4 /* X points to (state.length) */\r
- adiw r30, 8 /* Z points one after the last byte of block */\r
- ld r0, X+\r
- add r0, r20\r
- st -Z, r0\r
- ld r0, X+\r
- adc r0, r21\r
- st -Z, r0\r
- ldi r19, 6\r
-1:\r
- ld r0, X+\r
- adc r0, r1\r
- st -Z, r0\r
- dec r19\r
- brne 1b\r
-\r
- sbiw r30, 64-8\r
- movw r22, r30\r
- rcall sha1_nextBlock\r
-\r
-sha1_lastBlock_epilog:\r
- in r30, SPL\r
- in r31, SPH\r
- in r1, SREG\r
- adiw r30, 63 ; lo8(64)\r
- adiw r30, 1 ; hi8(64)\r
- cli\r
- out SPL, r30\r
- out SPH, r31\r
- out SREG,r1\r
- clr r1\r
- clr r0\r
- ret\r
-\r
-/**/\r
-;########################################################### \r
-\r
-.global sha1_nextBlock\r
-; === sha1_nextBlock ===\r
-; this is the core function for calculating SHA-1 hashes\r
-; param1: the 16-bit pointer to sha1_ctx structure\r
-; given in r25,r24 (r25 is most significant)\r
-; param2: an 16-bit pointer to 64 byte block to hash\r
-; given in r23,r22\r
-sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte)\r
-\r
-xtmp = 0\r
-xNULL = 1\r
-W1 = 10\r
-W2 = 11\r
-T1 = 12\r
-T2 = 13\r
-T3 = 14\r
-T4 = 15\r
-LoopC = 16\r
-S = 17\r
-tmp1 = 18\r
-tmp2 = 19\r
-tmp3 = 20\r
-tmp4 = 21\r
-F1 = 22\r
-F2 = 23\r
-F3 = 24\r
-F4 = 25\r
-\r
-/* byteorder: high number <--> high significance */\r
-sha1_nextBlock:\r
- ; initial, let's make some space ready for local vars\r
- /* replace push & pop by mem ops? */\r
- push r10\r
- push r11\r
- push r12\r
- push r13\r
- push r14\r
- push r15\r
- push r16\r
- push r17\r
- push r28\r
- push r29\r
- in r20, SPL\r
- in r21, SPH\r
- movw r18, r20 ;backup SP\r
-; movw r26, r20 ; X points to free space on stack /* maybe removeable? */ \r
- movw r30, r22 ; Z points to message\r
- subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63\r
- sbci r21, hi8(sha1_nextBlock_localSpace)\r
- movw r26, r20 ; X points to free space on stack \r
- in r0, SREG\r
- cli ; we want to be uninterrupted while updating SP\r
- out SPL, r20\r
- out SPH, r21\r
- out SREG, r0\r
- \r
- push r18\r
- push r19 /* push old SP on new stack */\r
- push r24\r
- push r25 /* param1 will be needed later */\r
- \r
- /* load a[] with state */\r
- movw 28, r24 /* load pointer to state in Y */\r
- adiw r26, 1 ; X++\r
-\r
- ldi LoopC, 5*4 \r
-1: ld tmp1, Y+\r
- st X+, tmp1\r
- dec LoopC\r
- brne 1b\r
-\r
- movw W1, r26 /* save pointer to w[0] */\r
- /* load w[] with endian fixed message */\r
- /* we might also use the changeendian32() function at bottom */\r
- movw r30, r22 /* mv param2 (ponter to msg) to Z */ \r
- ldi LoopC, 16\r
-1:\r
- ldd tmp1, Z+3\r
- st X+, tmp1\r
- ldd tmp1, Z+2\r
- st X+, tmp1\r
- ldd tmp1, Z+1\r
- st X+, tmp1\r
- ld tmp1, Z\r
- st X+, tmp1\r
- adiw r30, 4\r
- dec LoopC\r
- brne 1b\r
- \r
- ;clr LoopC /* LoopC is named t in FIPS 180-2 */ \r
- clr xtmp\r
-sha1_nextBlock_mainloop:\r
- mov S, LoopC\r
- lsl S\r
- lsl S\r
- andi S, 0x3C /* S is a bytepointer so *4 */\r
- /* load w[s] */\r
- movw r26, W1\r
- add r26, S /* X points at w[s] */\r
- adc r27, xNULL\r
- ld T1, X+\r
- ld T2, X+\r
- ld T3, X+\r
- ld T4, X+\r
-\r
- /**/\r
- push r26\r
- push r27\r
- push T4\r
- push T3\r
- push T2\r
- push T1\r
- in r26, SPL\r
- in r27, SPH\r
- adiw r26, 1\r
- dbg_hexdump 4\r
- pop T1\r
- pop T2\r
- pop T3\r
- pop T4\r
- pop r27\r
- pop r26\r
- /**/\r
-\r
- cpi LoopC, 16\r
- brlt sha1_nextBlock_mainloop_core\r
- /* update w[s] */\r
- ldi tmp1, 2*4\r
- rcall 1f\r
- ldi tmp1, 8*4\r
- rcall 1f\r
- ldi tmp1, 13*4\r
- rcall 1f\r
- rjmp 2f\r
-1: /* this might be "outsourced" to save the jump above */\r
- add tmp1, S\r
- andi tmp1, 0x3f\r
- movw r26, W1\r
- add r26, tmp1\r
- adc r27, xNULL\r
- ld tmp2, X+\r
- eor T1, tmp2\r
- ld tmp2, X+\r
- eor T2, tmp2\r
- ld tmp2, X+\r
- eor T3, tmp2\r
- ld tmp2, X+\r
- eor T4, tmp2\r
- ret\r
-2: /* now we just hav to do a ROTL(T) and save T back */\r
- mov tmp2, T4\r
- rol tmp2\r
- rol T1\r
- rol T2\r
- rol T3\r
- rol T4\r
- movw r26, W1\r
- add r26, S\r
- adc r27, xNULL\r
- st X+, T1\r
- st X+, T2\r
- st X+, T3\r
- st X+, T4\r
- \r
-sha1_nextBlock_mainloop_core: /* ther core function; T=ROTL5(a) ....*/ \r
- /* T already contains w[s] */\r
- movw r26, W1\r
- sbiw r26, 4*1 /* X points at a[4] aka e */\r
- ld tmp1, X+ \r
- add T1, tmp1\r
- ld tmp1, X+ \r
- adc T2, tmp1\r
- ld tmp1, X+ \r
- adc T3, tmp1\r
- ld tmp1, X+ \r
- adc T4, tmp1 /* T = w[s]+e */\r
- sbiw r26, 4*5 /* X points at a[0] aka a */\r
- ld F1, X+ \r
- ld F2, X+ \r
- ld F3, X+ \r
- ld F4, X+ \r
- mov tmp1, F4 /* X points at a[1] aka b */\r
- ldi tmp2, 5\r
-1:\r
- rol tmp1\r
- rol F1\r
- rol F2\r
- rol F3\r
- rol F4\r
- dec tmp2\r
- brne 1b\r
- \r
- add T1, F1\r
- adc T2, F2\r
- adc T3, F3\r
- adc T4, F4 /* T = ROTL(a,5) + e + w[s] */\r
- \r
- /* now we have to do this fucking conditional stuff */\r
- ldi r30, lo8(sha1_nextBlock_xTable)\r
- ldi r31, hi8(sha1_nextBlock_xTable)\r
- add r30, xtmp\r
- adc r31, xNULL\r
- lpm tmp1, Z\r
- cp tmp1, LoopC\r
- brne 1f\r
- inc xtmp\r
-1: ldi r30, lo8(sha1_nextBlock_KTable)\r
- ldi r31, hi8(sha1_nextBlock_KTable)\r
- lsl xtmp\r
- lsl xtmp\r
- add r30, xtmp\r
- adc r31, xNULL\r
- lsr xtmp\r
- lsr xtmp\r
- \r
- lpm tmp1, Z+\r
- add T1, tmp1\r
- lpm tmp1, Z+\r
- adc T2, tmp1\r
- lpm tmp1, Z+\r
- adc T3, tmp1\r
- lpm tmp1, Z+\r
- adc T4, tmp1\r
- /* T = ROTL(a,5) + e + kt + w[s] */\r
- \r
- /* wo Z-4 gerade auf kt zeigt ... */\r
- movw r28, r26 /* copy X in Y */\r
- adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */\r
- lsr r31\r
- ror r30\r
- \r
- icall\r
- mov F1, tmp1\r
- icall\r
- mov F2, tmp1\r
- icall\r
- mov F3, tmp1\r
- icall\r
- \r
- add T1, F1\r
- adc T2, F2\r
- adc T3, F3\r
- adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */\r
- /* X points still at a[1] aka b, Y points at a[2] aka c */ \r
- /* update a[] */\r
-sha1_nextBlock_update_a:\r
- /*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/\r
- //adiw r28, 3*4 /* Y should point at a[4] aka e */\r
- movw r28, W1\r
- sbiw r28, 4\r
- \r
- ldi tmp2, 4*4 \r
-1: \r
- ld tmp1, -Y\r
- std Y+4, tmp1\r
- dec tmp2\r
- brne 1b\r
- /* Y points at a[0] aka a*/\r
- \r
- movw r28, W1\r
- sbiw r28, 5*4\r
- /* store T in a[0] aka a */\r
- st Y+, T1\r
- st Y+, T2\r
- st Y+, T3\r
- st Y+, T4\r
- /* Y points at a[1] aka b*/\r
- \r
- /* rotate c */\r
- ldd T1, Y+1*4\r
- ldd T2, Y+1*4+1\r
- ldd T3, Y+1*4+2\r
- ldd T4, Y+1*4+3\r
- mov tmp1, T1\r
- ldi tmp2, 2\r
-1: ror tmp1\r
- ror T4\r
- ror T3\r
- ror T2\r
- ror T1\r
- dec tmp2\r
- brne 1b\r
- std Y+1*4+0, T1\r
- std Y+1*4+1, T2\r
- std Y+1*4+2, T3\r
- std Y+1*4+3, T4\r
- \r
- push r27\r
- push r26\r
- movw r26, W1\r
- sbiw r26, 4*5\r
- dbg_hexdump 4*5\r
- pop r26\r
- pop r27\r
- \r
- inc LoopC\r
- cpi LoopC, 80\r
- brge 1f\r
- jmp sha1_nextBlock_mainloop\r
-/**************************************/\r
-1: \r
- /* littel patch */\r
- sbiw r28, 4\r
-\r
-/* add a[] to state and inc length */ \r
- pop r27\r
- pop r26 /* now X points to state (and Y still at a[0]) */\r
- ldi tmp4, 5\r
-1: clc\r
- ldi tmp3, 4\r
-2: ld tmp1, X\r
- ld tmp2, Y+\r
- adc tmp1, tmp2\r
- st X+, tmp1\r
- dec tmp3\r
- brne 2b\r
- dec tmp4\r
- brne 1b\r
- \r
- /* now length += 512 */\r
- adiw r26, 1 /* we skip the least significant byte */\r
- ld tmp1, X\r
- ldi tmp2, hi8(512) /* 2 */\r
- add tmp1, tmp2\r
- st X+, tmp1\r
- ldi tmp2, 6\r
-1:\r
- ld tmp1, X\r
- adc tmp1, xNULL\r
- st X+, tmp1\r
- dec tmp2\r
- brne 1b\r
- \r
-; EPILOG\r
-sha1_nextBlock_epilog:\r
-/* now we should clean up the stack */\r
- pop r21\r
- pop r20\r
- in r0, SREG\r
- cli ; we want to be uninterrupted while updating SP\r
- out SPL, r20\r
- out SPH, r21\r
- out SREG, r0\r
- \r
- clr r1\r
- pop r29\r
- pop r28\r
- pop r17\r
- pop r16\r
- pop r15\r
- pop r14\r
- pop r13\r
- pop r12\r
- pop r11\r
- pop r10\r
- ret\r
-\r
-sha1_nextBlock_xTable:\r
-.byte 20,40,60,0\r
-sha1_nextBlock_KTable:\r
-.int 0x5a827999 \r
-.int 0x6ed9eba1 \r
-.int 0x8f1bbcdc \r
-.int 0xca62c1d6\r
-sha1_nextBlock_JumpTable:\r
-jmp sha1_nextBlock_Ch \r
-jmp sha1_nextBlock_Parity\r
-jmp sha1_nextBlock_Maj\r
-jmp sha1_nextBlock_Parity\r
-\r
- /* X and Y still point at a[1] aka b ; return value in tmp1 */\r
-sha1_nextBlock_Ch:\r
- ld tmp1, Y+\r
- mov tmp2, tmp1\r
- com tmp2\r
- ldd tmp3, Y+3 /* load from c */\r
- and tmp1, tmp3\r
- ldd tmp3, Y+7 /* load from d */\r
- and tmp2, tmp3\r
- eor tmp1, tmp2\r
- /**\r
- precall\r
- ldi r24, lo8(ch_str)\r
- ldi r25, hi8(ch_str)\r
- call uart_putstr_P\r
- postcall\r
- /**/\r
- ret\r
- \r
-sha1_nextBlock_Maj:\r
- ld tmp1, Y+\r
- mov tmp2, tmp1\r
- ldd tmp3, Y+3 /* load from c */\r
- and tmp1, tmp3\r
- ldd tmp4, Y+7 /* load from d */\r
- and tmp2, tmp4\r
- eor tmp1, tmp2\r
- and tmp3, tmp4\r
- eor tmp1, tmp3\r
- /**\r
- precall\r
- ldi r24, lo8(maj_str)\r
- ldi r25, hi8(maj_str)\r
- call uart_putstr_P\r
- postcall\r
- /**/\r
- ret\r
-\r
-sha1_nextBlock_Parity:\r
- ld tmp1, Y+\r
- ldd tmp2, Y+3 /* load from c */\r
- eor tmp1, tmp2\r
- ldd tmp2, Y+7 /* load from d */\r
- eor tmp1, tmp2\r
- \r
- /**\r
- precall\r
- ldi r24, lo8(parity_str)\r
- ldi r25, hi8(parity_str)\r
- call uart_putstr_P\r
- postcall\r
- /**/\r
- ret\r
-/* \r
-ch_str: .asciz "\r\nCh"\r
-maj_str: .asciz "\r\nMaj"\r
-parity_str: .asciz "\r\nParity"\r
-*/\r
-;########################################################### \r
-\r
-.global sha1_init \r
-;void sha1_init(sha1_ctx_t *state){\r
-; DEBUG_S("\r\nSHA1_INIT");\r
-; state->h[0] = 0x67452301;\r
-; state->h[1] = 0xefcdab89;\r
-; state->h[2] = 0x98badcfe;\r
-; state->h[3] = 0x10325476;\r
-; state->h[4] = 0xc3d2e1f0;\r
-; state->length = 0;\r
-;}\r
-; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram\r
-; modifys: Z(r30,r31), Func1, r22\r
-sha1_init:\r
- movw r26, r24 ; (24,25) --> (26,27) load X with param1\r
- ldi r30, lo8((sha1_init_vector))\r
- ldi r31, hi8((sha1_init_vector))\r
- ldi r22, 5*4 /* bytes to copy */\r
-sha1_init_vloop: \r
- lpm r23, Z+ \r
- st X+, r23\r
- dec r22\r
- brne sha1_init_vloop\r
- ldi r22, 8\r
- clr r1 /* this should not be needed */\r
-sha1_init_lloop:\r
- st X+, r1\r
- dec r22\r
- brne sha1_init_lloop\r
- ret\r
- \r
-sha1_init_vector:\r
-.int 0x67452301;\r
-.int 0xefcdab89;\r
-.int 0x98badcfe;\r
-.int 0x10325476;\r
-.int 0xc3d2e1f0;\r
-/*\r
-;########################################################### \r
-\r
-.global rotl32\r
-; === ROTL32 ===\r
-; function that rotates a 32 bit word to the left\r
-; param1: the 32-bit word to rotate\r
-; given in r25,r24,r23,r22 (r25 is most significant)\r
-; param2: an 8-bit value telling how often to rotate\r
-; given in r20\r
-; modifys: r21, r22\r
-rotl32:\r
- cpi r20, 8\r
- brlo bitrotl\r
- mov r21, r25\r
- mov r25, r24\r
- mov r24, r23\r
- mov r23, r22\r
- mov r22, r21\r
- subi r20, 8\r
- rjmp rotr32\r
-bitrotl:\r
- clr r21\r
- clc\r
-bitrotl_loop: \r
- tst r20\r
- breq fixrotl\r
- rol r22\r
- rol r23\r
- rol r24\r
- rol r25\r
- rol r21\r
- dec r20\r
- rjmp bitrotl_loop\r
-fixrotl:\r
- or r22, r21\r
- ret\r
- \r
-\r
-;########################################################### \r
-\r
-.global rotr32\r
-; === ROTR32 ===\r
-; function that rotates a 32 bit word to the right\r
-; param1: the 32-bit word to rotate\r
-; given in r25,r24,r23,22 (r25 is most significant)\r
-; param2: an 8-bit value telling how often to rotate\r
-; given in r20\r
-; modifys: r21, r22\r
-rotr32:\r
- cpi r20, 8\r
- brlo bitrotr\r
- mov r21, r22\r
- mov r22, r23\r
- mov r23, r24\r
- mov r24, r25\r
- mov r25, r21\r
- subi r20, 8\r
- rjmp rotr32\r
-bitrotr:\r
- clr r21\r
- clc\r
-bitrotr_loop: \r
- tst r20\r
- breq fixrotr\r
- ror r25\r
- ror r24\r
- ror r23\r
- ror r22\r
- ror r21\r
- dec r20\r
- rjmp bitrotr_loop\r
-fixrotr:\r
- or r25, r21\r
- ret\r
- \r
- \r
-;########################################################### \r
- \r
-.global change_endian32\r
-; === change_endian32 ===\r
-; function that changes the endianess of a 32-bit word\r
-; param1: the 32-bit word\r
-; given in r25,r24,r23,22 (r25 is most significant)\r
-; modifys: r21, r22\r
-change_endian32:\r
- movw r20, r22 ; (r22,r23) --> (r20,r21)\r
- mov r22, r25\r
- mov r23, r24\r
- mov r24, r21\r
- mov r25, r20 \r
- ret\r
-*/\r
+/*
+ * Author: Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; SHA1 implementation in assembler for AVR
+SHA1_BLOCK_BITS = 512
+SHA1_HASH_BITS = 160
+
+.macro precall
+ /* push r18 - r27, r30 - r31*/
+ push r0
+ push r1
+ push r18
+ push r19
+ push r20
+ push r21
+ push r22
+ push r23
+ push r24
+ push r25
+ push r26
+ push r27
+ push r30
+ push r31
+ clr r1
+.endm
+
+.macro postcall
+ pop r31
+ pop r30
+ pop r27
+ pop r26
+ pop r25
+ pop r24
+ pop r23
+ pop r22
+ pop r21
+ pop r20
+ pop r19
+ pop r18
+ pop r1
+ pop r0
+.endm
+
+
+.macro hexdump length
+ push r27
+ push r26
+ ldi r25, '\r'
+ mov r24, r25
+ call uart_putc
+ ldi r25, '\n'
+ mov r24, r25
+ call uart_putc
+ pop r26
+ pop r27
+ movw r24, r26
+.if \length > 16
+ ldi r22, lo8(16)
+ ldi r23, hi8(16)
+ push r27
+ push r26
+ call uart_hexdump
+ pop r26
+ pop r27
+ adiw r26, 16
+ hexdump \length-16
+.else
+ ldi r22, lo8(\length)
+ ldi r23, hi8(\length)
+ call uart_hexdump
+.endif
+.endm
+
+.macro delay
+/*
+ push r0
+ push r1
+ clr r0
+1: clr r1
+2: dec r1
+ brne 2b
+ dec r0
+ brne 1b
+ pop r1
+ pop r0 // */
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+/*
+ precall
+ hexdump \length
+ postcall
+ // */
+.endm
+
+
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha1_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################
+
+.global sha1_ctx2hash
+; === sha1_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+; param1: the 16-bit destination pointer
+; given in r25,r24 (r25 is most significant)
+; param2: the 16-bit pointer to sha1_ctx structure
+; given in r23,r22
+sha1_ctx2hash:
+ movw r26, r22
+ movw r30, r24
+ ldi r21, 5
+ sbiw r26, 4
+1:
+ ldi r20, 4
+ adiw r26, 8
+2:
+ ld r0, -X
+ st Z+, r0
+ dec r20
+ brne 2b
+
+ dec r21
+ brne 1b
+
+ ret
+
+;###########################################################
+
+.global sha1
+; === sha1 ===
+; this function calculates SHA-1 hashes from messages in RAM
+; param1: the 16-bit hash destination pointer
+; given in r25,r24 (r25 is most significant)
+; param2: the 16-bit pointer to message
+; given in r23,r22
+; param3: 32-bit length value (length of message in bits)
+; given in r21,r20,r19,r18
+sha1:
+sha1_prolog:
+ push r8
+ push r9
+ push r10
+ push r11
+ push r12
+ push r13
+ push r16
+ push r17
+ in r16, SPL
+ in r17, SPH
+ subi r16, 5*4+8
+ sbci r17, 0
+ in r0, SREG
+ cli
+ out SPL, r16
+ out SPH, r17
+ out SREG, r0
+
+ push r25
+ push r24
+ inc r16
+ adc r17, r1
+
+ movw r8, r18 /* backup of length*/
+ movw r10, r20
+
+ movw r12, r22 /* backup pf msg-ptr */
+
+ movw r24, r16
+ rcall sha1_init
+ /* if length >= 512 */
+1:
+ tst r11
+ brne 4f
+ tst r10
+ brne 4f
+ mov r19, r9
+ cpi r19, 0x02
+ brlo 4f
+
+ movw r24, r16
+ movw r22, r12
+ rcall sha1_nextBlock
+ ldi r19, 0x64
+ add r22, r19
+ adc r23, r1
+ /* length -= 512 */
+ ldi r19, 0x02
+ sub r9, r19
+ sbc r10, r1
+ sbc r11, r1
+ rjmp 1b
+
+4:
+ movw r24, r16
+ movw r22, r12
+ movw r20, r8
+ rcall sha1_lastBlock
+
+ pop r24
+ pop r25
+ movw r22, r16
+ rcall sha1_ctx2hash
+
+sha1_epilog:
+ in r30, SPL
+ in r31, SPH
+ adiw r30, 5*4+8
+ in r0, SREG
+ cli
+ out SPL, r30
+ out SPH, r31
+ out SREG, r0
+ pop r17
+ pop r16
+ pop r13
+ pop r12
+ pop r11
+ pop r10
+ pop r9
+ pop r8
+ ret
+
+;###########################################################
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha1_lastBlock
+; === sha1_lastBlock ===
+; this function does padding & Co. for calculating SHA-1 hashes
+; param1: the 16-bit pointer to sha1_ctx structure
+; given in r25,r24 (r25 is most significant)
+; param2: an 16-bit pointer to 64 byte block to hash
+; given in r23,r22
+; param3: an 16-bit integer specifing length of block in bits
+; given in r21,r20
+sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1)
+
+
+sha1_lastBlock:
+ tst r20
+ brne sha1_lastBlock_prolog
+ cpi r21, 0x02
+ brne sha1_lastBlock_prolog
+ push r25
+ push r24
+ push r23
+ push r22
+ rcall sha1_nextBlock
+ pop r22
+ pop r23
+ pop r24
+ pop r25
+ clr r21
+ clr r22
+sha1_lastBlock_prolog:
+ /* allocate space on stack */
+ in r30, SPL
+ in r31, SPH
+ in r1, SREG
+ subi r30, lo8(64)
+ sbci r31, hi8(64) /* ??? */
+ cli
+ out SPL, r30
+ out SPH, r31
+ out SREG,r1
+
+ adiw r30, 1 /* SP points to next free byte on stack */
+ mov r18, r20 /* r20 = LSB(length) */
+ lsr r18
+ lsr r18
+ lsr r18
+ bst r21, 0 /* may be we should explain this ... */
+ bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
+
+
+ movw r26, r22 /* X points to begin of msg */
+ tst r18
+ breq sha1_lastBlock_post_copy
+ mov r1, r18
+sha1_lastBlock_copy_loop:
+ ld r0, X+
+ st Z+, r0
+ dec r1
+ brne sha1_lastBlock_copy_loop
+sha1_lastBlock_post_copy:
+sha1_lastBlock_insert_stuffing_bit:
+ ldi r19, 0x80
+ mov r0,r19
+ ldi r19, 0x07
+ and r19, r20 /* if we are in bitmode */
+ breq 2f /* no bitmode */
+1:
+ lsr r0
+ dec r19
+ brne 1b
+ ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+ or r0, r19
+2:
+ st Z+, r0
+ inc r18
+
+/* checking stuff here */
+ cpi r18, 64-8+1
+ brsh 0f
+ rjmp sha1_lastBlock_insert_zeros
+0:
+ /* oh shit, we landed here */
+ /* first we have to fill it up with zeros */
+ ldi r19, 64
+ sub r19, r18
+ breq 2f
+1:
+ st Z+, r1
+ dec r19
+ brne 1b
+2:
+ sbiw r30, 63
+ sbiw r30, 1
+ movw r22, r30
+
+ push r31
+ push r30
+ push r25
+ push r24
+ push r21
+ push r20
+ rcall sha1_nextBlock
+ pop r20
+ pop r21
+ pop r24
+ pop r25
+ pop r30
+ pop r31
+
+ /* now we should subtract 512 from length */
+ movw r26, r24
+ adiw r26, 4*5+1 /* we can skip the lowest byte */
+ ld r19, X
+ subi r19, hi8(512)
+ st X+, r19
+ ldi r18, 6
+1:
+ ld r19, X
+ sbci r19, 0
+ st X+, r19
+ dec r18
+ brne 1b
+
+; clr r18 /* not neccessary ;-) */
+ /* reset Z pointer to begin of block */
+
+sha1_lastBlock_insert_zeros:
+ ldi r19, 64-8
+ sub r19, r18
+ breq sha1_lastBlock_insert_length
+ clr r1
+1:
+ st Z+, r1 /* r1 is still zero */
+ dec r19
+ brne 1b
+
+; rjmp sha1_lastBlock_epilog
+sha1_lastBlock_insert_length:
+ movw r26, r24 /* X points to state */
+ adiw r26, 5*4 /* X points to (state.length) */
+ adiw r30, 8 /* Z points one after the last byte of block */
+ ld r0, X+
+ add r0, r20
+ st -Z, r0
+ ld r0, X+
+ adc r0, r21
+ st -Z, r0
+ ldi r19, 6
+1:
+ ld r0, X+
+ adc r0, r1
+ st -Z, r0
+ dec r19
+ brne 1b
+
+ sbiw r30, 64-8
+ movw r22, r30
+ rcall sha1_nextBlock
+
+sha1_lastBlock_epilog:
+ in r30, SPL
+ in r31, SPH
+ in r1, SREG
+ adiw r30, 63 ; lo8(64)
+ adiw r30, 1 ; hi8(64)
+ cli
+ out SPL, r30
+ out SPH, r31
+ out SREG,r1
+ clr r1
+ clr r0
+ ret
+
+/**/
+;###########################################################
+
+.global sha1_nextBlock
+; === sha1_nextBlock ===
+; this is the core function for calculating SHA-1 hashes
+; param1: the 16-bit pointer to sha1_ctx structure
+; given in r25,r24 (r25 is most significant)
+; param2: an 16-bit pointer to 64 byte block to hash
+; given in r23,r22
+sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte)
+
+xtmp = 0
+xNULL = 1
+W1 = 10
+W2 = 11
+T1 = 12
+T2 = 13
+T3 = 14
+T4 = 15
+LoopC = 16
+S = 17
+tmp1 = 18
+tmp2 = 19
+tmp3 = 20
+tmp4 = 21
+F1 = 22
+F2 = 23
+F3 = 24
+F4 = 25
+
+/* byteorder: high number <--> high significance */
+sha1_nextBlock:
+ ; initial, let's make some space ready for local vars
+ /* replace push & pop by mem ops? */
+ push r10
+ push r11
+ push r12
+ push r13
+ push r14
+ push r15
+ push r16
+ push r17
+ push r28
+ push r29
+ in r20, SPL
+ in r21, SPH
+ movw r18, r20 ;backup SP
+; movw r26, r20 ; X points to free space on stack /* maybe removeable? */
+ movw r30, r22 ; Z points to message
+ subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63
+ sbci r21, hi8(sha1_nextBlock_localSpace)
+ movw r26, r20 ; X points to free space on stack
+ in r0, SREG
+ cli ; we want to be uninterrupted while updating SP
+ out SPL, r20
+ out SPH, r21
+ out SREG, r0
+
+ push r18
+ push r19 /* push old SP on new stack */
+ push r24
+ push r25 /* param1 will be needed later */
+
+ /* load a[] with state */
+ movw 28, r24 /* load pointer to state in Y */
+ adiw r26, 1 ; X++
+
+ ldi LoopC, 5*4
+1: ld tmp1, Y+
+ st X+, tmp1
+ dec LoopC
+ brne 1b
+
+ movw W1, r26 /* save pointer to w[0] */
+ /* load w[] with endian fixed message */
+ /* we might also use the changeendian32() function at bottom */
+ movw r30, r22 /* mv param2 (ponter to msg) to Z */
+ ldi LoopC, 16
+1:
+ ldd tmp1, Z+3
+ st X+, tmp1
+ ldd tmp1, Z+2
+ st X+, tmp1
+ ldd tmp1, Z+1
+ st X+, tmp1
+ ld tmp1, Z
+ st X+, tmp1
+ adiw r30, 4
+ dec LoopC
+ brne 1b
+
+ ;clr LoopC /* LoopC is named t in FIPS 180-2 */
+ clr xtmp
+sha1_nextBlock_mainloop:
+ mov S, LoopC
+ lsl S
+ lsl S
+ andi S, 0x3C /* S is a bytepointer so *4 */
+ /* load w[s] */
+ movw r26, W1
+ add r26, S /* X points at w[s] */
+ adc r27, xNULL
+ ld T1, X+
+ ld T2, X+
+ ld T3, X+
+ ld T4, X+
+
+ /**/
+ push r26
+ push r27
+ push T4
+ push T3
+ push T2
+ push T1
+ in r26, SPL
+ in r27, SPH
+ adiw r26, 1
+ dbg_hexdump 4
+ pop T1
+ pop T2
+ pop T3
+ pop T4
+ pop r27
+ pop r26
+ /**/
+
+ cpi LoopC, 16
+ brlt sha1_nextBlock_mainloop_core
+ /* update w[s] */
+ ldi tmp1, 2*4
+ rcall 1f
+ ldi tmp1, 8*4
+ rcall 1f
+ ldi tmp1, 13*4
+ rcall 1f
+ rjmp 2f
+1: /* this might be "outsourced" to save the jump above */
+ add tmp1, S
+ andi tmp1, 0x3f
+ movw r26, W1
+ add r26, tmp1
+ adc r27, xNULL
+ ld tmp2, X+
+ eor T1, tmp2
+ ld tmp2, X+
+ eor T2, tmp2
+ ld tmp2, X+
+ eor T3, tmp2
+ ld tmp2, X+
+ eor T4, tmp2
+ ret
+2: /* now we just hav to do a ROTL(T) and save T back */
+ mov tmp2, T4
+ rol tmp2
+ rol T1
+ rol T2
+ rol T3
+ rol T4
+ movw r26, W1
+ add r26, S
+ adc r27, xNULL
+ st X+, T1
+ st X+, T2
+ st X+, T3
+ st X+, T4
+
+sha1_nextBlock_mainloop_core: /* ther core function; T=ROTL5(a) ....*/
+ /* T already contains w[s] */
+ movw r26, W1
+ sbiw r26, 4*1 /* X points at a[4] aka e */
+ ld tmp1, X+
+ add T1, tmp1
+ ld tmp1, X+
+ adc T2, tmp1
+ ld tmp1, X+
+ adc T3, tmp1
+ ld tmp1, X+
+ adc T4, tmp1 /* T = w[s]+e */
+ sbiw r26, 4*5 /* X points at a[0] aka a */
+ ld F1, X+
+ ld F2, X+
+ ld F3, X+
+ ld F4, X+
+ mov tmp1, F4 /* X points at a[1] aka b */
+ ldi tmp2, 5
+1:
+ rol tmp1
+ rol F1
+ rol F2
+ rol F3
+ rol F4
+ dec tmp2
+ brne 1b
+
+ add T1, F1
+ adc T2, F2
+ adc T3, F3
+ adc T4, F4 /* T = ROTL(a,5) + e + w[s] */
+
+ /* now we have to do this fucking conditional stuff */
+ ldi r30, lo8(sha1_nextBlock_xTable)
+ ldi r31, hi8(sha1_nextBlock_xTable)
+ add r30, xtmp
+ adc r31, xNULL
+ lpm tmp1, Z
+ cp tmp1, LoopC
+ brne 1f
+ inc xtmp
+1: ldi r30, lo8(sha1_nextBlock_KTable)
+ ldi r31, hi8(sha1_nextBlock_KTable)
+ lsl xtmp
+ lsl xtmp
+ add r30, xtmp
+ adc r31, xNULL
+ lsr xtmp
+ lsr xtmp
+
+ lpm tmp1, Z+
+ add T1, tmp1
+ lpm tmp1, Z+
+ adc T2, tmp1
+ lpm tmp1, Z+
+ adc T3, tmp1
+ lpm tmp1, Z+
+ adc T4, tmp1
+ /* T = ROTL(a,5) + e + kt + w[s] */
+
+ /* wo Z-4 gerade auf kt zeigt ... */
+ movw r28, r26 /* copy X in Y */
+ adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */
+ lsr r31
+ ror r30
+
+ icall
+ mov F1, tmp1
+ icall
+ mov F2, tmp1
+ icall
+ mov F3, tmp1
+ icall
+
+ add T1, F1
+ adc T2, F2
+ adc T3, F3
+ adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */
+ /* X points still at a[1] aka b, Y points at a[2] aka c */
+ /* update a[] */
+sha1_nextBlock_update_a:
+ /*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/
+ //adiw r28, 3*4 /* Y should point at a[4] aka e */
+ movw r28, W1
+ sbiw r28, 4
+
+ ldi tmp2, 4*4
+1:
+ ld tmp1, -Y
+ std Y+4, tmp1
+ dec tmp2
+ brne 1b
+ /* Y points at a[0] aka a*/
+
+ movw r28, W1
+ sbiw r28, 5*4
+ /* store T in a[0] aka a */
+ st Y+, T1
+ st Y+, T2
+ st Y+, T3
+ st Y+, T4
+ /* Y points at a[1] aka b*/
+
+ /* rotate c */
+ ldd T1, Y+1*4
+ ldd T2, Y+1*4+1
+ ldd T3, Y+1*4+2
+ ldd T4, Y+1*4+3
+ mov tmp1, T1
+ ldi tmp2, 2
+1: ror tmp1
+ ror T4
+ ror T3
+ ror T2
+ ror T1
+ dec tmp2
+ brne 1b
+ std Y+1*4+0, T1
+ std Y+1*4+1, T2
+ std Y+1*4+2, T3
+ std Y+1*4+3, T4
+
+ push r27
+ push r26
+ movw r26, W1
+ sbiw r26, 4*5
+ dbg_hexdump 4*5
+ pop r26
+ pop r27
+
+ inc LoopC
+ cpi LoopC, 80
+ brge 1f
+ jmp sha1_nextBlock_mainloop
+/**************************************/
+1:
+ /* littel patch */
+ sbiw r28, 4
+
+/* add a[] to state and inc length */
+ pop r27
+ pop r26 /* now X points to state (and Y still at a[0]) */
+ ldi tmp4, 5
+1: clc
+ ldi tmp3, 4
+2: ld tmp1, X
+ ld tmp2, Y+
+ adc tmp1, tmp2
+ st X+, tmp1
+ dec tmp3
+ brne 2b
+ dec tmp4
+ brne 1b
+
+ /* now length += 512 */
+ adiw r26, 1 /* we skip the least significant byte */
+ ld tmp1, X
+ ldi tmp2, hi8(512) /* 2 */
+ add tmp1, tmp2
+ st X+, tmp1
+ ldi tmp2, 6
+1:
+ ld tmp1, X
+ adc tmp1, xNULL
+ st X+, tmp1
+ dec tmp2
+ brne 1b
+
+; EPILOG
+sha1_nextBlock_epilog:
+/* now we should clean up the stack */
+ pop r21
+ pop r20
+ in r0, SREG
+ cli ; we want to be uninterrupted while updating SP
+ out SPL, r20
+ out SPH, r21
+ out SREG, r0
+
+ clr r1
+ pop r29
+ pop r28
+ pop r17
+ pop r16
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop r11
+ pop r10
+ ret
+
+sha1_nextBlock_xTable:
+.byte 20,40,60,0
+sha1_nextBlock_KTable:
+.int 0x5a827999
+.int 0x6ed9eba1
+.int 0x8f1bbcdc
+.int 0xca62c1d6
+sha1_nextBlock_JumpTable:
+jmp sha1_nextBlock_Ch
+jmp sha1_nextBlock_Parity
+jmp sha1_nextBlock_Maj
+jmp sha1_nextBlock_Parity
+
+ /* X and Y still point at a[1] aka b ; return value in tmp1 */
+sha1_nextBlock_Ch:
+ ld tmp1, Y+
+ mov tmp2, tmp1
+ com tmp2
+ ldd tmp3, Y+3 /* load from c */
+ and tmp1, tmp3
+ ldd tmp3, Y+7 /* load from d */
+ and tmp2, tmp3
+ eor tmp1, tmp2
+ /**
+ precall
+ ldi r24, lo8(ch_str)
+ ldi r25, hi8(ch_str)
+ call uart_putstr_P
+ postcall
+ /**/
+ ret
+
+sha1_nextBlock_Maj:
+ ld tmp1, Y+
+ mov tmp2, tmp1
+ ldd tmp3, Y+3 /* load from c */
+ and tmp1, tmp3
+ ldd tmp4, Y+7 /* load from d */
+ and tmp2, tmp4
+ eor tmp1, tmp2
+ and tmp3, tmp4
+ eor tmp1, tmp3
+ /**
+ precall
+ ldi r24, lo8(maj_str)
+ ldi r25, hi8(maj_str)
+ call uart_putstr_P
+ postcall
+ /**/
+ ret
+
+sha1_nextBlock_Parity:
+ ld tmp1, Y+
+ ldd tmp2, Y+3 /* load from c */
+ eor tmp1, tmp2
+ ldd tmp2, Y+7 /* load from d */
+ eor tmp1, tmp2
+
+ /**
+ precall
+ ldi r24, lo8(parity_str)
+ ldi r25, hi8(parity_str)
+ call uart_putstr_P
+ postcall
+ /**/
+ ret
+/*
+ch_str: .asciz "\r\nCh"
+maj_str: .asciz "\r\nMaj"
+parity_str: .asciz "\r\nParity"
+*/
+;###########################################################
+
+.global sha1_init
+;void sha1_init(sha1_ctx_t *state){
+; DEBUG_S("\r\nSHA1_INIT");
+; state->h[0] = 0x67452301;
+; state->h[1] = 0xefcdab89;
+; state->h[2] = 0x98badcfe;
+; state->h[3] = 0x10325476;
+; state->h[4] = 0xc3d2e1f0;
+; state->length = 0;
+;}
+; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha1_init:
+ movw r26, r24 ; (24,25) --> (26,27) load X with param1
+ ldi r30, lo8((sha1_init_vector))
+ ldi r31, hi8((sha1_init_vector))
+ ldi r22, 5*4 /* bytes to copy */
+sha1_init_vloop:
+ lpm r23, Z+
+ st X+, r23
+ dec r22
+ brne sha1_init_vloop
+ ldi r22, 8
+ clr r1 /* this should not be needed */
+sha1_init_lloop:
+ st X+, r1
+ dec r22
+ brne sha1_init_lloop
+ ret
+
+sha1_init_vector:
+.int 0x67452301;
+.int 0xefcdab89;
+.int 0x98badcfe;
+.int 0x10325476;
+.int 0xc3d2e1f0;
+/*
+;###########################################################
+
+.global rotl32
+; === ROTL32 ===
+; function that rotates a 32 bit word to the left
+; param1: the 32-bit word to rotate
+; given in r25,r24,r23,r22 (r25 is most significant)
+; param2: an 8-bit value telling how often to rotate
+; given in r20
+; modifys: r21, r22
+rotl32:
+ cpi r20, 8
+ brlo bitrotl
+ mov r21, r25
+ mov r25, r24
+ mov r24, r23
+ mov r23, r22
+ mov r22, r21
+ subi r20, 8
+ rjmp rotr32
+bitrotl:
+ clr r21
+ clc
+bitrotl_loop:
+ tst r20
+ breq fixrotl
+ rol r22
+ rol r23
+ rol r24
+ rol r25
+ rol r21
+ dec r20
+ rjmp bitrotl_loop
+fixrotl:
+ or r22, r21
+ ret
+
+
+;###########################################################
+
+.global rotr32
+; === ROTR32 ===
+; function that rotates a 32 bit word to the right
+; param1: the 32-bit word to rotate
+; given in r25,r24,r23,22 (r25 is most significant)
+; param2: an 8-bit value telling how often to rotate
+; given in r20
+; modifys: r21, r22
+rotr32:
+ cpi r20, 8
+ brlo bitrotr
+ mov r21, r22
+ mov r22, r23
+ mov r23, r24
+ mov r24, r25
+ mov r25, r21
+ subi r20, 8
+ rjmp rotr32
+bitrotr:
+ clr r21
+ clc
+bitrotr_loop:
+ tst r20
+ breq fixrotr
+ ror r25
+ ror r24
+ ror r23
+ ror r22
+ ror r21
+ dec r20
+ rjmp bitrotr_loop
+fixrotr:
+ or r25, r21
+ ret
+
+
+;###########################################################
+
+.global change_endian32
+; === change_endian32 ===
+; function that changes the endianess of a 32-bit word
+; param1: the 32-bit word
+; given in r25,r24,r23,22 (r25 is most significant)
+; modifys: r21, r22
+change_endian32:
+ movw r20, r22 ; (r22,r23) --> (r20,r21)
+ mov r22, r25
+ mov r23, r24
+ mov r24, r21
+ mov r25, r20
+ ret
+*/