/* md5-asm.S */ /* This file is part of the AVR-Crypto-Lib. Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* * Author: Daniel Otte * License: GPLv3 or later * Date: 2008-11-15 */ #include "avr-asm-macros.S" ;########################################################### ; S-BOX T_table: .hword 0xa478, 0xd76a, 0xb756, 0xe8c7, 0x70db, 0x2420, 0xceee, 0xc1bd, 0x0faf, 0xf57c .hword 0xc62a, 0x4787, 0x4613, 0xa830, 0x9501, 0xfd46, 0x98d8, 0x6980, 0xf7af, 0x8b44 .hword 0x5bb1, 0xffff, 0xd7be, 0x895c, 0x1122, 0x6b90, 0x7193, 0xfd98, 0x438e, 0xa679 .hword 0x0821, 0x49b4, 0x2562, 0xf61e, 0xb340, 0xc040, 0x5a51, 0x265e, 0xc7aa, 0xe9b6 .hword 0x105d, 0xd62f, 0x1453, 0x0244, 0xe681, 0xd8a1, 0xfbc8, 0xe7d3, 0xcde6, 0x21e1 .hword 0x07d6, 0xc337, 0x0d87, 0xf4d5, 0x14ed, 0x455a, 0xe905, 0xa9e3, 0xa3f8, 0xfcef .hword 0x02d9, 0x676f, 0x4c8a, 0x8d2a, 0x3942, 0xfffa, 0xf681, 0x8771, 0x6122, 0x6d9d .hword 0x380c, 0xfde5, 0xea44, 0xa4be, 0xcfa9, 0x4bde, 0x4b60, 0xf6bb, 0xbc70, 0xbebf .hword 0x7ec6, 0x289b, 0x27fa, 0xeaa1, 0x3085, 0xd4ef, 0x1d05, 0x0488, 0xd039, 0xd9d4 .hword 0x99e5, 0xe6db, 0x7cf8, 0x1fa2, 0x5665, 0xc4ac, 0x2244, 0xf429, 0xff97, 0x432a .hword 0x23a7, 0xab94, 0xa039, 0xfc93, 0x59c3, 0x655b, 0xcc92, 0x8f0c, 0xf47d, 0xffef .hword 0x5dd1, 0x8584, 0x7e4f, 0x6fa8, 0xe6e0, 0xfe2c, 0x4314, 0xa301, 0x11a1, 0x4e08 .hword 0x7e82, 0xf753, 0xf235, 0xbd3a, 0xd2bb, 0x2ad7, 0xd391, 0xeb86 #define MD5_init_fast .global md5_init #ifndef MD5_init_fast ;########################################################### ;void md5_init(md5_ctx_t *state) ; param1: (r24,r25) 16-bit pointer to sha256_ctx_t struct in ram ; modifys: Z(r30,r31), X(r25,r26) ; size = 9+5*4 WORDS = 29 WORDS = 58 Bytes md5_init: movw r26, r24 ; (24,25) --> (26,27) load X with param1 ldi r30, lo8(md5_init_vector) ldi r31, hi8(md5_init_vector) ldi r24, 16+4 md5_init_vloop: lpm r0, Z+ st X+, r0 dec r24 brne md5_init_vloop ret md5_init_vector: .hword 0x2301, 0x6745 .hword 0xAB89, 0xEFCD .hword 0xDCFE, 0x98BA .hword 0x5476, 0x1032 .hword 0x0000, 0x0000 #else ;########################################################### .global md5_init_fast ;void md5_init(md5_ctx_t *state) ; param1: (r24,r25) 16-bit pointer to sha256_ctx_t struct in ram ; modifys: r23, r22 ; cycles = 1+16*3+4*2+4 = 1+48+12 = 61 ; size = 1+16*2+4+1 WORDS = 38 WORDS = 76 Bytes md5_init: md5_init_fast: movw r26, r24 ldi r24, 0x01 st X+, r24 ldi r24, 0x23 st X+, r24 ldi r24, 0x45 st X+, r24 ldi r24, 0x67 st X+, r24 ldi r24, 0x89 st X+, r24 ldi r24, 0xAB st X+, r24 ldi r24, 0xCD st X+, r24 ldi r24, 0xEF st X+, r24 ldi r24, 0xFE st X+, r24 ldi r24, 0xDC st X+, r24 ldi r24, 0xBA st X+, r24 ldi r24, 0x98 st X+, r24 ldi r24, 0x76 st X+, r24 ldi r24, 0x54 st X+, r24 ldi r24, 0x32 st X+, r24 ldi r24, 0x10 st X+, r24 st X+, r1 st X+, r1 st X+, r1 st X+, r1 ret #endif ;########################################################### /* static uint32_t md5_F(uint32_t x, uint32_t y, uint32_t z){ return ((x&y)|((~x)&z)); } */ ; x: r22-r25 ; y: r18-r21 ; z: r14-r17 md5_F: and r18, r22 and r19, r23 and r20, r24 and r21, r25 com r22 com r23 com r24 com r25 and r22, r14 and r23, r15 and r24, r16 and r25, r17 or r22, r18 or r23, r19 or r24, r20 or r25, r21 rjmp md5_core_F_exit /* static uint32_t md5_G(uint32_t x, uint32_t y, uint32_t z){ return ((x&z)|((~z)&y)); } */ ; x: r22-r25 ; y: r18-r21 ; z: r14-r17 md5_G: and r22, r14 and r23, r15 and r24, r16 and r25, r17 com r14 com r15 com r16 com r17 and r18, r14 and r19, r15 and r20, r16 and r21, r17 or r22, r18 or r23, r19 or r24, r20 or r25, r21 rjmp md5_core_F_exit /* static uint32_t md5_H(uint32_t x, uint32_t y, uint32_t z){ return (x^y^z); } */ ; x: r22-r25 ; y: r18-r21 ; z: r14-r17 md5_H: eor r22, r18 eor r22, r14 eor r23, r19 eor r23, r15 eor r24, r20 eor r24, r16 eor r25, r21 eor r25, r17 rjmp md5_core_F_exit /* static uint32_t md5_I(uint32_t x, uint32_t y, uint32_t z){ return (y ^ (x | (~z))); } */ jump_table: rjmp md5_F rjmp md5_G rjmp md5_H ; rjmp md5_I ; x: r22-r25 ; y: r18-r21 ; z: r14-r17 md5_I: com r14 com r15 com r16 com r17 or r22, r14 or r23, r15 or r24, r16 or r25, r17 eor r22, r18 eor r23, r19 eor r24, r20 eor r25, r21 rjmp md5_core_F_exit as_table: ; (as+0)&3 (as+3)&3 (as+1)&3 (as+2)&3 ; Z X Y ; AS_SAVE0 AS_SAVE1 AS_SAVE2 AS_SAVE3 .byte 1*4, 0*4, 2*4, 3*4 ;as=1 .byte 2*4, 1*4, 3*4, 0*4 ;as=2 .byte 3*4, 2*4, 0*4, 1*4 ;as=3 .byte 0*4, 3*4, 1*4, 2*4 ;as=4 ;########################################################### .global md5_core md5_core: mov r21, r20 mov r20, r18 mov r19, r16 mov r18, r14 ; rjmp md5_core_asm /* void md5_core(uint32_t *a, void *block, uint8_t as, uint8_t s, uint8_t i, uint8_t fi){ uint32_t t; md5_func_t *funcs[]={md5_F, md5_G, md5_H, md5_I}; as &= 0x3; / * a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). * / t = a[as] + funcs[fi](a[(as+1)&3], a[(as+2)&3], a[(as+3)&3]) + *((uint32_t*)block) + md5_T[i] ; a[as]=a[(as+1)&3] + ROTL32(t, s); } */ ; a: r24-r25 ; block: r22-r23 ; as: r21 ; s: r20 ; i: r19 ; fi: r18 P_A0 = 24 P_A1 = 25 P_B0 = 22 P_B1 = 23 P_AS = 21 P_S = 20 P_I = 19 P_FI = 18 ; x: r22-r25 ; y: r18-r21 ; z: r14-r17 AS_SAVE0 = 4 AS_SAVE1 = 5 AS_SAVE2 = 6 AS_SAVE3 = 7 FI_SAVE = 8 S_SAVE = 9 ACCU0 = 10 ACCU1 = 11 ACCU2 = 12 ACCU3 = 13 ARG_X0 = 22 ARG_X1 = 23 ARG_X2 = 24 ARG_X3 = 25 ARG_Y0 = 18 ARG_Y1 = 19 ARG_Y2 = 20 ARG_Y3 = 21 ARG_Z0 = 14 ARG_Z1 = 15 ARG_Z2 = 16 ARG_Z3 = 17 md5_core_asm: push r16 push r17 push_range 4, 8 ldi r30, lo8(T_table) ldi r31, hi8(T_table) lsl P_I rol r1 lsl P_I rol r1 add r30, P_I adc r31, r1 clr r1 mov FI_SAVE, r18 /* loading T[i] into ACCU */ lpm ACCU0, Z+ lpm ACCU1, Z+ lpm ACCU2, Z+ lpm ACCU3, Z /* add *block to ACCU */ movw r30, P_B0 ld r0, Z+ add ACCU0, r0 ld r0, Z+ adc ACCU1, r0 ld r0, Z+ adc ACCU2, r0 ld r0, Z+ adc ACCU3, r0 /* add a[as+0&3] to ACCU */ ldi r30, lo8(as_table) ldi r31, hi8(as_table) dec P_AS andi P_AS, 0x03 lsl P_AS lsl P_AS add r30, r21 adc r31, r1 ; Z points to the correct row in as_table lpm AS_SAVE0, Z+ lpm AS_SAVE1, Z+ lpm AS_SAVE2, Z+ lpm AS_SAVE3, Z movw r26, r24 ; X points to a[0] add r26, AS_SAVE0 adc r27, r1 ; X points at a[as&3] ld r0, X+ add ACCU0, r0 ld r0, X+ adc ACCU1, r0 ld r0, X+ adc ACCU2, r0 ld r0, X+ adc ACCU3, r0 mov S_SAVE, r20 movw r28, r24 /* loading z value */ movw r26, r28 add r26, AS_SAVE1 adc r27, r1 ld ARG_Z0, X+ ld ARG_Z1, X+ ld ARG_Z2, X+ ld ARG_Z3, X /* loading x value */ movw r26, r28 add r26, AS_SAVE2 adc r27, r1 ld ARG_X0, X+ ld ARG_X1, X+ ld ARG_X2, X+ ld ARG_X3, X /* loading y value */ movw r26, r28 add r26, AS_SAVE3 adc r27, r1 ldi r30, pm_lo8(jump_table) ldi r31, pm_hi8(jump_table) add r30, FI_SAVE adc r31, r1 ; Z points to the correct entry in our jump table ld ARG_Y0, X+ ld ARG_Y1, X+ ld ARG_Y2, X+ ld ARG_Y3, X ijmp /* calls the function pointed by Z */ md5_core_F_exit: /* add ACCU to result of f() */ add r22, ACCU0 adc r23, ACCU1 adc r24, ACCU2 adc r25, ACCU3 /* rotate */ mov r20, S_SAVE rotl32: cpi r20, 8 brlo bitrotl mov r21, r25 mov r25, r24 mov r24, r23 mov r23, r22 mov r22, r21 subi r20, 8 rjmp rotl32 bitrotl: mov r21, r25 bitrotl_loop: tst r20 breq fixrotl bitrotl_loop2: lsl r21 rol r22 rol r23 rol r24 rol r25 dec r20 brne bitrotl_loop2 fixrotl: /* add a[(as+1)&3] */ movw r26, r28 add r26, AS_SAVE2 adc r27, r1 ld r0, X+ add r22, r0 ld r0, X+ adc r23, r0 ld r0, X+ adc r24, r0 ld r0, X adc r25, r0 /* store result */ movw r26, r28 add r26, AS_SAVE0 adc r27, r1 st X+, r22 st X+, r23 st X+, r24 st X , r25 md5_core_exit: pop_range 4, 8 pop r17 pop r16 ret ;################################################################### /* void md5_nextBlock(md5_ctx_t *state, void *block){ uint32_t a[4]; uint8_t m,n,i=0; a[0]=state->a[0]; a[1]=state->a[1]; a[2]=state->a[2]; a[3]=state->a[3]; / * round 1 * / uint8_t s1t[]={7,12,17,22}; // 1,-1 1,4 2,-1 3,-2 for(m=0;m<4;++m){ for(n=0;n<4;++n){ md5_core(a, &(((uint32_t*)block)[m*4+n]), 4-n, s1t[n],i++,0); } } / * round 2 * / uint8_t s2t[]={5,9,14,20}; // 1,-3 1,1 2,-2 2,4 for(m=0;m<4;++m){ for(n=0;n<4;++n){ md5_core(a, &(((uint32_t*)block)[(1+m*4+n*5)&0xf]), 4-n, s2t[n],i++,1); } } / * round 3 * / uint8_t s3t[]={4,11,16,23}; // 0,4 1,3 2,0 3,-1 for(m=0;m<4;++m){ for(n=0;n<4;++n){ md5_core(a, &(((uint32_t*)block)[(5-m*4+n*3)&0xf]), 4-n, s3t[n],i++,2); } } / * round 4 * / uint8_t s4t[]={6,10,15,21}; // 1,-2 1,2 2,-1 3,-3 for(m=0;m<4;++m){ for(n=0;n<4;++n){ md5_core(a, &(((uint32_t*)block)[(0-m*4+n*7)&0xf]), 4-n, s4t[n],i++,3); } } state->a[0] += a[0]; state->a[1] += a[1]; state->a[2] += a[2]; state->a[3] += a[3]; state->counter++; } */ shift_table_1: .byte 7,12,17,22 shift_table_2: .byte 5, 9,14,20 shift_table_3: .byte 4,11,16,23 shift_table_4: .byte 6,10,15,21 index_table_r2: ;(1+m*4+n*5)&0xf: .byte 0x04, 0x18, 0x2c, 0x00 .byte 0x14, 0x28, 0x3c, 0x10 .byte 0x24, 0x38, 0x0c, 0x20 .byte 0x34, 0x08, 0x1c, 0x30 index_table_r3: ;(5-m*4+n*3)&0xf: .byte 0x14, 0x20, 0x2c, 0x38 .byte 0x04, 0x10, 0x1c, 0x28 .byte 0x34, 0x00, 0x0c, 0x18 .byte 0x24, 0x30, 0x3c, 0x08 index_table_r4: ;(0-m*4+n*7)&0xf: .byte 0x00, 0x1c, 0x38, 0x14 .byte 0x30, 0x0c, 0x28, 0x04 .byte 0x20, 0x3c, 0x18, 0x34 .byte 0x10, 0x2c, 0x08, 0x24 APTR_REG = 2 BPTR_REG = 4 N_REG = 6 M_REG = 7 I_REG = 8 .global md5_nextBlock md5_nextBlock: stack_alloc 16 push_range 2, 17 push r28 push r29 push r24 push r25 adiw r30, 1 /* Z now points to the beginning of the allocated memory */ movw r2, r30 movw r4, r22 movw r26, r24 ldi r20, 16 1: ld r0, X+ st Z+, r0 dec r20 brne 1b /* state now copied to stack memory */ clr I_REG /* Round 1 */ clr M_REG ldi r17, 4 1: clr N_REG ldi r16, 4 2: movw r24, APTR_REG movw r22, BPTR_REG mov r0, M_REG lsl r0 lsl r0 add r0, N_REG lsl r0 lsl r0 add r22, r0 adc r23, r1 mov r21, r16 ldi r30, lo8(shift_table_1) ldi r31, hi8(shift_table_1) add r30, N_REG adc r31, r1 lpm r20, Z mov r19, I_REG ldi r18, 0 rcall md5_core_asm inc I_REG inc N_REG dec r16 brne 2b inc M_REG dec r17 brne 1b /* Round 2 */ clr M_REG ldi r17, 4 1: clr N_REG ldi r16, 4 2: movw r24, APTR_REG movw r22, BPTR_REG ldi r30, lo8(index_table_r2) ldi r31, hi8(index_table_r2) mov r0, M_REG lsl r0 lsl r0 add r0, N_REG add r30, r0 adc r31, r1 lpm r0, Z add r22, r0 adc r23, r1 mov r21, r16 ldi r30, lo8(shift_table_2) ldi r31, hi8(shift_table_2) add r30, N_REG adc r31, r1 lpm r20, Z mov r19, I_REG ldi r18, 1 rcall md5_core_asm inc I_REG inc N_REG dec r16 brne 2b inc M_REG dec r17 brne 1b /* Round 3 */ clr M_REG ldi r17, 4 1: clr N_REG ldi r16, 4 2: movw r24, APTR_REG movw r22, BPTR_REG ldi r30, lo8(index_table_r3) ldi r31, hi8(index_table_r3) mov r0, M_REG lsl r0 lsl r0 add r0, N_REG add r30, r0 adc r31, r1 lpm r0, Z add r22, r0 adc r23, r1 mov r21, r16 ldi r30, lo8(shift_table_3) ldi r31, hi8(shift_table_3) add r30, N_REG adc r31, r1 lpm r20, Z mov r19, I_REG ldi r18, 2 rcall md5_core_asm inc I_REG inc N_REG dec r16 brne 2b inc M_REG dec r17 brne 1b /* Round 4 */ clr M_REG ldi r17, 4 1: clr N_REG ldi r16, 4 2: movw r24, APTR_REG movw r22, BPTR_REG ldi r30, lo8(index_table_r4) ldi r31, hi8(index_table_r4) mov r0, M_REG lsl r0 lsl r0 add r0, N_REG add r30, r0 adc r31, r1 lpm r0, Z add r22, r0 adc r23, r1 mov r21, r16 ldi r30, lo8(shift_table_4) ldi r31, hi8(shift_table_4) add r30, N_REG adc r31, r1 lpm r20, Z mov r19, I_REG ldi r18, 3 rcall md5_core_asm inc I_REG inc N_REG dec r16 brne 2b inc M_REG dec r17 brne 1b pop r27 pop r26 /* X now points to the context */ movw r30, APTR_REG ldi r16, 4 1: ld r0, X ld r2, Z+ add r0, r2 st X+, r0 ld r0, X ld r2, Z+ adc r0, r2 st X+, r0 ld r0, X ld r2, Z+ adc r0, r2 st X+, r0 ld r0, X ld r2, Z+ adc r0, r2 st X+, r0 dec r16 brne 1b ld r0, X inc r0 st X+, r0 brne 2f ld r0, X inc r0 st X+, r0 brne 2f ld r0, X inc r0 st X+, r0 brne 2f ld r0, X inc r0 st X+, r0 2: pop r29 pop r28 pop_range 2, 17 stack_free 16 ret ;############################################################################### /* void md5_lastBlock(md5_ctx_t *state, const void *block, uint16_t length_b){ uint16_t l; uint8_t b[64]; while (length_b >= 512){ md5_nextBlock(state, block); length_b -= 512; block = ((uint8_t*)block) + 512/8; } memset(b, 0, 64); memcpy(b, block, length_b/8); / * insert padding one * / l=length_b/8; if(length_b%8){ uint8_t t; t = ((uint8_t*)block)[l]; t |= (0x80>>(length_b%8)); b[l]=t; }else{ b[l]=0x80; } / * insert length value * / if(l+sizeof(uint64_t) >= 512/8){ md5_nextBlock(state, b); state->counter--; memset(b, 0, 64-8); } *((uint64_t*)&b[64-sizeof(uint64_t)]) = (state->counter * 512) + length_b; md5_nextBlock(state, b); } */ ; state_ptr : r24,r25 ; block_ptr : r22,r23 ; length_b : r20,r21 .global md5_lastBlock md5_lastBlock: stack_alloc_large 64 push_range 12, 17 push r30 push r31 movw r16, r20 /* length_b */ movw r14, r22 /* block_ptr */ movw r12, r24 /* state_ptr */ ldi r18, 64 2: cpi r17, 2 /* hi8(512) */ brlo 2f 1: movw r24, r12 movw r22, r14 rcall md5_nextBlock add r14, r18 adc r15, r1 subi r17, 2 rjmp 2b 2: pop r31 pop r30 adiw r30, 1 /* adjust Z to point to buffer */ movw r26, r14 movw r24, r16 adiw r24, 7 lsr r25 ror r24 lsr r25 ror r24 lsr r24 /* r24 now holds how many bytes are to copy */ ldi r18, 64 sub r18, r24 /* r18 will hold the amount of used bytes in buffer */ tst r24 4: breq 5f ld r0, X+ st Z+, r0 dec r24 rjmp 4b /* Z points to the byte after msg in buffer */ 5: /* append 1-bit */ mov r20, r16 ldi r19, 0x80 andi r20, 0x07 brne bit_fucking st Z+, r19 dec r18 /* 'allocate' another byte in buffer */ rjmp after_bit_fucking bit_fucking: 1: lsr r19 dec r20 brne 1b or r0, r19 st -Z, r0 adiw r30, 1 after_bit_fucking: clt cpi r18, 8 brmi 2f set /* store in t if the counter will also fit in this block (1 if fit)*/ 2: tst r18 breq 2f 1: /* fill remaning buffer with zeros */ st Z+, r1 dec r18 brne 1b 2: sbiw r30, 63 sbiw r30, 1 movw r14, r30 /* r14:r15 now points to buffer */ brts load_counter /* counter does not fit, finalize this block */ movw r24, r12 movw r22, r14 rcall md5_nextBlock movw r30, r14 ldi r20, 64-8 3: st Z+, r1 dec r20 brne 3b load_counter: movw r26, r12 /* X points to state */ adiw r26, 16 ld r19, X+ ld r20, X+ ld r21, X+ ld r22, X+ brts post_counter_decrement /* do not decremen because counter fits */ counter_decrement: subi r19, 1 sbci r20, 0 sbci r21, 0 sbci r22, 0 post_counter_decrement: clr r18 clr r23 lsl r19 rol r20 rol r21 rol r22 rol r23 mov r18, r16 /* r16:r17 length_b */ add r19, r17 adc r20, r1 adc r21, r1 adc r22, r1 adc r23, r1 movw r30, r14 adiw r30, 64-8 st Z+, r18 st Z+, r19 st Z+, r20 st Z+, r21 st Z+, r22 st Z+, r23 st Z+, r1 st Z, r1 sbiw r30, 63 ; sbiw r30, 1 movw r24, r12 movw r22, r30 rcall md5_nextBlock md5_lastBlock_exit: pop_range 12, 17 stack_free_large 64 ret ;############################################################################### .global md5_ctx2hash md5_ctx2hash: movw r26, r24 movw r30, r22 ldi r22, 16 1: ld r0, Z+ st X+, r0 dec r22 brne 1b ret ;############################################################################### .global md5 md5: stack_alloc 20 push_range 8, 17 adiw r30, 1 movw r8, r30 /* ctx */ movw r10, r24 /* dest */ movw r12, r22 /* msg */ movw r14, r18 /* length (low) */ movw r16, r20 /* length (high) */ movw r24, r30 rcall md5_init 1: tst r16 brne next_round tst r17 breq last_round next_round: movw r24, r8 movw r22, r12 rcall md5_nextBlock ldi r22, 64 add r12, r22 adc r13, r1 ldi r22, 2 sub r15, r22 sbci r16, 0 sbci r17, 0 rjmp 1b last_round: movw r24, r8 movw r22, r12 movw r20, r14 rcall md5_lastBlock movw r24, r10 movw r22, r8 rcall md5_ctx2hash pop_range 8, 17 stack_free 20 ret