/* md5-asm.S */
/*
    This file is part of the AVR-Crypto-Lib.
    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
/*
 * Author:  Daniel Otte
 * License: GPLv3 or later
 * Date:    2008-11-15
*/


#include "avr-asm-macros.S"

;###########################################################	
; S-BOX

T_table:
.hword	0xa478, 0xd76a, 0xb756, 0xe8c7, 0x70db, 0x2420, 0xceee, 0xc1bd, 0x0faf, 0xf57c 
.hword	0xc62a, 0x4787, 0x4613, 0xa830, 0x9501, 0xfd46, 0x98d8, 0x6980, 0xf7af, 0x8b44 
.hword	0x5bb1, 0xffff, 0xd7be, 0x895c, 0x1122, 0x6b90, 0x7193, 0xfd98, 0x438e, 0xa679 
.hword	0x0821, 0x49b4, 0x2562, 0xf61e, 0xb340, 0xc040, 0x5a51, 0x265e, 0xc7aa, 0xe9b6 
.hword	0x105d, 0xd62f, 0x1453, 0x0244, 0xe681, 0xd8a1, 0xfbc8, 0xe7d3, 0xcde6, 0x21e1 
.hword	0x07d6, 0xc337, 0x0d87, 0xf4d5, 0x14ed, 0x455a, 0xe905, 0xa9e3, 0xa3f8, 0xfcef 
.hword	0x02d9, 0x676f, 0x4c8a, 0x8d2a, 0x3942, 0xfffa, 0xf681, 0x8771, 0x6122, 0x6d9d 
.hword	0x380c, 0xfde5, 0xea44, 0xa4be, 0xcfa9, 0x4bde, 0x4b60, 0xf6bb, 0xbc70, 0xbebf 
.hword	0x7ec6, 0x289b, 0x27fa, 0xeaa1, 0x3085, 0xd4ef, 0x1d05, 0x0488, 0xd039, 0xd9d4 
.hword	0x99e5, 0xe6db, 0x7cf8, 0x1fa2, 0x5665, 0xc4ac, 0x2244, 0xf429, 0xff97, 0x432a 
.hword	0x23a7, 0xab94, 0xa039, 0xfc93, 0x59c3, 0x655b, 0xcc92, 0x8f0c, 0xf47d, 0xffef 
.hword	0x5dd1, 0x8584, 0x7e4f, 0x6fa8, 0xe6e0, 0xfe2c, 0x4314, 0xa301, 0x11a1, 0x4e08 
.hword	0x7e82, 0xf753, 0xf235, 0xbd3a, 0xd2bb, 0x2ad7, 0xd391, 0xeb86


#define MD5_init_fast

.global md5_init 
#ifndef MD5_init_fast
;###########################################################	
;void md5_init(md5_ctx_t *state)
; param1: (r24,r25) 16-bit pointer to sha256_ctx_t struct in ram
; modifys: Z(r30,r31), X(r25,r26)
; size = 9+5*4 WORDS = 29 WORDS = 58 Bytes
md5_init:
	movw r26, r24 ; (24,25) --> (26,27) load X with param1
	ldi r30, lo8(md5_init_vector)
	ldi r31, hi8(md5_init_vector)
	ldi r24, 16+4
md5_init_vloop:	
	lpm r0, Z+ 
	st X+, r0
	dec r24
	brne md5_init_vloop
	ret
	
md5_init_vector:
.hword 0x2301, 0x6745
.hword 0xAB89, 0xEFCD 
.hword 0xDCFE, 0x98BA 
.hword 0x5476, 0x1032 
.hword 0x0000, 0x0000

#else
;###########################################################	
.global md5_init_fast 
;void md5_init(md5_ctx_t *state)
; param1: (r24,r25) 16-bit pointer to sha256_ctx_t struct in ram
; modifys: r23, r22
; cycles = 1+16*3+4*2+4 = 1+48+12 = 61
; size = 1+16*2+4+1 WORDS = 38 WORDS = 76 Bytes
md5_init:
md5_init_fast:
	movw r26, r24
	ldi r24, 0x01
	st X+, r24
	ldi r24, 0x23
	st X+, r24
	ldi r24, 0x45
	st X+, r24
	ldi r24, 0x67
	st X+, r24
	ldi r24, 0x89
	st X+, r24
	ldi r24, 0xAB
	st X+, r24
	ldi r24, 0xCD
	st X+, r24
	ldi r24, 0xEF
	st X+, r24
	ldi r24, 0xFE
	st X+, r24
	ldi r24, 0xDC
	st X+, r24
	ldi r24, 0xBA
	st X+, r24
	ldi r24, 0x98
	st X+, r24
	ldi r24, 0x76
	st X+, r24
	ldi r24, 0x54
	st X+, r24
	ldi r24, 0x32
	st X+, r24
	ldi r24, 0x10
	st X+, r24
	st X+, r1
	st X+, r1
	st X+, r1
	st X+, r1
	ret
#endif
;###########################################################	

/*
static 
uint32_t md5_F(uint32_t x, uint32_t y, uint32_t z){
	return ((x&y)|((~x)&z));
}
*/
; x: r22-r25
; y: r18-r21
; z: r14-r17
md5_F:
	and r18, r22
	and r19, r23
	and r20, r24
	and r21, r25
	com r22
	com r23
	com r24
	com r25
	and r22, r14
	and r23, r15
	and r24, r16
	and r25, r17
	or  r22, r18
	or  r23, r19
	or  r24, r20
	or  r25, r21
	rjmp md5_core_F_exit
	
/*
static
uint32_t md5_G(uint32_t x, uint32_t y, uint32_t z){
	return ((x&z)|((~z)&y));
}
*/

; x: r22-r25
; y: r18-r21
; z: r14-r17
md5_G:
	and r22, r14
	and r23, r15
	and r24, r16
	and r25, r17
	com r14
	com r15
	com r16
	com r17
	and r18, r14
	and r19, r15
	and r20, r16
	and r21, r17
	or  r22, r18
	or  r23, r19
	or  r24, r20
	or  r25, r21
	rjmp md5_core_F_exit
/*
static
uint32_t md5_H(uint32_t x, uint32_t y, uint32_t z){
	return (x^y^z);
}
*/
; x: r22-r25
; y: r18-r21
; z: r14-r17
md5_H:
	eor r22, r18
	eor r22, r14
	eor r23, r19
	eor r23, r15
	eor r24, r20
	eor r24, r16
	eor r25, r21
	eor r25, r17
	rjmp md5_core_F_exit
/*
static
uint32_t md5_I(uint32_t x, uint32_t y, uint32_t z){
	return (y ^ (x | (~z)));
}
*/

jump_table:
	rjmp md5_F
	rjmp md5_G
	rjmp md5_H
;	rjmp md5_I

; x: r22-r25
; y: r18-r21
; z: r14-r17
md5_I:
	com r14
	com r15
	com r16
	com r17
	or  r22, r14
	or  r23, r15
	or  r24, r16
	or  r25, r17
	eor r22, r18
	eor r23, r19
	eor r24, r20
	eor r25, r21
	rjmp md5_core_F_exit

as_table:
;     (as+0)&3  (as+3)&3  (as+1)&3  (as+2)&3
;                  Z         X         Y
;     AS_SAVE0  AS_SAVE1  AS_SAVE2  AS_SAVE3 
.byte   1*4,      0*4,      2*4,      3*4    ;as=1
.byte   2*4,      1*4,      3*4,      0*4    ;as=2
.byte   3*4,      2*4,      0*4,      1*4    ;as=3
.byte   0*4,      3*4,      1*4,      2*4    ;as=4

;###########################################################	
.global md5_core
md5_core:
	mov r21, r20
	mov r20, r18
	mov r19, r16
	mov r18, r14
;	rjmp md5_core_asm
/*
void md5_core(uint32_t *a, void *block, uint8_t as, uint8_t s, uint8_t i, uint8_t fi){
	uint32_t t;
	md5_func_t *funcs[]={md5_F, md5_G, md5_H, md5_I};
	as &= 0x3;
	/ * a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). * /
	t = a[as] + funcs[fi](a[(as+1)&3], a[(as+2)&3], a[(as+3)&3]) + *((uint32_t*)block) + md5_T[i] ;
	a[as]=a[(as+1)&3] + ROTL32(t, s);
}
*/
; a:     r24-r25
; block: r22-r23
; as:    r21
; s:     r20
; i:     r19
; fi:    r18
P_A0 = 24
P_A1 = 25
P_B0 = 22
P_B1 = 23
P_AS = 21
P_S  = 20
P_I  = 19
P_FI = 18

; x: r22-r25
; y: r18-r21
; z: r14-r17


AS_SAVE0  =  4
AS_SAVE1  =  5
AS_SAVE2  =  6
AS_SAVE3  =  7
FI_SAVE   =  8
S_SAVE    =  9
ACCU0     = 10
ACCU1     = 11
ACCU2     = 12
ACCU3     = 13
ARG_X0    = 22
ARG_X1    = 23
ARG_X2    = 24
ARG_X3    = 25
ARG_Y0    = 18
ARG_Y1    = 19
ARG_Y2    = 20
ARG_Y3    = 21
ARG_Z0    = 14
ARG_Z1    = 15
ARG_Z2    = 16
ARG_Z3    = 17


md5_core_asm:
	push r16
	push r17
	push_range 4, 8
	ldi r30, lo8(T_table)
	ldi r31, hi8(T_table)
	lsl P_I
	rol r1
	lsl P_I
	rol r1
	add r30, P_I
	adc r31, r1
	clr r1
	mov FI_SAVE, r18
	/* loading T[i] into ACCU */	
	lpm ACCU0, Z+	
	lpm ACCU1, Z+	
	lpm ACCU2, Z+	
	lpm ACCU3, Z
	/* add *block to ACCU */
	movw r30, P_B0
	ld r0, Z+
	add ACCU0, r0
	ld r0, Z+
	adc ACCU1, r0
	ld r0, Z+
	adc ACCU2, r0
	ld r0, Z+
	adc ACCU3, r0
	/* add a[as+0&3] to ACCU */
	ldi r30, lo8(as_table)
	ldi r31, hi8(as_table)
	dec P_AS
	andi P_AS, 0x03
	lsl P_AS
	lsl P_AS
	add r30, r21
	adc r31, r1       ; Z points to the correct row in as_table
	lpm AS_SAVE0, Z+
	lpm AS_SAVE1, Z+
	lpm AS_SAVE2, Z+
	lpm AS_SAVE3, Z
	movw r26, r24     ; X points to a[0]
	add r26, AS_SAVE0
	adc r27, r1       ; X points at a[as&3]
	ld r0, X+
	add ACCU0, r0
	ld r0, X+
	adc ACCU1, r0
	ld r0, X+
	adc ACCU2, r0
	ld r0, X+
	adc ACCU3, r0
	mov S_SAVE, r20

	movw r28, r24
	/* loading z value */
	movw r26, r28
	add r26, AS_SAVE1
	adc r27, r1
	ld ARG_Z0, X+
	ld ARG_Z1, X+
	ld ARG_Z2, X+
	ld ARG_Z3, X

	/* loading x value */
	movw r26, r28	
	add r26, AS_SAVE2
	adc r27, r1
	ld ARG_X0, X+
	ld ARG_X1, X+
	ld ARG_X2, X+
	ld ARG_X3, X

	/* loading y value */
	movw r26, r28
	add r26, AS_SAVE3
	adc r27, r1
	ldi r30, pm_lo8(jump_table)
	ldi r31, pm_hi8(jump_table)
	add r30, FI_SAVE
	adc r31, r1    ; Z points to the correct entry in our jump table
	ld ARG_Y0, X+
	ld ARG_Y1, X+
	ld ARG_Y2, X+
	ld ARG_Y3, X

	ijmp /* calls the function pointed by Z */
md5_core_F_exit:		

	/* add ACCU to result of f() */
	add r22, ACCU0
	adc r23, ACCU1
	adc r24, ACCU2
	adc r25, ACCU3

	/* rotate */
	mov r20, S_SAVE
rotl32:
	cpi r20, 8
	brlo bitrotl
	mov r21, r25
	mov r25, r24
	mov r24, r23
	mov r23, r22
	mov r22, r21
	subi r20, 8
	rjmp rotl32
bitrotl:
	mov r21, r25
bitrotl_loop:	
	tst r20
	breq fixrotl
bitrotl_loop2:	
	lsl r21
	rol r22
	rol r23
	rol r24
	rol r25
	dec r20
	brne bitrotl_loop2
fixrotl:

	/* add a[(as+1)&3]  */
	movw r26, r28
	add r26, AS_SAVE2
	adc r27, r1
	ld r0, X+
	add r22, r0
	ld r0, X+
	adc r23, r0
	ld r0, X+
	adc r24, r0
	ld r0, X
	adc r25, r0

	/* store result */
	movw r26, r28
	add r26, AS_SAVE0
	adc r27, r1
	st X+, r22
	st X+, r23
	st X+, r24
	st X , r25	
md5_core_exit:
	pop_range 4, 8
	pop r17
	pop r16
	ret

;###################################################################
/*
void md5_nextBlock(md5_ctx_t *state, void *block){
	uint32_t	a[4];
	uint8_t		m,n,i=0;

	a[0]=state->a[0];
	a[1]=state->a[1];
	a[2]=state->a[2];
	a[3]=state->a[3];
	
	/ * round 1 * /
	uint8_t s1t[]={7,12,17,22}; // 1,-1   1,4   2,-1   3,-2
	for(m=0;m<4;++m){
		for(n=0;n<4;++n){
			md5_core(a, &(((uint32_t*)block)[m*4+n]), 4-n, s1t[n],i++,0);
		}
	}
	/ * round 2 * /
	uint8_t s2t[]={5,9,14,20}; // 1,-3   1,1   2,-2   2,4
	for(m=0;m<4;++m){
		for(n=0;n<4;++n){
			md5_core(a, &(((uint32_t*)block)[(1+m*4+n*5)&0xf]), 4-n, s2t[n],i++,1);
		}
	}
	/ * round 3 * /
	uint8_t s3t[]={4,11,16,23}; // 0,4   1,3   2,0   3,-1
	for(m=0;m<4;++m){
		for(n=0;n<4;++n){
			md5_core(a, &(((uint32_t*)block)[(5-m*4+n*3)&0xf]), 4-n, s3t[n],i++,2);
		}
	}
	/ * round 4 * /
	uint8_t s4t[]={6,10,15,21}; // 1,-2   1,2   2,-1   3,-3
	for(m=0;m<4;++m){
		for(n=0;n<4;++n){
			md5_core(a, &(((uint32_t*)block)[(0-m*4+n*7)&0xf]), 4-n, s4t[n],i++,3);
		}
	}
	state->a[0] += a[0];
	state->a[1] += a[1];
	state->a[2] += a[2];
	state->a[3] += a[3];
	state->counter++;
}
*/

shift_table_1:  .byte  7,12,17,22
shift_table_2:  .byte  5, 9,14,20
shift_table_3:  .byte  4,11,16,23
shift_table_4:  .byte  6,10,15,21

index_table_r2:
;(1+m*4+n*5)&0xf:
        .byte 0x04, 0x18, 0x2c, 0x00 
        .byte 0x14, 0x28, 0x3c, 0x10 
        .byte 0x24, 0x38, 0x0c, 0x20 
        .byte 0x34, 0x08, 0x1c, 0x30 

index_table_r3:
;(5-m*4+n*3)&0xf:
        .byte 0x14, 0x20, 0x2c, 0x38 
        .byte 0x04, 0x10, 0x1c, 0x28 
        .byte 0x34, 0x00, 0x0c, 0x18 
        .byte 0x24, 0x30, 0x3c, 0x08 

index_table_r4:
;(0-m*4+n*7)&0xf:
        .byte 0x00, 0x1c, 0x38, 0x14 
        .byte 0x30, 0x0c, 0x28, 0x04 
        .byte 0x20, 0x3c, 0x18, 0x34 
        .byte 0x10, 0x2c, 0x08, 0x24

APTR_REG = 2
BPTR_REG = 4
N_REG = 6
M_REG = 7
I_REG = 8
.global md5_nextBlock
md5_nextBlock:
	stack_alloc 16
	push_range 2, 17
	push r28
	push r29
	push r24
	push r25
	adiw r30, 1 /* Z now points to the beginning of the allocated memory */
	movw r2, r30
	movw r4, r22
	movw r26, r24
	ldi r20, 16
1:
	ld r0, X+
	st Z+, r0
	dec r20
	brne 1b
	/* state now copied to stack memory */
	clr I_REG	
	/* Round 1 */
	clr M_REG
	ldi r17, 4
1:
	clr N_REG	
	ldi r16, 4
2:
	movw r24, APTR_REG
	movw r22, BPTR_REG 
	mov r0, M_REG
	lsl r0
	lsl r0
	add r0, N_REG
	lsl r0
	lsl r0
	add r22, r0
	adc r23, r1
	mov r21, r16	
	ldi r30, lo8(shift_table_1)
	ldi r31, hi8(shift_table_1)
	add r30, N_REG
	adc r31, r1
	lpm r20, Z
	mov r19, I_REG
	ldi r18, 0
	rcall md5_core_asm
	inc I_REG
	inc N_REG
	dec r16
	brne 2b
	inc M_REG
	dec r17
	brne 1b
	
	/* Round 2 */
	clr M_REG
	ldi r17, 4
1:
	clr N_REG	
	ldi r16, 4
2:
	movw r24, APTR_REG
	movw r22, BPTR_REG 
	ldi r30, lo8(index_table_r2)
	ldi r31, hi8(index_table_r2)
	mov r0, M_REG
	lsl r0
	lsl r0
	add r0, N_REG
	add r30, r0
	adc r31, r1
	lpm r0, Z	
	add r22, r0
	adc r23, r1
	mov r21, r16	
	ldi r30, lo8(shift_table_2)
	ldi r31, hi8(shift_table_2)
	add r30, N_REG
	adc r31, r1
	lpm r20, Z
	mov r19, I_REG
	ldi r18, 1
	rcall md5_core_asm
	inc I_REG
	inc N_REG
	dec r16
	brne 2b
	inc M_REG
	dec r17
	brne 1b

	/* Round 3 */
	clr M_REG
	ldi r17, 4
1:
	clr N_REG	
	ldi r16, 4
2:
	movw r24, APTR_REG
	movw r22, BPTR_REG 
	ldi r30, lo8(index_table_r3)
	ldi r31, hi8(index_table_r3)
	mov r0, M_REG
	lsl r0
	lsl r0
	add r0, N_REG
	add r30, r0
	adc r31, r1
	lpm r0, Z	
	add r22, r0
	adc r23, r1
	mov r21, r16	
	ldi r30, lo8(shift_table_3)
	ldi r31, hi8(shift_table_3)
	add r30, N_REG
	adc r31, r1
	lpm r20, Z
	mov r19, I_REG
	ldi r18, 2
	rcall md5_core_asm
	inc I_REG
	inc N_REG
	dec r16
	brne 2b
	inc M_REG
	dec r17
	brne 1b

	/* Round 4 */
	clr M_REG
	ldi r17, 4
1:
	clr N_REG	
	ldi r16, 4
2:
	movw r24, APTR_REG
	movw r22, BPTR_REG 
	ldi r30, lo8(index_table_r4)
	ldi r31, hi8(index_table_r4)
	mov r0, M_REG
	lsl r0
	lsl r0
	add r0, N_REG
	add r30, r0
	adc r31, r1
	lpm r0, Z	
	add r22, r0
	adc r23, r1
	mov r21, r16	
	ldi r30, lo8(shift_table_4)
	ldi r31, hi8(shift_table_4)
	add r30, N_REG
	adc r31, r1
	lpm r20, Z
	mov r19, I_REG
	ldi r18, 3
	rcall md5_core_asm
	inc I_REG
	inc N_REG
	dec r16
	brne 2b
	inc M_REG
	dec r17
	brne 1b


	pop r27
	pop r26 /* X now points to the context */
	movw r30, APTR_REG
	ldi r16, 4
1:
	ld r0, X
	ld r2, Z+
	add r0, r2
	st X+, r0	
	ld r0, X
	ld r2, Z+
	adc r0, r2
	st X+, r0	
	ld r0, X
	ld r2, Z+
	adc r0, r2
	st X+, r0	
	ld r0, X
	ld r2, Z+
	adc r0, r2
	st X+, r0	
	dec r16
	brne 1b

	ld r0, X
	inc r0
	st X+, r0	
	brne 2f
	ld r0, X
	inc r0
	st X+, r0	
	brne 2f
	ld r0, X
	inc r0
	st X+, r0	
	brne 2f	
	ld r0, X
	inc r0
	st X+, r0	
2:			

	pop r29
	pop r28
	pop_range 2, 17
	stack_free 16
	ret

;###############################################################################
/*
void md5_lastBlock(md5_ctx_t *state, const void *block, uint16_t length_b){
	uint16_t l;
	uint8_t b[64];
	while (length_b >= 512){
		md5_nextBlock(state, block);
		length_b -= 512;
		block = ((uint8_t*)block) + 512/8;
	}
	memset(b, 0, 64);
	memcpy(b, block, length_b/8);
	/ * insert padding one * /
	l=length_b/8;
	if(length_b%8){
		uint8_t t;
		t = ((uint8_t*)block)[l];
		t |= (0x80>>(length_b%8));
		b[l]=t;
	}else{
		b[l]=0x80;
	}
	/ * insert length value * /
	if(l+sizeof(uint64_t) >= 512/8){
		md5_nextBlock(state, b);
		state->counter--;
		memset(b, 0, 64-8);
	}
	*((uint64_t*)&b[64-sizeof(uint64_t)]) = (state->counter * 512) + length_b;
	md5_nextBlock(state, b);
}
*/
; state_ptr : r24,r25
; block_ptr : r22,r23
; length_b  : r20,r21
.global md5_lastBlock
md5_lastBlock:
	stack_alloc_large 64
	push_range 12, 17
	push r30
	push r31
	movw r16, r20 /* length_b  */ 
	movw r14, r22 /* block_ptr */
	movw r12, r24 /* state_ptr */
	ldi r18, 64
2:
	cpi r17, 2 /* hi8(512) */	
	brlo 2f
1:
	movw r24, r12
	movw r22, r14
	rcall md5_nextBlock
	add r14, r18
	adc r15, r1
	subi r17, 2
	rjmp 2b
2:
	pop r31
	pop r30

	adiw r30, 1 /* adjust Z to point to buffer */
	movw r26, r14
	movw r24, r16
	adiw r24, 7

	lsr r25
	ror r24
	lsr r25
	ror r24
	lsr r24 /* r24 now holds how many bytes are to copy */
    ldi r18, 64
	sub r18, r24 /* r18 will hold the amount of used bytes in buffer */
	tst r24
4:
	breq 5f
	ld r0, X+
	st Z+, r0 
	dec r24
	rjmp 4b /* Z points to the byte after msg in buffer */
5:	/* append 1-bit */
	mov r20, r16
	ldi r19, 0x80
	andi r20, 0x07
	brne bit_fucking
	st Z+, r19
	dec r18 /* 'allocate' another byte in buffer */
	rjmp after_bit_fucking
bit_fucking:
1:
	lsr r19
	dec r20
	brne 1b
	or r0, r19
	st -Z, r0
    adiw r30, 1
after_bit_fucking:
	clt	
	cpi r18, 8
	brmi 2f
	set         /* store in t if the counter will also fit in this block (1 if fit)*/
2:
	tst r18
	breq 2f
1: /* fill remaning buffer with zeros */
	st Z+, r1
	dec r18
	brne 1b
2:
	sbiw r30, 63
	sbiw r30,  1
	movw r14, r30 /* r14:r15 now points to buffer */	
	brts load_counter
	/* counter does not fit, finalize this block */
	movw r24, r12
	movw r22, r14
	rcall md5_nextBlock
	movw r30, r14
	ldi r20, 64-8
3:
	st Z+, r1
	dec r20
	brne 3b
	
load_counter:		
	movw r26, r12 /* X points to state */
	adiw r26, 16
	ld r19, X+
	ld r20, X+
	ld r21, X+
	ld r22, X+
	brts post_counter_decrement	/* do not decremen because counter fits */
counter_decrement:
	subi r19, 1
	sbci r20, 0
	sbci r21, 0
	sbci r22, 0
post_counter_decrement:
	clr r18
	clr r23
	lsl r19
	rol r20
	rol r21
	rol r22
	rol r23
	mov r18, r16 /* r16:r17 length_b */
	add r19, r17
	adc r20, r1
	adc r21, r1
	adc r22, r1
	adc r23, r1
	movw r30, r14
	adiw r30, 64-8
	st Z+, r18
	st Z+, r19
	st Z+, r20
	st Z+, r21
	st Z+, r22
	st Z+, r23
	st Z+, r1
	st Z, r1

	sbiw r30, 63
;	sbiw r30, 1
	movw r24, r12
	movw r22, r30
	rcall md5_nextBlock
md5_lastBlock_exit:	
	pop_range 12, 17
	stack_free_large 64
	ret


;###############################################################################


.global md5_ctx2hash
md5_ctx2hash:
	movw r26, r24
	movw r30, r22
	ldi r22, 16
1:
	ld r0, Z+
	st X+, r0
	dec r22
	brne 1b	
	ret


;###############################################################################


.global md5
md5:
	stack_alloc 20
	push_range  8, 17
	adiw r30, 1
	movw  r8, r30 /* ctx           */
	movw r10, r24 /* dest          */
	movw r12, r22 /* msg           */
	movw r14, r18 /* length (low)  */
	movw r16, r20 /* length (high) */
	movw r24, r30
	rcall md5_init
1:
	tst r16
	brne next_round
	tst r17
	breq last_round
next_round:
	movw r24,  r8
	movw r22, r12
	rcall md5_nextBlock
	ldi r22, 64
	add r12, r22
	adc r13, r1
	ldi r22, 2
	sub r15, r22
	sbci r16, 0
	sbci r17, 0
	rjmp 1b
last_round:		
	movw r24, r8
	movw r22, r12
	movw r20, r14
	rcall md5_lastBlock
	movw r24, r10
	movw r22,  r8
	rcall md5_ctx2hash
	pop_range  8, 17
	stack_free 20
	ret