/* keccac-asm.S */
/*
    This file is part of the AVR-Crypto-Lib.
    Copyright (C) 2012  Daniel Otte (daniel.otte@rub.de)

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
/**
 * \file     keccak-asm.S
 * \email    daniel.otte@rub.de
 * \author   Daniel Otte
 * \date     2012-12-16
 * \license  GPLv3 or later
 *
 */

.nolist
#include "avr-asm-macros.S"
.list

.equ __zero_reg__, 1

/*
typedef struct{
	uint64_t a[5][5];
	uint16_t r;
	uint8_t  bs;
} keccak_ctx_t;
*/
	.struct 0
ctx_a:
	.struct ctx_a + 8 * 5 * 5
ctx_r:
	.struct ctx_r + 2
ctx_bs:

	.section .text

	.global rho_pi_idx_table
rho_pi_idx_table:
	.irp i, 0, 1, 2, 3, 4
		.irp j, 0, 1, 2, 3, 4
			.byte (((2 * \j + 3 * \i) % 5) * 5 + \i) * 8
		.endr
	.endr

/*
#define ROT_BIT(a) (( (a) <= 4) ? ((a) << 1) : (0x01 | ((8 - (a)) << 1)))
#define ROT_CODE(a) ((((a) / 8 + ((((a) % 8) > 4) ? 1 : 0)) << 4) | ROT_BIT(((a) % 8)))

const uint8_t keccak_rotate_codes[5][5] PROGMEM = {
        { ROT_CODE( 0), ROT_CODE( 1), ROT_CODE(62), ROT_CODE(28), ROT_CODE(27) },
        { ROT_CODE(36), ROT_CODE(44), ROT_CODE( 6), ROT_CODE(55), ROT_CODE(20) },
        { ROT_CODE( 3), ROT_CODE(10), ROT_CODE(43), ROT_CODE(25), ROT_CODE(39) },
        { ROT_CODE(41), ROT_CODE(45), ROT_CODE(15), ROT_CODE(21), ROT_CODE( 8) },
        { ROT_CODE(18), ROT_CODE( 2), ROT_CODE(61), ROT_CODE(56), ROT_CODE(14) }
};
*/

keccak_rotate_codes:
.byte	0x00, 0x02, 0x85, 0x38, 0x36
.byte	0x48, 0x58, 0x15, 0x73, 0x28
.byte	0x06, 0x14, 0x56, 0x32, 0x53
.byte	0x52, 0x67, 0x23, 0x37, 0x10
.byte	0x24, 0x04, 0x87, 0x70, 0x25

keccak_rc_comp:
.byte	0x01, 0x92, 0xda, 0x70
.byte	0x9b, 0x21, 0xf1, 0x59
.byte	0x8a, 0x88, 0x39, 0x2a
.byte	0xbb, 0xcb, 0xd9, 0x53
.byte	0x52, 0xc0, 0x1a, 0x6a
.byte	0xf1, 0xd0, 0x21, 0x78

	.align 2

rotate64_1bit_left:
	bst r25, 7
	rol r18
	rol r19
	rol r20
	rol r21
	rol r22
	rol r23
	rol r24
	rol r25
	bld r18, 0
	ret

rotate64_1bit_right:
	bst r18, 0
	ror r25
	ror r24
	ror r23
	ror r22
	ror r21
	ror r20
	ror r19
	ror r18
	bld r25, 7
	ret

rotate64_1byte_left:
	mov r0, r25
	mov r25, r24
	mov r24, r23
	mov r23, r22
	mov r22, r21
	mov r21, r20
	mov r20, r19
	mov r19, r18
	mov r18, r0
	ret

rotate64_2byte_left:
	movw r0, r24
	movw r24, r22
	movw r22, r20
	movw r20, r18
	movw r18, r0
	ret

rotate64_3byte_left:
	mov r0, r25
	mov r25, r22
	mov r22, r19
	mov r19, r24
	mov r24, r21
	mov r21, r18
	mov r18, r23
	mov r23, r20
	mov r20, r0
	ret

rotate64_4byte_left:
	movw r0, r24
	movw r24, r20
	movw r20, r0
	movw r0, r22
	movw r22, r18
	movw r18, r0
	ret

rotate64_5byte_left:
	mov r0, r25
	mov r25, r20
	mov r20, r23
	mov r23, r18
	mov r18, r21
	mov r21, r24
	mov r24, r19
	mov r19, r22
	mov r22, r0
	ret

rotate64_6byte_left:
	movw r0, r18
	movw r18, r20
	movw r20, r22
	movw r22, r24
	movw r24, r0
	ret

rotate64_7byte_left:
	mov r0, r18
	mov r18, r19
	mov r19, r20
	mov r20, r21
	mov r21, r22
	mov r22, r23
	mov r23, r24
	mov r24, r25
	mov r25, r0

byte_rot_jmp_table:
	ret
	rjmp rotate64_1byte_left
	rjmp rotate64_2byte_left
	rjmp rotate64_3byte_left
	rjmp rotate64_4byte_left
	rjmp rotate64_5byte_left
	rjmp rotate64_6byte_left
	rjmp rotate64_7byte_left


/*
	void keccak_theta (uint64_t *a, uint64_t *b){
	// uint64_t b[5][5];
		for(i = 0; i < 5; ++i){
			b[i][0] = a[0][i] ^ a[1][i] ^ a[2][i] ^ a[3][i] ^ a[4][i];
	  	}
  	}
*/

/*********************************************
 * theta_2a
 *********************************************
	input:
		r24:r25 = a ; uint64_t a[5][5]
		X = b       ; uint64_t *b
	output:
		a[0..4][0] ^= b
		r20 = 0
		r21 = XX
		r22 = XX
		r24:r25 += 8
		X += 8
		Z = r24:r25 + 7 + 4 * 40
*/
theta_2a:
	ldi r20, 8
10:
	movw ZL, r24
	ld  r21, X+
	.irp r, 0, 1, 2, 3, 4
		ld  r22, Z
		eor r22, r21
		st  Z, r22
	.if \r != 4
		adiw ZL, 40
	.endif
	.endr
	adiw r24, 1
	dec r20
	brne 10b
	ret

/*********************************************
 * theta_2b
 *********************************************
	input:
		r24:r25 = a+1 ; uint64_t a[5][5]
		X = b       ; uint64_t *b
	output:
		a[0..4][0] ^= rol(b,1)
		r19 = XX
		r20 = 0
		r21 = XX
		r22 = XX
		r24:r25 += 8
		X += 8
		Z = r24:r25 + 7 + 4 * 40
*/
theta_2b:
	ldi r20, 7
	ld r19, X+
	lsl r19
	rol __zero_reg__
10:
	movw ZL, r24
	ld  r21, X+
	ror __zero_reg__
	rol r21
	rol __zero_reg__
	.irp r, 0, 1, 2, 3, 4
		ld  r22, Z
		eor r22, r21
		st  Z, r22
	.if \r != 4
		adiw ZL, 40
	.endif
	.endr
	adiw r24, 1
	dec r20
	brne 10b
	add r19, __zero_reg__
	sbiw r24, 8
	movw ZL, r24
	.irp r, 0, 1, 2, 3, 4
		ld  r22, Z
		eor r22, r19
		st  Z, r22
	.if \r != 4
		adiw ZL, 40
	.endif
	.endr
	adiw r24, 9
	clr __zero_reg__
	ret

;	a[i][j] =  b[i][j] ^ ((~(b[i][(j + 1) % 5])) & (b[i][(j + 2) % 5]));

/*********************************************
 * chi_step
 *********************************************
	input:
		Y = a; uint8t *a;
		X = b; uint8t *b;
		Z = c; uint8t *c;
	output:
		a[0..7] ^= ~b[0..7] & c[0..7]
		X += 8
		Y += 8
		Z += 8
		r16 = 0
		trash r21, r22, r23
*/
chi_step:
	ldi r16, 8
10:
	ld r21, Y
	ld r22, X+
	ld r23, Z+
	com r22
	and r22, r23
	eor r21, r22
	st Y+, r21
	dec r16
	brne 10b
	ret

	.global keccak_nextBlock
	.func keccak_nextBlock
keccak_nextBlock:
	movw ZL, r24
	subi ZL, lo8(-ctx_bs)
	sbci ZH, hi8(-ctx_bs)
	ld r20, Z
	movw XL, r24
	movw ZL, r22
10:
	ld r22, X
	ld r23, Z+
	eor r22, r23
	st X+, r22
	dec r20
	brne 10b
	.endfunc

	.global keccak_f1600
	.func keccak_f1600
keccak_f1600:
	push_range 2, 9
	push r16
	push_range 28, 29

	stack_alloc_large 200, r26, r27
	adiw XL, 1

	clr r9
5:
	movw r30, r24 ; Z = a

	ldi r19, 5
10:
	ldi r20, 8
20:
	ld  r22, Z
	adiw ZL, 40
	ld  r21, Z
	eor r22, r21
	adiw ZL, 40
	ld  r21, Z
	eor r22, r21
	adiw ZL, 40
	ld  r21, Z
	eor r22, r21
	adiw ZL, 40
	ld  r21, Z
	eor r22, r21
	adiw r24, 1
	movw r30, r24
	st X+, r22
	dec r20
	brne 20b

	adiw XL, 8 * 4
	dec r19
	brne 10b
/*
	for(i = 0; i < 5; ++i){
		for(j = 0; j < 5; ++j){
			a[j][i] ^= b[(4 + i) % 5][0];
		}
	}

*/
/* a[0..4][0]{0..7} ^= b[4][0]{0..7} */
	sbiw XL, 5 * 8
	sbiw r24, 40
	rcall theta_2a
/* a[0..4][1]{0..7} ^= b[0][0]{0..7} */
	subi XL, lo8(4 * 5 * 8 + 8)
	sbci XH, hi8(4 * 5 * 8 + 8)
	rcall theta_2a
/* a[0..4][2]{0..7} ^= b[1][0]{0..7} */
	adiw XL, 4 * 8
	rcall theta_2a
/* a[0..4][3]{0..7} ^= b[2][0]{0..7} */
	adiw XL, 4 * 8
	rcall theta_2a
/* a[0..4][4]{0..7} ^= b[3][0]{0..7} */
	adiw XL, 4 * 8
	rcall theta_2a
/*
	for(i = 0; i < 5; ++i){
        for(j = 0; j < 5; ++j){
            a[j][i] ^= rotate64_1bit_left(b[(i + 1) % 5][0]);
        }
    }
*/
/* a[0..4][0]{0..7} ^= rol(b[1][0]{0..7}) */
	subi r24, lo8(5 * 8 - 1)
	sbci r25, hi8(5 * 8 - 1)
	subi XL, lo8(2 * 5 * 8 + 8)
	sbci XH, hi8(2 * 5 * 8 + 8)
	rcall theta_2b
/* a[0..4][1]{0..7} ^= rol(b[2][0]{0..7}) */
	adiw XL, 4 * 8
	rcall theta_2b
/* a[0..4][21]{0..7} ^= rol(b[3][0]{0..7}) */
	adiw XL, 4 * 8
	rcall theta_2b
/* a[0..4][3]{0..7} ^= rol(b[4][0]{0..7}) */
	adiw XL, 4 * 8
	rcall theta_2b
/* a[0..4][4]{0..7} ^= rol(b[0][0]{0..7}) */
	subi XL, lo8(4 * 5 * 8 + 8)
	sbci XH, hi8(4 * 5 * 8 + 8)
	rcall theta_2b
/*
   -- rho & pi --
	for(i = 0; i < 5; ++i){
		for(j = 0; j < 5; ++j){
			b[(2 * i + 3 * j) % 5][j] =
              rotate64left_code(a[j][i], pgm_read_byte(&(keccak_rotate_codes[i][j])));
		}
	}

   -- or --

	const uint8_t *rot_code = (const uint8_t*)keccak_rotate_codes;
    const uint8_t *idx_idx = (const uint8_t*)rho_pi_idx_table;
    uint64_t *a_tmp = (uint64_t*)a;
	for(i = 0; i < 25; ++i){
		    *((uint64_t*)(((uint8_t*)b) + pgm_read_byte(idx_idx++))) =
                rotate64left_code(*a_tmp++, pgm_read_byte(rot_code++));

	}

*/

.equ B_REG_L, 6
.equ B_REG_H, 7

	ldi r18, lo8(keccak_rotate_codes)
	ldi r19, hi8(keccak_rotate_codes)
	movw r2, r18
	ldi r18, lo8(rho_pi_idx_table)
	ldi r19, hi8(rho_pi_idx_table)
	movw r4, r18
	ldi r16, 25
	mov r8, r16

	sbiw r24, 5 * 8 + 1
	movw YL, r24
	sbiw XL, 8
	movw B_REG_L, XL

10:
	ld r18, Y+
	ld r19, Y+
	ld r20, Y+
	ld r21, Y+
	ld r22, Y+
	ld r23, Y+
	ld r24, Y+
	ld r25, Y+
	movw ZL, r2
	lpm r16, Z+
	movw r2, ZL
rotate64left_code:
	ldi r30, pm_lo8(byte_rot_jmp_table)
	ldi r31, pm_hi8(byte_rot_jmp_table)
	mov r0, r16
	andi r16, 0x70
	swap r16
	add r30, r16
	adc r31, r1
	mov r16, r0
	andi r16, 0x0f
	icall
	clr r1
rotate64_nbit_autodir:
	lsr r16
	brcc rotate64_nbit_left
rotate64_nbit_right:
	ldi r30, pm_lo8(rotate64_1bit_right)
	ldi r31, pm_hi8(rotate64_1bit_right)
	rjmp icall_r16_times
rotate64_nbit_left:
	ldi r30, pm_lo8(rotate64_1bit_left)
	ldi r31, pm_hi8(rotate64_1bit_left)
icall_r16_times:
1:	dec r16
	brmi 2f
	icall
	rjmp 1b
2:
	movw ZL, r4
	lpm r16, Z+
	movw r4, ZL
	movw XL, B_REG_L
	add XL, r16
	adc XH, __zero_reg__
	st X+, r18
	st X+, r19
	st X+, r20
	st X+, r21
	st X+, r22
	st X+, r23
	st X+, r24
	st X+, r25

	dec r8
	brne 10b
/*
	-- chi --
	for(i = 0; i < 5; ++i){
        a[i][0] ^= ((~(b[i][1])) & (b[i][2]));
        a[i][1] ^= ((~(b[i][2])) & (b[i][3]));
        a[i][2] ^= ((~(b[i][3])) & (b[i][4]));
        a[i][3] ^= ((~(b[i][4])) & (b[i][0]));
        a[i][4] ^= ((~(b[i][0])) & (b[i][1]));

	}
*/
	; memcpy(a, b, 200)
	; X points at b + 32 + 8 = b + 40 = b[1][0] has to point to b[0][0]
	ldi r16, 200 / 8
	sbiw XL, 5 * 8
	movw ZL, XL
	subi YL, lo8(5 * 5 * 8)
	sbci YH, hi8(5 * 5 * 8)
	movw r2, YL
10:
	.rept 8
	ld r22, X+
	st Y+, r22
	.endr
	dec r16
	brne 10b

	; Z points at b
	movw XL, ZL
	movw r4, ZL
	adiw XL, 8
	adiw ZL, 16
	movw YL, r2
	ldi r18, 5
10:
	rcall chi_step
	rcall chi_step
	rcall chi_step
	sbiw ZL, 5 * 8
	rcall chi_step
	sbiw XL, 5 * 8
	rcall chi_step
	adiw XL, 5 * 8
	adiw ZL, 5 * 8
	dec r18
	brne 10b

	/* -- iota -- */
	ldi r30, lo8(keccak_rc_comp)
	ldi r31, hi8(keccak_rc_comp)
	add r30, r9
	adc r31, __zero_reg__
	lpm r20, Z+
	movw YL, r2
	ldi r21, 0x80
	bst r20, 6
	brtc 10f
	ldd r22, Y+7
	eor r22, r21
	std Y+7, r22
10:
	bst r20, 5
	brtc 10f
	ldd r22, Y+3
	eor r22, r21
	std Y+3, r22
10:
	bst r20, 4
	brtc 10f
	ldd r22, Y+1
	eor r22, r21
	std Y+1, r22
10:
	andi r20, 0x8f
	ld r22, Y
	eor r22, r20
	st Y, r22

	inc r9
	mov r16, r9
	cpi r16, 24
	breq 20f
	movw r24, YL
	movw r26, r4
	rjmp 5b
20:

	stack_free_large3 200

	pop_range 28, 29
	pop r16
	pop_range 2, 9
	ret
	.endfunc


	.global keccak224_ctx2hash
	.func keccak224_ctx2hash
keccak224_ctx2hash:
	movw r20, r22
	ldi r22, lo8(224)
	ldi r23, hi8(224)
	rjmp keccak_ctx2hash
	.endfunc

	.global keccak384_ctx2hash
	.func keccak384_ctx2hash
keccak384_ctx2hash:
	movw r20, r22
	ldi r22, lo8(384)
	ldi r23, hi8(384)
	rjmp keccak_ctx2hash
	.endfunc

	.global keccak512_ctx2hash
	.func keccak512_ctx2hash
keccak512_ctx2hash:
	movw r20, r22
	ldi r22, lo8(512)
	ldi r23, hi8(512)
	rjmp keccak_ctx2hash
	.endfunc

	.global keccak256_ctx2hash
	.func keccak256_ctx2hash
keccak256_ctx2hash:
	movw r20, r22
	ldi r22, lo8(256)
	ldi r23, hi8(256)
	.endfunc

/*
void keccak_ctx2hash(void *dest, uint16_t length_b, keccak_ctx_t *ctx){
	while(length_b>=ctx->r){
		memcpy(dest, ctx->a, ctx->bs);
		dest = (uint8_t*)dest + ctx->bs;
		length_b -= ctx->r;
		keccak_f1600(ctx->a);
	}
	memcpy(dest, ctx->a, (length_b+7)/8);
}
*/
	.global keccak_ctx2hash
	.func keccak_ctx2hash
keccak_ctx2hash:
	push_range 2, 10
	movw r4, r20
	movw r6, r24
	movw ZL, r20
	movw r8, r22
	subi ZL, lo8(-ctx_r)
	sbci ZH, hi8(-ctx_r)
	ld r2, Z+
	ld r3, Z+
	ldd r10, Z+3 ; load blocksize (in bytes)
10:
	; length_b = (r9:r8) ; r = (r3:r2) ; (H:L)
	cp  r2, r8
	cpc r3, r9
	brsh 40f
	movw XL, r4
	movw ZL, r6
	mov r24, r10
20:
	ld r22, X+
	st Z+, r22
	dec r24
	brne 20b
	movw r6, ZL
	sub r8, r2
	sbc r9, r3
	movw r24, r4
	rcall keccak_f1600
	rjmp 10b
40:
	movw XL, r4
	movw ZL, r6
	movw r24, r8
	adiw r24, 7
	lsr r25
	ror r24
	lsr r25
	ror r24
	lsr r25
	ror r24
	adiw r24, 0
	breq 99f
10:
	ld r22, X+
	st Z+, r22
	sbiw r24, 1
	brne 10b
99:
	pop_range 2, 10
	ret
	.endfunc


	.global keccak224_init
	.func keccak224_init
keccak224_init:
	movw XL, r24
	ldi r24, lo8(1152)
	ldi r25, hi8(1152)
	rjmp keccak_init_1
	.endfunc

	.global keccak384_init
	.func keccak384_init
keccak384_init:
	movw XL, r24
	ldi r24, lo8( 832)
	ldi r25, hi8( 832)
	rjmp keccak_init_1
	.endfunc

	.global keccak512_init
	.func keccak512_init
keccak512_init:
	movw XL, r24
	ldi r24, lo8( 576)
	ldi r25, hi8( 576)
	rjmp keccak_init_1
	.endfunc

	.global keccak256_init
	.func keccak256_init
keccak256_init:
	movw r22, r24
	ldi r24, lo8(1088)
	ldi r25, hi8(1088)
	.endfunc
/*
void keccak_init(uint16_t r, keccak_ctx_t *ctx){
	memset(ctx->a, 0x00, 5 * 5 * 8);
	ctx->r = r;
	ctx->bs = (uint8_t)(r / 8);
}
*/
	.global keccak_init
	.func keccak_init
keccak_init:
	movw XL, r22
keccak_init_1:
	ldi r22, 200
10:
	st X+, __zero_reg__
	dec r22
	brne 10b
	st X+, r24
	st X+, r25
	lsr r25
	ror r24
	lsr r25
	ror r24
	lsr r25
	ror r24
	st X+, r24
	ret
	.endfunc

/*
void keccak_lastBlock(keccak_ctx_t *ctx, const void *block, uint16_t length_b){
    uint8_t length_B;
    uint8_t t;
    while(length_b >= ctx->r){
        keccak_nextBlock(ctx, block);
        block = (uint8_t*)block + ctx->bs;
        length_b -=  ctx->r;
    }
    length_B = length_b / 8;
    memxor(ctx->a, block, length_B);
    / * append 1 * /
    if(length_b & 7){
        / * we have some single bits * /
        t = ((uint8_t*)block)[length_B] >> (8 - (length_b & 7));
        t |= 0x01 << (length_b & 7);
    }else{
        t = 0x01;
    }
    ctx->a[length_B] ^= t;
    if(length_b == ctx->r - 1){
        keccak_f1600(ctx->a);
    }

*/
.set length_b_l,  2
.set length_b_h,  3
.set pbs,     10
.set pr_l,     8
.set pr_h,      9
.set ctx_l,       6
.set ctx_h,       7

	.global keccak_lastBlock
	.func keccak_lastBlock
keccak_lastBlock:
	push_range 2, 10
	movw r2, r20
	movw r4, r22
	movw r6, r24
	movw XL, r24
	subi XL, lo8(-ctx_r)
	sbci XH, hi8(-ctx_r)
	ld  pr_l, X+
	ld  pr_h, X+
	ld  pbs, X
10:
	cp  length_b_l, pr_l
	cpc length_b_h, pr_h
	brlo 20f
	movw r24, ctx_l
	movw r22, r4
	rcall keccak_nextBlock
	add r4, pbs
	adc r5, __zero_reg__
	sub length_b_l, pr_l
	sbc length_b_h, pr_h
	rjmp 10b
20:
	movw ZL, ctx_l
	movw XL, r4
	movw r22, length_b_l
	lsr r23
	ror r22
	lsr r23
	ror r22
	lsr r23
	ror r22
	mov r23, r22
	breq 20f
10:
	ld r25, X+
	ld r24, Z
	eor r24, r25
	st Z+, r24
	dec r23
	brne 10b
20:
	ldi r25, 1
	mov r18, length_b_l
	andi r18, 7
	breq 30f
	/* we have trailing bits */
	mov r19, r18
	ld r24, X+
	subi r18, 8
	neg r18
10:
	lsr r24
	dec r18
	brne 10b
10:
	lsl r25
	dec r19
	brne 10b
	or r25, r24
30:
	ld r24, Z
	eor r24, r25
	st Z, r24

	movw r24, pr_l
	sbiw r24, 1
	cp  length_b_l, r24
	cpc length_b_h, r25
	brne 20f
	movw r24, ctx_l
	rcall keccak_f1600
20:
	movw XL, ctx_l
	dec pbs
	add XL, pbs
	adc XH, __zero_reg__
	ld r24, X
	ldi r25, 0x80
	eor r24, r25
	st X, r24
	movw r24, ctx_l
	pop_range 2, 10
	rjmp keccak_f1600
	.endfunc