.equ __zero_reg__, 1
-.global rho_pi_idx_table
+/*
+typedef struct{
+ uint64_t a[5][5];
+ uint16_t r, c;
+ uint8_t d, bs;
+} keccak_ctx_t;
+*/
+ .struct 0
+ctx_a:
+ .struct ctx_a + 8 * 5 * 5
+ctx_r:
+ .struct ctx_r + 2
+ctx_c:
+ .struct ctx_c + 2
+ctx_d:
+ .struct ctx_d + 1
+ctx_bs:
+
+ .section .text
+
+ .global rho_pi_idx_table
rho_pi_idx_table:
.irp i, 0, 1, 2, 3, 4
.irp j, 0, 1, 2, 3, 4
.endr
.endr
-.align 2
+/*
+#define ROT_BIT(a) (( (a) <= 4) ? ((a) << 1) : (0x01 | ((8 - (a)) << 1)))
+#define ROT_CODE(a) ((((a) / 8 + ((((a) % 8) > 4) ? 1 : 0)) << 4) | ROT_BIT(((a) % 8)))
+
+const uint8_t keccak_rotate_codes[5][5] PROGMEM = {
+ { ROT_CODE( 0), ROT_CODE( 1), ROT_CODE(62), ROT_CODE(28), ROT_CODE(27) },
+ { ROT_CODE(36), ROT_CODE(44), ROT_CODE( 6), ROT_CODE(55), ROT_CODE(20) },
+ { ROT_CODE( 3), ROT_CODE(10), ROT_CODE(43), ROT_CODE(25), ROT_CODE(39) },
+ { ROT_CODE(41), ROT_CODE(45), ROT_CODE(15), ROT_CODE(21), ROT_CODE( 8) },
+ { ROT_CODE(18), ROT_CODE( 2), ROT_CODE(61), ROT_CODE(56), ROT_CODE(14) }
+};
+*/
+
+keccak_rotate_codes:
+.byte 0x00, 0x02, 0x85, 0x38, 0x36
+.byte 0x48, 0x58, 0x15, 0x73, 0x28
+.byte 0x06, 0x14, 0x56, 0x32, 0x53
+.byte 0x52, 0x67, 0x23, 0x37, 0x10
+.byte 0x24, 0x04, 0x87, 0x70, 0x25
+
+keccak_rc_comp:
+.byte 0x01, 0x92, 0xda, 0x70
+.byte 0x9b, 0x21, 0xf1, 0x59
+.byte 0x8a, 0x88, 0x39, 0x2a
+.byte 0xbb, 0xcb, 0xd9, 0x53
+.byte 0x52, 0xc0, 0x1a, 0x6a
+.byte 0xf1, 0xd0, 0x21, 0x78
+
+ .align 2
-.global rotate64_1bit_left
rotate64_1bit_left:
bst r25, 7
rol r18
bld r18, 0
ret
-.global rotate64_1bit_right
rotate64_1bit_right:
bst r18, 0
ror r25
bld r25, 7
ret
-.global rotate64_nbit_autodir
-rotate64_nbit_autodir:
- lsr r16
- brcc rotate64_nbit_left
-.global rotate64_nbit_right
-rotate64_nbit_right:
- ldi r30, pm_lo8(rotate64_1bit_right)
- ldi r31, pm_hi8(rotate64_1bit_right)
- rjmp icall_r16_times
-.global rotate64_nbit_left
-rotate64_nbit_left:
- ldi r30, pm_lo8(rotate64_1bit_left)
- ldi r31, pm_hi8(rotate64_1bit_left)
-icall_r16_times:
-1: dec r16
- brmi 2f
- icall
- rjmp 1b
-2:
- ret
-
rotate64_1byte_left:
mov r0, r25
mov r25, r24
mov r23, r24
mov r24, r25
mov r25, r0
- ret
-
byte_rot_jmp_table:
ret
rjmp rotate64_6byte_left
rjmp rotate64_7byte_left
-.global rotate64left_code
-rotate64left_code:
- ldi r30, pm_lo8(byte_rot_jmp_table)
- ldi r31, pm_hi8(byte_rot_jmp_table)
- mov r0, r16
- andi r16, 0x70
- swap r16
- add r30, r16
- adc r31, r1
- mov r16, r0
- andi r16, 0x0f
- icall
- clr r1
- rjmp rotate64_nbit_autodir
-
/*
void keccak_theta (uint64_t *a, uint64_t *b){
brne 10b
ret
-.global keccak_f1600
+ .global keccak_nextBlock
+ .func keccak_nextBlock
+keccak_nextBlock:
+ movw ZL, r24
+ subi ZL, lo8(-ctx_bs)
+ sbci ZH, hi8(-ctx_bs)
+ ld r20, Z
+ movw XL, r24
+ movw ZL, r22
+10:
+ ld r22, X
+ ld r23, Z+
+ eor r22, r23
+ st X+, r22
+ dec r20
+ brne 10b
+ .endfunc
+
+ .global keccak_f1600
+ .func keccak_f1600
keccak_f1600:
push_range 2, 9
push r16
; ret
/*
- rho & pi
+ -- rho & pi --
for(i = 0; i < 5; ++i){
for(j = 0; j < 5; ++j){
b[(2 * i + 3 * j) % 5][j] =
movw ZL, r2
lpm r16, Z+
movw r2, ZL
- rcall rotate64left_code
+rotate64left_code:
+ ldi r30, pm_lo8(byte_rot_jmp_table)
+ ldi r31, pm_hi8(byte_rot_jmp_table)
+ mov r0, r16
+ andi r16, 0x70
+ swap r16
+ add r30, r16
+ adc r31, r1
+ mov r16, r0
+ andi r16, 0x0f
+ icall
+ clr r1
+rotate64_nbit_autodir:
+ lsr r16
+ brcc rotate64_nbit_left
+rotate64_nbit_right:
+ ldi r30, pm_lo8(rotate64_1bit_right)
+ ldi r31, pm_hi8(rotate64_1bit_right)
+ rjmp icall_r16_times
+rotate64_nbit_left:
+ ldi r30, pm_lo8(rotate64_1bit_left)
+ ldi r31, pm_hi8(rotate64_1bit_left)
+icall_r16_times:
+1: dec r16
+ brmi 2f
+ icall
+ rjmp 1b
+2:
movw ZL, r4
lpm r16, Z+
movw r4, ZL
pop_range 28, 29
pop r16
pop_range 2, 9
+ ret
+ .endfunc
+/*
+void keccak_ctx2hash(void* dest, uint16_t length_b, keccak_ctx_t* ctx){
+ while(length_b>=ctx->r){
+ memcpy(dest, ctx->a, ctx->bs);
+ dest = (uint8_t*)dest + ctx->bs;
+ length_b -= ctx->r;
+ keccak_f1600(ctx->a);
+ }
+ memcpy(dest, ctx->a, (length_b+7)/8);
+}
+*/
+ .global keccak_ctx2hash
+ .func keccak_ctx2hash
+keccak_ctx2hash:
+ push_range 2, 10
+ movw r4, r20
+ movw r6, r24
+ movw ZL, r24
+ movw r8, r22
+ subi ZL, lo8(-ctx_r)
+ subi ZH, hi8(-ctx_r)
+ ld r2, Z+
+ ld r3, Z+
+ ldd r10, Z+3 ; load blocksize (in bytes)
+10:
+ cp r8, r2
+ cpc r9, r3
+ brcc 40f
+ movw XL, r4
+ movw ZL, r6
+ mov r24, r10
+20:
+ ld r22, X+
+ st Z+, r22
+ dec r24
+ brne 20b
+ movw r6, ZL
+ sub r8, r2
+ sbc r9, r3
+ movw r24, r4
+ rcall keccak_f1600
+ rjmp 10b
+40:
+ movw XL, r4
+ movw ZL, r6
+ movw r24, r8
+ adiw r24, 7
+ lsr r25
+ ror r24
+ lsr r25
+ ror r24
+ lsr r25
+ ror r24
+ adiw r24, 0
+ breq 99f
+10:
+ ld r22, X+
+ st Z+, r22
+ sbiw r24, 1
+ brne 10b
+99:
+ pop_range 2, 10
ret
+ .endfunc
+