X-Git-Url: https://git.cryptolib.org/avr-crypto-lib.git?a=blobdiff_plain;f=keccak%2Fkeccak-asm.S;h=7b7c6cf3144bb4374831c4cb5dbb1ecb9f66d420;hb=628319e6c3018268ef1c307976a0e81e4dc549b8;hp=357c8bd028fd36d6a1baa56358c42978ff22157c;hpb=e7925dc3d2f76b73f54df0e22f69f789715eff8a;p=avr-crypto-lib.git diff --git a/keccak/keccak-asm.S b/keccak/keccak-asm.S index 357c8bd..7b7c6cf 100644 --- a/keccak/keccak-asm.S +++ b/keccak/keccak-asm.S @@ -31,7 +31,23 @@ .equ __zero_reg__, 1 -.global rho_pi_idx_table +/* +typedef struct{ + uint64_t a[5][5]; + uint16_t r, c; + uint8_t d, bs; +} keccak_ctx_t; +*/ + .struct 0 +ctx_a: + .struct ctx_a + 8 * 5 * 5 +ctx_r: + .struct ctx_r + 2 +ctx_bs: + + .section .text + + .global rho_pi_idx_table rho_pi_idx_table: .irp i, 0, 1, 2, 3, 4 .irp j, 0, 1, 2, 3, 4 @@ -39,7 +55,145 @@ rho_pi_idx_table: .endr .endr -.align 2 +/* +#define ROT_BIT(a) (( (a) <= 4) ? ((a) << 1) : (0x01 | ((8 - (a)) << 1))) +#define ROT_CODE(a) ((((a) / 8 + ((((a) % 8) > 4) ? 1 : 0)) << 4) | ROT_BIT(((a) % 8))) + +const uint8_t keccak_rotate_codes[5][5] PROGMEM = { + { ROT_CODE( 0), ROT_CODE( 1), ROT_CODE(62), ROT_CODE(28), ROT_CODE(27) }, + { ROT_CODE(36), ROT_CODE(44), ROT_CODE( 6), ROT_CODE(55), ROT_CODE(20) }, + { ROT_CODE( 3), ROT_CODE(10), ROT_CODE(43), ROT_CODE(25), ROT_CODE(39) }, + { ROT_CODE(41), ROT_CODE(45), ROT_CODE(15), ROT_CODE(21), ROT_CODE( 8) }, + { ROT_CODE(18), ROT_CODE( 2), ROT_CODE(61), ROT_CODE(56), ROT_CODE(14) } +}; +*/ + +keccak_rotate_codes: +.byte 0x00, 0x02, 0x85, 0x38, 0x36 +.byte 0x48, 0x58, 0x15, 0x73, 0x28 +.byte 0x06, 0x14, 0x56, 0x32, 0x53 +.byte 0x52, 0x67, 0x23, 0x37, 0x10 +.byte 0x24, 0x04, 0x87, 0x70, 0x25 + +keccak_rc_comp: +.byte 0x01, 0x92, 0xda, 0x70 +.byte 0x9b, 0x21, 0xf1, 0x59 +.byte 0x8a, 0x88, 0x39, 0x2a +.byte 0xbb, 0xcb, 0xd9, 0x53 +.byte 0x52, 0xc0, 0x1a, 0x6a +.byte 0xf1, 0xd0, 0x21, 0x78 + + .align 2 + +rotate64_1bit_left: + bst r25, 7 + rol r18 + rol r19 + rol r20 + rol r21 + rol r22 + rol r23 + rol r24 + rol r25 + bld r18, 0 + ret + +rotate64_1bit_right: + bst r18, 0 + ror r25 + ror r24 + ror r23 + ror r22 + ror r21 + ror r20 + ror r19 + ror r18 + bld r25, 7 + ret + +rotate64_1byte_left: + mov r0, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r21 + mov r21, r20 + mov r20, r19 + mov r19, r18 + mov r18, r0 + ret + +rotate64_2byte_left: + movw r0, r24 + movw r24, r22 + movw r22, r20 + movw r20, r18 + movw r18, r0 + ret + +rotate64_3byte_left: + mov r0, r25 + mov r25, r22 + mov r22, r19 + mov r19, r24 + mov r24, r21 + mov r21, r18 + mov r18, r23 + mov r23, r20 + mov r20, r0 + ret + +rotate64_4byte_left: + movw r0, r24 + movw r24, r20 + movw r20, r0 + movw r0, r22 + movw r22, r18 + movw r18, r0 + ret + +rotate64_5byte_left: + mov r0, r25 + mov r25, r20 + mov r20, r23 + mov r23, r18 + mov r18, r21 + mov r21, r24 + mov r24, r19 + mov r19, r22 + mov r22, r0 + ret + +rotate64_6byte_left: + movw r0, r18 + movw r18, r20 + movw r20, r22 + movw r22, r24 + movw r24, r0 + ret + +rotate64_7byte_left: + mov r0, r18 + mov r18, r19 + mov r19, r20 + mov r20, r21 + mov r21, r22 + mov r22, r23 + mov r23, r24 + mov r24, r25 + mov r25, r0 + +byte_rot_jmp_table: + ret + rjmp rotate64_1byte_left + rjmp rotate64_2byte_left + rjmp rotate64_3byte_left + rjmp rotate64_4byte_left + rjmp rotate64_5byte_left + rjmp rotate64_6byte_left + rjmp rotate64_7byte_left + + /* void keccak_theta (uint64_t *a, uint64_t *b){ // uint64_t b[5][5]; @@ -166,7 +320,26 @@ chi_step: brne 10b ret -.global keccak_f1600 + .global keccak_nextBlock + .func keccak_nextBlock +keccak_nextBlock: + movw ZL, r24 + subi ZL, lo8(-ctx_bs) + sbci ZH, hi8(-ctx_bs) + ld r20, Z + movw XL, r24 + movw ZL, r22 +10: + ld r22, X + ld r23, Z+ + eor r22, r23 + st X+, r22 + dec r20 + brne 10b + .endfunc + + .global keccak_f1600 + .func keccak_f1600 keccak_f1600: push_range 2, 9 push r16 @@ -259,7 +432,7 @@ keccak_f1600: ; ret /* - rho & pi + -- rho & pi -- for(i = 0; i < 5; ++i){ for(j = 0; j < 5; ++j){ b[(2 * i + 3 * j) % 5][j] = @@ -309,7 +482,34 @@ keccak_f1600: movw ZL, r2 lpm r16, Z+ movw r2, ZL - call rotate64left_code +rotate64left_code: + ldi r30, pm_lo8(byte_rot_jmp_table) + ldi r31, pm_hi8(byte_rot_jmp_table) + mov r0, r16 + andi r16, 0x70 + swap r16 + add r30, r16 + adc r31, r1 + mov r16, r0 + andi r16, 0x0f + icall + clr r1 +rotate64_nbit_autodir: + lsr r16 + brcc rotate64_nbit_left +rotate64_nbit_right: + ldi r30, pm_lo8(rotate64_1bit_right) + ldi r31, pm_hi8(rotate64_1bit_right) + rjmp icall_r16_times +rotate64_nbit_left: + ldi r30, pm_lo8(rotate64_1bit_left) + ldi r31, pm_hi8(rotate64_1bit_left) +icall_r16_times: +1: dec r16 + brmi 2f + icall + rjmp 1b +2: movw ZL, r4 lpm r16, Z+ movw r4, ZL @@ -340,15 +540,17 @@ keccak_f1600: */ ; memcpy(a, b, 200) ; X points at b + 32 + 8 = b + 40 = b[1][0] has to point to b[0][0] - ldi r16, 200 + ldi r16, 200 / 8 sbiw XL, 5 * 8 movw ZL, XL subi YL, lo8(5 * 5 * 8) sbci YH, hi8(5 * 5 * 8) movw r2, YL 10: + .rept 8 ld r22, X+ st Y+, r22 + .endr dec r16 brne 10b @@ -417,5 +619,73 @@ keccak_f1600: pop_range 28, 29 pop r16 pop_range 2, 9 + ret + .endfunc +/* +void keccak_ctx2hash(void* dest, uint16_t length_b, keccak_ctx_t* ctx){ + while(length_b>=ctx->r){ + memcpy(dest, ctx->a, ctx->bs); + dest = (uint8_t*)dest + ctx->bs; + length_b -= ctx->r; + keccak_f1600(ctx->a); + } + memcpy(dest, ctx->a, (length_b+7)/8); +} +*/ +; .global keccak_ctx2hash +; .func keccak_ctx2hash +;keccak_ctx2hash: + push_range 2, 10 + movw r4, r20 + movw r6, r24 + movw ZL, r24 + movw r8, r22 + subi ZL, lo8(-ctx_r) + sbci ZH, hi8(-ctx_r) + ld r2, Z+ + ld r3, Z+ + ldd r10, Z+3 ; load blocksize (in bytes) +10: + ; length_b = (r9:r8) ; r = (r3:r2) ; (H:L) + cp r2, r8 + cpc r3, r9 + rjmp 40f + brsh 40f + movw XL, r4 + movw ZL, r6 + mov r24, r10 +20: + ld r22, X+ + st Z+, r22 + dec r24 + brne 20b + movw r6, ZL + sub r8, r2 + sbc r9, r3 + movw r24, r4 + rcall keccak_f1600 + rjmp 10b +40: + movw XL, r4 + movw ZL, r6 + movw r24, r8 + adiw r24, 7 + lsr r25 + ror r24 + lsr r25 + ror r24 + lsr r25 + ror r24 + adiw r24, 0 + breq 99f +10: + ld r22, X+ + st Z+, r22 + sbiw r24, 1 + brne 10b +99: + pop_range 2, 10 ret +; .endfunc +