X-Git-Url: https://git.cryptolib.org/?a=blobdiff_plain;f=keccak%2Fkeccak-asm.S;h=3b3a48818fdc84a47547b64d61e6b00ad1eca3a9;hb=29a44972ae3749a6a273d936f2e15327ecae8a94;hp=77dc9ce10bc99f92f67f5b7e43ddcf0155c5532c;hpb=4128060fcc4585ab8f40d01486baa4248b499160;p=avr-crypto-lib.git diff --git a/keccak/keccak-asm.S b/keccak/keccak-asm.S index 77dc9ce..3b3a488 100644 --- a/keccak/keccak-asm.S +++ b/keccak/keccak-asm.S @@ -31,7 +31,27 @@ .equ __zero_reg__, 1 -.global rho_pi_idx_table +/* +typedef struct{ + uint64_t a[5][5]; + uint16_t r, c; + uint8_t d, bs; +} keccak_ctx_t; +*/ + .struct 0 +ctx_a: + .struct ctx_a + 8 * 5 * 5 +ctx_r: + .struct ctx_r + 2 +ctx_c: + .struct ctx_c + 2 +ctx_d: + .struct ctx_d + 1 +ctx_bs: + + .section .text + + .global rho_pi_idx_table rho_pi_idx_table: .irp i, 0, 1, 2, 3, 4 .irp j, 0, 1, 2, 3, 4 @@ -39,7 +59,145 @@ rho_pi_idx_table: .endr .endr -.align 2 +/* +#define ROT_BIT(a) (( (a) <= 4) ? ((a) << 1) : (0x01 | ((8 - (a)) << 1))) +#define ROT_CODE(a) ((((a) / 8 + ((((a) % 8) > 4) ? 1 : 0)) << 4) | ROT_BIT(((a) % 8))) + +const uint8_t keccak_rotate_codes[5][5] PROGMEM = { + { ROT_CODE( 0), ROT_CODE( 1), ROT_CODE(62), ROT_CODE(28), ROT_CODE(27) }, + { ROT_CODE(36), ROT_CODE(44), ROT_CODE( 6), ROT_CODE(55), ROT_CODE(20) }, + { ROT_CODE( 3), ROT_CODE(10), ROT_CODE(43), ROT_CODE(25), ROT_CODE(39) }, + { ROT_CODE(41), ROT_CODE(45), ROT_CODE(15), ROT_CODE(21), ROT_CODE( 8) }, + { ROT_CODE(18), ROT_CODE( 2), ROT_CODE(61), ROT_CODE(56), ROT_CODE(14) } +}; +*/ + +keccak_rotate_codes: +.byte 0x00, 0x02, 0x85, 0x38, 0x36 +.byte 0x48, 0x58, 0x15, 0x73, 0x28 +.byte 0x06, 0x14, 0x56, 0x32, 0x53 +.byte 0x52, 0x67, 0x23, 0x37, 0x10 +.byte 0x24, 0x04, 0x87, 0x70, 0x25 + +keccak_rc_comp: +.byte 0x01, 0x92, 0xda, 0x70 +.byte 0x9b, 0x21, 0xf1, 0x59 +.byte 0x8a, 0x88, 0x39, 0x2a +.byte 0xbb, 0xcb, 0xd9, 0x53 +.byte 0x52, 0xc0, 0x1a, 0x6a +.byte 0xf1, 0xd0, 0x21, 0x78 + + .align 2 + +rotate64_1bit_left: + bst r25, 7 + rol r18 + rol r19 + rol r20 + rol r21 + rol r22 + rol r23 + rol r24 + rol r25 + bld r18, 0 + ret + +rotate64_1bit_right: + bst r18, 0 + ror r25 + ror r24 + ror r23 + ror r22 + ror r21 + ror r20 + ror r19 + ror r18 + bld r25, 7 + ret + +rotate64_1byte_left: + mov r0, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r21 + mov r21, r20 + mov r20, r19 + mov r19, r18 + mov r18, r0 + ret + +rotate64_2byte_left: + movw r0, r24 + movw r24, r22 + movw r22, r20 + movw r20, r18 + movw r18, r0 + ret + +rotate64_3byte_left: + mov r0, r25 + mov r25, r22 + mov r22, r19 + mov r19, r24 + mov r24, r21 + mov r21, r18 + mov r18, r23 + mov r23, r20 + mov r20, r0 + ret + +rotate64_4byte_left: + movw r0, r24 + movw r24, r20 + movw r20, r0 + movw r0, r22 + movw r22, r18 + movw r18, r0 + ret + +rotate64_5byte_left: + mov r0, r25 + mov r25, r20 + mov r20, r23 + mov r23, r18 + mov r18, r21 + mov r21, r24 + mov r24, r19 + mov r19, r22 + mov r22, r0 + ret + +rotate64_6byte_left: + movw r0, r18 + movw r18, r20 + movw r20, r22 + movw r22, r24 + movw r24, r0 + ret + +rotate64_7byte_left: + mov r0, r18 + mov r18, r19 + mov r19, r20 + mov r20, r21 + mov r21, r22 + mov r22, r23 + mov r23, r24 + mov r24, r25 + mov r25, r0 + +byte_rot_jmp_table: + ret + rjmp rotate64_1byte_left + rjmp rotate64_2byte_left + rjmp rotate64_3byte_left + rjmp rotate64_4byte_left + rjmp rotate64_5byte_left + rjmp rotate64_6byte_left + rjmp rotate64_7byte_left + + /* void keccak_theta (uint64_t *a, uint64_t *b){ // uint64_t b[5][5]; @@ -166,14 +324,35 @@ chi_step: brne 10b ret -.global keccak_theta -keccak_theta: - push_range 2, 8 + .global keccak_nextBlock + .func keccak_nextBlock +keccak_nextBlock: + movw ZL, r24 + subi ZL, lo8(-ctx_bs) + sbci ZL, hi8(-ctx_bs) + ld r20, Z + movw XL, r24 + movw ZL, r22 +10: + ld r22, X + ld r23, Z+ + eor r22, r23 + st X+, r22 + dec r20 + brne 10b + + .global keccak_f1600 +keccak_f1600: + push_range 2, 9 push r16 push_range 28, 29 + stack_alloc_large 200, r26, r27 + adiw XL, 1 + + clr r9 +5: movw r30, r24 ; Z = a - movw r26, r22 ; X = b ldi r19, 5 10: @@ -255,7 +434,7 @@ keccak_theta: ; ret /* - rho & pi + -- rho & pi -- for(i = 0; i < 5; ++i){ for(j = 0; j < 5; ++j){ b[(2 * i + 3 * j) % 5][j] = @@ -305,7 +484,34 @@ keccak_theta: movw ZL, r2 lpm r16, Z+ movw r2, ZL - call rotate64left_code +rotate64left_code: + ldi r30, pm_lo8(byte_rot_jmp_table) + ldi r31, pm_hi8(byte_rot_jmp_table) + mov r0, r16 + andi r16, 0x70 + swap r16 + add r30, r16 + adc r31, r1 + mov r16, r0 + andi r16, 0x0f + icall + clr r1 +rotate64_nbit_autodir: + lsr r16 + brcc rotate64_nbit_left +rotate64_nbit_right: + ldi r30, pm_lo8(rotate64_1bit_right) + ldi r31, pm_hi8(rotate64_1bit_right) + rjmp icall_r16_times +rotate64_nbit_left: + ldi r30, pm_lo8(rotate64_1bit_left) + ldi r31, pm_hi8(rotate64_1bit_left) +icall_r16_times: +1: dec r16 + brmi 2f + icall + rjmp 1b +2: movw ZL, r4 lpm r16, Z+ movw r4, ZL @@ -350,6 +556,7 @@ keccak_theta: ; Z points at b movw XL, ZL + movw r4, ZL adiw XL, 8 adiw ZL, 16 movw YL, r2 @@ -366,8 +573,51 @@ keccak_theta: adiw ZL, 5 * 8 dec r18 brne 10b + + /* -- iota -- */ + ldi r30, lo8(keccak_rc_comp) + ldi r31, hi8(keccak_rc_comp) + add r30, r9 + adc r31, __zero_reg__ + lpm r20, Z+ + movw YL, r2 + ldi r21, 0x80 + bst r20, 6 + brtc 10f + ldd r22, Y+7 + eor r22, r21 + std Y+7, r22 +10: + bst r20, 5 + brtc 10f + ldd r22, Y+3 + eor r22, r21 + std Y+3, r22 +10: + bst r20, 4 + brtc 10f + ldd r22, Y+1 + eor r22, r21 + std Y+1, r22 +10: + andi r20, 0x8f + ld r22, Y + eor r22, r20 + st Y, r22 + + inc r9 + mov r16, r9 + cpi r16, 24 + breq 20f + movw r24, YL + movw r26, r4 + rjmp 5b +20: + + stack_free_large3 200 + pop_range 28, 29 pop r16 - pop_range 2, 8 + pop_range 2, 9 ret