X-Git-Url: https://git.cryptolib.org/?p=avr-crypto-lib.git;a=blobdiff_plain;f=keccak%2Fkeccak-asm.S;h=60a78ae232b8358fa37da25db194bbc9e44b2c41;hp=95d8aa6289ff484ec781862a4f1d2208e0245f16;hb=4b5da1dc27a791b5c448274a3db09cd035b33493;hpb=7a138263197006d74661fbe446f33d6014777245 diff --git a/keccak/keccak-asm.S b/keccak/keccak-asm.S index 95d8aa6..60a78ae 100644 --- a/keccak/keccak-asm.S +++ b/keccak/keccak-asm.S @@ -1,7 +1,7 @@ /* keccac-asm.S */ /* This file is part of the AVR-Crypto-Lib. - Copyright (C) 2012 Daniel Otte (daniel.otte@rub.de) + Copyright (C) 2006-2015 Daniel Otte (bg@nerilex.org) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,7 +18,7 @@ */ /** * \file keccak-asm.S - * \email daniel.otte@rub.de + * \email bg@nerilex.org * \author Daniel Otte * \date 2012-12-16 * \license GPLv3 or later @@ -31,7 +31,23 @@ .equ __zero_reg__, 1 -.global rho_pi_idx_table +/* +typedef struct{ + uint64_t a[5][5]; + uint16_t r; + uint8_t bs; +} keccak_ctx_t; +*/ + .struct 0 +ctx_a: + .struct ctx_a + 8 * 5 * 5 +ctx_r: + .struct ctx_r + 2 +ctx_bs: + + .section .text + + .global rho_pi_idx_table rho_pi_idx_table: .irp i, 0, 1, 2, 3, 4 .irp j, 0, 1, 2, 3, 4 @@ -39,7 +55,145 @@ rho_pi_idx_table: .endr .endr -.align 2 +/* +#define ROT_BIT(a) (( (a) <= 4) ? ((a) << 1) : (0x01 | ((8 - (a)) << 1))) +#define ROT_CODE(a) ((((a) / 8 + ((((a) % 8) > 4) ? 1 : 0)) << 4) | ROT_BIT(((a) % 8))) + +const uint8_t keccak_rotate_codes[5][5] PROGMEM = { + { ROT_CODE( 0), ROT_CODE( 1), ROT_CODE(62), ROT_CODE(28), ROT_CODE(27) }, + { ROT_CODE(36), ROT_CODE(44), ROT_CODE( 6), ROT_CODE(55), ROT_CODE(20) }, + { ROT_CODE( 3), ROT_CODE(10), ROT_CODE(43), ROT_CODE(25), ROT_CODE(39) }, + { ROT_CODE(41), ROT_CODE(45), ROT_CODE(15), ROT_CODE(21), ROT_CODE( 8) }, + { ROT_CODE(18), ROT_CODE( 2), ROT_CODE(61), ROT_CODE(56), ROT_CODE(14) } +}; +*/ + +keccak_rotate_codes: +.byte 0x00, 0x02, 0x85, 0x38, 0x36 +.byte 0x48, 0x58, 0x15, 0x73, 0x28 +.byte 0x06, 0x14, 0x56, 0x32, 0x53 +.byte 0x52, 0x67, 0x23, 0x37, 0x10 +.byte 0x24, 0x04, 0x87, 0x70, 0x25 + +keccak_rc_comp: +.byte 0x01, 0x92, 0xda, 0x70 +.byte 0x9b, 0x21, 0xf1, 0x59 +.byte 0x8a, 0x88, 0x39, 0x2a +.byte 0xbb, 0xcb, 0xd9, 0x53 +.byte 0x52, 0xc0, 0x1a, 0x6a +.byte 0xf1, 0xd0, 0x21, 0x78 + + .align 2 + +rotate64_1bit_left: + bst r25, 7 + rol r18 + rol r19 + rol r20 + rol r21 + rol r22 + rol r23 + rol r24 + rol r25 + bld r18, 0 + ret + +rotate64_1bit_right: + bst r18, 0 + ror r25 + ror r24 + ror r23 + ror r22 + ror r21 + ror r20 + ror r19 + ror r18 + bld r25, 7 + ret + +rotate64_1byte_left: + mov r0, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r21 + mov r21, r20 + mov r20, r19 + mov r19, r18 + mov r18, r0 + ret + +rotate64_2byte_left: + movw r0, r24 + movw r24, r22 + movw r22, r20 + movw r20, r18 + movw r18, r0 + ret + +rotate64_3byte_left: + mov r0, r25 + mov r25, r22 + mov r22, r19 + mov r19, r24 + mov r24, r21 + mov r21, r18 + mov r18, r23 + mov r23, r20 + mov r20, r0 + ret + +rotate64_4byte_left: + movw r0, r24 + movw r24, r20 + movw r20, r0 + movw r0, r22 + movw r22, r18 + movw r18, r0 + ret + +rotate64_5byte_left: + mov r0, r25 + mov r25, r20 + mov r20, r23 + mov r23, r18 + mov r18, r21 + mov r21, r24 + mov r24, r19 + mov r19, r22 + mov r22, r0 + ret + +rotate64_6byte_left: + movw r0, r18 + movw r18, r20 + movw r20, r22 + movw r22, r24 + movw r24, r0 + ret + +rotate64_7byte_left: + mov r0, r18 + mov r18, r19 + mov r19, r20 + mov r20, r21 + mov r21, r22 + mov r22, r23 + mov r23, r24 + mov r24, r25 + mov r25, r0 + +byte_rot_jmp_table: + ret + rjmp rotate64_1byte_left + rjmp rotate64_2byte_left + rjmp rotate64_3byte_left + rjmp rotate64_4byte_left + rjmp rotate64_5byte_left + rjmp rotate64_6byte_left + rjmp rotate64_7byte_left + + /* void keccak_theta (uint64_t *a, uint64_t *b){ // uint64_t b[5][5]; @@ -135,16 +289,68 @@ theta_2b: clr __zero_reg__ ret +; a[i][j] = b[i][j] ^ ((~(b[i][(j + 1) % 5])) & (b[i][(j + 2) % 5])); + +/********************************************* + * chi_step + ********************************************* + input: + Y = a; uint8t *a; + X = b; uint8t *b; + Z = c; uint8t *c; + output: + a[0..7] ^= ~b[0..7] & c[0..7] + X += 8 + Y += 8 + Z += 8 + r16 = 0 + trash r21, r22, r23 +*/ +chi_step: + ldi r16, 8 +10: + ld r21, Y + ld r22, X+ + ld r23, Z+ + com r22 + and r22, r23 + eor r21, r22 + st Y+, r21 + dec r16 + brne 10b + ret -.global keccak_theta -keccak_theta: - push_range 2, 8 + .global keccak_nextBlock + .func keccak_nextBlock +keccak_nextBlock: + movw ZL, r24 + subi ZL, lo8(-ctx_bs) + sbci ZH, hi8(-ctx_bs) + ld r20, Z + movw XL, r24 + movw ZL, r22 +10: + ld r22, X + ld r23, Z+ + eor r22, r23 + st X+, r22 + dec r20 + brne 10b + .endfunc + + .global keccak_f1600 + .func keccak_f1600 +keccak_f1600: + push_range 2, 9 push r16 push_range 28, 29 + stack_alloc_large 200, r26, r27 + adiw XL, 1 + + clr r9 +5: movw r30, r24 ; Z = a - movw r26, r22 ; X = b - push_range 22, 25 ldi r19, 5 10: @@ -223,10 +429,8 @@ keccak_theta: subi XL, lo8(4 * 5 * 8 + 8) sbci XH, hi8(4 * 5 * 8 + 8) rcall theta_2b - -; ret /* - rho & pi + -- rho & pi -- for(i = 0; i < 5; ++i){ for(j = 0; j < 5; ++j){ b[(2 * i + 3 * j) % 5][j] = @@ -236,8 +440,8 @@ keccak_theta: -- or -- - const uint8_t* rot_code = (const uint8_t*)keccak_rotate_codes; - const uint8_t* idx_idx = (const uint8_t*)rho_pi_idx_table; + const uint8_t *rot_code = (const uint8_t*)keccak_rotate_codes; + const uint8_t *idx_idx = (const uint8_t*)rho_pi_idx_table; uint64_t *a_tmp = (uint64_t*)a; for(i = 0; i < 25; ++i){ *((uint64_t*)(((uint8_t*)b) + pgm_read_byte(idx_idx++))) = @@ -259,10 +463,10 @@ keccak_theta: ldi r16, 25 mov r8, r16 - pop YH - pop YL - pop B_REG_H - pop B_REG_L + sbiw r24, 5 * 8 + 1 + movw YL, r24 + sbiw XL, 8 + movw B_REG_L, XL 10: ld r18, Y+ @@ -276,7 +480,34 @@ keccak_theta: movw ZL, r2 lpm r16, Z+ movw r2, ZL - call rotate64left_code +rotate64left_code: + ldi r30, pm_lo8(byte_rot_jmp_table) + ldi r31, pm_hi8(byte_rot_jmp_table) + mov r0, r16 + andi r16, 0x70 + swap r16 + add r30, r16 + adc r31, r1 + mov r16, r0 + andi r16, 0x0f + icall + clr r1 +rotate64_nbit_autodir: + lsr r16 + brcc rotate64_nbit_left +rotate64_nbit_right: + ldi r30, pm_lo8(rotate64_1bit_right) + ldi r31, pm_hi8(rotate64_1bit_right) + rjmp icall_r16_times +rotate64_nbit_left: + ldi r30, pm_lo8(rotate64_1bit_left) + ldi r31, pm_hi8(rotate64_1bit_left) +icall_r16_times: +1: dec r16 + brmi 2f + icall + rjmp 1b +2: movw ZL, r4 lpm r16, Z+ movw r4, ZL @@ -294,9 +525,386 @@ keccak_theta: dec r8 brne 10b +/* + -- chi -- + for(i = 0; i < 5; ++i){ + a[i][0] ^= ((~(b[i][1])) & (b[i][2])); + a[i][1] ^= ((~(b[i][2])) & (b[i][3])); + a[i][2] ^= ((~(b[i][3])) & (b[i][4])); + a[i][3] ^= ((~(b[i][4])) & (b[i][0])); + a[i][4] ^= ((~(b[i][0])) & (b[i][1])); + + } +*/ + ; memcpy(a, b, 200) + ; X points at b + 32 + 8 = b + 40 = b[1][0] has to point to b[0][0] + ldi r16, 200 / 8 + sbiw XL, 5 * 8 + movw ZL, XL + subi YL, lo8(5 * 5 * 8) + sbci YH, hi8(5 * 5 * 8) + movw r2, YL +10: + .rept 8 + ld r22, X+ + st Y+, r22 + .endr + dec r16 + brne 10b + + ; Z points at b + movw XL, ZL + movw r4, ZL + adiw XL, 8 + adiw ZL, 16 + movw YL, r2 + ldi r18, 5 +10: + rcall chi_step + rcall chi_step + rcall chi_step + sbiw ZL, 5 * 8 + rcall chi_step + sbiw XL, 5 * 8 + rcall chi_step + adiw XL, 5 * 8 + adiw ZL, 5 * 8 + dec r18 + brne 10b + + /* -- iota -- */ + ldi r30, lo8(keccak_rc_comp) + ldi r31, hi8(keccak_rc_comp) + add r30, r9 + adc r31, __zero_reg__ + lpm r20, Z+ + movw YL, r2 + ldi r21, 0x80 + bst r20, 6 + brtc 10f + ldd r22, Y+7 + eor r22, r21 + std Y+7, r22 +10: + bst r20, 5 + brtc 10f + ldd r22, Y+3 + eor r22, r21 + std Y+3, r22 +10: + bst r20, 4 + brtc 10f + ldd r22, Y+1 + eor r22, r21 + std Y+1, r22 +10: + andi r20, 0x8f + ld r22, Y + eor r22, r20 + st Y, r22 + + inc r9 + mov r16, r9 + cpi r16, 24 + breq 20f + movw r24, YL + movw r26, r4 + rjmp 5b +20: + + stack_free_large3 200 pop_range 28, 29 pop r16 - pop_range 2, 8 + pop_range 2, 9 + ret + .endfunc + + + .global keccak224_ctx2hash + .func keccak224_ctx2hash +keccak224_ctx2hash: + movw r20, r22 + ldi r22, lo8(224) + ldi r23, hi8(224) + rjmp keccak_ctx2hash + .endfunc + + .global keccak384_ctx2hash + .func keccak384_ctx2hash +keccak384_ctx2hash: + movw r20, r22 + ldi r22, lo8(384) + ldi r23, hi8(384) + rjmp keccak_ctx2hash + .endfunc + .global keccak512_ctx2hash + .func keccak512_ctx2hash +keccak512_ctx2hash: + movw r20, r22 + ldi r22, lo8(512) + ldi r23, hi8(512) + rjmp keccak_ctx2hash + .endfunc + + .global keccak256_ctx2hash + .func keccak256_ctx2hash +keccak256_ctx2hash: + movw r20, r22 + ldi r22, lo8(256) + ldi r23, hi8(256) + .endfunc + +/* +void keccak_ctx2hash(void *dest, uint16_t length_b, keccak_ctx_t *ctx){ + while(length_b>=ctx->r){ + memcpy(dest, ctx->a, ctx->bs); + dest = (uint8_t*)dest + ctx->bs; + length_b -= ctx->r; + keccak_f1600(ctx->a); + } + memcpy(dest, ctx->a, (length_b+7)/8); +} +*/ + .global keccak_ctx2hash + .func keccak_ctx2hash +keccak_ctx2hash: + push_range 2, 10 + movw r4, r20 + movw r6, r24 + movw ZL, r20 + movw r8, r22 + subi ZL, lo8(-ctx_r) + sbci ZH, hi8(-ctx_r) + ld r2, Z+ + ld r3, Z+ + ldd r10, Z+3 ; load blocksize (in bytes) +10: + ; length_b = (r9:r8) ; r = (r3:r2) ; (H:L) + cp r2, r8 + cpc r3, r9 + brsh 40f + movw XL, r4 + movw ZL, r6 + mov r24, r10 +20: + ld r22, X+ + st Z+, r22 + dec r24 + brne 20b + movw r6, ZL + sub r8, r2 + sbc r9, r3 + movw r24, r4 + rcall keccak_f1600 + rjmp 10b +40: + movw XL, r4 + movw ZL, r6 + movw r24, r8 + adiw r24, 7 + lsr r25 + ror r24 + lsr r25 + ror r24 + lsr r25 + ror r24 + adiw r24, 0 + breq 99f +10: + ld r22, X+ + st Z+, r22 + sbiw r24, 1 + brne 10b +99: + pop_range 2, 10 ret + .endfunc + + + .global keccak224_init + .func keccak224_init +keccak224_init: + movw XL, r24 + ldi r24, lo8(1152) + ldi r25, hi8(1152) + rjmp keccak_init_1 + .endfunc + + .global keccak384_init + .func keccak384_init +keccak384_init: + movw XL, r24 + ldi r24, lo8( 832) + ldi r25, hi8( 832) + rjmp keccak_init_1 + .endfunc + + .global keccak512_init + .func keccak512_init +keccak512_init: + movw XL, r24 + ldi r24, lo8( 576) + ldi r25, hi8( 576) + rjmp keccak_init_1 + .endfunc + + .global keccak256_init + .func keccak256_init +keccak256_init: + movw r22, r24 + ldi r24, lo8(1088) + ldi r25, hi8(1088) + .endfunc +/* +void keccak_init(uint16_t r, keccak_ctx_t *ctx){ + memset(ctx->a, 0x00, 5 * 5 * 8); + ctx->r = r; + ctx->bs = (uint8_t)(r / 8); +} +*/ + .global keccak_init + .func keccak_init +keccak_init: + movw XL, r22 +keccak_init_1: + ldi r22, 200 +10: + st X+, __zero_reg__ + dec r22 + brne 10b + st X+, r24 + st X+, r25 + lsr r25 + ror r24 + lsr r25 + ror r24 + lsr r25 + ror r24 + st X+, r24 + ret + .endfunc + +/* +void keccak_lastBlock(keccak_ctx_t *ctx, const void *block, uint16_t length_b){ + uint8_t length_B; + uint8_t t; + while(length_b >= ctx->r){ + keccak_nextBlock(ctx, block); + block = (uint8_t*)block + ctx->bs; + length_b -= ctx->r; + } + length_B = length_b / 8; + memxor(ctx->a, block, length_B); + / * append 1 * / + if(length_b & 7){ + / * we have some single bits * / + t = ((uint8_t*)block)[length_B] >> (8 - (length_b & 7)); + t |= 0x01 << (length_b & 7); + }else{ + t = 0x01; + } + ctx->a[length_B] ^= t; + if(length_b == ctx->r - 1){ + keccak_f1600(ctx->a); + } + +*/ +.set length_b_l, 2 +.set length_b_h, 3 +.set pbs, 10 +.set pr_l, 8 +.set pr_h, 9 +.set ctx_l, 6 +.set ctx_h, 7 + + .global keccak_lastBlock + .func keccak_lastBlock +keccak_lastBlock: + push_range 2, 10 + movw r2, r20 + movw r4, r22 + movw r6, r24 + movw XL, r24 + subi XL, lo8(-ctx_r) + sbci XH, hi8(-ctx_r) + ld pr_l, X+ + ld pr_h, X+ + ld pbs, X +10: + cp length_b_l, pr_l + cpc length_b_h, pr_h + brlo 20f + movw r24, ctx_l + movw r22, r4 + rcall keccak_nextBlock + add r4, pbs + adc r5, __zero_reg__ + sub length_b_l, pr_l + sbc length_b_h, pr_h + rjmp 10b +20: + movw ZL, ctx_l + movw XL, r4 + movw r22, length_b_l + lsr r23 + ror r22 + lsr r23 + ror r22 + lsr r23 + ror r22 + mov r23, r22 + breq 20f +10: + ld r25, X+ + ld r24, Z + eor r24, r25 + st Z+, r24 + dec r23 + brne 10b +20: + ldi r25, 1 + mov r18, length_b_l + andi r18, 7 + breq 30f + /* we have trailing bits */ + mov r19, r18 + ld r24, X+ + subi r18, 8 + neg r18 +10: + lsr r24 + dec r18 + brne 10b +10: + lsl r25 + dec r19 + brne 10b + or r25, r24 +30: + ld r24, Z + eor r24, r25 + st Z, r24 + + movw r24, pr_l + sbiw r24, 1 + cp length_b_l, r24 + cpc length_b_h, r25 + brne 20f + movw r24, ctx_l + rcall keccak_f1600 +20: + movw XL, ctx_l + dec pbs + add XL, pbs + adc XH, __zero_reg__ + ld r24, X + ldi r25, 0x80 + eor r24, r25 + st X, r24 + movw r24, ctx_l + pop_range 2, 10 + rjmp keccak_f1600 + .endfunc +