]> git.cryptolib.org Git - avr-crypto-lib.git/blobdiff - keccak/keccak-asm.S
[keccak-asm] removing unnecessary c and d fields from context
[avr-crypto-lib.git] / keccak / keccak-asm.S
index 9d613f95042bdec0006cb4f1fb2d6ae8183fe195..7b7c6cf3144bb4374831c4cb5dbb1ecb9f66d420 100644 (file)
 
 .equ __zero_reg__, 1
 
-.global rho_pi_idx_table
+/*
+typedef struct{
+       uint64_t a[5][5];
+       uint16_t r, c;
+       uint8_t  d, bs;
+} keccak_ctx_t;
+*/
+       .struct 0
+ctx_a:
+       .struct ctx_a + 8 * 5 * 5
+ctx_r:
+       .struct ctx_r + 2
+ctx_bs:
+
+       .section .text
+
+       .global rho_pi_idx_table
 rho_pi_idx_table:
        .irp i, 0, 1, 2, 3, 4
                .irp j, 0, 1, 2, 3, 4
@@ -39,9 +55,36 @@ rho_pi_idx_table:
                .endr
        .endr
 
-.align 2
+/*
+#define ROT_BIT(a) (( (a) <= 4) ? ((a) << 1) : (0x01 | ((8 - (a)) << 1)))
+#define ROT_CODE(a) ((((a) / 8 + ((((a) % 8) > 4) ? 1 : 0)) << 4) | ROT_BIT(((a) % 8)))
+
+const uint8_t keccak_rotate_codes[5][5] PROGMEM = {
+        { ROT_CODE( 0), ROT_CODE( 1), ROT_CODE(62), ROT_CODE(28), ROT_CODE(27) },
+        { ROT_CODE(36), ROT_CODE(44), ROT_CODE( 6), ROT_CODE(55), ROT_CODE(20) },
+        { ROT_CODE( 3), ROT_CODE(10), ROT_CODE(43), ROT_CODE(25), ROT_CODE(39) },
+        { ROT_CODE(41), ROT_CODE(45), ROT_CODE(15), ROT_CODE(21), ROT_CODE( 8) },
+        { ROT_CODE(18), ROT_CODE( 2), ROT_CODE(61), ROT_CODE(56), ROT_CODE(14) }
+};
+*/
+
+keccak_rotate_codes:
+.byte  0x00, 0x02, 0x85, 0x38, 0x36
+.byte  0x48, 0x58, 0x15, 0x73, 0x28
+.byte  0x06, 0x14, 0x56, 0x32, 0x53
+.byte  0x52, 0x67, 0x23, 0x37, 0x10
+.byte  0x24, 0x04, 0x87, 0x70, 0x25
+
+keccak_rc_comp:
+.byte  0x01, 0x92, 0xda, 0x70
+.byte  0x9b, 0x21, 0xf1, 0x59
+.byte  0x8a, 0x88, 0x39, 0x2a
+.byte  0xbb, 0xcb, 0xd9, 0x53
+.byte  0x52, 0xc0, 0x1a, 0x6a
+.byte  0xf1, 0xd0, 0x21, 0x78
+
+       .align 2
 
-.global rotate64_1bit_left
 rotate64_1bit_left:
        bst r25, 7
        rol r18
@@ -55,7 +98,6 @@ rotate64_1bit_left:
        bld r18, 0
        ret
 
-.global rotate64_1bit_right
 rotate64_1bit_right:
        bst r18, 0
        ror r25
@@ -69,27 +111,6 @@ rotate64_1bit_right:
        bld r25, 7
        ret
 
-.global rotate64_nbit_autodir
-rotate64_nbit_autodir:
-       lsr r16
-       brcc rotate64_nbit_left
-.global rotate64_nbit_right
-rotate64_nbit_right:
-       ldi r30, pm_lo8(rotate64_1bit_right)
-       ldi r31, pm_hi8(rotate64_1bit_right)
-       rjmp icall_r16_times
-.global rotate64_nbit_left
-rotate64_nbit_left:
-       ldi r30, pm_lo8(rotate64_1bit_left)
-       ldi r31, pm_hi8(rotate64_1bit_left)
-icall_r16_times:
-1:     dec r16
-       brmi 2f
-       icall
-       rjmp 1b
-2:
-    ret
-
 rotate64_1byte_left:
        mov r0, r25
        mov r25, r24
@@ -161,8 +182,6 @@ rotate64_7byte_left:
        mov r23, r24
        mov r24, r25
        mov r25, r0
-       ret
-
 
 byte_rot_jmp_table:
        ret
@@ -174,21 +193,6 @@ byte_rot_jmp_table:
        rjmp rotate64_6byte_left
        rjmp rotate64_7byte_left
 
-.global rotate64left_code
-rotate64left_code:
-       ldi r30, pm_lo8(byte_rot_jmp_table)
-       ldi r31, pm_hi8(byte_rot_jmp_table)
-       mov r0, r16
-       andi r16, 0x70
-       swap r16
-       add r30, r16
-       adc r31, r1
-       mov r16, r0
-       andi r16, 0x0f
-       icall
-       clr r1
-       rjmp rotate64_nbit_autodir
-
 
 /*
        void keccak_theta (uint64_t *a, uint64_t *b){
@@ -316,7 +320,26 @@ chi_step:
        brne 10b
        ret
 
-.global keccak_f1600
+       .global keccak_nextBlock
+       .func keccak_nextBlock
+keccak_nextBlock:
+       movw ZL, r24
+       subi ZL, lo8(-ctx_bs)
+       sbci ZH, hi8(-ctx_bs)
+       ld r20, Z
+       movw XL, r24
+       movw ZL, r22
+10:
+       ld r22, X
+       ld r23, Z+
+       eor r22, r23
+       st X+, r22
+       dec r20
+       brne 10b
+       .endfunc
+
+       .global keccak_f1600
+       .func keccak_f1600
 keccak_f1600:
        push_range 2, 9
        push r16
@@ -409,7 +432,7 @@ keccak_f1600:
 
 ;      ret
 /*
-  rho & pi
+   -- rho & pi --
        for(i = 0; i < 5; ++i){
                for(j = 0; j < 5; ++j){
                        b[(2 * i + 3 * j) % 5][j] =
@@ -459,7 +482,34 @@ keccak_f1600:
        movw ZL, r2
        lpm r16, Z+
        movw r2, ZL
-       rcall rotate64left_code
+rotate64left_code:
+       ldi r30, pm_lo8(byte_rot_jmp_table)
+       ldi r31, pm_hi8(byte_rot_jmp_table)
+       mov r0, r16
+       andi r16, 0x70
+       swap r16
+       add r30, r16
+       adc r31, r1
+       mov r16, r0
+       andi r16, 0x0f
+       icall
+       clr r1
+rotate64_nbit_autodir:
+       lsr r16
+       brcc rotate64_nbit_left
+rotate64_nbit_right:
+       ldi r30, pm_lo8(rotate64_1bit_right)
+       ldi r31, pm_hi8(rotate64_1bit_right)
+       rjmp icall_r16_times
+rotate64_nbit_left:
+       ldi r30, pm_lo8(rotate64_1bit_left)
+       ldi r31, pm_hi8(rotate64_1bit_left)
+icall_r16_times:
+1:     dec r16
+       brmi 2f
+       icall
+       rjmp 1b
+2:
        movw ZL, r4
        lpm r16, Z+
        movw r4, ZL
@@ -490,15 +540,17 @@ keccak_f1600:
 */
        ; memcpy(a, b, 200)
        ; X points at b + 32 + 8 = b + 40 = b[1][0] has to point to b[0][0]
-       ldi r16, 200
+       ldi r16, 200 / 8
        sbiw XL, 5 * 8
        movw ZL, XL
        subi YL, lo8(5 * 5 * 8)
        sbci YH, hi8(5 * 5 * 8)
        movw r2, YL
 10:
+       .rept 8
        ld r22, X+
        st Y+, r22
+       .endr
        dec r16
        brne 10b
 
@@ -567,5 +619,73 @@ keccak_f1600:
        pop_range 28, 29
        pop r16
        pop_range 2, 9
+       ret
+       .endfunc
 
+/*
+void keccak_ctx2hash(void* dest, uint16_t length_b, keccak_ctx_t* ctx){
+       while(length_b>=ctx->r){
+               memcpy(dest, ctx->a, ctx->bs);
+               dest = (uint8_t*)dest + ctx->bs;
+               length_b -= ctx->r;
+               keccak_f1600(ctx->a);
+       }
+       memcpy(dest, ctx->a, (length_b+7)/8);
+}
+*/
+;      .global keccak_ctx2hash
+;      .func keccak_ctx2hash
+;keccak_ctx2hash:
+       push_range 2, 10
+       movw r4, r20
+       movw r6, r24
+       movw ZL, r24
+       movw r8, r22
+       subi ZL, lo8(-ctx_r)
+       sbci ZH, hi8(-ctx_r)
+       ld r2, Z+
+       ld r3, Z+
+       ldd r10, Z+3 ; load blocksize (in bytes)
+10:
+       ; length_b = (r9:r8) ; r = (r3:r2) ; (H:L)
+       cp  r2, r8
+       cpc r3, r9
+       rjmp 40f
+       brsh 40f
+       movw XL, r4
+       movw ZL, r6
+       mov r24, r10
+20:
+       ld r22, X+
+       st Z+, r22
+       dec r24
+       brne 20b
+       movw r6, ZL
+       sub r8, r2
+       sbc r9, r3
+       movw r24, r4
+       rcall keccak_f1600
+       rjmp 10b
+40:
+       movw XL, r4
+       movw ZL, r6
+       movw r24, r8
+       adiw r24, 7
+       lsr r25
+       ror r24
+       lsr r25
+       ror r24
+       lsr r25
+       ror r24
+       adiw r24, 0
+       breq 99f
+10:
+       ld r22, X+
+       st Z+, r22
+       sbiw r24, 1
+       brne 10b
+99:
+       pop_range 2, 10
        ret
+;      .endfunc
+