adding CS-Cipher

[avr-crypto-lib.git] / cscipher / cscipher_tiny_asm.S
diff --git a/cscipher/cscipher_tiny_asm.S b/cscipher/cscipher_tiny_asm.S

new file mode 100644 (file)

index 0000000..bba78f9
--- /dev/null
+++ b/cscipher/cscipher_tiny_asm.S
@@ -0,0 +1,399 @@
+/* cscipher_tiny_asm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2006-2010 Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "avr-asm-macros.S"
+
+/*
+uint8_t p(uint8_t a){
+       a ^= pgm_read_byte(fg_table+(a&0xf))&0xf0;
+       a ^= pgm_read_byte(fg_table+(a>>4)) &0x0f;
+       a ^= pgm_read_byte(fg_table+(a&0xf))&0xf0;
+       return a;
+}
+*/
+
+fg_table:
+.byte  0xfa, 0xd6, 0xb0, 0xb2, 0x7b, 0x5e, 0x71, 0x78
+.byte  0xed, 0xd4, 0xa5, 0xb3, 0xef, 0xdc, 0xe7, 0xf9
+
+.global p
+p:
+       ldi r30, lo8(fg_table)
+       ldi r31, hi8(fg_table)
+       movw r26, r30
+       mov r25, r24
+       andi r25, 0x0F
+       add r30, r25
+       adc r31, r1
+       lpm r25, Z
+       andi r25, 0xF0
+       eor r24, r25
+
+       movw r30, r26
+       mov r25, r24
+       swap r25
+       andi r25, 0x0F
+       add r30, r25
+       adc r31, r1
+       lpm r25, Z
+       andi r25, 0x0F
+       eor r24, r25
+
+       movw r30, r26
+       mov r25, r24
+       andi r25, 0x0F
+       add r30, r25
+       adc r31, r1
+       lpm r25, Z
+       andi r25, 0xF0
+       eor r24, r25
+       clr r25
+       ret
+
+ks_const:
+.byte  0x29,0x0d,0x61,0x40,0x9c,0xeb,0x9e,0x8f
+.byte  0x1f,0x85,0x5f,0x58,0x5b,0x01,0x39,0x86
+.byte  0x97,0x2e,0xd7,0xd6,0x35,0xae,0x17,0x16
+.byte  0x21,0xb6,0x69,0x4e,0xa5,0x72,0x87,0x08
+.byte  0x3c,0x18,0xe6,0xe7,0xfa,0xad,0xb8,0x89
+.byte  0xb7,0x00,0xf7,0x6f,0x73,0x84,0x11,0x63
+.byte  0x3f,0x96,0x7f,0x6e,0xbf,0x14,0x9d,0xac
+.byte  0xa4,0x0e,0x7e,0xf6,0x20,0x4a,0x62,0x30
+.byte  0x03,0xc5,0x4b,0x5a,0x46,0xa3,0x44,0x65
+
+CTX_0 = 18
+CTX_1 = 19
+CNT   = 17
+.global cscipher_init
+cscipher_init:
+       push CNT
+       push_range 28, 29
+       stack_alloc 24, 28, 29
+       adiw r28, 1
+       movw r30, r24
+       movw CTX_0, r22
+       /* copy key to local tmp_key */
+       ldi r22, 16
+10: ld r23, Z+
+       st Y+, r23
+       dec r22
+       brne 10b
+       sbiw r28, 16
+       ldi CNT, 0xff
+10: /* main loop */
+       inc CNT
+       /* copy part of tmp_key to tmp */
+       ldi r23, 8
+11:    ldd r22, Y+0
+       sbrc CNT, 0
+       ldd r22, Y+8
+       std Y+16, r22
+       adiw r28, 1
+       dec r23
+       brne 11b
+       adiw r28, 8 /* Y points at tmp */
+       /* xor ks constant into tmp */
+       movw r24, r28
+       ldi r22, lo8(ks_const)
+       ldi r23, hi8(ks_const)
+       mov r21, CNT
+       swap r21
+       lsr r21
+       add r22, r21
+       adc r23, r1
+       clr r21
+       ldi r20, 8
+       call memxor_P
+       /* do P transformation */
+       ldi r22, 8
+20:    ld r24, Y
+       rcall p
+       st Y+, r24
+       dec r22
+       brne 20b
+       sbiw r28, 8 /* Y points at tmp */
+       movw r26, r28
+       sbiw r26, 8
+       sbrc CNT, 0
+       sbiw r26, 8
+       /* do T transformation */
+       movw r30, CTX_0
+       ldi r22, 8
+30:    ldi r23, 8
+35:    ld r24, Y
+       rol r24
+       rol r21
+       st Y+, r24
+       dec r23
+       brne 35b
+       sbiw r28, 8 /* Y points at tmp */
+       ld r24, X
+       eor r21, r24
+       st X+, r21
+       st Z+, r21
+       dec r22
+       brne 30b
+       sbiw r28, 16 /* Y points at tmp_key (again) */
+       movw CTX_0, r30
+       sbrs CNT, 3
+       rjmp 10b
+       stack_free 24
+       pop_range 28, 29
+       pop CNT
+       ret
+
+
+round_const:
+.byte  0xb7, 0xe1, 0x51, 0x62, 0x8a, 0xed, 0x2a, 0x6a
+.byte  0xbf, 0x71, 0x58, 0x80, 0x9c, 0xf4, 0xf3, 0xc7
+
+/*
+void cscipher_enc(void* buffer, const cscipher_ctx_t* ctx){
+       uint8_t i,j,k;
+       uint8_t tmp[8];
+       for(i=0; i<8; ++i){
+               for(j=0; j<3; ++j){
+                       if(j==0){
+                               memxor(buffer, ctx->keys[i], 8);
+                       }else{
+                               memxor_P(buffer, round_const+((j==1)?0:8), 8);
+                       }
+                       for(k=0; k<4; ++k){
+                               ((uint16_t*)tmp)[k] = m(((uint16_t*)buffer)[k]);
+                       }
+                       for(k=0; k<4; ++k){
+                               ((uint8_t*)buffer)[k]   = tmp[2*k];
+                               ((uint8_t*)buffer)[k+4] = tmp[2*k+1];
+                       }
+               }
+       }
+       memxor(buffer, ctx->keys[8], 8);
+}
+*/
+TMP_0 =  2
+TMP_1 =  3
+TMP_2 =  4
+TMP_3 =  5
+TMP_4 =  6
+TMP_5 =  7
+TMP_6 =  8
+TMP_7 =  9
+CTX_0 = 10
+CTX_1 = 11
+CNT_0 = 16
+CNT_1 = 17
+DST_0 = 12
+DST_1 = 13
+SRC_0 = 14
+SRC_1 = 15
+.global cscipher_enc
+cscipher_enc:
+       push_range 2, 17
+       push_range 28, 29
+       movw r28, r24
+       movw CTX_0, r22
+       ldi CNT_0, 8
+       /* main loop */
+10: ldi CNT_1, 2
+       clt
+       /* sub loop */
+20: ldi r27, 0
+       ldi r26, TMP_0
+       movw DST_0, r26
+       ldi r30, lo8(round_const)
+       ldi r31, hi8(round_const)
+       sbrs CNT_1, 0
+       adiw r30, 8
+       sbrc CNT_1, 1
+       movw r30, CTX_0
+       movw SRC_0, r30
+       ldi r21, 4
+       /* xor and m transformation */
+25:    ld r24, Y+
+       ld r25, Y+
+       movw r30, SRC_0
+       brts 30f
+       ld r22, Z+
+       ld r23, Z+
+       rjmp 35f
+30:    lpm r22, Z+
+       lpm r23, Z+
+35:
+       movw SRC_0, r30
+       eor r24, r22
+       eor r25, r23
+
+       movw r22, r24
+       mov r25, r22
+       rol r25
+       adc r25, r1
+       mov r22, r25
+       andi r22, 0x55
+       eor r22, r24
+       eor r22, r23
+       eor r23, r25
+       mov r24, r23
+       rcall p
+       mov r23, r24
+       mov r24, r22
+       rcall p
+
+       movw r26, DST_0
+       st X+, r24
+       st X+, r23
+       movw DST_0, r26
+       dec r21
+       brne 25b
+       sbrc CNT_1, 1
+       movw CTX_0, SRC_0
+       sbiw r28, 8
+       std Y+0, TMP_0
+       std Y+4, TMP_1
+       std Y+1, TMP_2
+       std Y+5, TMP_3
+       std Y+2, TMP_4
+       std Y+6, TMP_5
+       std Y+3, TMP_6
+       std Y+7, TMP_7
+       set
+       dec CNT_1
+       brpl 20b
+
+       dec CNT_0
+       brne 10b
+
+       movw r24, r28
+       movw r22, CTX_0
+       clr r21
+       ldi r20, 8
+
+       pop_range 28, 29
+       pop_range 2, 17
+       rjmp memxor
+
+/*
+void cscipher_dec(void* buffer, const cscipher_ctx_t* ctx){
+       uint8_t i=7,j,k;
+       uint8_t tmp[8];
+       memxor(buffer, ctx->keys[8], 8);
+       do{
+               for(j=0; j<3; ++j){
+                       for(k=0; k<4; ++k){
+                               tmp[2*k]   = ((uint8_t*)buffer)[k];
+                               tmp[2*k+1] = ((uint8_t*)buffer)[4+k];
+                       }
+                       for(k=0; k<4; ++k){
+                               ((uint16_t*)buffer)[k] = m_inv(((uint16_t*)tmp)[k]);
+                       }
+                       if(j==2){
+                               memxor(buffer, ctx->keys[i], 8);
+                       }else{
+                               memxor_P(buffer, round_const+((j==1)?0:8), 8);
+                       }
+
+               }
+       }while(i--);
+}
+
+*/
+.global cscipher_dec
+cscipher_dec:
+       push_range 2, 17
+       push_range 28, 29
+       movw r28, r24
+       movw r26, r22
+       adiw r26, 7*8
+       adiw r26, 8
+       movw CTX_0, r26
+       movw r22, r26
+       clr r21
+       ldi r20, 8
+       call memxor
+       ldi CNT_0, 7
+10:
+       ldi CNT_1, 3
+20:
+       clr r27
+       ldi r26, TMP_0
+       movw DST_0, r26
+       ldi r21, 4
+30:
+       ldd r23, Y+4
+       ld  r24, Y+
+/* m_inv transformation */
+;      mov r23, r25
+       rcall p
+       mov r22, r24
+       mov r24, r23
+       rcall p
+       eor r22, r24
+       mov r25, r24
+       mov r24, r22
+       rol r24
+       adc r24, r1
+       andi r24, 0xaa
+       eor r24, r22
+       mov r22, r24
+       rol r22
+       adc r22, r1
+       eor r25, r22
+
+       movw r26, DST_0
+       st X+, r24
+       st X+, r25
+       movw DST_0, r26
+       dec r21
+       brne 30b
+       sbiw r28, 4
+       std Y+0, TMP_0
+       std Y+1, TMP_1
+       std Y+2, TMP_2
+       std Y+3, TMP_3
+       std Y+4, TMP_4
+       std Y+5, TMP_5
+       std Y+6, TMP_6
+       std Y+7, TMP_7
+       movw r24, r28
+       clr r21
+       ldi r20, 8
+       sbrc CNT_1, 1
+       rjmp 40f
+       movw r26, CTX_0
+       sbiw r26, 8
+       movw CTX_0, r26
+       movw r22, r26
+       call memxor
+       rjmp 45f
+40:
+       ldi r26, lo8(round_const)
+       ldi r27, hi8(round_const)
+       sbrc CNT_1, 0
+       adiw r26, 8
+       movw r22, r26
+       call memxor_P
+45:
+
+       dec CNT_1
+       brne 20b
+       dec CNT_0
+       brpl 10b
+90:
+       pop_range 28, 29
+       pop_range 2, 17
+       ret