--- /dev/null
+/* cscipher_tiny_asm.S */
+/*
+ This file is part of the AVR-Crypto-Lib.
+ Copyright (C) 2006-2010 Daniel Otte (daniel.otte@rub.de)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "avr-asm-macros.S"
+
+/*
+uint8_t p(uint8_t a){
+ a ^= pgm_read_byte(fg_table+(a&0xf))&0xf0;
+ a ^= pgm_read_byte(fg_table+(a>>4)) &0x0f;
+ a ^= pgm_read_byte(fg_table+(a&0xf))&0xf0;
+ return a;
+}
+*/
+
+fg_table:
+.byte 0xfa, 0xd6, 0xb0, 0xb2, 0x7b, 0x5e, 0x71, 0x78
+.byte 0xed, 0xd4, 0xa5, 0xb3, 0xef, 0xdc, 0xe7, 0xf9
+
+.global p
+p:
+ ldi r30, lo8(fg_table)
+ ldi r31, hi8(fg_table)
+ movw r26, r30
+ mov r25, r24
+ andi r25, 0x0F
+ add r30, r25
+ adc r31, r1
+ lpm r25, Z
+ andi r25, 0xF0
+ eor r24, r25
+
+ movw r30, r26
+ mov r25, r24
+ swap r25
+ andi r25, 0x0F
+ add r30, r25
+ adc r31, r1
+ lpm r25, Z
+ andi r25, 0x0F
+ eor r24, r25
+
+ movw r30, r26
+ mov r25, r24
+ andi r25, 0x0F
+ add r30, r25
+ adc r31, r1
+ lpm r25, Z
+ andi r25, 0xF0
+ eor r24, r25
+ clr r25
+ ret
+
+ks_const:
+.byte 0x29,0x0d,0x61,0x40,0x9c,0xeb,0x9e,0x8f
+.byte 0x1f,0x85,0x5f,0x58,0x5b,0x01,0x39,0x86
+.byte 0x97,0x2e,0xd7,0xd6,0x35,0xae,0x17,0x16
+.byte 0x21,0xb6,0x69,0x4e,0xa5,0x72,0x87,0x08
+.byte 0x3c,0x18,0xe6,0xe7,0xfa,0xad,0xb8,0x89
+.byte 0xb7,0x00,0xf7,0x6f,0x73,0x84,0x11,0x63
+.byte 0x3f,0x96,0x7f,0x6e,0xbf,0x14,0x9d,0xac
+.byte 0xa4,0x0e,0x7e,0xf6,0x20,0x4a,0x62,0x30
+.byte 0x03,0xc5,0x4b,0x5a,0x46,0xa3,0x44,0x65
+
+CTX_0 = 18
+CTX_1 = 19
+CNT = 17
+.global cscipher_init
+cscipher_init:
+ push CNT
+ push_range 28, 29
+ stack_alloc 24, 28, 29
+ adiw r28, 1
+ movw r30, r24
+ movw CTX_0, r22
+ /* copy key to local tmp_key */
+ ldi r22, 16
+10: ld r23, Z+
+ st Y+, r23
+ dec r22
+ brne 10b
+ sbiw r28, 16
+ ldi CNT, 0xff
+10: /* main loop */
+ inc CNT
+ /* copy part of tmp_key to tmp */
+ ldi r23, 8
+11: ldd r22, Y+0
+ sbrc CNT, 0
+ ldd r22, Y+8
+ std Y+16, r22
+ adiw r28, 1
+ dec r23
+ brne 11b
+ adiw r28, 8 /* Y points at tmp */
+ /* xor ks constant into tmp */
+ movw r24, r28
+ ldi r22, lo8(ks_const)
+ ldi r23, hi8(ks_const)
+ mov r21, CNT
+ swap r21
+ lsr r21
+ add r22, r21
+ adc r23, r1
+ clr r21
+ ldi r20, 8
+ call memxor_P
+ /* do P transformation */
+ ldi r22, 8
+20: ld r24, Y
+ rcall p
+ st Y+, r24
+ dec r22
+ brne 20b
+ sbiw r28, 8 /* Y points at tmp */
+ movw r26, r28
+ sbiw r26, 8
+ sbrc CNT, 0
+ sbiw r26, 8
+ /* do T transformation */
+ movw r30, CTX_0
+ ldi r22, 8
+30: ldi r23, 8
+35: ld r24, Y
+ rol r24
+ rol r21
+ st Y+, r24
+ dec r23
+ brne 35b
+ sbiw r28, 8 /* Y points at tmp */
+ ld r24, X
+ eor r21, r24
+ st X+, r21
+ st Z+, r21
+ dec r22
+ brne 30b
+ sbiw r28, 16 /* Y points at tmp_key (again) */
+ movw CTX_0, r30
+ sbrs CNT, 3
+ rjmp 10b
+ stack_free 24
+ pop_range 28, 29
+ pop CNT
+ ret
+
+
+round_const:
+.byte 0xb7, 0xe1, 0x51, 0x62, 0x8a, 0xed, 0x2a, 0x6a
+.byte 0xbf, 0x71, 0x58, 0x80, 0x9c, 0xf4, 0xf3, 0xc7
+
+/*
+void cscipher_enc(void* buffer, const cscipher_ctx_t* ctx){
+ uint8_t i,j,k;
+ uint8_t tmp[8];
+ for(i=0; i<8; ++i){
+ for(j=0; j<3; ++j){
+ if(j==0){
+ memxor(buffer, ctx->keys[i], 8);
+ }else{
+ memxor_P(buffer, round_const+((j==1)?0:8), 8);
+ }
+ for(k=0; k<4; ++k){
+ ((uint16_t*)tmp)[k] = m(((uint16_t*)buffer)[k]);
+ }
+ for(k=0; k<4; ++k){
+ ((uint8_t*)buffer)[k] = tmp[2*k];
+ ((uint8_t*)buffer)[k+4] = tmp[2*k+1];
+ }
+ }
+ }
+ memxor(buffer, ctx->keys[8], 8);
+}
+*/
+TMP_0 = 2
+TMP_1 = 3
+TMP_2 = 4
+TMP_3 = 5
+TMP_4 = 6
+TMP_5 = 7
+TMP_6 = 8
+TMP_7 = 9
+CTX_0 = 10
+CTX_1 = 11
+CNT_0 = 16
+CNT_1 = 17
+DST_0 = 12
+DST_1 = 13
+SRC_0 = 14
+SRC_1 = 15
+.global cscipher_enc
+cscipher_enc:
+ push_range 2, 17
+ push_range 28, 29
+ movw r28, r24
+ movw CTX_0, r22
+ ldi CNT_0, 8
+ /* main loop */
+10: ldi CNT_1, 2
+ clt
+ /* sub loop */
+20: ldi r27, 0
+ ldi r26, TMP_0
+ movw DST_0, r26
+ ldi r30, lo8(round_const)
+ ldi r31, hi8(round_const)
+ sbrs CNT_1, 0
+ adiw r30, 8
+ sbrc CNT_1, 1
+ movw r30, CTX_0
+ movw SRC_0, r30
+ ldi r21, 4
+ /* xor and m transformation */
+25: ld r24, Y+
+ ld r25, Y+
+ movw r30, SRC_0
+ brts 30f
+ ld r22, Z+
+ ld r23, Z+
+ rjmp 35f
+30: lpm r22, Z+
+ lpm r23, Z+
+35:
+ movw SRC_0, r30
+ eor r24, r22
+ eor r25, r23
+
+ movw r22, r24
+ mov r25, r22
+ rol r25
+ adc r25, r1
+ mov r22, r25
+ andi r22, 0x55
+ eor r22, r24
+ eor r22, r23
+ eor r23, r25
+ mov r24, r23
+ rcall p
+ mov r23, r24
+ mov r24, r22
+ rcall p
+
+ movw r26, DST_0
+ st X+, r24
+ st X+, r23
+ movw DST_0, r26
+ dec r21
+ brne 25b
+ sbrc CNT_1, 1
+ movw CTX_0, SRC_0
+ sbiw r28, 8
+ std Y+0, TMP_0
+ std Y+4, TMP_1
+ std Y+1, TMP_2
+ std Y+5, TMP_3
+ std Y+2, TMP_4
+ std Y+6, TMP_5
+ std Y+3, TMP_6
+ std Y+7, TMP_7
+ set
+ dec CNT_1
+ brpl 20b
+
+ dec CNT_0
+ brne 10b
+
+ movw r24, r28
+ movw r22, CTX_0
+ clr r21
+ ldi r20, 8
+
+ pop_range 28, 29
+ pop_range 2, 17
+ rjmp memxor
+
+/*
+void cscipher_dec(void* buffer, const cscipher_ctx_t* ctx){
+ uint8_t i=7,j,k;
+ uint8_t tmp[8];
+ memxor(buffer, ctx->keys[8], 8);
+ do{
+ for(j=0; j<3; ++j){
+ for(k=0; k<4; ++k){
+ tmp[2*k] = ((uint8_t*)buffer)[k];
+ tmp[2*k+1] = ((uint8_t*)buffer)[4+k];
+ }
+ for(k=0; k<4; ++k){
+ ((uint16_t*)buffer)[k] = m_inv(((uint16_t*)tmp)[k]);
+ }
+ if(j==2){
+ memxor(buffer, ctx->keys[i], 8);
+ }else{
+ memxor_P(buffer, round_const+((j==1)?0:8), 8);
+ }
+
+ }
+ }while(i--);
+}
+
+*/
+.global cscipher_dec
+cscipher_dec:
+ push_range 2, 17
+ push_range 28, 29
+ movw r28, r24
+ movw r26, r22
+ adiw r26, 7*8
+ adiw r26, 8
+ movw CTX_0, r26
+ movw r22, r26
+ clr r21
+ ldi r20, 8
+ call memxor
+ ldi CNT_0, 7
+10:
+ ldi CNT_1, 3
+20:
+ clr r27
+ ldi r26, TMP_0
+ movw DST_0, r26
+ ldi r21, 4
+30:
+ ldd r23, Y+4
+ ld r24, Y+
+/* m_inv transformation */
+; mov r23, r25
+ rcall p
+ mov r22, r24
+ mov r24, r23
+ rcall p
+ eor r22, r24
+ mov r25, r24
+ mov r24, r22
+ rol r24
+ adc r24, r1
+ andi r24, 0xaa
+ eor r24, r22
+ mov r22, r24
+ rol r22
+ adc r22, r1
+ eor r25, r22
+
+ movw r26, DST_0
+ st X+, r24
+ st X+, r25
+ movw DST_0, r26
+ dec r21
+ brne 30b
+ sbiw r28, 4
+ std Y+0, TMP_0
+ std Y+1, TMP_1
+ std Y+2, TMP_2
+ std Y+3, TMP_3
+ std Y+4, TMP_4
+ std Y+5, TMP_5
+ std Y+6, TMP_6
+ std Y+7, TMP_7
+ movw r24, r28
+ clr r21
+ ldi r20, 8
+ sbrc CNT_1, 1
+ rjmp 40f
+ movw r26, CTX_0
+ sbiw r26, 8
+ movw CTX_0, r26
+ movw r22, r26
+ call memxor
+ rjmp 45f
+40:
+ ldi r26, lo8(round_const)
+ ldi r27, hi8(round_const)
+ sbrc CNT_1, 0
+ adiw r26, 8
+ movw r22, r26
+ call memxor_P
+45:
+
+ dec CNT_1
+ brne 20b
+ dec CNT_0
+ brpl 10b
+90:
+ pop_range 28, 29
+ pop_range 2, 17
+ ret