/* camellia-asm.S */ /* This file is part of the AVR-Crypto-Lib. Copyright (C) 2006-2015 Daniel Otte (bg@nerilex.org) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* * File: camellis-asm.S * Author: Daniel Otte * Date: 2006-11-10 * License: GPLv3 or later * Description: Implementation of the camellia block cipher algorithm. * */ .macro SWAP_R A, B eor \A, \B eor \B, \A eor \A, \B .endm .macro precall /* push r18 - r27, r30 - r31*/ push r0 push r1 push r18 push r19 push r20 push r21 push r22 push r23 push r24 push r25 push r26 push r27 push r30 push r31 clr r1 .endm .macro postcall pop r31 pop r30 pop r27 pop r26 pop r25 pop r24 pop r23 pop r22 pop r21 pop r20 pop r19 pop r18 pop r1 pop r0 .endm .macro hexdump length push r27 push r26 ldi r25, '\r' mov r24, r25 call uart_putc ldi r25, '\n' mov r24, r25 call uart_putc pop r26 pop r27 movw r24, r26 .if \length > 16 ldi r22, lo8(16) ldi r23, hi8(16) push r27 push r26 call uart_hexdump pop r26 pop r27 adiw r26, 16 hexdump \length-16 .else ldi r22, lo8(\length) ldi r23, hi8(\length) call uart_hexdump .endif .endm /* X points to Block */ .macro dbg_hexdump length precall hexdump \length postcall .endm SPL = 0x3D SPH = 0x3E SREG = 0x3F NULLr = 1 camellia_sbox: .byte 112, 130, 44, 236, 179, 39, 192, 229, 228, 133, 87, 53, 234, 12, 174, 65 .byte 35, 239, 107, 147, 69, 25, 165, 33, 237, 14, 79, 78, 29, 101, 146, 189 .byte 134, 184, 175, 143, 124, 235, 31, 206, 62, 48, 220, 95, 94, 197, 11, 26 .byte 166, 225, 57, 202, 213, 71, 93, 61, 217, 1, 90, 214, 81, 86, 108, 77 .byte 139, 13, 154, 102, 251, 204, 176, 45, 116, 18, 43, 32, 240, 177, 132, 153 .byte 223, 76, 203, 194, 52, 126, 118, 5, 109, 183, 169, 49, 209, 23, 4, 215 .byte 20, 88, 58, 97, 222, 27, 17, 28, 50, 15, 156, 22, 83, 24, 242, 34 .byte 254, 68, 207, 178, 195, 181, 122, 145, 36, 8, 232, 168, 96, 252, 105, 80 .byte 170, 208, 160, 125, 161, 137, 98, 151, 84, 91, 30, 149, 224, 255, 100, 210 .byte 16, 196, 0, 72, 163, 247, 117, 219, 138, 3, 230, 218, 9, 63, 221, 148 .byte 135, 92, 131, 2, 205, 74, 144, 51, 115, 103, 246, 243, 157, 127, 191, 226 .byte 82, 155, 216, 38, 200, 55, 198, 59, 129, 150, 111, 75, 19, 190, 99, 46 .byte 233, 121, 167, 140, 159, 110, 188, 142, 41, 245, 249, 182, 47, 253, 180, 89 .byte 120, 152, 6, 106, 231, 70, 113, 186, 212, 37, 171, 66, 136, 162, 141, 250 .byte 114, 7, 185, 85, 248, 238, 172, 10, 54, 73, 42, 104, 60, 56, 241, 164 .byte 64, 40, 211, 123, 187, 201, 67, 193, 21, 227, 173, 244, 119, 199, 128, 158 //.global camellia_sigma /* camellia_sigma: .quad 0xA09E667F3BCC908B .quad 0xB67AE8584CAA73B2 .quad 0xC6EF372FE94F82BE .quad 0x54FF53A5F1D36F1C .quad 0x10E527FADE682D1D .quad 0xB05688C2B3E6C1FD */ /* uint8_t camellia_s1(uint8_t b) */ .global camellia_s1 camellia_s1: ldi r30, lo8(camellia_sbox) ldi r31, hi8(camellia_sbox) add r30, r24 adc r31, NULLr lpm r24, Z clr r25 ret .global camellia_s2 camellia_s2: ldi r30, lo8(camellia_sbox) ldi r31, hi8(camellia_sbox) add r30, r24 adc r31, NULLr lpm r24, Z lsl r24 adc r24, NULLr clr r25 ret .global camellia_s3 camellia_s3: ldi r30, lo8(camellia_sbox) ldi r31, hi8(camellia_sbox) add r30, r24 adc r31, NULLr lpm r24, Z bst r24, 0 lsr r24 bld r24, 7 clr r25 ret .global camellia_s4 camellia_s4: ldi r30, lo8(camellia_sbox) ldi r31, hi8(camellia_sbox) lsl r24 adc r24, NULLr add r30, r24 adc r31, NULLr lpm r24, Z clr r25 ret .global camellia_s /* uint64_t camellia_s(uint64_t d){ #define D ((uint8_t*)(&d)) D[7] = camellia_s1(D[7]); // MSB D[6] = camellia_s2(D[6]); D[5] = camellia_s3(D[5]); D[4] = camellia_s4(D[4]); D[3] = camellia_s2(D[3]); D[2] = camellia_s3(D[2]); D[1] = camellia_s4(D[1]); D[0] = camellia_s1(D[0]); // LSB #undef D return d; }*/ ; parameters ; d: r18-r25 (r18 is LSB) camellia_s: movw r26, r24 ; backup r24,r25 -> X clr r25 rcall camellia_s2 mov r26, r24 mov r24, r27 rcall camellia_s1 mov r27, r24 mov r24, r23 rcall camellia_s3 mov r23, r24 mov r24, r22 rcall camellia_s4 mov r22, r24 mov r24, r21 rcall camellia_s2 mov r21, r24 mov r24, r20 rcall camellia_s3 mov r20, r24 mov r24, r19 rcall camellia_s4 mov r19, r24 mov r24, r18 rcall camellia_s1 mov r18, r24 movw r24, r26 ret ;############################################################################## /* uint64_t camellia_p(uint64_t d) */ ; param: r18-r25 (r18 is LSB) z1 = 25 z2 = 24 z3 = 23 z4 = 22 z5 = 21 z6 = 20 z7 = 19 z8 = 18 .global camellia_p camellia_p: eor z1, z6 eor z2, z7 eor z3, z8 eor z4, z5 eor z5, z3 eor z6, z4 eor z7, z1 eor z8, z2 ;--------- eor z1, z8 eor z2, z5 eor z3, z6 eor z4, z7 eor z5, z4 eor z6, z1 eor z7, z2 eor z8, z3 ;--------- movw r26, z8 movw r30, z6 ; backup z5 bis z8 movw z8, z4 movw z6, z2 movw z4, r26 movw z2, r30 ret ;############################################################################## /* uint64_t camellia_f(uint64_t x, uint64_t k) */ ; param x: r18-r25 ; param k: r10-r17 .global camellia_f camellia_f: eor r18, r10 eor r19, r11 eor r20, r12 eor r21, r13 eor r22, r14 eor r23, r15 eor r24, r16 eor r25, r17 rcall camellia_s rcall camellia_p ret ;############################################################################## /* uint64_t camellia_fl(uint64_t x, uint64_t k) */ ; param x: r18-r25 xl: r22-r25, xr: r18-r21 ; param k: r10-r17 kl: r14-r17, kr: r10-r13 kl1 = 14 kl2 = 15 kl3 = 16 kl4 = 17 kr1 = 10 kr2 = 11 kr3 = 12 kr4 = 13 xr1 = 18 xr2 = 19 xr3 = 20 xr4 = 21 xl1 = 22 xl2 = 23 xl3 = 24 xl4 = 25 .global camellia_fl camellia_fl: and kl1, xl1 and kl2, xl2 and kl3, xl3 and kl4, xl4 mov r26, kl4 rol r26 rol kl1 rol kl2 rol kl3 rol kl4 eor xr1, kl1 eor xr2, kl2 eor xr3, kl3 eor xr4, kl4 // that was part one or kr1, xr1 or kr2, xr2 or kr3, xr3 or kr4, xr4 eor xl1, kr1 eor xl2, kr2 eor xl3, kr3 eor xl4, kr4 ret ;############################################################################## /* uint64_t camellia_fl_inv(uint64_t y, uint64_t k) */ ; param y: r18-r25 yl: r22-r25, yr: r18-r21 ; param k: r10-r17 kl: r14-r17, kr: r10-r13 kl1 = 14 kl2 = 15 kl3 = 16 kl4 = 17 kr1 = 10 kr2 = 11 kr3 = 12 kr4 = 13 yr1 = 18 yr2 = 19 yr3 = 20 yr4 = 21 yl1 = 22 yl2 = 23 yl3 = 24 yl4 = 25 .global camellia_fl_inv camellia_fl_inv: or kr1, yr1 or kr2, yr2 or kr3, yr3 or kr4, yr4 eor yl1, kr1 eor yl2, kr2 eor yl3, kr3 eor yl4, kr4 // the first one is done and kl1, yl1 and kl2, yl2 and kl3, yl3 and kl4, yl4 mov r26, kl4 rol r26 rol kl1 rol kl2 rol kl3 rol kl4 eor yr1, kl1 eor yr2, kl2 eor yr3, kl3 eor yr4, kl4 ret ;############################################################################## ; param s: r24-r25 ; param q: r22 B1 = 18 B2 = 19 .global camellia128_keyop_rot15 camellia128_keyop_rot15: movw r30, r24 ; Z points at LSB of kl ;-- 0 ldi r22, 2 2: adiw r30, 15 ;-- 15 ld r21, Z ld r20, -Z ;-- 14 movw B1, r20 ; store Backup of the 2 MSB of kl ror r20 ldi r21, 14 1: ld r20, -Z ;-- 13..0 ror r20 std Z+2, r20 ;-- (15..2) dec r21 brne 1b ror B2 ror B1 st Z+, B1 ;-- 1 st Z, B2 adiw r30, 15 ;-- 16 dec r22 brne 2b ret ;############################################################################## ; param s: r24-r25 ; param q: r22 .global camellia128_keyop_rot17 camellia128_keyop_rot17: push r8 push r9 push r10 push r11 push r12 push r13 push r14 push r15 push r16 push r17 clt movw r30, r24 clr r27 2: ldi r26, 8 mov r1, r26 lsl r1 ; r1=16 ;push r1 ; load 128bit value ldd r0, Z+15 rol r0 1: ld r0, Z+ rol r0 st X+, r0 dec r1 brne 1b st -Z, 21 st -Z, 20 st -Z, 19 st -Z, 18 st -Z, 17 st -Z, 16 st -Z, 15 st -Z, 14 ;-- st -Z, 13 st -Z, 12 st -Z, 11 st -Z, 10 st -Z, 9 st -Z, 8 st -Z, 23 st -Z, 22 brts 2f set adiw r30, 16 rjmp 2b 2: pop r17 pop r16 pop r15 pop r14 pop r13 pop r12 pop r11 pop r10 pop r9 pop r8 ret ;############################################################################## ; param s: r24-r25 ; param q: r22 .global camellia128_keyop camellia128_keyop: cpi r22, 1 breq camellia128_keyop_rot17 rjmp camellia128_keyop_rot15 ;############################################################################## ; param s: r24-r25 ; param q: r22 B1 = 18 B2 = 19 .global camellia128_keyop_inv_rot15 camellia128_keyop_inv_rot15: movw r30, r24 ; Z points at LSB of kl ;-- 0 movw r26, r24 ; X also ldi r22, 2 2: ;-- 0 ld r20, Z+ ;-- 0/1 ld r21, Z+ ;-- 1/2 movw B1, r20 ; store Backup of the 2 LSB of kl rol r21 ldi r20, 14 1: ld r21, Z+ ;-- 2/14..3/16 rol r21 st X+, r21 ;-- (0..13)/(1..14) dec r20 brne 1b rol B1 rol B2 st X+, B1 ;-- 14/15 st X+, B2 ;-- 15/16 dec r22 brne 2b ret ;############################################################################## ; param s: r24-r25 ; param q: r22 .global camellia128_keyop_inv_rot17 camellia128_keyop_inv_rot17: push r8 push r9 push r10 push r11 push r12 push r13 push r14 push r15 push r16 push r17 clt movw r30, r24 clr r27 2: ldi r26, 8 mov r1, r26 lsl r1 ; r1=16 ; load 128bit value ld r0, Z adiw r30, 16 ror r0 1: ld r0, -Z ror r0 st X+, r0 dec r1 brne 1b st Z+, 21 st Z+, 20 st Z+, 19 st Z+, 18 st Z+, 17 st Z+, 16 st Z+, 15 st Z+, 14 ;-- st Z+, 13 st Z+, 12 st Z+, 11 st Z+, 10 st Z+, 9 st Z+, 8 st Z+, 23 st Z+, 22 brts 2f set ; adiw r30, 16 rjmp 2b 2: pop r17 pop r16 pop r15 pop r14 pop r13 pop r12 pop r11 pop r10 pop r9 pop r8 ret ;############################################################################## ; param s: r24-r25 ; param q: r22 .global camellia128_keyop_inv camellia128_keyop_inv: cpi r22, 1 breq camellia128_keyop_inv_rot17 rjmp camellia128_keyop_inv_rot15 ;############################################################################## ; param p: r24-r25 pointer to data ; param l: r22 length of word .global change_endian change_endian: movw r26, r24 movw r30, r24 add r30, r22 adc r31, r1 lsr r22 1: ld r20, X ld r21, -Z st X+, r21 st Z, r20 dec r22 brne 1b ret ;############################################################################## #define SEL_KA 1 #define SEL_KL 0 #define KEY_POSTC1 0x00 #define KEY_POSTC2 0x01 #define KEY_INC2 0x02 #define KEY_DIR 0x04 #define KEY_DIR_NORM 0x00 #define KEY_DIR_INV 0x04 #define KEY_AMMOUNT 0x08 #define KEY_ROL17 0x08 #define KEY_ROL15 0x00 /* void camellia_6rounds(camellia128_ctx_t *s, uint64_t *bl, uint64_t *br, uint8_t roundop, uint8_t keychoice){ uint8_t i; uint64_t *k[4]; k[0] = &(s->kll); k[1] = &(s->klr); k[2] = &(s->kal); k[3] = &(s->kar); for(i=0; i<3; ++i){ / * each cycle * / br[0] ^= camellia_f(bl[0],*(k[(keychoice&1)*2+((roundop&KEY_DIR)?1:0)])); keychoice >>= 1; if((i == 1) && (roundop&KEY_INC2)){ ((roundop&KEY_DIR)?camellia128_keyop_inv:camellia128_keyop)(s,(roundop&KEY_AMMOUNT)?1:-1); } bl[0] ^= camellia_f(br[0],*(k[(keychoice&1)*2+((roundop&KEY_DIR)?0:1)])); keychoice >>= 1; / * check if we should do some keyop * / if((i == (roundop&1)) && (!(roundop&KEY_INC2)) ){ ((roundop&KEY_DIR)?camellia128_keyop_inv:camellia128_keyop)(s,(roundop&KEY_AMMOUNT)?1:-1); / * isn't it fuckin nice what we can do in C?! * / } } } */ ; param s: r24-r25 ; param bl: r22-r23 ; param br: r20-r21 ; param roundop: r18 ; param keychoice: r16 s1 = 24 s2 = 25 bl1 = 22 bl2 = 23 br1 = 20 br2 = 22 xro = 18 kc = 16 xro_sec = 17 br1_sec = 10 br2_sec = 11 bl1_sec = 12 bl2_sec = 13 s1_sec = 14 t = 9 loop_cnt = 8 keyop_time = 7 .global camellia_6rounds camellia_6rounds: push r17 push r16 push r15 push r14 push r13 push r12 push r11 push r10 push r9 push r8 push r7 ldi r17, 6 mov loop_cnt, r17 mov xro_sec, xro movw br1_sec, br1 movw bl1_sec, bl1 movw s1_sec, s1 clr keyop_time inc keyop_time sec rol keyop_time // keyop_time == 3 SBRC xro, 1 // KEY_INC2 rjmp 1f SBRS xro, 0 // KEY_POSTC1 inc keyop_time SBRS xro, 0 // KEY_POSTC1 inc keyop_time rjmp 2f 1: inc keyop_time 2: main_loop: /* now we load the key to r18-r25 */ movw r26, s1_sec SBRC kc, 0 /* select between KA and KL */ adiw r26, 16 SBRC xro_sec, 2 // KEY_DIR rjmp 2f SBRS loop_cnt, 0 /* enc */ adiw r26, 8 rjmp 3f 2: SBRC loop_cnt, 0 /* dec */ adiw r26, 8 rjmp 3f 3: lsr kc ld r18, X+ ld r19, X+ ld r20, X+ ld r21, X+ ld r22, X+ ld r23, X+ ld r24, X+ ld r25, X+ /* now we xor bl in */ movw r26, bl1_sec ld r0, X+ eor r18, r0 ld r0, X+ eor r19, r0 ld r0, X+ eor r20, r0 ld r0, X+ eor r21, r0 ld r0, X+ eor r22, r0 ld r0, X+ eor r23, r0 ld r0, X+ eor r24, r0 ld r0, X+ eor r25, r0 /* f(x,k) = p(s(x xor k)) ; xor is done */ call camellia_s; call camellia_p; // in r26, SPL // in r27, SPH // sbiw r26, 9 // dbg_hexdump 10 /* now we have to xor the result into br */ clr r31 ldi r30, 18 movw r26, br1_sec ; ldi r1, 8 ;-- this won't work clr r1 sec ror r1 swap r1 1: ld r0, X ld t, Z+ eor r0, t st X+, r0 dec r1 brne 1b /* check for keyop */ cp loop_cnt, keyop_time brne 3f movw s1, s1_sec ldi r22, 1 SBRS xro_sec, 3 // KEY_ROL17 neg r22 SBRS xro_sec, 2 // KEY_DIR rjmp 2f rcall camellia128_keyop_inv rjmp 3f 2: rcall camellia128_keyop 3: /* loop back */ SWAP_R br1_sec, bl1_sec SWAP_R br2_sec, bl2_sec dec loop_cnt breq 2f rjmp main_loop 2: pop r7 pop r8 pop r9 pop r10 pop r11 pop r12 pop r13 pop r14 pop r15 pop r16 pop r17 ret ;############################################################################## /* void camellia128_init(camellia128_ctx_t *s, uint8_t *key){ uint8_t i; s->kll = 0; //((uint64_t*)key)[0]; / * load the key, endian-adjusted, to kll,klr * / for(i=0; i<8; ++i){ s->kll <<= 8; s->kll |= *key++; } for(i=0; i<8; ++i){ s->klr <<= 8; s->klr |= *key++; } s->kal = s->kll; s->kar = s->klr; s->kar ^= camellia_f(s->kal, camellia_sigma[0]); s->kal ^= camellia_f(s->kar, camellia_sigma[1]); s->kal ^= s->kll; s->kar ^= s->klr; s->kar ^= camellia_f(s->kal, camellia_sigma[2]); s->kal ^= camellia_f(s->kar, camellia_sigma[3]); / * * / // uart_putstr("\n\r----------------init finished--------------------"); } */ /* X64_xor_in: ld r0, X+ eor r18, r0 ld r0, X+ eor r19, r0 ld r0, X+ eor r20, r0 ld r0, X+ eor r21, r0 ld r0, X+ eor r22, r0 ld r0, X+ eor r23, r0 ld r0, X+ eor r24, r0 ld r0, X+ eor r25, r0 ret X64_load: ld r18, X+ ld r19, X+ ld r20, X+ ld r21, X+ ld r22, X+ ld r23, X+ ld r24, X+ ld r25, X+ ret Y64_load_xor_store: ld r0, Y eor r18, r0 st Y+, r18 ld r0, Y eor r19, r0 st Y+, r19 ld r0, Y eor r20, r0 st Y+, r20 ld r0, Y eor r21, r0 st Y+, r21 ld r0, Y eor r22, r0 st Y+, r22 ld r0, Y eor r23, r0 st Y+, r23 ld r0, Y eor r24, r0 st Y+, r24 ld r0, Y eor r25, r0 st Y+, r25 ret ; param s: r24-r25 ; param *k: r22-r23 //.global camellia128_init camellia128_init: push r29 push r28 movw r30, r24 ; Z is statepointer movw r26, r22 ; X is keypointer clr r29 ldi r28, 18 // / * load key into kl, ka and kal to r18:r25 * / adiw r26, 128/8 ;-- 16 ldi r16, (128/8)-1 1: ld r17, -X std Z+(128/8), r17 st Z+, r17 sbrs r16, 3 st Y+, r17 ; this should only be done the last 8 rounds 0<=r16<=7 dec r16 brpl 1b // / * step 1 * / ldi r26, lo8(camellia_sigma) ldi r27, hi8(camellia_sigma) rcall X64_xor_in rcall camellia_s rcall camellia_p // / * f(x,k) is done * / sbiw r30, 128/8 movw r28, r30 ; Z&Y point on kar now call Y64_load_xor_store // / * step 2 now * / rcall X64_xor_in rcall camellia_s rcall camellia_p // / * f(x,k) is done * / rcall Y64_load_xor_store // / * now the xor part (kl and kr) * / sbiw r30, 128/8 ; Z points to klr ldi r16, 128/8 1: ld r0, Z+ ldd r1, Z+(128/8)-1 eor r0, r1 std Z+(128/8)-1, r0 dec r16 brne 1b // / * now s->kar ^= camellia_f(s->kal, camellia_sigma[2]); * / rcall X64_load ; load sigma[2] movw r26, r28 ; X&Y point at kal rcall X64_xor_in rcall camellia_s rcall camellia_p sbiw r28, 128/8/2 ; Y points at kar rcall Y64_load_xor_store // / * now s->kal ^= camellia_f(s->kar, camellia_sigma[3]); * / sbiw r26, 128/8 ; rcall X64_load ; load kar ldi r26, lo8(camellia_sigma+3*8) ldi r27, hi8(camellia_sigma+3*8) rcall X64_xor_in ; xor sigma[3] in rcall camellia_s rcall camellia_p rcall Y64_load_xor_store pop r28 pop r29 ret //*/