1 /* bmw_small-tinyasm.S */
3 This file is part of the AVR-Crypto-Lib.
4 Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de)
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 * File: bmw_small-tinyasm.S
24 * License: GPLv3 or later
25 * Description: implementation of BlueMidnightWish
29 #include "avr-asm-macros.S"
31 /******************************************************************************/
33 param a: r22:r23:r24:r25
47 /******************************************************************************/
49 param a: r22:r23:r24:r25
72 /******************************************************************************/
74 param a: r22:r23:r24:r25
103 /******************************************************************************/
121 param x: r22:r23:r24:25
128 ldi r30, lo8(s_table)
129 ldi r31, hi8(s_table)
165 /******************************************************************************/
167 param dest: r26:r27 (X)
168 param src: r30:r31 (Z)
184 /******************************************************************************/
203 rjmp store32_to_Y;50f
218 /******************************************************************************/
250 /******************************************************************************/
264 .byte 5*4,7*4,10*4,13*4,14*4
265 ; .byte 0 ; just for alignment
302 /* calculate W and store it in Q */
306 /* load initial index */
307 ldi r30, lo8(f0_indextable-1)
308 ldi r31, hi8(f0_indextable-1)
312 /* load values from hacktable */
313 ldi r30, lo8(f0_hacktable-2)
314 ldi r31, hi8(f0_hacktable-2)
345 ldi r30, lo8(f0_s_table)
346 ldi r31, hi8(f0_s_table)
379 /******************************************************************************/
382 .long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f
383 .long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3
384 .long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7
385 .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b
387 /*******************************************************************************
388 * uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){
390 * r = pgm_read_dword(k_lut+j);
391 * r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
392 * r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
393 * r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
394 * r ^= ((uint32_t*)h)[(j+7)&0xf];
474 ldi r30, lo8(const_lut)
475 ldi r31, hi8(const_lut)
484 rcall load_rotate_add_M
487 rcall load_rotate_add_M
491 rcall load_rotate_add_M
510 /******************************************************************************/
549 /******************************************************************************/
558 .byte 0,3,0,7,0,13,0,16,0,19,0,23,0,27
563 ldi r30, lo8(expand2_rot_table)
564 ldi r31, hi8(expand2_rot_table)
584 /******************************************************************************/
590 /* for calling expand1/2
624 /******************************************************************************/
631 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
632 .byte 0x2B, 0x64, 0x66, 0x03, 0x51, 0x55, 0x87, 0x55
634 .byte (2<<1), (7<<1), (4<<1), (3<<1), (4<<1)+1, (6<<1)+1, (6<<1)
635 .byte 0 ; just for alignment
664 10: rcall load32_from_X
686 ;--- /* calc first half of h0..h15 */
695 ldi r30, lo8(f2_1_shift_table-1)
696 ldi r31, hi8(f2_1_shift_table-1)
713 25: rcall shiftleft32
714 26: rcall eor32_to_acc
724 27: rcall shiftright32
725 28: rcall eor32_to_acc
737 sbiw r26, 4*8 /* X points to q[24] */
740 sbiw r28, 33 /* Y points to q[0] */
742 sbiw r30, 1 /* Z points to h0 */
753 sbiw r26, 9*4 /* X points to q[23] */
754 rcall load_acc_from_X
762 sbiw r26, 8*4 /* X points to q[16] */
766 ldi r30, lo8(f2_2_shift_table-1)
767 ldi r31, hi8(f2_2_shift_table-1)
771 rcall load_acc_from_X
778 20: rcall shiftright32
789 sbiw r30, 8*4 /* Z points to h8 */
791 sbiw r26, 4*4 /* X points to h4 */
811 /******************************************************************************/
831 .global bmw_small_nextBlock
832 .global bmw224_nextBlock
833 .global bmw256_nextBlock
839 stack_alloc_large 32*4, r28, r29
841 ; push_range 28, 29 /* push Q */
842 ; push_range 22, 25 /* push M & H */
843 /* increment counter */
848 rcall load_acc_from_X
880 stack_free_large3 32*4
885 /******************************************************************************/
904 /******************************************************************************/
911 .global bmw_small_lastBlock
912 .global bmw224_lastBlock
913 .global bmw256_lastBlock
917 /* while(length_b >= BMW_SMALL_BLOCKSIZE){
918 bmw_small_nextBlock(ctx, block);
919 length_b -= BMW_SMALL_BLOCKSIZE;
920 block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
933 rcall bmw_small_nextBlock
948 /* memset(pctx.buffer, 0, 64);
949 memcpy(pctx.buffer, block, (length_b+7)/8);
950 pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
961 /* copy (#r24) bytes to stack buffer */
966 301: /* calculate the appended byte */
986 /* if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
987 bmw_small_nextBlock(ctx, pctx.buffer);
988 memset(pctx.buffer, 0, 64-8);
998 rcall bmw_small_nextBlock
1005 rcall load32_from_Z_stub
1011 /* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
1012 bmw_small_nextBlock(ctx, pctx.buffer);
1015 rcall load32_from_Z_stub
1041 rcall bmw_small_nextBlock
1042 /* memset(pctx.buffer, 0xaa, 64);
1044 pctx.buffer[i*4] = i+0xa0;
1058 /* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
1059 memcpy(ctx->h, pctx.buffer, 64);
1063 rcall bmw_small_nextBlock
1079 /*******************************************************************************
1080 * void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){
1081 * memcpy(dest, &(ctx->h[9]), 224/8);
1084 * param dest: r24:r25
1085 * param ctx: r22:r23
1087 .global bmw224_ctx2hash
1095 /*******************************************************************************
1096 * void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
1097 * memcpy(dest, &(ctx->h[8]), 256/8);
1100 * param dest: r24:r25
1101 * param ctx: r22:r23
1103 .global bmw256_ctx2hash
1116 /*******************************************************************************
1117 * void bmw256(void* dest, const void* msg, uint32_t length_b){
1118 * bmw_small_ctx_t ctx;
1119 * bmw256_init(&ctx);
1120 * while(length_b>=BMW_SMALL_BLOCKSIZE){
1121 * bmw_small_nextBlock(&ctx, msg);
1122 * length_b -= BMW_SMALL_BLOCKSIZE;
1123 * msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
1125 * bmw_small_lastBlock(&ctx, msg, length_b);
1126 * bmw256_ctx2hash(dest, &ctx);
1129 * param dest: r24:r25
1130 * param msg: r22:r23
1131 * param length_b: r18:r21
1149 /*******************************************************************************
1150 * void bmw224(void* dest, const void* msg, uint32_t length_b){
1151 * bmw_small_ctx_t ctx;
1152 * bmw224_init(&ctx);
1153 * while(length_b>=BMW_SMALL_BLOCKSIZE){
1154 * bmw_small_nextBlock(&ctx, msg);
1155 * length_b -= BMW_SMALL_BLOCKSIZE;
1156 * msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
1158 * bmw_small_lastBlock(&ctx, msg, length_b);
1159 * bmw224_ctx2hash(dest, &ctx);
1162 * param dest: r24:r25
1163 * param msg: r22:r23
1164 * param length_b: r18:r21
1184 stack_alloc_large 64+4
1192 ldi r30, pm_lo8(init_lut)
1193 ldi r31, pm_hi8(init_lut)
1203 rcall bmw_small_nextBlock
1215 rcall bmw_small_lastBlock
1218 ldi r30, pm_lo8(c2h_lut)
1219 ldi r31, pm_hi8(c2h_lut)
1223 stack_free_large 64+4
1233 rjmp bmw224_ctx2hash
1234 rjmp bmw256_ctx2hash
1236 /*******************************************************************************
1237 * void bmw224_init(bmw224_ctx_t* ctx){
1239 * ctx->h[0] = 0x00010203;
1240 * for(i=1; i<16; ++i){
1241 * ctx->h[i] = ctx->h[i-1]+ 0x04040404;
1246 * param ctx: r24:r25
1289 /******************************************************************************/
1298 ldi r24, lo8(qdbg_str)
1299 ldi r25, hi8(qdbg_str)
1302 10: ldi r24, lo8(qdbg_str1)
1303 ldi r25, hi8(qdbg_str1)
1306 call cli_hexdump_byte
1307 ldi r24, lo8(qdbg_str2)
1308 ldi r25, hi8(qdbg_str2)
1313 call cli_hexdump_rev
1321 qdbg_str: .asciz "\r\nDBG Q: "
1322 qdbg_str1: .asciz "\r\n Q["
1323 qdbg_str2: .asciz "] = "
1334 ldi r24, lo8(Xdbg_str)
1335 ldi r25, hi8(Xdbg_str)
1342 10: ldi r24, lo8(Xdbg_str1)
1343 ldi r25, hi8(Xdbg_str1)
1350 call cli_hexdump_byte
1351 ldi r24, lo8(Xdbg_str2)
1352 ldi r25, hi8(Xdbg_str2)
1357 call cli_hexdump_rev
1367 Xdbg_str: .asciz "\r\nDBG "
1368 Xdbg_str1: .asciz "\r\n "
1369 Xdbg_str2: .asciz "] = "
1377 ldi r24, lo8(Xdbg_str)
1378 ldi r25, hi8(Xdbg_str)
1381 call cli_hexdump_byte
1383 call cli_hexdump_byte
1385 call cli_hexdump_byte
1387 call cli_hexdump_byte
1397 ldi r24, lo8(Xdbg_str)
1398 ldi r25, hi8(Xdbg_str)
1401 call cli_hexdump_byte
1403 call cli_hexdump_byte
1405 call cli_hexdump_byte
1407 call cli_hexdump_byte