3 This file is part of the AVR-Crypto-Lib.
4 Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de)
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 * File: bmw_small-asm.S
24 * License: GPLv3 or later
25 * Description: implementation of BlueMidnightWish
29 #include "avr-asm-macros.S"
51 ; .byte 0x21 ; 17 unused but necesseray for padding
55 /*******************************************************************************
71 /*******************************************************************************
87 /*******************************************************************************
104 /*******************************************************************************
122 /*******************************************************************************
129 ldi r30, lo8(shiftcodetable_9)
130 ldi r31, hi8(shiftcodetable_9)
155 /*******************************************************************************
156 * uint32_t rotl_addel(uint32_t x, uint8_t v){
158 * r = ROTL32(x, (v&0xf)+1);
167 ldi r30, lo8(shiftcodetable_1)
168 ldi r31, hi8(shiftcodetable_1)
192 /******************************************************************************/
194 preg0 = 22 /* preg for processing register */
198 breg0 = 26 /* breg for backup register */
202 areg0 = 0 /* areg for accumulator register */
207 /*******************************************************************************
208 * uint32_t bmw_small_s0(uint32_t x){
241 /* now the trick, we simply can rotate the old value to the right by 17 */
242 movw breg0, preg0 /* first rotate by 16 */
256 /*******************************************************************************
257 * uint32_t bmw_small_s1(uint32_t x){
292 /*******************************************************************************
293 * uint32_t bmw_small_s2(uint32_t x){
334 /*******************************************************************************
335 * uint32_t bmw_small_s3(uint32_t x){
374 /*******************************************************************************
375 * uint32_t bmw_small_s4(uint32_t x){
390 /*******************************************************************************
391 * uint32_t bmw_small_s5(uint32_t x){
406 /*******************************************************************************
407 * uint32_t bmw_small_r1(uint32_t x){
418 /*******************************************************************************
419 * uint32_t bmw_small_r2(uint32_t x){
430 /*******************************************************************************
431 * uint32_t bmw_small_r3(uint32_t x){
446 /*******************************************************************************
447 * uint32_t bmw_small_r4(uint32_t x){
460 /*******************************************************************************
461 * uint32_t bmw_small_r5(uint32_t x){
475 /*******************************************************************************
476 * uint32_t bmw_small_r6(uint32_t x){
492 /*******************************************************************************
493 * uint32_t bmw_small_r7(uint32_t x){
504 /******************************************************************************/
507 .long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f
508 .long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3
509 .long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7
510 .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b
512 /*******************************************************************************
513 * uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){
515 * r = pgm_read_dword(k_lut+j);
516 * r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
517 * r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
518 * r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
519 * r ^= ((uint32_t*)h)[(j+7)&0xf];
543 ldi r30, lo8(const_lut)
544 ldi r31, hi8(const_lut)
627 /*******************************************************************************
628 * uint32_t bmw_small_expand1(uint8_t j, const void* m, const void* h, const uint32_t* q){
629 * uint32_t(*s[])(uint32_t) = {bmw_small_s1, bmw_small_s2, bmw_small_s3, bmw_small_s0};
632 * r = addelement(j, m, h);
635 * r += s[i%4](q[j+i]);
648 .global bmw_small_expand1
710 /*******************************************************************************
711 * uint32_t bmw_small_expand2(uint8_t j, const void* m, const void* h, const uint32_t* q){
712 * uint32_t(*rf[])(uint32_t) = {bmw_small_r1, bmw_small_r2, bmw_small_r3,
713 * bmw_small_r4, bmw_small_r5, bmw_small_r6,
717 * r = addelement(j, m, h);
718 * for(i=0; i<14; i+=2){
721 * for(i=0; i<14; i+=2){
722 * r += rf[i/2](q[j+i+1]);
724 * r += bmw_small_s4(q[j+14]);
725 * r += bmw_small_s5(q[j+15]);
747 .global bmw_small_expand2
762 ldi r30, pm_lo8(expand2_jumptable)
763 ldi r31, pm_hi8(expand2_jumptable)
783 /*******************************************************************************
784 * void bmw_small_f1(uint32_t* q, const void* m, const void* h){
786 * q[16] = bmw_small_expand1(0, m, h, q);
787 * q[17] = bmw_small_expand1(1, m, h, q);
788 * for(i=2; i<16; ++i){
789 * q[16+i] = bmw_small_expand2(i, m, h, q);
811 clr r25 /* not required */
813 rcall bmw_small_expand1
820 clr r25 /* not required */
824 rcall bmw_small_expand1
835 rcall bmw_small_expand2
848 /*******************************************************************************
849 * uint16_t hack_table[5] PROGMEM = { 0x0311, 0xDDB3, 0x2A79, 0x07AA, 0x51C2 };
850 * uint8_t offset_table[5] PROGMEM = { 4+16, 6+16, 9+16, 12+16, 13+16 };
852 * void bmw_small_f0(uint32_t* h, const void* m, uint32_t* q){
855 * uint32_t(*s[])(uint32_t)={ bmw_small_s0, bmw_small_s1, bmw_small_s2,
856 * bmw_small_s3, bmw_small_s4 };
857 * for(i=0; i<16; ++i){
858 * ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i];
860 * dump_x(h, 16, 'T');
861 * memset(q, 0, 4*16);
865 * j=pgm_read_byte(offset_table+c);
866 * hack_reg=pgm_read_word(&(hack_table[c]));
877 * dump_x(q, 16, 'W');
878 * for(i=0; i<16; ++i){
879 * q[i] = s[i%5](q[i]);
881 * for(i=0; i<16; ++i){
882 * ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i];
884 * for(i=0; i<16; ++i){
885 * q[i] += h[(i+1)&0xf];
964 /* h[i] ^= m[i]; q[i]= 0 */
1001 movw r26, h0 ; X = h
1004 movw r30, q0 ; Z = q
1006 movw r26, h0 ; X = h
1012 movw r26, h0 ; X = h
1015 movw r30, q0 ; Z = q
1017 movw r26, h0 ; X = h
1023 movw r26, h0 ; X = h
1026 movw r30, q0 ; Z = q
1028 movw r26, h0 ; X = h
1031 ;--------------- h[i] ^= m[i]
1041 ;--------------- q[i] = s[i%5](q[i])
1043 ldi r30, pm_lo8(f0_jumptable)
1044 ldi r31, pm_hi8(f0_jumptable)
1048 movw r28, q0 ; Y = q
1064 ;--------------- q[i] += h[(i+1)%16]
1111 /*******************************************************************************
1112 * void bmw_small_f2(uint32_t* h, const uint32_t* q, const void* m){
1113 * uint32_t xl=0, xh;
1115 * for(i=16;i<24;++i){
1119 * for(i=24;i<32;++i){
1122 * memcpy(h, m, 16*4);
1123 * h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5);
1124 * h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6);
1125 * h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5);
1126 * h[4] ^= SHR32(xh, 3) ^ q[20];
1127 * h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6);
1128 * h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5);
1129 * h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8);
1130 * h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2);
1131 * for(i=0; i<8; ++i){
1132 * h[i] += xl ^ q[24+i] ^ q[i];
1134 * for(i=0; i<8; ++i){
1135 * h[8+i] ^= xh ^ q[24+i];
1136 * h[8+i] += ROTL32(h[(4+i)%8],i+9);
1138 * h[11] += SHL32(xl, 4) ^ q[18] ^ q[11];
1139 * h[10] += SHL32(xl, 6) ^ q[17] ^ q[10];
1140 * h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8];
1141 * h[15] += SHR32(xl, 2) ^ q[22] ^ q[15];
1142 * h[12] += SHR32(xl, 3) ^ q[19] ^ q[12];
1143 * h[13] += SHR32(xl, 4) ^ q[20] ^ q[13];
1144 * h[ 9] += SHR32(xl, 6) ^ q[16] ^ q[ 9];
1145 * h[14] += SHR32(xl, 7) ^ q[21] ^ q[14];
1170 .macro modify_h_2 addr:req
1171 ldd r22, Y+\addr*4+0
1172 ldd r23, Y+\addr*4+1
1173 ldd r24, Y+\addr*4+2
1174 ldd r25, Y+\addr*4+3
1211 .global bmw_small_f2
1213 /* memcpy(h, m, 64) */
1226 /* for(i=16;i<24;++i){
1232 adiw r26, 1 ; X points at q[16]
1270 adiw r28, 60 ; Y points at q[15]
1271 /* h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5); */
1298 /* h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6); */
1325 /* h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5); */
1354 /* h[4] ^= SHR32(xh, 3) ^ q[20]; */
1377 /* h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6); */
1404 /* h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5); */
1431 /* h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8); */
1453 /* h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2); */
1478 /* for(i=0; i<8; ++i){
1479 * h[i] += xl ^ q[24+i] ^ q[i];
1521 /* for(i=0; i<8; ++i){
1522 h[8+i] ^= xh ^ q[24+i];
1523 h[8+i] += ROTL32(h[(4+i)%8],i+9);
1528 sbiw r28, 8*4 ; Y points at q[24]
1530 sbiw r26, 4*4 ; X points at h[4]
1569 sbiw r30, 4*8 ; adjust Z to point at h[8]
1571 sbiw r28, 1 ; adjust Y to point at q[16]
1573 sbiw r26, 7*4 ; adjust X to point at q[9]
1575 20: /* now we do the memxor stuff */
1584 sbiw r26, 4*8 ; X points at q[8]
1590 /* h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8]; */
1621 ; X points at q[9] but we won't need it anymore
1623 /* h[11] += SHL32(xl, 4) ^ q[11]; */
1629 /* h[10] += SHL32(xl, 6) ^ q[10]; */
1633 /* h[15] += SHR32(xl, 2) ^ q[15]; */
1639 /* h[12] += SHR32(xl, 3) ^ q[12]; */
1643 /* h[13] += SHR32(xl, 4) ^ q[13]; */
1647 /* h[ 9] += SHR32(xl, 6) ^ q[ 9]; */
1651 /* h[14] += SHR32(xl, 7) ^ q[14]; */
1669 ldi r30, lo8(hextable)
1670 ldi r31, hi8(hextable)
1678 ldi r30, lo8(hextable)
1679 ldi r31, hi8(hextable)
1690 .byte '0', '1', '2', '3', '4', '5', '6', '7'
1691 .byte '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
1701 /*******************************************************************************
1702 * void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block){
1704 * dump_x(block, 16, 'M');
1705 * bmw_small_f0(ctx->h, block, q);
1706 * dump_x(q, 16, 'Q');
1707 * bmw_small_f1(q, block, ctx->h);
1708 * dump_x(q, 32, 'Q');
1709 * bmw_small_f2(ctx->h, q, block);
1710 * ctx->counter += 1;
1714 * param ctx: r24:r25
1715 * param block: r22:r23
1723 .global bmw_small_nextBlock
1724 .global bmw224_nextBlock
1725 .global bmw256_nextBlock
1726 bmw_small_nextBlock:
1731 stack_alloc_large 32*4, 30, 31
1736 /* increment counter */
1752 /* call bmw_small_f0(ctx->h, block, q) */
1756 push_ q1, q0, b1, b0, h1, h0
1758 /* call bmw_small_f1(q, block, ctx->h) */
1759 pop_ 20, 21, 22, 23, 24, 25,
1760 push_ 21, 20, 25, 24, 23, 22
1762 /* call bmw_small_f2(ctx->h, q, block) */
1763 pop_ 20, 21, 22, 23, 24, 25,
1765 stack_free_large3 32*4
1771 /*******************************************************************************
1772 * void bmw224_init(bmw224_ctx_t* ctx){
1774 * ctx->h[0] = 0x00010203;
1775 * for(i=1; i<16; ++i){
1776 * ctx->h[i] = ctx->h[i-1]+ 0x04040404;
1781 * param ctx: r24:r25
1823 /*******************************************************************************
1824 * void bmw_small_lastBlock(bmw_small_ctx_t* ctx, const void* block, uint16_t length_b){
1826 * uint8_t buffer[64];
1829 * while(length_b >= BMW_SMALL_BLOCKSIZE){
1830 * bmw_small_nextBlock(ctx, block);
1831 * length_b -= BMW_SMALL_BLOCKSIZE;
1832 * block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
1834 * memset(pctx.buffer, 0, 64);
1835 * memcpy(pctx.buffer, block, (length_b+7)/8);
1836 * pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
1837 * if(length_b+1>64*8-64){
1838 * bmw_small_nextBlock(ctx, pctx.buffer);
1839 * memset(pctx.buffer, 0, 64-8);
1840 * ctx->counter -= 1;
1842 * *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
1843 * bmw_small_nextBlock(ctx, pctx.buffer);
1845 * memset(pctx.buffer, 0xaa, 64);
1846 * for(i=0; i<16;++i){
1847 * pctx.buffer[i*4] = i+0xa0;
1849 * bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
1850 * memcpy(ctx->h, pctx.buffer, 64);
1853 * param ctx: r24:r25
1854 * param block: r22:r23
1855 * param length_b: r20:r21
1866 .global bmw_small_lastBlock
1867 .global bmw224_lastBlock
1868 .global bmw256_lastBlock
1869 bmw_small_lastBlock:
1872 /* while(length_b >= BMW_SMALL_BLOCKSIZE){
1873 bmw_small_nextBlock(ctx, block);
1874 length_b -= BMW_SMALL_BLOCKSIZE;
1875 block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
1888 rcall bmw_small_nextBlock
1900 stack_alloc_large 68
1903 /* memset(pctx.buffer, 0, 64);
1904 memcpy(pctx.buffer, block, (length_b+7)/8);
1905 pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
1941 /* if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
1942 bmw_small_nextBlock(ctx, pctx.buffer);
1943 memset(pctx.buffer, 0, 64-8);
1953 rcall bmw_small_nextBlock
1971 /* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
1972 bmw_small_nextBlock(ctx, pctx.buffer);
2006 rcall bmw_small_nextBlock
2007 /* memset(pctx.buffer, 0xaa, 64);
2009 pctx.buffer[i*4] = i+0xa0;
2023 /* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
2024 memcpy(ctx->h, pctx.buffer, 64);
2028 rcall bmw_small_nextBlock
2043 /*******************************************************************************
2044 * void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){
2045 * memcpy(dest, &(ctx->h[9]), 224/8);
2048 * param dest: r24:r25
2049 * param ctx: r22:r23
2051 .global bmw224_ctx2hash
2059 /*******************************************************************************
2060 * void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
2061 * memcpy(dest, &(ctx->h[8]), 256/8);
2064 * param dest: r24:r25
2065 * param ctx: r22:r23
2067 .global bmw256_ctx2hash
2080 /*******************************************************************************
2081 * void bmw256(void* dest, const void* msg, uint32_t length_b){
2082 * bmw_small_ctx_t ctx;
2083 * bmw256_init(&ctx);
2084 * while(length_b>=BMW_SMALL_BLOCKSIZE){
2085 * bmw_small_nextBlock(&ctx, msg);
2086 * length_b -= BMW_SMALL_BLOCKSIZE;
2087 * msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
2089 * bmw_small_lastBlock(&ctx, msg, length_b);
2090 * bmw256_ctx2hash(dest, &ctx);
2093 * param dest: r24:r25
2094 * param msg: r22:r23
2095 * param length_b: r18:r21
2113 /*******************************************************************************
2114 * void bmw224(void* dest, const void* msg, uint32_t length_b){
2115 * bmw_small_ctx_t ctx;
2116 * bmw224_init(&ctx);
2117 * while(length_b>=BMW_SMALL_BLOCKSIZE){
2118 * bmw_small_nextBlock(&ctx, msg);
2119 * length_b -= BMW_SMALL_BLOCKSIZE;
2120 * msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
2122 * bmw_small_lastBlock(&ctx, msg, length_b);
2123 * bmw224_ctx2hash(dest, &ctx);
2126 * param dest: r24:r25
2127 * param msg: r22:r23
2128 * param length_b: r18:r21
2147 stack_alloc_large 64+4
2155 ldi r30, pm_lo8(init_lut)
2156 ldi r31, pm_hi8(init_lut)
2166 rcall bmw_small_nextBlock
2179 rcall bmw_small_lastBlock
2182 ldi r30, pm_lo8(c2h_lut)
2183 ldi r31, pm_hi8(c2h_lut)
2187 stack_free_large 64+4
2196 rjmp bmw224_ctx2hash
2197 rjmp bmw256_ctx2hash