3 This file is part of the AVR-Crypto-Lib.
4 Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de)
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 * File: bmw_small-asm.S
24 * License: GPLv3 or later
25 * Description: implementation of BlueMidnightWish
29 #include "avr-asm-macros.S"
51 ; .byte 0x21 ; 17 unused but necesseray for padding
55 /*******************************************************************************
71 /*******************************************************************************
87 /*******************************************************************************
104 /*******************************************************************************
122 /*******************************************************************************
129 ldi r30, lo8(shiftcodetable_9)
130 ldi r31, hi8(shiftcodetable_9)
155 /*******************************************************************************
156 * uint32_t rotl_addel(uint32_t x, uint8_t v){
158 * r = ROTL32(x, (v&0xf)+1);
167 ldi r30, lo8(shiftcodetable_1)
168 ldi r31, hi8(shiftcodetable_1)
192 /******************************************************************************/
194 preg0 = 22 /* preg for processing register */
198 breg0 = 26 /* breg for backup register */
202 areg0 = 0 /* areg for accumulator register */
207 /*******************************************************************************
208 * uint32_t bmw_small_s0(uint32_t x){
241 /* now the trick, we simply can rotate the old value to the right by 17 */
242 movw breg0, preg0 /* first rotate by 16 */
256 /*******************************************************************************
257 * uint32_t bmw_small_s1(uint32_t x){
292 /*******************************************************************************
293 * uint32_t bmw_small_s2(uint32_t x){
334 /*******************************************************************************
335 * uint32_t bmw_small_s3(uint32_t x){
374 /*******************************************************************************
375 * uint32_t bmw_small_s4(uint32_t x){
390 /*******************************************************************************
391 * uint32_t bmw_small_s5(uint32_t x){
406 /*******************************************************************************
407 * uint32_t bmw_small_r1(uint32_t x){
418 /*******************************************************************************
419 * uint32_t bmw_small_r2(uint32_t x){
430 /*******************************************************************************
431 * uint32_t bmw_small_r3(uint32_t x){
446 /*******************************************************************************
447 * uint32_t bmw_small_r4(uint32_t x){
460 /*******************************************************************************
461 * uint32_t bmw_small_r5(uint32_t x){
475 /*******************************************************************************
476 * uint32_t bmw_small_r6(uint32_t x){
492 /*******************************************************************************
493 * uint32_t bmw_small_r7(uint32_t x){
504 /******************************************************************************/
507 .long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f
508 .long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3
509 .long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7
510 .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b
512 /*******************************************************************************
513 * uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){
515 * r = pgm_read_dword(k_lut+j);
516 * r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
517 * r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
518 * r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
519 * r ^= ((uint32_t*)h)[(j+7)&0xf];
544 ldi r30, lo8(const_lut)
545 ldi r31, hi8(const_lut)
628 /*******************************************************************************
629 * uint32_t bmw_small_expand1(uint8_t j, const void* m, const void* h, const uint32_t* q){
630 * uint32_t(*s[])(uint32_t) = {bmw_small_s1, bmw_small_s2, bmw_small_s3, bmw_small_s0};
633 * r = addelement(j, m, h);
636 * r += s[i%4](q[j+i]);
649 .global bmw_small_expand1
711 /*******************************************************************************
712 * uint32_t bmw_small_expand2(uint8_t j, const void* m, const void* h, const uint32_t* q){
713 * uint32_t(*rf[])(uint32_t) = {bmw_small_r1, bmw_small_r2, bmw_small_r3,
714 * bmw_small_r4, bmw_small_r5, bmw_small_r6,
718 * r = addelement(j, m, h);
719 * for(i=0; i<14; i+=2){
722 * for(i=0; i<14; i+=2){
723 * r += rf[i/2](q[j+i+1]);
725 * r += bmw_small_s4(q[j+14]);
726 * r += bmw_small_s5(q[j+15]);
748 .global bmw_small_expand2
763 ldi r30, pm_lo8(expand2_jumptable)
764 ldi r31, pm_hi8(expand2_jumptable)
784 /*******************************************************************************
785 * void bmw_small_f1(uint32_t* q, const void* m, const void* h){
787 * q[16] = bmw_small_expand1(0, m, h, q);
788 * q[17] = bmw_small_expand1(1, m, h, q);
789 * for(i=2; i<16; ++i){
790 * q[16+i] = bmw_small_expand2(i, m, h, q);
812 clr r25 /* not required */
814 rcall bmw_small_expand1
821 clr r25 /* not required */
825 rcall bmw_small_expand1
836 rcall bmw_small_expand2
849 /*******************************************************************************
850 * uint16_t hack_table[5] PROGMEM = { 0x0311, 0xDDB3, 0x2A79, 0x07AA, 0x51C2 };
851 * uint8_t offset_table[5] PROGMEM = { 4+16, 6+16, 9+16, 12+16, 13+16 };
853 * void bmw_small_f0(uint32_t* h, const void* m, uint32_t* q){
856 * uint32_t(*s[])(uint32_t)={ bmw_small_s0, bmw_small_s1, bmw_small_s2,
857 * bmw_small_s3, bmw_small_s4 };
858 * for(i=0; i<16; ++i){
859 * ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i];
861 * dump_x(h, 16, 'T');
862 * memset(q, 0, 4*16);
866 * j=pgm_read_byte(offset_table+c);
867 * hack_reg=pgm_read_word(&(hack_table[c]));
878 * dump_x(q, 16, 'W');
879 * for(i=0; i<16; ++i){
880 * q[i] = s[i%5](q[i]);
882 * for(i=0; i<16; ++i){
883 * ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i];
885 * for(i=0; i<16; ++i){
886 * q[i] += h[(i+1)&0xf];
965 /* h[i] ^= m[i]; q[i]= 0 */
1002 movw r26, h0 ; X = h
1005 movw r30, q0 ; Z = q
1007 movw r26, h0 ; X = h
1013 movw r26, h0 ; X = h
1016 movw r30, q0 ; Z = q
1018 movw r26, h0 ; X = h
1024 movw r26, h0 ; X = h
1027 movw r30, q0 ; Z = q
1029 movw r26, h0 ; X = h
1032 ;--------------- h[i] ^= m[i]
1042 ;--------------- q[i] = s[i%5](q[i])
1044 ldi r30, pm_lo8(f0_jumptable)
1045 ldi r31, pm_hi8(f0_jumptable)
1049 movw r28, q0 ; Y = q
1065 ;--------------- q[i] += h[(i+1)%16]
1112 /*******************************************************************************
1113 * void bmw_small_f2(uint32_t* h, const uint32_t* q, const void* m){
1114 * uint32_t xl=0, xh;
1116 * for(i=16;i<24;++i){
1120 * for(i=24;i<32;++i){
1123 * memcpy(h, m, 16*4);
1124 * h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5);
1125 * h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6);
1126 * h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5);
1127 * h[4] ^= SHR32(xh, 3) ^ q[20];
1128 * h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6);
1129 * h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5);
1130 * h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8);
1131 * h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2);
1132 * for(i=0; i<8; ++i){
1133 * h[i] += xl ^ q[24+i] ^ q[i];
1135 * for(i=0; i<8; ++i){
1136 * h[8+i] ^= xh ^ q[24+i];
1137 * h[8+i] += ROTL32(h[(4+i)%8],i+9);
1139 * h[11] += SHL32(xl, 4) ^ q[18] ^ q[11];
1140 * h[10] += SHL32(xl, 6) ^ q[17] ^ q[10];
1141 * h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8];
1142 * h[15] += SHR32(xl, 2) ^ q[22] ^ q[15];
1143 * h[12] += SHR32(xl, 3) ^ q[19] ^ q[12];
1144 * h[13] += SHR32(xl, 4) ^ q[20] ^ q[13];
1145 * h[ 9] += SHR32(xl, 6) ^ q[16] ^ q[ 9];
1146 * h[14] += SHR32(xl, 7) ^ q[21] ^ q[14];
1171 .macro modify_h_2 addr:req
1172 ldd r22, Y+\addr*4+0
1173 ldd r23, Y+\addr*4+1
1174 ldd r24, Y+\addr*4+2
1175 ldd r25, Y+\addr*4+3
1212 .global bmw_small_f2
1214 /* memcpy(h, m, 64) */
1227 /* for(i=16;i<24;++i){
1233 adiw r26, 1 ; X points at q[16]
1271 adiw r28, 60 ; Y points at q[15]
1272 /* h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5); */
1299 /* h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6); */
1326 /* h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5); */
1355 /* h[4] ^= SHR32(xh, 3) ^ q[20]; */
1378 /* h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6); */
1405 /* h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5); */
1432 /* h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8); */
1454 /* h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2); */
1479 /* for(i=0; i<8; ++i){
1480 * h[i] += xl ^ q[24+i] ^ q[i];
1522 /* for(i=0; i<8; ++i){
1523 h[8+i] ^= xh ^ q[24+i];
1524 h[8+i] += ROTL32(h[(4+i)%8],i+9);
1529 sbiw r28, 8*4 ; Y points at q[24]
1531 sbiw r26, 4*4 ; X points at h[4]
1570 sbiw r30, 4*8 ; adjust Z to point at h[8]
1572 sbiw r28, 1 ; adjust Y to point at q[16]
1574 sbiw r26, 7*4 ; adjust X to point at q[9]
1576 20: /* now we do the memxor stuff */
1585 sbiw r26, 4*8 ; X points at q[8]
1591 /* h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8]; */
1622 ; X points at q[9] but we won't need it anymore
1624 /* h[11] += SHL32(xl, 4) ^ q[11]; */
1630 /* h[10] += SHL32(xl, 6) ^ q[10]; */
1634 /* h[15] += SHR32(xl, 2) ^ q[15]; */
1640 /* h[12] += SHR32(xl, 3) ^ q[12]; */
1644 /* h[13] += SHR32(xl, 4) ^ q[13]; */
1648 /* h[ 9] += SHR32(xl, 6) ^ q[ 9]; */
1652 /* h[14] += SHR32(xl, 7) ^ q[14]; */
1670 ldi r30, lo8(hextable)
1671 ldi r31, hi8(hextable)
1679 ldi r30, lo8(hextable)
1680 ldi r31, hi8(hextable)
1691 .byte '0', '1', '2', '3', '4', '5', '6', '7'
1692 .byte '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
1702 /*******************************************************************************
1703 * void bmw_small_nextBlock(bmw_small_ctx_t* ctx, const void* block){
1705 * dump_x(block, 16, 'M');
1706 * bmw_small_f0(ctx->h, block, q);
1707 * dump_x(q, 16, 'Q');
1708 * bmw_small_f1(q, block, ctx->h);
1709 * dump_x(q, 32, 'Q');
1710 * bmw_small_f2(ctx->h, q, block);
1711 * ctx->counter += 1;
1715 * param ctx: r24:r25
1716 * param block: r22:r23
1724 .global bmw_small_nextBlock
1725 .global bmw224_nextBlock
1726 .global bmw256_nextBlock
1727 bmw_small_nextBlock:
1732 stack_alloc_large 32*4, 30, 31
1737 /* increment counter */
1753 /* call bmw_small_f0(ctx->h, block, q) */
1757 push_ q1, q0, b1, b0, h1, h0
1759 /* call bmw_small_f1(q, block, ctx->h) */
1760 pop_ 20, 21, 22, 23, 24, 25,
1761 push_ 21, 20, 25, 24, 23, 22
1763 /* call bmw_small_f2(ctx->h, q, block) */
1764 pop_ 20, 21, 22, 23, 24, 25,
1766 stack_free_large3 32*4
1772 /*******************************************************************************
1773 * void bmw224_init(bmw224_ctx_t* ctx){
1775 * ctx->h[0] = 0x00010203;
1776 * for(i=1; i<16; ++i){
1777 * ctx->h[i] = ctx->h[i-1]+ 0x04040404;
1782 * param ctx: r24:r25
1824 /*******************************************************************************
1825 * void bmw_small_lastBlock(bmw_small_ctx_t* ctx, const void* block, uint16_t length_b){
1827 * uint8_t buffer[64];
1830 * while(length_b >= BMW_SMALL_BLOCKSIZE){
1831 * bmw_small_nextBlock(ctx, block);
1832 * length_b -= BMW_SMALL_BLOCKSIZE;
1833 * block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
1835 * memset(pctx.buffer, 0, 64);
1836 * memcpy(pctx.buffer, block, (length_b+7)/8);
1837 * pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
1838 * if(length_b+1>64*8-64){
1839 * bmw_small_nextBlock(ctx, pctx.buffer);
1840 * memset(pctx.buffer, 0, 64-8);
1841 * ctx->counter -= 1;
1843 * *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
1844 * bmw_small_nextBlock(ctx, pctx.buffer);
1846 * memset(pctx.buffer, 0xaa, 64);
1847 * for(i=0; i<16;++i){
1848 * pctx.buffer[i*4] = i+0xa0;
1850 * bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
1851 * memcpy(ctx->h, pctx.buffer, 64);
1854 * param ctx: r24:r25
1855 * param block: r22:r23
1856 * param length_b: r20:r21
1867 .global bmw_small_lastBlock
1868 .global bmw224_lastBlock
1869 .global bmw256_lastBlock
1870 bmw_small_lastBlock:
1873 /* while(length_b >= BMW_SMALL_BLOCKSIZE){
1874 bmw_small_nextBlock(ctx, block);
1875 length_b -= BMW_SMALL_BLOCKSIZE;
1876 block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
1889 rcall bmw_small_nextBlock
1901 stack_alloc_large 68
1904 /* memset(pctx.buffer, 0, 64);
1905 memcpy(pctx.buffer, block, (length_b+7)/8);
1906 pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
1942 /* if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
1943 bmw_small_nextBlock(ctx, pctx.buffer);
1944 memset(pctx.buffer, 0, 64-8);
1954 rcall bmw_small_nextBlock
1972 /* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
1973 bmw_small_nextBlock(ctx, pctx.buffer);
2007 rcall bmw_small_nextBlock
2008 /* memset(pctx.buffer, 0xaa, 64);
2010 pctx.buffer[i*4] = i+0xa0;
2024 /* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
2025 memcpy(ctx->h, pctx.buffer, 64);
2029 rcall bmw_small_nextBlock
2044 /*******************************************************************************
2045 * void bmw224_ctx2hash(void* dest, const bmw224_ctx_t* ctx){
2046 * memcpy(dest, &(ctx->h[9]), 224/8);
2049 * param dest: r24:r25
2050 * param ctx: r22:r23
2052 .global bmw224_ctx2hash
2060 /*******************************************************************************
2061 * void bmw256_ctx2hash(void* dest, const bmw256_ctx_t* ctx){
2062 * memcpy(dest, &(ctx->h[8]), 256/8);
2065 * param dest: r24:r25
2066 * param ctx: r22:r23
2068 .global bmw256_ctx2hash
2081 /*******************************************************************************
2082 * void bmw256(void* dest, const void* msg, uint32_t length_b){
2083 * bmw_small_ctx_t ctx;
2084 * bmw256_init(&ctx);
2085 * while(length_b>=BMW_SMALL_BLOCKSIZE){
2086 * bmw_small_nextBlock(&ctx, msg);
2087 * length_b -= BMW_SMALL_BLOCKSIZE;
2088 * msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
2090 * bmw_small_lastBlock(&ctx, msg, length_b);
2091 * bmw256_ctx2hash(dest, &ctx);
2094 * param dest: r24:r25
2095 * param msg: r22:r23
2096 * param length_b: r18:r21
2114 /*******************************************************************************
2115 * void bmw224(void* dest, const void* msg, uint32_t length_b){
2116 * bmw_small_ctx_t ctx;
2117 * bmw224_init(&ctx);
2118 * while(length_b>=BMW_SMALL_BLOCKSIZE){
2119 * bmw_small_nextBlock(&ctx, msg);
2120 * length_b -= BMW_SMALL_BLOCKSIZE;
2121 * msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
2123 * bmw_small_lastBlock(&ctx, msg, length_b);
2124 * bmw224_ctx2hash(dest, &ctx);
2127 * param dest: r24:r25
2128 * param msg: r22:r23
2129 * param length_b: r18:r21
2148 stack_alloc_large 64+4
2156 ldi r30, pm_lo8(init_lut)
2157 ldi r31, pm_hi8(init_lut)
2167 rcall bmw_small_nextBlock
2180 rcall bmw_small_lastBlock
2183 ldi r30, pm_lo8(c2h_lut)
2184 ldi r31, pm_hi8(c2h_lut)
2188 stack_free_large 64+4
2197 rjmp bmw224_ctx2hash
2198 rjmp bmw256_ctx2hash