3 This file is part of the AVR-Crypto-Lib.
4 Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de)
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 * File: bmw_small-asm.S
24 * License: GPLv3 or later
25 * Description: implementation of BlueMidnightWish
29 #include "avr-asm-macros.S"
50 .byte 0x21 ; 17 unused but necesseray for padding
54 /*******************************************************************************
70 /*******************************************************************************
86 /*******************************************************************************
103 /*******************************************************************************
121 /*******************************************************************************
128 ldi r30, lo8(shiftcodetable_9)
129 ldi r31, hi8(shiftcodetable_9)
154 /*******************************************************************************
155 * uint32_t rotl_addel(uint32_t x, uint8_t v){
157 * r = ROTL32(x, (v&0xf)+1);
167 ldi r30, lo8(shiftcodetable)
168 ldi r31, hi8(shiftcodetable)
193 /******************************************************************************/
195 preg0 = 22 /* preg for processing register */
199 breg0 = 26 /* breg for backup register */
203 areg0 = 0 /* areg for accumulator register */
208 /*******************************************************************************
209 * uint32_t bmw_small_s0(uint32_t x){
242 /* now the trick, we simply can rotate the old value to the right by 17 */
243 movw breg0, preg0 /* first rotate by 16 */
257 /*******************************************************************************
258 * uint32_t bmw_small_s1(uint32_t x){
293 /*******************************************************************************
294 * uint32_t bmw_small_s2(uint32_t x){
335 /*******************************************************************************
336 * uint32_t bmw_small_s3(uint32_t x){
375 /*******************************************************************************
376 * uint32_t bmw_small_s4(uint32_t x){
391 /*******************************************************************************
392 * uint32_t bmw_small_s5(uint32_t x){
407 /*******************************************************************************
408 * uint32_t bmw_small_r1(uint32_t x){
419 /*******************************************************************************
420 * uint32_t bmw_small_r2(uint32_t x){
431 /*******************************************************************************
432 * uint32_t bmw_small_r3(uint32_t x){
447 /*******************************************************************************
448 * uint32_t bmw_small_r4(uint32_t x){
461 /*******************************************************************************
462 * uint32_t bmw_small_r5(uint32_t x){
476 /*******************************************************************************
477 * uint32_t bmw_small_r6(uint32_t x){
493 /*******************************************************************************
494 * uint32_t bmw_small_r7(uint32_t x){
505 /******************************************************************************/
508 .long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f
509 .long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3
510 .long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7
511 .long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b
513 /*******************************************************************************
514 * uint32_t addelment(uint8_t j, const uint32_t* m, const uint32_t* h){
516 * r = pgm_read_dword(k_lut+j);
517 * r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
518 * r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
519 * r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
520 * r ^= ((uint32_t*)h)[(j+7)&0xf];
545 ldi r30, lo8(const_lut)
546 ldi r31, hi8(const_lut)
629 /*******************************************************************************
630 * uint32_t bmw_small_expand1(uint8_t j, const void* m, const void* h, const uint32_t* q){
631 * uint32_t(*s[])(uint32_t) = {bmw_small_s1, bmw_small_s2, bmw_small_s3, bmw_small_s0};
634 * r = addelement(j, m, h);
637 * r += s[i%4](q[j+i]);
650 .global bmw_small_expand1
712 /*******************************************************************************
713 * uint32_t bmw_small_expand2(uint8_t j, const void* m, const void* h, const uint32_t* q){
714 * uint32_t(*rf[])(uint32_t) = {bmw_small_r1, bmw_small_r2, bmw_small_r3,
715 * bmw_small_r4, bmw_small_r5, bmw_small_r6,
719 * r = addelement(j, m, h);
720 * for(i=0; i<14; i+=2){
723 * for(i=0; i<14; i+=2){
724 * r += rf[i/2](q[j+i+1]);
726 * r += bmw_small_s4(q[j+14]);
727 * r += bmw_small_s5(q[j+15]);
749 .global bmw_small_expand2
764 ldi r30, pm_lo8(expand2_jumptable)
765 ldi r31, pm_hi8(expand2_jumptable)
785 /*******************************************************************************
786 * void bmw_small_f1(uint32_t* q, const void* m, const void* h){
788 * q[16] = bmw_small_expand1(0, m, h, q);
789 * q[17] = bmw_small_expand1(1, m, h, q);
790 * for(i=2; i<16; ++i){
791 * q[16+i] = bmw_small_expand2(i, m, h, q);
813 clr r25 /* not required */
815 rcall bmw_small_expand1
822 clr r25 /* not required */
826 rcall bmw_small_expand1
837 rcall bmw_small_expand2
850 /*******************************************************************************
851 * uint16_t hack_table[5] PROGMEM = { 0x0311, 0xDDB3, 0x2A79, 0x07AA, 0x51C2 };
852 * uint8_t offset_table[5] PROGMEM = { 4+16, 6+16, 9+16, 12+16, 13+16 };
854 * void bmw_small_f0(uint32_t* h, const void* m, uint32_t* q){
857 * uint32_t(*s[])(uint32_t)={ bmw_small_s0, bmw_small_s1, bmw_small_s2,
858 * bmw_small_s3, bmw_small_s4 };
859 * for(i=0; i<16; ++i){
860 * ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i];
862 * dump_x(h, 16, 'T');
863 * memset(q, 0, 4*16);
867 * j=pgm_read_byte(offset_table+c);
868 * hack_reg=pgm_read_word(&(hack_table[c]));
879 * dump_x(q, 16, 'W');
880 * for(i=0; i<16; ++i){
881 * q[i] = s[i%5](q[i]);
883 * for(i=0; i<16; ++i){
884 * ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i];
886 * for(i=0; i<16; ++i){
887 * q[i] += h[(i+1)&0xf];
966 /* h[i] ^= m[i]; q[i]= 0 */
1003 movw r26, h0 ; X = h
1006 movw r30, q0 ; Z = q
1008 movw r26, h0 ; X = h
1014 movw r26, h0 ; X = h
1017 movw r30, q0 ; Z = q
1019 movw r26, h0 ; X = h
1025 movw r26, h0 ; X = h
1028 movw r30, q0 ; Z = q
1030 movw r26, h0 ; X = h
1033 ;--------------- h[i] ^= m[i]
1043 ;--------------- q[i] = s[i%5](q[i])
1045 ldi r30, pm_lo8(f0_jumptable)
1046 ldi r31, pm_hi8(f0_jumptable)
1050 movw r28, q0 ; Y = q
1066 ;--------------- q[i] += h[(i+1)%16]
1113 /*******************************************************************************
1114 * void bmw_small_f2(uint32_t* h, const uint32_t* q, const void* m){
1115 * uint32_t xl=0, xh;
1117 * for(i=16;i<24;++i){
1121 * for(i=24;i<32;++i){
1124 * memcpy(h, m, 16*4);
1125 * h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5);
1126 * h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6);
1127 * h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5);
1128 * h[4] ^= SHR32(xh, 3) ^ q[20];
1129 * h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6);
1130 * h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5);
1131 * h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8);
1132 * h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2);
1133 * for(i=0; i<8; ++i){
1134 * h[i] += xl ^ q[24+i] ^ q[i];
1136 * for(i=0; i<8; ++i){
1137 * h[8+i] ^= xh ^ q[24+i];
1138 * h[8+i] += ROTL32(h[(4+i)%8],i+9);
1140 * h[11] += SHL32(xl, 4) ^ q[18] ^ q[11];
1141 * h[10] += SHL32(xl, 6) ^ q[17] ^ q[10];
1142 * h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8];
1143 * h[15] += SHR32(xl, 2) ^ q[22] ^ q[15];
1144 * h[12] += SHR32(xl, 3) ^ q[19] ^ q[12];
1145 * h[13] += SHR32(xl, 4) ^ q[20] ^ q[13];
1146 * h[ 9] += SHR32(xl, 6) ^ q[16] ^ q[ 9];
1147 * h[14] += SHR32(xl, 7) ^ q[21] ^ q[14];
1172 .macro modify_h_2 addr:req
1173 ldd r22, Y+\addr*4+0
1174 ldd r23, Y+\addr*4+1
1175 ldd r24, Y+\addr*4+2
1176 ldd r25, Y+\addr*4+3
1213 .global bmw_small_f2
1215 /* memcpy(h, m, 64) */
1228 /* for(i=16;i<24;++i){
1234 adiw r26, 1 ; X points at q[16]
1272 adiw r28, 60 ; Y points at q[15]
1273 /* h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5); */
1300 /* h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6); */
1327 /* h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5); */
1356 /* h[4] ^= SHR32(xh, 3) ^ q[20]; */
1379 /* h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6); */
1406 /* h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5); */
1433 /* h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8); */
1455 /* h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2); */
1480 /* for(i=0; i<8; ++i){
1481 * h[i] += xl ^ q[24+i] ^ q[i];
1523 /* for(i=0; i<8; ++i){
1524 h[8+i] ^= xh ^ q[24+i];
1525 h[8+i] += ROTL32(h[(4+i)%8],i+9);
1530 sbiw r28, 8*4 ; Y points at q[24]
1532 sbiw r26, 4*4 ; X points at h[4]
1571 sbiw r30, 4*8 ; adjust Z to point at h[8]
1573 sbiw r28, 1 ; adjust Y to point at q[16]
1575 sbiw r26, 7*4 ; adjust X to point at q[9]
1577 20: /* now we do the memxor stuff */
1586 sbiw r26, 4*8 ; X points at q[8]
1592 /* h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8]; */
1623 ; X points at q[9] but we won't need it anymore
1625 /* h[11] += SHL32(xl, 4) ^ q[11]; */
1631 /* h[10] += SHL32(xl, 6) ^ q[10]; */
1635 /* h[15] += SHR32(xl, 2) ^ q[15]; */
1641 /* h[12] += SHR32(xl, 3) ^ q[12]; */
1645 /* h[13] += SHR32(xl, 4) ^ q[13]; */
1649 /* h[ 9] += SHR32(xl, 6) ^ q[ 9]; */
1653 /* h[14] += SHR32(xl, 7) ^ q[14]; */
1669 ldi r30, lo8(hextable)
1670 ldi r31, hi8(hextable)
1678 ldi r30, lo8(hextable)
1679 ldi r31, hi8(hextable)
1690 .byte '0', '1', '2', '3', '4', '5', '6', '7'
1691 .byte '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'