bmw/autogen_f1_neon_small.rb

   1 # autogen f1 function for BMW
   2 =begin
   3     This file is part of the ARM-Crypto-Lib.
   4     Copyright (C) 2006-2010  Daniel Otte (daniel.otte@rub.de)
   5
   6     This program is free software: you can redistribute it and/or modify
   7     it under the terms of the GNU General Public License as published by
   8     the Free Software Foundation, either version 3 of the License, or
   9     (at your option) any later version.
  10
  11     This program is distributed in the hope that it will be useful,
  12     but WITHOUT ANY WARRANTY; without even the implied warranty of
  13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14     GNU General Public License for more details.
  15
  16     You should have received a copy of the GNU General Public License
  17     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 =end
  19
  20
  21 header = <<EOF
  22 /* BEGIN of automatic generated code */
  23 /*
  24     This file is part of the ARM-Crypto-Lib.
  25     Copyright (C) 2006-2010  Daniel Otte (daniel.otte@rub.de)
  26
  27     This program is free software: you can redistribute it and/or modify
  28     it under the terms of the GNU General Public License as published by
  29     the Free Software Foundation, either version 3 of the License, or
  30     (at your option) any later version.
  31
  32     This program is distributed in the hope that it will be useful,
  33     but WITHOUT ANY WARRANTY; without even the implied warranty of
  34     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  35     GNU General Public License for more details.
  36
  37     You should have received a copy of the GNU General Public License
  38     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  39 */
  40
  41 static inline
  42 void bmw_small_f1(uint32_t* q, const uint32_t* m, const uint32_t* h){
  43   uint32_t even, odd;
  44   uint32x4_t qq16, qq20, qq24, qq28;
  45   uint32x4_t qm0, qm1, qm2;
  46   uint32x4_t qk={0x55555550UL, 0x5aaaaaa5UL, 0x5ffffffaUL, 0x6555554fUL};
  47   uint32x4_t qkadd={0x15555554UL, 0x15555554UL, 0x15555554UL, 0x15555554UL};
  48   uint32x2_t dtmp0;
  49   uint32x4x2_t q2tmp0, q2tmp1;
  50 EOF
  51
  52 footer = <<EOF
  53 }
  54
  55 /* END of automatic generated code */
  56
  57 EOF
  58
  59
  60 =begin
  61   uint32_t r;
  62   /* r = 0x05555555*(j+16); */
  63   r = (   ROTL32(((uint32_t*)m)[j],      ((j+0))+1  )
  64          + ROTL32(((uint32_t*)m)[(j+3)],  ((j+3))+1  )
  65          - ROTL32(((uint32_t*)m)[(j+10)], ((j+10))+1 )
  66          + k_lut[j]
  67        ) ^ ((uint32_t*)h)[(j+7)];
  68   r += S32_1(q[j+ 0]) + S32_2(q[j+ 1]) + S32_3(q[j+ 2]) + S32_0(q[j+ 3]) +
  69      S32_1(q[j+ 4]) + S32_2(q[j+ 5]) + S32_3(q[j+ 6]) + S32_0(q[j+ 7]) +
  70      S32_1(q[j+ 8]) + S32_2(q[j+ 9]) + S32_3(q[j+10]) + S32_0(q[j+11]) +
  71      S32_1(q[j+12]) + S32_2(q[j+13]) + S32_3(q[j+14]) + S32_0(q[j+15]);
  72 =end
  73 def gen_addElement(i)
  74   j = i-16;
  75   s  = sprintf("\n  /* addElement for q%d .. q%d */\n", i, i+3);
  76   s += sprintf("  qm0 = *((uint32x4_t*)&(m[%2d]));\n", j);
  77   s += sprintf("  qm1 = *((uint32x4_t*)&(m[%2d]));\n", j+3);
  78   s += sprintf("  qm2 = *((uint32x4_t*)&(m[%2d]));\n", j+10);
  79   s += sprintf("  qm0 = veorq_u32(vshlq_u32(qm0,(int32x4_t){%2d, %2d, %2d, %2d}),vshlq_u32(qm0,(int32x4_t){%2d, %2d, %2d, %2d}));\n",
  80                   (j+0)%16+1, (j+1)%16+1, (j+2)%16+1, (j+3)%16+1,
  81                   -(32-((j+0)%16+1)), -(32-((j+1)%16+1)), -(32-((j+2)%16+1)), -(32-((j+3)%16+1)))
  82   s += sprintf("  qm1 = veorq_u32(vshlq_u32(qm1,(int32x4_t){%2d, %2d, %2d, %2d}),vshlq_u32(qm1,(int32x4_t){%2d, %2d, %2d, %2d}));\n",
  83                   (j+3)%16+1, (j+4)%16+1, (j+5)%16+1, (j+6)%16+1,
  84                   -(32-((j+3)%16+1)), -(32-((j+4)%16+1)), -(32-((j+5)%16+1)), -(32-((j+6)%16+1)))
  85   s += sprintf("  qm2 = veorq_u32(vshlq_u32(qm2,(int32x4_t){%2d, %2d, %2d, %2d}),vshlq_u32(qm2,(int32x4_t){%2d, %2d, %2d, %2d}));\n",
  86                   (j+10)%16+1, (j+11)%16+1, (j+12)%16+1, (j+13)%16+1,
  87                   -(32-((j+10)%16+1)), -(32-((j+11)%16+1)), -(32-((j+12)%16+1)), -(32-((j+13)%16+1)))
  88   s += sprintf("  qq%d = veorq_u32(vaddq_u32(vaddq_u32(qm0, qm1),vsubq_u32(qk, qm2)), *((uint32x4_t*)&(h[%2d])));\n",
  89                   i, (j+7)%16)
  90   s += sprintf("  qk = vaddq_u32(qk, qkadd);\n");
  91   return s
  92 end
  93
  94 =begin
  95   r += S32_1(q[j+ 0]) + S32_2(q[j+ 1]) + S32_3(q[j+ 2]) + S32_0(q[j+ 3]) +
  96      S32_1(q[j+ 4]) + S32_2(q[j+ 5]) + S32_3(q[j+ 6]) + S32_0(q[j+ 7]) +
  97      S32_1(q[j+ 8]) + S32_2(q[j+ 9]) + S32_3(q[j+10]) + S32_0(q[j+11]) +
  98      S32_1(q[j+12]) + S32_2(q[j+13]) + S32_3(q[j+14]) + S32_0(q[j+15]);
  99
 100 #define S32_0(x) ( (SHR32((x),   1)) ^ \
 101                  (SHL32((x),   3)) ^ \
 102                  (ROTL32((x),  4)) ^ \
 103                  (ROTR32((x), 13)) )
 104
 105 #define S32_1(x) ( (SHR32((x),   1)) ^ \
 106                  (SHL32((x),   2)) ^ \
 107                  (ROTL32((x),  8)) ^ \
 108                  (ROTR32((x),  9)) )
 109
 110 #define S32_2(x) ( (SHR32((x),   2)) ^ \
 111                  (SHL32((x),   1)) ^ \
 112                  (ROTL32((x), 12)) ^ \
 113                  (ROTR32((x),  7)) )
 114
 115 #define S32_3(x) ( (SHR32((x),   2)) ^ \
 116                  (SHL32((x),   2)) ^ \
 117                  (ROTL32((x), 15)) ^ \
 118                  (ROTR32((x),  3)) )
 119 =end
 120
 121 def gen_expand_1(i)
 122   s  = sprintf("\n  /* expand1(%2d) */\n", i)
 123   s += sprintf("  qm0 = *((uint32x4_t*)&(q[%2d]));\n", i);
 124   s += sprintf("  qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}),\n" \
 125                "                            vshlq_u32(qm0,(int32x4_t){  2,  1,  2,  3})),\n" \
 126                "                  veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){  8, 12, 15,  4}),\n" \
 127                "                                      vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})),\n" \
 128                "                            veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}), \n" \
 129                "                                      vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13}))));\n")
 130   s += sprintf("  qm0 = *((uint32x4_t*)&(q[%2d]));\n", i+4);
 131   s += sprintf("  qm2 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}),\n" \
 132                "                            vshlq_u32(qm0,(int32x4_t){  2,  1,  2,  3})),\n" \
 133                "                  veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){  8, 12, 15,  4}),\n" \
 134                "                                      vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})),\n" \
 135                "                            veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}),\n" \
 136                "                                      vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13}))));\n")
 137   s += sprintf("  qm2 = vaddq_u32(qm2, qm1);\n")
 138   s += sprintf("  qm0 = *((uint32x4_t*)&(q[%2d]));\n", i+8);
 139   s += sprintf("  qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}),\n" \
 140                "                            vshlq_u32(qm0,(int32x4_t){  2,  1,  2,  3})),\n" \
 141                "                  veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){  8, 12, 15,  4}),\n" \
 142                "                                      vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})),\n" \
 143                "                            veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}),\n" \
 144                "                                      vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13}))));\n")
 145   s += sprintf("  qm2 = vaddq_u32(qm2, qm1);\n")
 146   s += sprintf("  qm0 = *((uint32x4_t*)&(q[%2d]));\n", i+12);
 147   s += sprintf("  qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}),\n" \
 148                "                            vshlq_u32(qm0,(int32x4_t){  2,  1,  2,  3})),\n" \
 149                "                  veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){  8, 12, 15,  4}),\n" \
 150                "                                      vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})),\n" \
 151                "                            veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}),\n" \
 152                "                                      vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13}))));\n")
 153   s += sprintf("  qm2 = vaddq_u32(qm2, qm1);\n")
 154   s += sprintf("  dtmp0 = vadd_u32(vget_high_u32(qm2), vget_low_u32(qm2));\n")
 155   s += sprintf("  q[%2d] = vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1) + vgetq_lane_u32(qq%d, %2d);\n",
 156                  i+16, (i+16)&0xFC, i&3)
 157   return s
 158 end
 159
 160 =begin
 161      + ( even =  q[ 2] + q[ 4] + q[ 6]
 162                + q[ 8] + q[10] + q[12] + q[14] )
 163      + R32_1(q[ 3]) + R32_2(q[ 5]) + R32_3(q[ 7])
 164      + R32_4(q[ 9]) + R32_5(q[11]) + R32_6(q[13])
 165      + R32_7(q[15]) + S32_4(q[16]) + S32_5(q[17]);
 166      + ( odd  =  q[ 3] + q[ 5] + q[ 7]
 167                + q[ 9] + q[11] + q[13] + q[15] )
 168      + R32_1(q[ 4]) + R32_2(q[ 6]) + R32_3(q[ 8])
 169      + R32_4(q[10]) + R32_5(q[12]) + R32_6(q[14])
 170      + R32_7(q[16]) + S32_4(q[17]) + S32_5(q[18]);
 171
 172 #define S32_4(x) ( (SHR32((x),   1)) ^ (x))
 173 #define S32_5(x) ( (SHR32((x),   2)) ^ (x))
 174
 175 #define R32_1(x)   (ROTL32((x),  3))
 176 #define R32_2(x)   (ROTL32((x),  7))
 177 #define R32_3(x)   (ROTL32((x), 13))
 178 #define R32_4(x)   (ROTL32((x), 16))
 179 #define R32_5(x)   (ROTR32((x), 13))
 180 #define R32_6(x)   (ROTR32((x),  9))
 181 #define R32_7(x)   (ROTR32((x),  5))
 182 =end
 183
 184 def gen_expand_2(i, start)
 185   s  = sprintf("\n  /* expand2(%2d) */\n", i)
 186   s += sprintf("  q2tmp0 = vld2q_u32(&q[%2d]);\n", i)
 187   s += sprintf("  q2tmp1 = vld2q_u32(&q[%2d]);\n", i+8)
 188   if i-1<=start
 189     s += sprintf("  q2tmp1.val[0] = vsetq_lane_u32(0, q2tmp1.val[0], 3);\n")
 190     s += sprintf("  q2tmp0.val[0] = vaddq_u32(q2tmp0.val[0], q2tmp1.val[0]);\n")
 191     s += sprintf("  dtmp0 = vadd_u32(vget_high_u32(q2tmp0.val[0]), vget_low_u32(q2tmp0.val[0]));\n")
 192     s += sprintf("  %s = vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);\n",(i%2==0)?"even":"odd ")
 193   else
 194     s += sprintf("  %s += q[%2d] - q[%2d];\n",(i%2==0)?"even":"odd ", i+12, i-2)
 195   end
 196   s += sprintf("  q[%2d] = %s + ((q[%2d]>>1)|q[%2d]);\n", i+16, (i%2==0)?"even":"odd ", i+14, i+14)
 197   s += sprintf("  qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){  3,  7, 13, 16}),\n" \
 198                "                  vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));\n")
 199   s += sprintf("  qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27,  0}),\n" \
 200                "                  vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));\n")
 201   s += sprintf("  qm1 = vaddq_u32(qm1, qm0);\n")
 202   s += sprintf("  dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));\n")
 203   s += sprintf("  q[%2d] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);\n", i+16)
 204
 205   return s
 206 end
 207
 208
 209 puts header
 210 [16,20,24,28].each {|x| puts gen_addElement(x)}
 211 (0..1).each  {|x| puts gen_expand_1(x)}
 212 (2..15).each {|x| puts gen_expand_2(x, 2)}
 213 puts footer