1 /* BEGIN of automatic generated code */
3 This file is part of the ARM-Crypto-Lib.
4 Copyright (C) 2006-2010 Daniel Otte (daniel.otte@rub.de)
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 void bmw_small_f1(uint32_t* q, const uint32_t* m, const uint32_t* h){
23 uint32x4_t qq16, qq20, qq24, qq28;
24 uint32x4_t qm0, qm1, qm2;
25 uint32x4_t qk={0x55555550UL, 0x5aaaaaa5UL, 0x5ffffffaUL, 0x6555554fUL};
26 uint32x4_t qkadd={0x15555554UL, 0x15555554UL, 0x15555554UL, 0x15555554UL};
28 uint32x4x2_t q2tmp0, q2tmp1;
30 /* addElement for q16 .. q19 */
31 qm0 = *((uint32x4_t*)&(m[ 0]));
32 qm1 = *((uint32x4_t*)&(m[ 3]));
33 qm2 = *((uint32x4_t*)&(m[10]));
34 qm0 = veorq_u32(vshlq_u32(qm0,(int32x4_t){ 1, 2, 3, 4}),vshlq_u32(qm0,(int32x4_t){-31, -30, -29, -28}));
35 qm1 = veorq_u32(vshlq_u32(qm1,(int32x4_t){ 4, 5, 6, 7}),vshlq_u32(qm1,(int32x4_t){-28, -27, -26, -25}));
36 qm2 = veorq_u32(vshlq_u32(qm2,(int32x4_t){11, 12, 13, 14}),vshlq_u32(qm2,(int32x4_t){-21, -20, -19, -18}));
37 qq16 = veorq_u32(vaddq_u32(vaddq_u32(qm0, qm1),vsubq_u32(qk, qm2)), *((uint32x4_t*)&(h[ 7])));
38 qk = vaddq_u32(qk, qkadd);
40 /* addElement for q20 .. q23 */
41 qm0 = *((uint32x4_t*)&(m[ 4]));
42 qm1 = *((uint32x4_t*)&(m[ 7]));
43 qm2 = *((uint32x4_t*)&(m[14]));
44 qm0 = veorq_u32(vshlq_u32(qm0,(int32x4_t){ 5, 6, 7, 8}),vshlq_u32(qm0,(int32x4_t){-27, -26, -25, -24}));
45 qm1 = veorq_u32(vshlq_u32(qm1,(int32x4_t){ 8, 9, 10, 11}),vshlq_u32(qm1,(int32x4_t){-24, -23, -22, -21}));
46 qm2 = veorq_u32(vshlq_u32(qm2,(int32x4_t){15, 16, 1, 2}),vshlq_u32(qm2,(int32x4_t){-17, -16, -31, -30}));
47 qq20 = veorq_u32(vaddq_u32(vaddq_u32(qm0, qm1),vsubq_u32(qk, qm2)), *((uint32x4_t*)&(h[11])));
48 qk = vaddq_u32(qk, qkadd);
50 /* addElement for q24 .. q27 */
51 qm0 = *((uint32x4_t*)&(m[ 8]));
52 qm1 = *((uint32x4_t*)&(m[11]));
53 qm2 = *((uint32x4_t*)&(m[18]));
54 qm0 = veorq_u32(vshlq_u32(qm0,(int32x4_t){ 9, 10, 11, 12}),vshlq_u32(qm0,(int32x4_t){-23, -22, -21, -20}));
55 qm1 = veorq_u32(vshlq_u32(qm1,(int32x4_t){12, 13, 14, 15}),vshlq_u32(qm1,(int32x4_t){-20, -19, -18, -17}));
56 qm2 = veorq_u32(vshlq_u32(qm2,(int32x4_t){ 3, 4, 5, 6}),vshlq_u32(qm2,(int32x4_t){-29, -28, -27, -26}));
57 qq24 = veorq_u32(vaddq_u32(vaddq_u32(qm0, qm1),vsubq_u32(qk, qm2)), *((uint32x4_t*)&(h[15])));
58 qk = vaddq_u32(qk, qkadd);
60 /* addElement for q28 .. q31 */
61 qm0 = *((uint32x4_t*)&(m[12]));
62 qm1 = *((uint32x4_t*)&(m[15]));
63 qm2 = *((uint32x4_t*)&(m[22]));
64 qm0 = veorq_u32(vshlq_u32(qm0,(int32x4_t){13, 14, 15, 16}),vshlq_u32(qm0,(int32x4_t){-19, -18, -17, -16}));
65 qm1 = veorq_u32(vshlq_u32(qm1,(int32x4_t){16, 1, 2, 3}),vshlq_u32(qm1,(int32x4_t){-16, -31, -30, -29}));
66 qm2 = veorq_u32(vshlq_u32(qm2,(int32x4_t){ 7, 8, 9, 10}),vshlq_u32(qm2,(int32x4_t){-25, -24, -23, -22}));
67 qq28 = veorq_u32(vaddq_u32(vaddq_u32(qm0, qm1),vsubq_u32(qk, qm2)), *((uint32x4_t*)&(h[ 3])));
68 qk = vaddq_u32(qk, qkadd);
71 qm0 = *((uint32x4_t*)&(q[ 0]));
72 qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}),
73 vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})),
74 veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}),
75 vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})),
76 veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}),
77 vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13}))));
78 qm0 = *((uint32x4_t*)&(q[ 4]));
79 qm2 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}),
80 vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})),
81 veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}),
82 vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})),
83 veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}),
84 vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13}))));
85 qm2 = vaddq_u32(qm2, qm1);
86 qm0 = *((uint32x4_t*)&(q[ 8]));
87 qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}),
88 vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})),
89 veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}),
90 vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})),
91 veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}),
92 vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13}))));
93 qm2 = vaddq_u32(qm2, qm1);
94 qm0 = *((uint32x4_t*)&(q[12]));
95 qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}),
96 vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})),
97 veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}),
98 vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})),
99 veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}),
100 vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13}))));
101 qm2 = vaddq_u32(qm2, qm1);
102 dtmp0 = vadd_u32(vget_high_u32(qm2), vget_low_u32(qm2));
103 q[16] = vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1) + vgetq_lane_u32(qq16, 0);
106 qm0 = *((uint32x4_t*)&(q[ 1]));
107 qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}),
108 vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})),
109 veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}),
110 vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})),
111 veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}),
112 vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13}))));
113 qm0 = *((uint32x4_t*)&(q[ 5]));
114 qm2 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}),
115 vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})),
116 veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}),
117 vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})),
118 veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}),
119 vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13}))));
120 qm2 = vaddq_u32(qm2, qm1);
121 qm0 = *((uint32x4_t*)&(q[ 9]));
122 qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}),
123 vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})),
124 veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}),
125 vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})),
126 veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}),
127 vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13}))));
128 qm2 = vaddq_u32(qm2, qm1);
129 qm0 = *((uint32x4_t*)&(q[13]));
130 qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}),
131 vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})),
132 veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}),
133 vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})),
134 veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}),
135 vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13}))));
136 qm2 = vaddq_u32(qm2, qm1);
137 dtmp0 = vadd_u32(vget_high_u32(qm2), vget_low_u32(qm2));
138 q[17] = vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1) + vgetq_lane_u32(qq16, 1);
141 q2tmp0 = vld2q_u32(&q[ 2]);
142 q2tmp1 = vld2q_u32(&q[10]);
143 q2tmp1.val[0] = vsetq_lane_u32(0, q2tmp1.val[0], 3);
144 q2tmp0.val[0] = vaddq_u32(q2tmp0.val[0], q2tmp1.val[0]);
145 dtmp0 = vadd_u32(vget_high_u32(q2tmp0.val[0]), vget_low_u32(q2tmp0.val[0]));
146 even = vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
147 q[18] = even + ((q[16]>>1)|q[16]);
148 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
149 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
150 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
151 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
152 qm1 = vaddq_u32(qm1, qm0);
153 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
154 q[18] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
157 q2tmp0 = vld2q_u32(&q[ 3]);
158 q2tmp1 = vld2q_u32(&q[11]);
159 q2tmp1.val[0] = vsetq_lane_u32(0, q2tmp1.val[0], 3);
160 q2tmp0.val[0] = vaddq_u32(q2tmp0.val[0], q2tmp1.val[0]);
161 dtmp0 = vadd_u32(vget_high_u32(q2tmp0.val[0]), vget_low_u32(q2tmp0.val[0]));
162 odd = vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
163 q[19] = odd + ((q[17]>>1)|q[17]);
164 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
165 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
166 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
167 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
168 qm1 = vaddq_u32(qm1, qm0);
169 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
170 q[19] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
173 q2tmp0 = vld2q_u32(&q[ 4]);
174 q2tmp1 = vld2q_u32(&q[12]);
175 even += q[16] - q[ 2];
176 q[20] = even + ((q[18]>>1)|q[18]);
177 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
178 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
179 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
180 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
181 qm1 = vaddq_u32(qm1, qm0);
182 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
183 q[20] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
186 q2tmp0 = vld2q_u32(&q[ 5]);
187 q2tmp1 = vld2q_u32(&q[13]);
188 odd += q[17] - q[ 3];
189 q[21] = odd + ((q[19]>>1)|q[19]);
190 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
191 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
192 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
193 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
194 qm1 = vaddq_u32(qm1, qm0);
195 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
196 q[21] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
199 q2tmp0 = vld2q_u32(&q[ 6]);
200 q2tmp1 = vld2q_u32(&q[14]);
201 even += q[18] - q[ 4];
202 q[22] = even + ((q[20]>>1)|q[20]);
203 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
204 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
205 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
206 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
207 qm1 = vaddq_u32(qm1, qm0);
208 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
209 q[22] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
212 q2tmp0 = vld2q_u32(&q[ 7]);
213 q2tmp1 = vld2q_u32(&q[15]);
214 odd += q[19] - q[ 5];
215 q[23] = odd + ((q[21]>>1)|q[21]);
216 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
217 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
218 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
219 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
220 qm1 = vaddq_u32(qm1, qm0);
221 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
222 q[23] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
225 q2tmp0 = vld2q_u32(&q[ 8]);
226 q2tmp1 = vld2q_u32(&q[16]);
227 even += q[20] - q[ 6];
228 q[24] = even + ((q[22]>>1)|q[22]);
229 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
230 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
231 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
232 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
233 qm1 = vaddq_u32(qm1, qm0);
234 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
235 q[24] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
238 q2tmp0 = vld2q_u32(&q[ 9]);
239 q2tmp1 = vld2q_u32(&q[17]);
240 odd += q[21] - q[ 7];
241 q[25] = odd + ((q[23]>>1)|q[23]);
242 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
243 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
244 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
245 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
246 qm1 = vaddq_u32(qm1, qm0);
247 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
248 q[25] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
251 q2tmp0 = vld2q_u32(&q[10]);
252 q2tmp1 = vld2q_u32(&q[18]);
253 even += q[22] - q[ 8];
254 q[26] = even + ((q[24]>>1)|q[24]);
255 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
256 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
257 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
258 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
259 qm1 = vaddq_u32(qm1, qm0);
260 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
261 q[26] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
264 q2tmp0 = vld2q_u32(&q[11]);
265 q2tmp1 = vld2q_u32(&q[19]);
266 odd += q[23] - q[ 9];
267 q[27] = odd + ((q[25]>>1)|q[25]);
268 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
269 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
270 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
271 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
272 qm1 = vaddq_u32(qm1, qm0);
273 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
274 q[27] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
277 q2tmp0 = vld2q_u32(&q[12]);
278 q2tmp1 = vld2q_u32(&q[20]);
279 even += q[24] - q[10];
280 q[28] = even + ((q[26]>>1)|q[26]);
281 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
282 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
283 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
284 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
285 qm1 = vaddq_u32(qm1, qm0);
286 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
287 q[28] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
290 q2tmp0 = vld2q_u32(&q[13]);
291 q2tmp1 = vld2q_u32(&q[21]);
292 odd += q[25] - q[11];
293 q[29] = odd + ((q[27]>>1)|q[27]);
294 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
295 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
296 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
297 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
298 qm1 = vaddq_u32(qm1, qm0);
299 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
300 q[29] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
303 q2tmp0 = vld2q_u32(&q[14]);
304 q2tmp1 = vld2q_u32(&q[22]);
305 even += q[26] - q[12];
306 q[30] = even + ((q[28]>>1)|q[28]);
307 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
308 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
309 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
310 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
311 qm1 = vaddq_u32(qm1, qm0);
312 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
313 q[30] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
316 q2tmp0 = vld2q_u32(&q[15]);
317 q2tmp1 = vld2q_u32(&q[23]);
318 odd += q[27] - q[13];
319 q[31] = odd + ((q[29]>>1)|q[29]);
320 qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}),
321 vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16}));
322 qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}),
323 vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2}));
324 qm1 = vaddq_u32(qm1, qm0);
325 dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1));
326 q[31] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1);
329 /* END of automatic generated code */