X-Git-Url: https://git.cryptolib.org/?a=blobdiff_plain;ds=sidebyside;f=bmw%2Ff1_autogen_neon_small.i;fp=bmw%2Ff1_autogen_neon_small.i;h=b309e7e4f42e420a07e2d6b55496b4a744684209;hb=3a80fbe29e33b818ccebbaba7f8bbe48c5ccd173;hp=0000000000000000000000000000000000000000;hpb=2a4779378a7bf4322a0e6b2024284092135e8a3d;p=arm-crypto-lib.git diff --git a/bmw/f1_autogen_neon_small.i b/bmw/f1_autogen_neon_small.i new file mode 100644 index 0000000..b309e7e --- /dev/null +++ b/bmw/f1_autogen_neon_small.i @@ -0,0 +1,330 @@ +/* BEGIN of automatic generated code */ +/* + This file is part of the ARM-Crypto-Lib. + Copyright (C) 2006-2010 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +static inline +void bmw_small_f1(uint32_t* q, const uint32_t* m, const uint32_t* h){ + uint32_t even, odd; + uint32x4_t qq16, qq20, qq24, qq28; + uint32x4_t qm0, qm1, qm2; + uint32x4_t qk={0x55555550UL, 0x5aaaaaa5UL, 0x5ffffffaUL, 0x6555554fUL}; + uint32x4_t qkadd={0x15555554UL, 0x15555554UL, 0x15555554UL, 0x15555554UL}; + uint32x2_t dtmp0; + uint32x4x2_t q2tmp0, q2tmp1; + + /* addElement for q16 .. q19 */ + qm0 = *((uint32x4_t*)&(m[ 0])); + qm1 = *((uint32x4_t*)&(m[ 3])); + qm2 = *((uint32x4_t*)&(m[10])); + qm0 = veorq_u32(vshlq_u32(qm0,(int32x4_t){ 1, 2, 3, 4}),vshlq_u32(qm0,(int32x4_t){-31, -30, -29, -28})); + qm1 = veorq_u32(vshlq_u32(qm1,(int32x4_t){ 4, 5, 6, 7}),vshlq_u32(qm1,(int32x4_t){-28, -27, -26, -25})); + qm2 = veorq_u32(vshlq_u32(qm2,(int32x4_t){11, 12, 13, 14}),vshlq_u32(qm2,(int32x4_t){-21, -20, -19, -18})); + qq16 = veorq_u32(vaddq_u32(vaddq_u32(qm0, qm1),vsubq_u32(qk, qm2)), *((uint32x4_t*)&(h[ 7]))); + qk = vaddq_u32(qk, qkadd); + + /* addElement for q20 .. q23 */ + qm0 = *((uint32x4_t*)&(m[ 4])); + qm1 = *((uint32x4_t*)&(m[ 7])); + qm2 = *((uint32x4_t*)&(m[14])); + qm0 = veorq_u32(vshlq_u32(qm0,(int32x4_t){ 5, 6, 7, 8}),vshlq_u32(qm0,(int32x4_t){-27, -26, -25, -24})); + qm1 = veorq_u32(vshlq_u32(qm1,(int32x4_t){ 8, 9, 10, 11}),vshlq_u32(qm1,(int32x4_t){-24, -23, -22, -21})); + qm2 = veorq_u32(vshlq_u32(qm2,(int32x4_t){15, 16, 1, 2}),vshlq_u32(qm2,(int32x4_t){-17, -16, -31, -30})); + qq20 = veorq_u32(vaddq_u32(vaddq_u32(qm0, qm1),vsubq_u32(qk, qm2)), *((uint32x4_t*)&(h[11]))); + qk = vaddq_u32(qk, qkadd); + + /* addElement for q24 .. q27 */ + qm0 = *((uint32x4_t*)&(m[ 8])); + qm1 = *((uint32x4_t*)&(m[11])); + qm2 = *((uint32x4_t*)&(m[18])); + qm0 = veorq_u32(vshlq_u32(qm0,(int32x4_t){ 9, 10, 11, 12}),vshlq_u32(qm0,(int32x4_t){-23, -22, -21, -20})); + qm1 = veorq_u32(vshlq_u32(qm1,(int32x4_t){12, 13, 14, 15}),vshlq_u32(qm1,(int32x4_t){-20, -19, -18, -17})); + qm2 = veorq_u32(vshlq_u32(qm2,(int32x4_t){ 3, 4, 5, 6}),vshlq_u32(qm2,(int32x4_t){-29, -28, -27, -26})); + qq24 = veorq_u32(vaddq_u32(vaddq_u32(qm0, qm1),vsubq_u32(qk, qm2)), *((uint32x4_t*)&(h[15]))); + qk = vaddq_u32(qk, qkadd); + + /* addElement for q28 .. q31 */ + qm0 = *((uint32x4_t*)&(m[12])); + qm1 = *((uint32x4_t*)&(m[15])); + qm2 = *((uint32x4_t*)&(m[22])); + qm0 = veorq_u32(vshlq_u32(qm0,(int32x4_t){13, 14, 15, 16}),vshlq_u32(qm0,(int32x4_t){-19, -18, -17, -16})); + qm1 = veorq_u32(vshlq_u32(qm1,(int32x4_t){16, 1, 2, 3}),vshlq_u32(qm1,(int32x4_t){-16, -31, -30, -29})); + qm2 = veorq_u32(vshlq_u32(qm2,(int32x4_t){ 7, 8, 9, 10}),vshlq_u32(qm2,(int32x4_t){-25, -24, -23, -22})); + qq28 = veorq_u32(vaddq_u32(vaddq_u32(qm0, qm1),vsubq_u32(qk, qm2)), *((uint32x4_t*)&(h[ 3]))); + qk = vaddq_u32(qk, qkadd); + + /* expand1( 0) */ + qm0 = *((uint32x4_t*)&(q[ 0])); + qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}), + vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})), + veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}), + vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})), + veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}), + vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13})))); + qm0 = *((uint32x4_t*)&(q[ 4])); + qm2 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}), + vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})), + veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}), + vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})), + veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}), + vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13})))); + qm2 = vaddq_u32(qm2, qm1); + qm0 = *((uint32x4_t*)&(q[ 8])); + qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}), + vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})), + veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}), + vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})), + veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}), + vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13})))); + qm2 = vaddq_u32(qm2, qm1); + qm0 = *((uint32x4_t*)&(q[12])); + qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}), + vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})), + veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}), + vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})), + veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}), + vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13})))); + qm2 = vaddq_u32(qm2, qm1); + dtmp0 = vadd_u32(vget_high_u32(qm2), vget_low_u32(qm2)); + q[16] = vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1) + vgetq_lane_u32(qq16, 0); + + /* expand1( 1) */ + qm0 = *((uint32x4_t*)&(q[ 1])); + qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}), + vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})), + veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}), + vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})), + veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}), + vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13})))); + qm0 = *((uint32x4_t*)&(q[ 5])); + qm2 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}), + vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})), + veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}), + vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})), + veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}), + vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13})))); + qm2 = vaddq_u32(qm2, qm1); + qm0 = *((uint32x4_t*)&(q[ 9])); + qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}), + vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})), + veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}), + vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})), + veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}), + vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13})))); + qm2 = vaddq_u32(qm2, qm1); + qm0 = *((uint32x4_t*)&(q[13])); + qm1 = veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ -1, -2, -2, -1}), + vshlq_u32(qm0,(int32x4_t){ 2, 1, 2, 3})), + veorq_u32(veorq_u32(vshlq_u32(qm0,(int32x4_t){ 8, 12, 15, 4}), + vshlq_u32(qm0,(int32x4_t){-24,-20,-17,-28})), + veorq_u32(vshlq_u32(qm0,(int32x4_t){ 23, 25, 29, 19}), + vshlq_u32(qm0,(int32x4_t){ -9, -7, -3,-13})))); + qm2 = vaddq_u32(qm2, qm1); + dtmp0 = vadd_u32(vget_high_u32(qm2), vget_low_u32(qm2)); + q[17] = vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1) + vgetq_lane_u32(qq16, 1); + + /* expand2( 2) */ + q2tmp0 = vld2q_u32(&q[ 2]); + q2tmp1 = vld2q_u32(&q[10]); + q2tmp1.val[0] = vsetq_lane_u32(0, q2tmp1.val[0], 3); + q2tmp0.val[0] = vaddq_u32(q2tmp0.val[0], q2tmp1.val[0]); + dtmp0 = vadd_u32(vget_high_u32(q2tmp0.val[0]), vget_low_u32(q2tmp0.val[0])); + even = vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + q[18] = even + ((q[16]>>1)|q[16]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[18] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2( 3) */ + q2tmp0 = vld2q_u32(&q[ 3]); + q2tmp1 = vld2q_u32(&q[11]); + q2tmp1.val[0] = vsetq_lane_u32(0, q2tmp1.val[0], 3); + q2tmp0.val[0] = vaddq_u32(q2tmp0.val[0], q2tmp1.val[0]); + dtmp0 = vadd_u32(vget_high_u32(q2tmp0.val[0]), vget_low_u32(q2tmp0.val[0])); + odd = vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + q[19] = odd + ((q[17]>>1)|q[17]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[19] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2( 4) */ + q2tmp0 = vld2q_u32(&q[ 4]); + q2tmp1 = vld2q_u32(&q[12]); + even += q[16] - q[ 2]; + q[20] = even + ((q[18]>>1)|q[18]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[20] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2( 5) */ + q2tmp0 = vld2q_u32(&q[ 5]); + q2tmp1 = vld2q_u32(&q[13]); + odd += q[17] - q[ 3]; + q[21] = odd + ((q[19]>>1)|q[19]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[21] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2( 6) */ + q2tmp0 = vld2q_u32(&q[ 6]); + q2tmp1 = vld2q_u32(&q[14]); + even += q[18] - q[ 4]; + q[22] = even + ((q[20]>>1)|q[20]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[22] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2( 7) */ + q2tmp0 = vld2q_u32(&q[ 7]); + q2tmp1 = vld2q_u32(&q[15]); + odd += q[19] - q[ 5]; + q[23] = odd + ((q[21]>>1)|q[21]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[23] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2( 8) */ + q2tmp0 = vld2q_u32(&q[ 8]); + q2tmp1 = vld2q_u32(&q[16]); + even += q[20] - q[ 6]; + q[24] = even + ((q[22]>>1)|q[22]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[24] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2( 9) */ + q2tmp0 = vld2q_u32(&q[ 9]); + q2tmp1 = vld2q_u32(&q[17]); + odd += q[21] - q[ 7]; + q[25] = odd + ((q[23]>>1)|q[23]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[25] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2(10) */ + q2tmp0 = vld2q_u32(&q[10]); + q2tmp1 = vld2q_u32(&q[18]); + even += q[22] - q[ 8]; + q[26] = even + ((q[24]>>1)|q[24]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[26] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2(11) */ + q2tmp0 = vld2q_u32(&q[11]); + q2tmp1 = vld2q_u32(&q[19]); + odd += q[23] - q[ 9]; + q[27] = odd + ((q[25]>>1)|q[25]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[27] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2(12) */ + q2tmp0 = vld2q_u32(&q[12]); + q2tmp1 = vld2q_u32(&q[20]); + even += q[24] - q[10]; + q[28] = even + ((q[26]>>1)|q[26]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[28] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2(13) */ + q2tmp0 = vld2q_u32(&q[13]); + q2tmp1 = vld2q_u32(&q[21]); + odd += q[25] - q[11]; + q[29] = odd + ((q[27]>>1)|q[27]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[29] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2(14) */ + q2tmp0 = vld2q_u32(&q[14]); + q2tmp1 = vld2q_u32(&q[22]); + even += q[26] - q[12]; + q[30] = even + ((q[28]>>1)|q[28]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[30] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); + + /* expand2(15) */ + q2tmp0 = vld2q_u32(&q[15]); + q2tmp1 = vld2q_u32(&q[23]); + odd += q[27] - q[13]; + q[31] = odd + ((q[29]>>1)|q[29]); + qm0 = veorq_u32(vshlq_u32(q2tmp0.val[1],(int32x4_t){ 3, 7, 13, 16}), + vshlq_u32(q2tmp0.val[1],(int32x4_t){-29,-25,-19,-16})); + qm1 = veorq_u32(vshlq_u32(q2tmp1.val[1],(int32x4_t){ 19, 23, 27, 0}), + vshlq_u32(q2tmp1.val[1],(int32x4_t){-13, -9, -5, -2})); + qm1 = vaddq_u32(qm1, qm0); + dtmp0 = vadd_u32(vget_high_u32(qm1), vget_low_u32(qm1)); + q[31] += vget_lane_u32(dtmp0, 0) + vget_lane_u32(dtmp0, 1); +} + +/* END of automatic generated code */ +