+ a += rf[i/2](q[j+i+1]);
+ }
+#if TWEAK
+ a += bmw_large_s4(q[j+14]);
+ a += bmw_large_s5(q[j+15]);
+#else
+ a += bmw_large_s5(q[j+14]);
+ a += bmw_large_s4(q[j+15]);
+#endif
+#if TWEAK
+ /*
+ if(j==(22-16)){
+ uint64_t t;
+ cli_putstr_P(PSTR("\n+++++++++ expand_2 ++++++++++++"));
+ dump_x(&a, 1, 'a');
+ dump_x(&r, 1, 'r');
+ t=ROTL64(((uint64_t*)m)[j], ((j+ 0)&0xf)+1);
+ dump_x(&t, 1, '0');
+ t=ROTL64(((uint64_t*)m)[j], ((j+ 3)&0xf)+1);
+ dump_x(&t, 1, '0');
+ t=ROTL64(((uint64_t*)m)[j], ((j+ 0)&0xf)+1);
+ dump_x(&t, 1, '0');
+
+ }
+ */
+ a += ( ROTL64(((uint64_t*)m)[(j)&0xf], ((j+ 0)&0xf)+1)
+ + ROTL64(((uint64_t*)m)[(j+3)&0xf], ((j+ 3)&0xf)+1)
+ + r.v64
+ - ROTL64(((uint64_t*)m)[(j+10)&0xf],((j+10)&0xf)+1)
+ ) ^ ((uint64_t*)h)[(j+7)&0xf];
+#else
+ a += ((uint64_t*)m)[j&0xf];
+ a += ((uint64_t*)m)[(j+3)&0xf];
+ a -= ((uint64_t*)m)[(j+10)&0xf];
+ a += r.v64;
+#endif
+ return a;
+}
+
+#if F0_HACK==2
+/* to understand this implementation take a look at f0-opt-table.txt */
+static uint16_t hack_table[5] PROGMEM = { 0x0311, 0xDDB3, 0x2A79, 0x07AA, 0x51C2 };
+static uint8_t offset_table[5] PROGMEM = { 4+16, 6+16, 9+16, 12+16, 13+16 };
+
+
+static
+void bmw_large_f0(uint64_t* q, const uint64_t* h, const void* m){
+ uint16_t hack_reg;
+ uint8_t i,j,c;
+ uint64_t(*s[])(uint64_t)={ bmw_large_s0, bmw_large_s1, bmw_large_s2,
+ bmw_large_s3, bmw_large_s4 };
+ for(i=0; i<16; ++i){
+ ((uint64_t*)h)[i] ^= ((uint64_t*)m)[i];
+ }
+ dump_x(h, 16, 'T');
+ memset(q, 0, 8*16);
+ c=4;
+ do{
+ i=15;
+ j=pgm_read_byte(offset_table+c);
+ hack_reg=pgm_read_word(&(hack_table[c]));
+ do{
+ if(hack_reg&1){
+ q[i]-= h[j&15];
+ }else{
+ q[i]+= h[j&15];
+ }
+ --j;
+ hack_reg>>= 1;
+ }while(i--!=0);
+ }while(c--!=0);
+ dump_x(q, 16, 'W');
+ for(i=0; i<16; ++i){
+ q[i] = s[i%5](q[i]);
+ }
+#if TWEAK
+ for(i=0; i<16; ++i){
+ ((uint64_t*)h)[i] ^= ((uint64_t*)m)[i];