+#if F0_HACK
+static
+uint8_t f0_lut[] PROGMEM ={
+ 5<<1, ( 7<<1)+1, (10<<1)+0, (13<<1)+0, (14<<1)+0,
+ 6<<1, ( 8<<1)+1, (11<<1)+0, (14<<1)+0, (15<<1)+1,
+ 0<<1, ( 7<<1)+0, ( 9<<1)+0, (12<<1)+1, (15<<1)+0,
+ 0<<1, ( 1<<1)+1, ( 8<<1)+0, (10<<1)+1, (13<<1)+0,
+ 1<<1, ( 2<<1)+0, ( 9<<1)+0, (11<<1)+1, (14<<1)+1,
+ 3<<1, ( 2<<1)+1, (10<<1)+0, (12<<1)+1, (15<<1)+0,
+ 4<<1, ( 0<<1)+1, ( 3<<1)+1, (11<<1)+1, (13<<1)+0,
+ 1<<1, ( 4<<1)+1, ( 5<<1)+1, (12<<1)+1, (14<<1)+1,
+ 2<<1, ( 5<<1)+1, ( 6<<1)+1, (13<<1)+0, (15<<1)+1,
+ 0<<1, ( 3<<1)+1, ( 6<<1)+0, ( 7<<1)+1, (14<<1)+0,
+ 8<<1, ( 1<<1)+1, ( 4<<1)+1, ( 7<<1)+1, (15<<1)+0,
+ 8<<1, ( 0<<1)+1, ( 2<<1)+1, ( 5<<1)+1, ( 9<<1)+0,
+ 1<<1, ( 3<<1)+0, ( 6<<1)+1, ( 9<<1)+1, (10<<1)+0,
+ 2<<1, ( 4<<1)+0, ( 7<<1)+0, (10<<1)+0, (11<<1)+0,
+ 3<<1, ( 5<<1)+1, ( 8<<1)+0, (11<<1)+1, (12<<1)+1,
+ 12<<1, ( 4<<1)+1, ( 6<<1)+1, ( 9<<1)+1, (13<<1)+0
+};
+
+void bmw_large_f0(uint64_t* q, uint64_t* h, const void* m){
+ uint8_t i,j=0,v,sign,l=4;
+ uint64_t(*s[])(uint64_t)={ bmw_large_s0, bmw_large_s1, bmw_large_s2,
+ bmw_large_s3, bmw_large_s4 };
+ for(i=0; i<16; ++i){
+ h[i] ^= ((uint64_t*)m)[i];
+ }
+ dump_x(h, 16, 'T');
+ memset(q, 0, 4*16);
+ for(i=0; i<5*16; ++i){
+ v = pgm_read_byte(f0_lut+i);
+ sign = v&1;
+ v >>=1;
+ if(sign){
+ q[j] -= h[v];
+ }else{
+ q[j] += h[v];
+ }
+ if(i==l){
+ j++;
+ l+=5;
+ }
+ }
+ dump_x(q, 16, 'W');
+ for(i=0; i<16; ++i){
+ q[i] = s[i%5](q[i]);
+ }
+}
+
+#else
+void bmw_large_f0(uint64_t* q, uint64_t* h, const void* m){