--- /dev/null
+/* mqq160-sign_P-asm.S */
+/*
+ This file is part of the AVR-Crypto-Lib.
+ Copyright (C) 2010 Daniel Otte (daniel.otte@rub.de)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+/**
+ * \file mqq160-sign_P-asm.S
+ * \email daniel.otte@rub.de
+ * \author Daniel Otte
+ * \date 2010-03-21
+ * \license GPLv3 or later
+ *
+ */
+
+#include "avr-asm-macros.S"
+
+#if 0
+static void mqq_inv_affine_transformation(uint8_t* input_bytes, uint8_t* result, const mqq160_sign_key_t* key){
+ /* The matrix SInv is given as two permutations of 160 elements. */
+ uint8_t j, byteindex, bitindex, bitindex_d, byteindex_d, rp1, rp5;
+ uint8_t *r1_ptr, *r5_ptr;
+ uint8_t h1[20];
+
+ /* Initialize H1 and H2 = 0 */
+ memset(h1, 0, 20);
+ memset(result, 0, 20);
+
+ /*
+ Fill H1 with bits of InputBytes accordingly to RP1 permutation
+ and fill H2 with bits of InputBytes accordingly to RP5 permutation
+ */
+ bitindex_d = 0x80;
+ byteindex_d = 0;
+ j=160;
+ r1_ptr = key->rp1;
+ r5_ptr = key->rp5;
+ do{
+ rp1 = pgm_read_byte(r1_ptr++);
+ rp5 = pgm_read_byte(r5_ptr++);
+ byteindex = rp1>>3;
+ bitindex = 0x80 >> (rp1&0x07);
+ if (input_bytes[byteindex] & bitindex){
+ h1[byteindex_d] ^= bitindex_d;
+ }
+
+ byteindex = rp5>>3;
+ bitindex = 0x80 >> (rp5&0x07);
+ if (input_bytes[byteindex] & bitindex){
+ result[byteindex_d] ^= bitindex_d;
+ }
+ bitindex_d >>= 1;
+ if(bitindex_d==0){
+ ++byteindex_d;
+ bitindex_d = 0x80;
+ }
+ }while(--j);
+
+ for (j=0; j<20; j++){
+ result[j] ^= h1[j] ^ h1[pgm_read_byte(j+mod20_table)]
+ ^ h1[pgm_read_byte(8+j+mod20_table)]
+ ^ h1[pgm_read_byte(12+j+mod20_table)];
+ }
+}
+#endif
+
+fetch_bit:
+ lpm r0, Z+
+ mov r28, r0
+ ldi r29, 0x80
+ andi r28, 7
+ breq 3f
+2: lsr r29
+ dec r28
+ brne 2b
+3: mov r28, r0
+ lsr r28
+ lsr r28
+ lsr r28
+ mov r0, r29
+ clr r29
+ add r28, r24
+ adc r29, r25
+ ld r28, Y
+ clt
+ and r28, r0
+ breq 4f
+ set
+4: ret
+
+xres_0 = 18
+xres_1 = 19
+h_0 = 20
+h_1 = 21
+xrp5_0 = 22
+xrp5_1 = 23
+inp_0 = 24
+inp_1 = 25
+tmp_0 = 22
+tmp_1 = 23
+tmp_2 = 24
+tmp_3 = 25
+tmp_4 = 18
+
+/*
+ param input_bytes: r24:r25
+ param result: r22:r23
+ param key: r20:r21
+*/
+;.global mqq_inv_affine_transformation
+mqq_inv_affine_transformation:
+ push r17
+; push r28
+; push r29
+ stack_alloc 20
+ adiw r30, 1 /* Z points to stack space for h1 */
+ movw r28, r20 /* Y points to the key struct in RAM */
+ movw xres_0, r22
+ movw r26, r30 /* X points to h1[0] */
+ ldd xrp5_0, Y+8 /* load pointer rp5 to xrp5 */
+ ldd xrp5_1, Y+9
+ movw h_0, r30
+ ldd r30, Y+6 /* load pointer to rp1 in Z */
+ ldd r31, Y+7
+ ldi r17, 20
+20: rcall fetch_bit
+ bld r1, 7
+ rcall fetch_bit
+ bld r1, 6
+ rcall fetch_bit
+ bld r1, 5
+ rcall fetch_bit
+ bld r1, 4
+ rcall fetch_bit
+ bld r1, 3
+ rcall fetch_bit
+ bld r1, 2
+ rcall fetch_bit
+ bld r1, 1
+ rcall fetch_bit
+ bld r1, 0
+ st X+, r1
+ dec r17
+ brne 20b
+;----
+ movw r26, xres_0 /* X points to result */
+ movw r30, xrp5_0
+ ldi r17, 20
+20: rcall fetch_bit
+ bld r1, 7
+ rcall fetch_bit
+ bld r1, 6
+ rcall fetch_bit
+ bld r1, 5
+ rcall fetch_bit
+ bld r1, 4
+ rcall fetch_bit
+ bld r1, 3
+ rcall fetch_bit
+ bld r1, 2
+ rcall fetch_bit
+ bld r1, 1
+ rcall fetch_bit
+ bld r1, 0
+ st X+, r1
+ dec r17
+ brne 20b
+ clr r1
+; --- now we mix result with h1
+ sbiw r26, 20 /* adjusting X to point at result[0] */
+ movw tmp_2, h_0
+ ldi r30, lo8(affine_mix_lut)
+ ldi r31, hi8(affine_mix_lut)
+ ldi r17, 20
+30:
+ ld tmp_0, X
+ movw r28, tmp_2
+ ld tmp_1, Y+
+ movw tmp_2, r28
+ eor tmp_0, tmp_1
+ movw r28, h_0
+ lpm r0, Z+
+ mov tmp_4, r0
+ andi tmp_4, 0x0f
+ add r28, tmp_4
+ adc r29, r1
+ ld tmp_1, Y
+ eor tmp_0, tmp_1
+ adiw r28, 4
+ sbrc r0, 7
+ adiw r28, 4
+ ld tmp_1, Y
+ eor tmp_0, tmp_1
+ adiw r28, 4
+ sbrc r0, 6
+ adiw r28, 4
+ ld tmp_1, Y
+ eor tmp_0, tmp_1
+ st X+, tmp_0
+ dec r17
+ brne 30b
+
+ stack_free 20
+; pop r29
+; pop r28
+ pop r17
+ ret
+
+affine_mix_lut:
+ .byte 0x84, 0x85, 0x86, 0x87
+ .byte 0xC0, 0xC1, 0xC2, 0xC3
+ .byte 0x40, 0x41, 0x42, 0x43
+ .byte 0x44, 0x45, 0x46, 0x47
+ .byte 0x80, 0x81, 0x82, 0x83
+
+/******************************************************************************/
+
+xres = 20
+tmp_0 = 23
+tmp_1 = 22
+tmp_2 = 21
+tmp_3 = 19
+/*
+ param i: r24
+ param b1: r22
+ param b2: r20
+ param key: r18:r19
+*/
+;.global mqq_q
+mqq_q:
+; push r28
+; push r29
+; stack_alloc 25, r26, r27
+; adiw r26, 1 /* X points to e[0] */
+ movw r28, r18
+ sbrs r24, 0
+ adiw r28, 2
+ ldd r30, Y+2
+ ldd r31, Y+3
+ ldi r28, 9
+10: lpm r0, Z+
+ st X+, r0
+ dec r28
+ brne 10b
+ sbiw r26, 9 /* adjust X to point at e[0] */
+;---
+ movw r28, r18
+ ld r30, Y+ /* Z points to a[0] in progmem */
+ ld r31, Y
+ sbrs r24, 0
+ rjmp 40f
+20:
+ sbrs r22, 7
+ rjmp 30f
+ ldi r25, 9
+ movw r28, r30
+25: lpm r0, Z
+ adiw r30, 9
+ ld r24, X
+ eor r24, r0
+ st X+, r24
+ dec r25
+ brne 25b
+ movw r30, r28
+ sbiw r26, 9
+30:
+ adiw r30, 1
+ lsl r22
+ breq 60f
+ rjmp 20b
+40:
+ sbrs r22, 7
+ rjmp 50f
+ ldi r25, 9
+ movw r28, r30
+45: lpm r0, Z+
+ ld r24, X
+ eor r24, r0
+ st X+, r24
+ dec r25
+ brne 45b
+ movw r30, r28
+ sbiw r26, 9
+50:
+ adiw r30, 9
+ lsl r22
+ breq 60f
+ rjmp 40b
+60:
+;------ all inputs are consumed, X points at e[0]
+;------ So we finished with obtaining e0 .. e7 and e8
+ movw r28, r26
+ ldd r0, Y+8
+ eor xres, r0
+;---
+
+/*
+ We can look at the bits of e0 .. e7 as a columns of a given matrix. We want to define 8 variables that have the rows
+ of that matrix. The variables need to be 16-bit because we will put into the upper 8 bits the bits of e0 .. e7,
+ and the bits of the variable result will be the Least Significant Bits of a[0] ... a[7].
+*/
+ adiw r28, 9 /* Y points at a[0] */
+ ldi r25, 8
+63:
+ ldi r24, 8
+ clr tmp_0
+65: ld tmp_1, X
+ lsl tmp_1
+ st X+, tmp_1
+ rol tmp_0
+ dec r24
+ brne 65b
+;---
+ clr tmp_1
+ lsl xres
+ rol tmp_1
+ st Y+, tmp_1
+ st Y+, tmp_0
+ sbiw r26, 8
+ dec r25
+ brne 63b
+;------- First we apply upper triangular transformation
+ sbiw r28, 16 /* Y points at a[0] */
+ movw r30, r28 /* Z points at a[0] */
+
+col = 25
+ ldi r24, 8
+ clr col
+70:
+ mov r1, col
+ ldi tmp_3, 0x80
+ tst r1
+ breq 72f
+71: lsr tmp_3
+ dec r1
+ brne 71b
+72:
+ clt
+ movw r28, r30 /* Y points at a[row]*/
+73: ldd tmp_0, Y+1
+ and tmp_0, tmp_3
+ brne 74f
+ set
+ adiw r28, 2
+ rjmp 73b
+74:
+ /* Y points at a[row] */
+ /* if T is set we have to permute [Y] and [Z] */
+ brtc 75f
+ ld tmp_0, Y
+ ld tmp_1, Z
+ st Y, tmp_1
+ st Z, tmp_0
+ ldd tmp_0, Y+1
+ ldd tmp_1, Z+1
+ std Y+1, tmp_1
+ std Z+1, tmp_0
+75: /* permutation done */
+ ldi r26, 7
+ sub r26, col
+ breq 78f
+ movw r28, r30
+76: adiw r28, 2
+ ldd tmp_0, Y+1
+ and tmp_0, tmp_3
+ breq 77f
+ ld tmp_0, Y
+ ld tmp_1, Z
+ eor tmp_0, tmp_1
+ st Y, tmp_0
+ ldd tmp_0, Y+1
+ ldd tmp_1, Z+1
+ eor tmp_0, tmp_1
+ std Y+1, tmp_0
+77:
+ dec r26
+ brne 76b
+78:
+ adiw r30, 2
+ inc col
+ dec r24
+ brne 70b
+79:
+;------ Then we eliminate 1s above the main diagonal
+
+ ldi col, 7
+ ldi tmp_3, 1
+ sbiw r30, 2
+80:
+ movw r28, r30
+ mov r26, col
+81:
+ sbiw r28, 2
+ ldd tmp_0, Y+1
+ and tmp_0, tmp_3
+ breq 82f
+ ld tmp_0, Y
+ ld tmp_1, Z
+ eor tmp_0, tmp_1
+ st Y, tmp_0
+ ldd tmp_0, Y+1
+ ldd tmp_1, Z+1
+ eor tmp_0, tmp_1
+ std Y+1, tmp_0
+82:
+ dec r26
+ brne 81b
+ sbiw r30, 2
+ lsl tmp_3
+ dec col
+ brne 80b
+89:
+;------ The result is in the Least Significant Bits of a[0] ... a[7]
+ /* Z should point at a[0] */
+ ldi r25, 8
+ clr r24
+90:
+ ld tmp_0, Z
+ adiw r30, 2
+ lsr tmp_0
+ rol r24
+ dec r25
+ brne 90b
+mqq_q_exit:
+; stack_free 25
+; pop r29
+; pop r28
+ ret
+
+/******************************************************************************/
+
+/*
+ param dest: r24:r25
+ param hash: r22:r23
+ param key: r20:r21
+*/
+
+dest_0 = 2
+dest_1 = 3
+xr1_0 = 4
+xr1_1 = 5
+key_0 = 6
+key_1 = 7
+i = 8
+c = 9
+qstack_0 = 10
+qstack_1 = 11
+
+.global mqq160_sign_P
+mqq160_sign_P:
+ push_range 2, 11
+ push_range 28, 29
+ stack_alloc 10+20, r26, r27 /* r1[20] + key */
+ adiw r26, 1 /* X points to stack memory */
+ movw key_0, r26
+ /* load key structure */
+ movw r30, r20
+ ldi r18, 10
+10: lpm r0, Z+
+ st X+, r0
+ dec r18
+ brne 10b
+ movw xr1_0, r26
+ movw dest_0, r24
+ /* call to mqq_inv_affine_transformation(hash, dest, &key); */
+ movw r24, r22
+ movw r22, dest_0
+ movw r20, key_0
+ rcall mqq_inv_affine_transformation
+ /* r1[0]=((uint8_t*)dest)[0]; */
+
+ movw r26, dest_0
+ movw r30, xr1_0
+ ld r0, X
+ st Z, r0
+;----
+ ldi r18, 19
+ mov c, r18
+ clr i
+ inc i
+ stack_alloc 25, r28, r29
+ adiw r28, 1
+ movw qstack_0, r28
+20: mov r24, i
+ movw r26, xr1_0
+ add r26, i
+ adc r27, r1
+ sbiw r26, 1
+ ld r22, X
+ movw r26, dest_0
+ add r26, i
+ adc r27, r1
+ ld r20, X
+ movw r18, key_0
+ movw r26, qstack_0
+ rcall mqq_q
+ movw r26, xr1_0
+ add r26, i
+ adc r27, r1
+ st X, r24
+ inc i
+ dec c
+ brne 20b
+ stack_free 25
+;-----
+
+
+ movw r28, key_0
+ ldd r30, Y+8
+ ldd r31, Y+9
+ movw r26, xr1_0
+ ldi r18, 20
+30: lpm r20, Z+
+ swap r20
+ andi r20, 0xF0
+ lpm r21, Z+
+ andi r21, 0x0F
+ or r20, r21
+ ld r21, X
+ eor r21, r20
+ st X+, r21
+ dec r18
+ brne 30b
+;----
+
+ movw r24, xr1_0
+ movw r22, dest_0
+ movw r20, key_0
+ rcall mqq_inv_affine_transformation
+ stack_free 30
+ pop_range 28, 29
+ pop_range 2, 11
+ ret
+
+