sha256/sha256-asm.S

   1 /* sha256-asm.S */
   2 /*
   3     This file is part of the AVR-Crypto-Lib.
   4     Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
   5
   6     This program is free software: you can redistribute it and/or modify
   7     it under the terms of the GNU General Public License as published by
   8     the Free Software Foundation, either version 3 of the License, or
   9     (at your option) any later version.
  10
  11     This program is distributed in the hope that it will be useful,
  12     but WITHOUT ANY WARRANTY; without even the implied warranty of
  13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14     GNU General Public License for more details.
  15
  16     You should have received a copy of the GNU General Public License
  17     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 */
  19 /*
  20  * Author:      Daniel Otte
  21  *
  22  * License: GPLv3 or later
  23 */
  24 ; sha-256 implementation in assembler
  25 SHA256_BLOCK_BITS = 512
  26 SHA256_HASH_BITS = 256
  27
  28
  29 .macro precall
  30         /* push r18 - r27, r30 - r31*/
  31         push r0
  32         push r1
  33         push r18
  34         push r19
  35         push r20
  36         push r21
  37         push r22
  38         push r23
  39         push r24
  40         push r25
  41         push r26
  42         push r27
  43         push r30
  44         push r31
  45         clr r1
  46 .endm
  47
  48 .macro postcall
  49         pop r31
  50         pop r30
  51         pop r27
  52         pop r26
  53         pop r25
  54         pop r24
  55         pop r23
  56         pop r22
  57         pop r21
  58         pop r20
  59         pop r19
  60         pop r18
  61         pop r1
  62         pop r0
  63 .endm
  64
  65
  66 .macro hexdump length
  67         push r27
  68         push r26
  69         ldi r25, '\r'
  70         mov r24, r25
  71         call uart_putc
  72         ldi r25, '\n'
  73         mov r24, r25
  74         call uart_putc
  75         pop r26
  76         pop r27
  77         movw r24, r26
  78 .if \length > 16
  79         ldi r22, lo8(16)
  80         ldi r23, hi8(16)
  81         push r27
  82         push r26
  83         call uart_hexdump
  84         pop r26
  85         pop r27
  86         adiw r26, 16
  87         hexdump \length-16
  88 .else
  89         ldi r22, lo8(\length)
  90         ldi r23, hi8(\length)
  91         call uart_hexdump
  92 .endif
  93 .endm
  94
  95 /* X points to Block */
  96 .macro dbg_hexdump length
  97         precall
  98         hexdump \length
  99         postcall
 100 .endm
 101
 102 .section .text
 103
 104 SPL = 0x3D
 105 SPH = 0x3E
 106 SREG = 0x3F
 107
 108
 109 ;
 110 ;sha256_ctx_t is:
 111 ;
 112 ; [h0][h1][h2][h3][h4][h5][h6][h7][length]
 113 ; hn is 32 bit large, length is 64 bit large
 114
 115 ;###########################################################
 116
 117 .global sha256_ctx2hash
 118 ; === sha256_ctx2hash ===
 119 ; this function converts a state into a normal hash (bytestring)
 120 ;  param1: the 16-bit destination pointer
 121 ;       given in r25,r24 (r25 is most significant)
 122 ;  param2: the 16-bit pointer to sha256_ctx structure
 123 ;       given in r23,r22
 124 sha256_ctx2hash:
 125         movw r26, r22
 126         movw r30, r24
 127         ldi r21, 8
 128         sbiw r26, 4
 129 1:
 130         ldi r20, 4
 131         adiw r26, 8
 132 2:
 133                 ld r0, -X
 134                 st Z+, r0
 135         dec r20
 136         brne 2b
 137
 138         dec r21
 139         brne 1b
 140
 141         ret
 142
 143 ;###########################################################
 144
 145 .global sha256
 146 ; === sha256 ===
 147 ; this function calculates SHA-256 hashes from messages in RAM
 148 ;  param1: the 16-bit hash destination pointer
 149 ;       given in r25,r24 (r25 is most significant)
 150 ;  param2: the 16-bit pointer to message
 151 ;       given in r23,r22
 152 ;  param3: 32-bit length value (length of message in bits)
 153 ;   given in r21,r20,r19,r18
 154 sha256:
 155 sha256_prolog:
 156         push r8
 157         push r9
 158         push r10
 159         push r11
 160         push r12
 161         push r13
 162         push r16
 163         push r17
 164         in r30, SPL
 165         in r31, SPH
 166         sbiw r30, 8*4+8
 167         in r0, SREG
 168         cli
 169         out SPL, r30
 170         out SREG, r0
 171         out SPH, r31
 172
 173         push r25
 174         push r24
 175         adiw r30, 1
 176         movw r16, r30
 177         movw r8, r18            /* backup of length*/
 178         movw r10, r20
 179
 180         movw r12, r22   /* backup pf msg-ptr */
 181
 182         movw r24, r16
 183         rcall sha256_init
 184         /* if length > 0xffff */
 185 1:
 186         tst r11
 187         brne 2f
 188         tst r10
 189         breq 4f
 190 2:
 191         movw r24, r16
 192         movw r22, r12
 193         rcall sha256_nextBlock
 194         ldi r19, 64
 195         add r12, r19
 196         adc r13, r1
 197         /* length -= 512 */
 198         ldi r19, 0x02
 199         sub r9, r19
 200         sbc r10, r1
 201         sbc r11, r1
 202         rjmp 1b
 203
 204 4:
 205         movw r24, r16
 206         movw r22, r12
 207         movw r20, r8
 208         rcall sha256_lastBlock
 209
 210         pop r24
 211         pop r25
 212         movw r22, r16
 213         rcall sha256_ctx2hash
 214
 215 sha256_epilog:
 216         in r30, SPL
 217         in r31, SPH
 218         adiw r30, 8*4+8
 219         in r0, SREG
 220         cli
 221         out SPL, r30
 222         out SREG, r0
 223         out SPH, r31
 224         pop r17
 225         pop r16
 226         pop r13
 227         pop r12
 228         pop r11
 229         pop r10
 230         pop r9
 231         pop r8
 232         ret
 233
 234 ;###########################################################
 235
 236
 237 ; block MUST NOT be larger than 64 bytes
 238
 239 .global sha256_lastBlock
 240 ; === sha256_lastBlock ===
 241 ; this function does padding & Co. for calculating SHA-256 hashes
 242 ;  param1: the 16-bit pointer to sha256_ctx structure
 243 ;       given in r25,r24 (r25 is most significant)
 244 ;  param2: an 16-bit pointer to 64 byte block to hash
 245 ;       given in r23,r22
 246 ;  param3: an 16-bit integer specifing length of block in bits
 247 ;       given in r21,r20
 248 sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
 249
 250
 251 sha256_lastBlock:
 252         cpi r21, 0x02
 253         brlo sha256_lastBlock_prolog
 254         push r25
 255         push r24
 256         push r23
 257         push r22
 258         push r21
 259         push r20
 260         rcall sha256_nextBlock
 261         pop r20
 262         pop r21
 263         pop r22
 264         pop r23
 265         pop r24
 266         pop r25
 267         subi r21, 0x02
 268         ldi r19, 64
 269         add r22, r19
 270         adc r23, r1
 271         rjmp sha256_lastBlock
 272 sha256_lastBlock_prolog:
 273         /* allocate space on stack */
 274         in r30, SPL
 275         in r31, SPH
 276         in r1, SREG
 277         subi r30, lo8(64)
 278         sbci r31, hi8(64)
 279         cli
 280         out SPL, r30
 281         out SPH, r31
 282         out SREG,r1
 283
 284         adiw r30, 1 /* SP points to next free byte on stack */
 285         mov r18, r20 /* r20 = LSB(length) */
 286         lsr r18
 287         lsr r18
 288         lsr r18
 289         bst r21, 0      /* may be we should explain this ... */
 290         bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
 291
 292
 293         movw r26, r22 /* X points to begin of msg */
 294         tst r18
 295         breq sha256_lastBlock_post_copy
 296         mov r1, r18
 297 sha256_lastBlock_copy_loop:
 298         ld r0, X+
 299         st Z+, r0
 300         dec r1
 301         brne sha256_lastBlock_copy_loop
 302 sha256_lastBlock_post_copy:
 303 sha256_lastBlock_insert_stuffing_bit:
 304         ldi r19, 0x80
 305         mov r0,r19
 306         ldi r19, 0x07
 307         and r19, r20 /* if we are in bitmode */
 308         breq 2f /* no bitmode */
 309 1:
 310         lsr r0
 311         dec r19
 312         brne 1b
 313         ld r19, X
 314 /* maybe we should do some ANDing here, just for safety */
 315         or r0, r19
 316 2:
 317         st Z+, r0
 318         inc r18
 319
 320 /* checking stuff here */
 321         cpi r18, 64-8+1
 322         brsh 0f
 323         rjmp sha256_lastBlock_insert_zeros
 324 0:
 325         /* oh shit, we landed here */
 326         /* first we have to fill it up with zeros */
 327         ldi r19, 64
 328         sub r19, r18
 329         breq 2f
 330 1:
 331         st Z+, r1
 332         dec r19
 333         brne 1b
 334 2:
 335         sbiw r30, 63
 336         sbiw r30,  1
 337         movw r22, r30
 338
 339         push r31
 340         push r30
 341         push r25
 342         push r24
 343         push r21
 344         push r20
 345         rcall sha256_nextBlock
 346         pop r20
 347         pop r21
 348         pop r24
 349         pop r25
 350         pop r30
 351         pop r31
 352
 353         /* now we should subtract 512 from length */
 354         movw r26, r24
 355         adiw r26, 4*8+1 /* we can skip the lowest byte */
 356         ld r19, X
 357         subi r19, hi8(512)
 358         st X+, r19
 359         ldi r18, 6
 360 1:
 361         ld r19, X
 362         sbci r19, 0
 363         st X+, r19
 364         dec r18
 365         brne 1b
 366
 367 ;       clr r18 /* not neccessary ;-) */
 368         /* reset Z pointer to begin of block */
 369
 370 sha256_lastBlock_insert_zeros:
 371         ldi r19, 64-8
 372         sub r19, r18
 373         breq sha256_lastBlock_insert_length
 374         clr r1
 375 1:
 376         st Z+, r1       /* r1 is still zero */
 377         dec r19
 378         brne 1b
 379
 380 ;       rjmp sha256_lastBlock_epilog
 381 sha256_lastBlock_insert_length:
 382         movw r26, r24   /* X points to state */
 383         adiw r26, 8*4   /* X points to (state.length) */
 384         adiw r30, 8             /* Z points one after the last byte of block */
 385         ld r0, X+
 386         add r0, r20
 387         st -Z, r0
 388         ld r0, X+
 389         adc r0, r21
 390         st -Z, r0
 391         ldi r19, 6
 392 1:
 393         ld r0, X+
 394         adc r0, r1
 395         st -Z, r0
 396         dec r19
 397         brne 1b
 398
 399         sbiw r30, 64-8
 400         movw r22, r30
 401         rcall sha256_nextBlock
 402
 403 sha256_lastBlock_epilog:
 404         in r30, SPL
 405         in r31, SPH
 406         in r1, SREG
 407         adiw r30, 63 ; lo8(64)
 408         adiw r30,  1  ; hi8(64)
 409         cli
 410         out SPL, r30
 411         out SPH, r31
 412         out SREG,r1
 413         clr r1
 414         clr r0
 415         ret
 416
 417 /**/
 418 ;###########################################################
 419
 420 .global sha256_nextBlock
 421 ; === sha256_nextBlock ===
 422 ; this is the core function for calculating SHA-256 hashes
 423 ;  param1: the 16-bit pointer to sha256_ctx structure
 424 ;       given in r25,r24 (r25 is most significant)
 425 ;  param2: an 16-bit pointer to 64 byte block to hash
 426 ;       given in r23,r22
 427 sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
 428
 429 Bck1 = 12
 430 Bck2 = 13
 431 Bck3 = 14
 432 Bck4 = 15
 433 Func1 = 22
 434 Func2 = 23
 435 Func3 = 24
 436 Func4 = 25
 437 Accu1 = 16
 438 Accu2 = 17
 439 Accu3 = 18
 440 Accu4 = 19
 441 XAccu1 = 8
 442 XAccu2 = 9
 443 XAccu3 = 10
 444 XAccu4 = 11
 445 T1      = 4
 446 T2      = 5
 447 T3      = 6
 448 T4      = 7
 449 LoopC = 1
 450 /* byteorder: high number <--> high significance */
 451 sha256_nextBlock:
 452  ; initial, let's make some space ready for local vars
 453         push r4 /* replace push & pop by mem ops? */
 454         push r5
 455         push r6
 456         push r7
 457         push r8
 458         push r9
 459         push r10
 460         push r11
 461         push r12
 462         push r13
 463         push r14
 464         push r15
 465         push r16
 466         push r17
 467         push r28
 468         push r29
 469         in r20, SPL
 470         in r21, SPH
 471         movw r18, r20                   ;backup SP
 472 ;       movw r26, r20                   ; X points to free space on stack
 473         movw r30, r22                   ; Z points to message
 474         subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
 475         sbci r21, hi8(sha256_nextBlock_localSpace)
 476         movw r26, r20                   ; X points to free space on stack
 477         in r0, SREG
 478         cli ; we want to be uninterrupted while updating SP
 479         out SPL, r20
 480         out SPH, r21
 481         out SREG, r0
 482         push r18
 483         push r19
 484         push r24
 485         push r25 /* param1 will be needed later */
 486  ; now we fill the w array with message (think about endianess)
 487         adiw r26, 1 ; X++
 488         ldi r20, 16
 489 sha256_nextBlock_wcpyloop:
 490         ld r23, Z+
 491         ld r22, Z+
 492         ld r19, Z+
 493         ld r18, Z+
 494         st X+, r18
 495         st X+, r19
 496         st X+, r22
 497         st X+, r23
 498         dec r20
 499         brne sha256_nextBlock_wcpyloop
 500 /*      for (i=16; i<64; ++i){
 501                 w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];
 502         } */
 503         /* r25,r24,r23,r24 (r21,r20) are function values
 504            r19,r18,r17,r16 are the accumulator
 505            r15,r14,r13,rBck1 are backup1
 506            r11,r10,r9 ,r8  are xor accu
 507            r1 is round counter                                                          */
 508
 509         ldi r20, 64-16
 510         mov LoopC, r20
 511 sha256_nextBlock_wcalcloop:
 512         movw r30, r26 ; cp X to Z
 513         sbiw r30, 63
 514         sbiw r30, 1             ; substract 64 = 16*4
 515         ld Accu1, Z+
 516         ld Accu2, Z+
 517         ld Accu3, Z+
 518         ld Accu4, Z+ /* w[i] = w[i-16] */
 519         ld Bck1, Z+
 520         ld Bck2, Z+
 521         ld Bck3, Z+
 522         ld Bck4, Z+ /* backup = w[i-15] */
 523         /* now sigma 0 */
 524         mov Func1, Bck2
 525         mov Func2, Bck3
 526         mov Func3, Bck4
 527         mov Func4, Bck1  /* prerotated by 8 */
 528         ldi r20, 1
 529         rcall bitrotl
 530         movw XAccu1, Func1
 531         movw XAccu3, Func3       /* store ROTR(w[i-15],7) in xor accu */
 532         movw Func1, Bck3
 533         movw Func3, Bck1 /* prerotated by 16 */
 534         ldi r20, 2
 535         rcall bitrotr
 536         eor XAccu1, Func1  /* xor ROTR(w[i-15], 18)*/
 537         eor XAccu2, Func2
 538         eor XAccu3, Func3
 539         eor XAccu4, Func4
 540         ldi Func2, 3             /* now shr3 */ /*we can destroy backup now*/
 541 sigma0_shr:
 542         lsr Bck4
 543         ror Bck3
 544         ror Bck2
 545         ror Bck1
 546         dec Func2
 547         brne sigma0_shr
 548         eor XAccu1, Bck1
 549         eor XAccu2, Bck2
 550         eor XAccu3, Bck3
 551         eor XAccu4, Bck4        /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
 552         add Accu1, XAccu1
 553         adc Accu2, XAccu2
 554         adc Accu3, XAccu3
 555         adc Accu4, XAccu4 /* finished with sigma0 */
 556         ldd Func1, Z+7*4  /* now accu += w[i-7] */
 557         ldd Func2, Z+7*4+1
 558         ldd Func3, Z+7*4+2
 559         ldd Func4, Z+7*4+3
 560         add Accu1, Func1
 561         adc Accu2, Func2
 562         adc Accu3, Func3
 563         adc Accu4, Func4
 564         ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
 565         ldd Bck2, Z+12*4+1
 566         ldd Bck3, Z+12*4+2
 567         ldd Bck4, Z+12*4+3
 568         /* now sigma 1 */
 569         movw Func1, Bck3
 570         movw Func3, Bck1 /* prerotated by 16 */
 571         ldi r20, 1
 572         rcall bitrotr
 573         movw XAccu3, Func3
 574         movw XAccu1, Func1       /* store in ROTR(w[i-2], 17) xor accu */
 575 ;       movw Func1, Bck3
 576 ;       movw Func3, Bck1 /* prerotated by 16 */
 577         ldi r20, 2
 578         rcall bitrotr
 579         eor XAccu1, Func1  /* xor ROTR(w[i-2], 19)*/
 580         eor XAccu2, Func2
 581         eor XAccu3, Func3
 582         eor XAccu4, Func4
 583         ldi Func2, 2     /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
 584 sigma1_shr:
 585         lsr Bck4
 586         ror Bck3
 587         ror Bck2
 588         dec Func2
 589         brne sigma1_shr
 590         eor XAccu1, Bck2
 591         eor XAccu2, Bck3
 592         eor XAccu3, Bck4  /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
 593         add Accu1, XAccu1
 594         adc Accu2, XAccu2
 595         adc Accu3, XAccu3
 596         adc Accu4, XAccu4 /* finished with sigma0 */
 597         /* now let's store the shit */
 598         st X+, Accu1
 599         st X+, Accu2
 600         st X+, Accu3
 601         st X+, Accu4
 602         dec LoopC
 603         breq 3f  ; skip if zero
 604         rjmp sha256_nextBlock_wcalcloop
 605 3:
 606         /* we are finished with w array X points one byte post w */
 607 /* init a array */
 608         pop r31
 609         pop r30
 610         push r30
 611         push r31
 612         ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
 613 init_a_array:
 614         ld r1, Z+
 615         st X+, r1
 616         dec r25
 617         brne init_a_array
 618
 619 /* now the real fun begins */
 620 /* for (i=0; i<64; ++i){
 621                         t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
 622                         t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
 623                         memmove(&(a[1]), &(a[0]), 7*4);         // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0];
 624                         a[4] += t1;
 625                         a[0] = t1 + t2;
 626                 } */
 627         /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
 628         sbiw r26, 8*4  /* X still points at a[7]+1*/
 629         movw r28, r26
 630         ldi r30, lo8(sha256_kv)
 631         ldi r31, hi8(sha256_kv)
 632         dec r27  /* X - (64*4 == 256) */
 633         ldi r25, 64
 634         mov LoopC, r25
 635 sha256_main_loop:
 636         /* now calculate t1 */
 637          /*CH(x,y,z) = (x&y)^((~x)&z)*/
 638         ldd T1, Y+5*4
 639         ldd T2, Y+5*4+1
 640         ldd T3, Y+5*4+2
 641         ldd T4, Y+5*4+3 /* y in T */
 642         ldd Func1, Y+4*4
 643         ldd Func2, Y+4*4+1
 644         ldd Func3, Y+4*4+2
 645         ldd Func4, Y+4*4+3  /* x in Func */
 646         ldd Bck1, Y+6*4
 647         ldd Bck2, Y+6*4+1
 648         ldd Bck3, Y+6*4+2
 649         ldd Bck4, Y+6*4+3 /* z in Bck */
 650         and T1, Func1
 651         and T2, Func2
 652         and T3, Func3
 653         and T4, Func4
 654         com Func1
 655         com Func2
 656         com Func3
 657         com Func4
 658         and Bck1, Func1
 659         and Bck2, Func2
 660         and Bck3, Func3
 661         and Bck4, Func4
 662         eor T1, Bck1
 663         eor T2, Bck2
 664         eor T3, Bck3
 665         eor T4, Bck4 /* done, CH(x,y,z) is in T */
 666         /* now SIGMA1(a[4]) */
 667         ldd Bck4, Y+4*4         /* think about using it from Func reg above*/
 668         ldd Bck1, Y+4*4+1
 669         ldd Bck2, Y+4*4+2
 670         ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */
 671         movw Func1, Bck1
 672         movw Func3, Bck3
 673         ldi r20, 2
 674         rcall bitrotl           /* rotr(x,6) */
 675         movw XAccu1, Func1
 676         movw XAccu3, Func3
 677         movw Func1, Bck1
 678         movw Func3, Bck3
 679         ldi r20, 3
 680         rcall bitrotr   /* rotr(x,11) */
 681         eor XAccu1, Func1
 682         eor XAccu2, Func2
 683         eor XAccu3, Func3
 684         eor XAccu4, Func4
 685         movw Func1, Bck3 /* this prerotates furteh 16 bits*/
 686         movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
 687         ldi r20, 1
 688         rcall bitrotr   /* rotr(x,11) */
 689         eor XAccu1, Func1
 690         eor XAccu2, Func2
 691         eor XAccu3, Func3
 692         eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
 693         add T1, XAccu1
 694         adc T2, XAccu2
 695         adc T3, XAccu3
 696         adc T4, XAccu4
 697         /* now we've to add a[7], w[i] and k[i] */
 698         ldd XAccu1, Y+4*7
 699         ldd XAccu2, Y+4*7+1
 700         ldd XAccu3, Y+4*7+2
 701         ldd XAccu4, Y+4*7+3
 702         add T1, XAccu1
 703         adc T2, XAccu2
 704         adc T3, XAccu3
 705         adc T4, XAccu4 /* add a[7] */
 706         ld XAccu1, X+
 707         ld XAccu2, X+
 708         ld XAccu3, X+
 709         ld XAccu4, X+
 710         add T1, XAccu1
 711         adc T2, XAccu2
 712         adc T3, XAccu3
 713         adc T4, XAccu4 /* add w[i] */
 714         lpm XAccu1, Z+
 715         lpm XAccu2, Z+
 716         lpm XAccu3, Z+
 717         lpm XAccu4, Z+
 718         add T1, XAccu1
 719         adc T2, XAccu2
 720         adc T3, XAccu3
 721         adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
 722         /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
 723                 /* starting with MAJ(x,y,z) */
 724         ldd Func1, Y+4*0+0
 725         ldd Func2, Y+4*0+1
 726         ldd Func3, Y+4*0+2
 727         ldd Func4, Y+4*0+3 /* load x=a[0] */
 728         ldd XAccu1, Y+4*1+0
 729         ldd XAccu2, Y+4*1+1
 730         ldd XAccu3, Y+4*1+2
 731         ldd XAccu4, Y+4*1+3 /* load y=a[1] */
 732         and XAccu1, Func1
 733         and XAccu2, Func2
 734         and XAccu3, Func3
 735         and XAccu4, Func4       /* XAccu == (x & y) */
 736         ldd Bck1, Y+4*2+0
 737         ldd Bck2, Y+4*2+1
 738         ldd Bck3, Y+4*2+2
 739         ldd Bck4, Y+4*2+3 /* load z=a[2] */
 740         and Func1, Bck1
 741         and Func2, Bck2
 742         and Func3, Bck3
 743         and Func4, Bck4
 744         eor XAccu1, Func1
 745         eor XAccu2, Func2
 746         eor XAccu3, Func3
 747         eor XAccu4, Func4       /* XAccu == (x & y) ^ (x & z) */
 748         ldd Func1, Y+4*1+0
 749         ldd Func2, Y+4*1+1
 750         ldd Func3, Y+4*1+2
 751         ldd Func4, Y+4*1+3 /* load y=a[1] */
 752         and Func1, Bck1
 753         and Func2, Bck2
 754         and Func3, Bck3
 755         and Func4, Bck4
 756         eor XAccu1, Func1
 757         eor XAccu2, Func2
 758         eor XAccu3, Func3
 759         eor XAccu4, Func4       /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
 760         /* SIGMA0(a[0]) */
 761         ldd Bck1, Y+4*0+0 /* we should combine this with above */
 762         ldd Bck2, Y+4*0+1
 763         ldd Bck3, Y+4*0+2
 764         ldd Bck4, Y+4*0+3
 765         movw Func1, Bck1
 766         movw Func3, Bck3
 767         ldi r20, 2
 768         rcall bitrotr
 769         movw Accu1, Func1
 770         movw Accu3, Func3 /* Accu = shr(a[0], 2) */
 771         movw Func1, Bck3
 772         movw Func3, Bck1 /* prerotate by 16 bits */
 773         ldi r20, 3
 774         rcall bitrotl
 775         eor Accu1, Func1
 776         eor Accu2, Func2
 777         eor Accu3, Func3
 778         eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
 779         mov Func1, Bck4
 780         mov Func2, Bck1
 781         mov Func3, Bck2
 782         mov Func4, Bck3  /* prerotate by 24 bits */
 783         ldi r20, 2
 784         rcall bitrotl
 785         eor Accu1, Func1
 786         eor Accu2, Func2
 787         eor Accu3, Func3
 788         eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
 789         add Accu1, XAccu1 /* add previous result (MAJ)*/
 790         adc Accu2, XAccu2
 791         adc Accu3, XAccu3
 792         adc Accu4, XAccu4
 793         /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
 794         /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
 795
 796         ldi r21, 7*4
 797         adiw r28, 7*4
 798 a_shift_loop:
 799         ld  r25, -Y /* warning: this is PREdecrement */
 800         std Y+4, r25
 801         dec r21
 802         brne a_shift_loop
 803
 804         ldd Bck1, Y+4*4+0
 805         ldd Bck2, Y+4*4+1
 806         ldd Bck3, Y+4*4+2
 807         ldd Bck4, Y+4*4+3
 808         add Bck1, T1
 809         adc Bck2, T2
 810         adc Bck3, T3
 811         adc Bck4, T4
 812         std Y+4*4+0, Bck1
 813         std Y+4*4+1, Bck2
 814         std Y+4*4+2, Bck3
 815         std Y+4*4+3, Bck4
 816         add Accu1, T1
 817         adc Accu2, T2
 818         adc Accu3, T3
 819         adc Accu4, T4
 820         std Y+4*0+0, Accu1
 821         std Y+4*0+1, Accu2
 822         std Y+4*0+2, Accu3
 823         std Y+4*0+3, Accu4 /* a array updated */
 824
 825
 826         dec LoopC
 827         breq update_state
 828         rjmp sha256_main_loop ;brne sha256_main_loop
 829 update_state:
 830         /* update state */
 831         /* pointers to state should still exist on the stack ;-) */
 832         pop r31
 833         pop r30
 834         ldi r21, 8
 835 update_state_loop:
 836         ldd Accu1, Z+0
 837         ldd Accu2, Z+1
 838         ldd Accu3, Z+2
 839         ldd Accu4, Z+3
 840         ld Func1, Y+
 841         ld Func2, Y+
 842         ld Func3, Y+
 843         ld Func4, Y+
 844         add Accu1, Func1
 845         adc Accu2, Func2
 846         adc Accu3, Func3
 847         adc Accu4, Func4
 848         st Z+, Accu1
 849         st Z+, Accu2
 850         st Z+, Accu3
 851         st Z+, Accu4
 852         dec r21
 853         brne update_state_loop
 854         /* now we just have to update the length */
 855         adiw r30, 1 /* since we add 512, we can simply skip the LSB */
 856         ldi r21, 2
 857         ldi r22, 6
 858         ld r20, Z
 859         add r20, r21
 860         st Z+, r20
 861         clr r21
 862 sha256_nextBlock_fix_length:
 863         brcc sha256_nextBlock_epilog
 864         ld r20, Z
 865         adc r20, r21
 866         st Z+, r20
 867         dec r22
 868         brne sha256_nextBlock_fix_length
 869
 870 ; EPILOG
 871 sha256_nextBlock_epilog:
 872 /* now we should clean up the stack */
 873
 874         pop r21
 875         pop r20
 876         in r0, SREG
 877         cli ; we want to be uninterrupted while updating SP
 878         out SPL, r20
 879         out SPH, r21
 880         out SREG, r0
 881
 882         clr r1
 883         pop r29
 884         pop r28
 885         pop r17
 886         pop r16
 887         pop r15
 888         pop r14
 889         pop r13
 890         pop r12
 891         pop r11
 892         pop r10
 893         pop r9
 894         pop r8
 895         pop r7
 896         pop r6
 897         pop r5
 898         pop r4
 899         ret
 900
 901 sha256_kv: ; round-key-vector stored in ProgMem
 902 .word   0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
 903 .word   0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
 904 .word   0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
 905 .word   0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
 906 .word   0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
 907 .word   0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
 908 .word   0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
 909 .word   0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
 910
 911
 912 ;###########################################################
 913
 914 .global sha256_init
 915 ;uint32_t sha256_init_vector[]={
 916 ;       0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
 917 ;       0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
 918 ;
 919 ;void sha256_init(sha256_ctx_t *state){
 920 ;       state->length=0;
 921 ;       memcpy(state->h, sha256_init_vector, 8*4);
 922 ;}
 923 ; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
 924 ; modifys: Z(r30,r31), Func1, r22
 925 sha256_init:
 926         movw r26, r24 ; (24,25) --> (26,27) load X with param1
 927         ldi r30, lo8((sha256_init_vector))
 928         ldi r31, hi8((sha256_init_vector))
 929         ldi r22, 32+8
 930 sha256_init_vloop:
 931         lpm r23, Z+
 932         st X+, r23
 933         dec r22
 934         brne sha256_init_vloop
 935         ret
 936
 937 sha256_init_vector:
 938 .word 0xE667, 0x6A09
 939 .word 0xAE85, 0xBB67
 940 .word 0xF372, 0x3C6E
 941 .word 0xF53A, 0xA54F
 942 .word 0x527F, 0x510E
 943 .word 0x688C, 0x9B05
 944 .word 0xD9AB, 0x1F83
 945 .word 0xCD19, 0x5BE0
 946 .word 0x0000, 0x0000
 947 .word 0x0000, 0x0000
 948
 949 ;###########################################################
 950
 951 .global rotl32
 952 ; === ROTL32 ===
 953 ; function that rotates a 32 bit word to the left
 954 ;  param1: the 32-bit word to rotate
 955 ;       given in r25,r24,r23,r22 (r25 is most significant)
 956 ;  param2: an 8-bit value telling how often to rotate
 957 ;       given in r20
 958 ; modifys: r21, r22
 959 rotl32:
 960         cpi r20, 8
 961         brlo bitrotl
 962         mov r21, r25
 963         mov r25, r24
 964         mov r24, r23
 965         mov r23, r22
 966         mov r22, r21
 967         subi r20, 8
 968         rjmp rotl32
 969 bitrotl:
 970         clr r21
 971         clc
 972 bitrotl_loop:
 973         tst r20
 974         breq fixrotl
 975         rol r22
 976         rol r23
 977         rol r24
 978         rol r25
 979         rol r21
 980         dec r20
 981         rjmp bitrotl_loop
 982 fixrotl:
 983         or r22, r21
 984         ret
 985
 986
 987 ;###########################################################
 988
 989 .global rotr32
 990 ; === ROTR32 ===
 991 ; function that rotates a 32 bit word to the right
 992 ;  param1: the 32-bit word to rotate
 993 ;       given in r25,r24,r23,22 (r25 is most significant)
 994 ;  param2: an 8-bit value telling how often to rotate
 995 ;       given in r20
 996 ; modifys: r21, r22
 997 rotr32:
 998         cpi r20, 8
 999         brlo bitrotr
1000         mov r21, r22
1001         mov r22, r23
1002         mov r23, r24
1003         mov r24, r25
1004         mov r25, r21
1005         subi r20, 8
1006         rjmp rotr32
1007 bitrotr:
1008         clr r21
1009         clc
1010 bitrotr_loop:
1011         tst r20
1012         breq fixrotr
1013         ror r25
1014         ror r24
1015         ror r23
1016         ror r22
1017         ror r21
1018         dec r20
1019         rjmp bitrotr_loop
1020 fixrotr:
1021         or r25, r21
1022         ret
1023
1024
1025 ;###########################################################
1026
1027 .global change_endian32
1028 ; === change_endian32 ===
1029 ; function that changes the endianess of a 32-bit word
1030 ;  param1: the 32-bit word
1031 ;       given in r25,r24,r23,22 (r25 is most significant)
1032 ;  modifys: r21, r22
1033 change_endian32:
1034         movw r20,  r22 ; (r22,r23) --> (r20,r21)
1035         mov r22, r25
1036         mov r23, r24
1037         mov r24, r21
1038         mov r25, r20
1039         ret
1040