shabea/sha256-asm.S

   1 /* sha256-asm.S */
   2 /*
   3     This file is part of the AVR-Crypto-Lib.
   4     Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
   5
   6     This program is free software: you can redistribute it and/or modify
   7     it under the terms of the GNU General Public License as published by
   8     the Free Software Foundation, either version 3 of the License, or
   9     (at your option) any later version.
  10
  11     This program is distributed in the hope that it will be useful,
  12     but WITHOUT ANY WARRANTY; without even the implied warranty of
  13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14     GNU General Public License for more details.
  15
  16     You should have received a copy of the GNU General Public License
  17     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 */
  19 /*
  20  * Author:      Daniel Otte
  21  *
  22  * License: GPLv3 or later
  23 */
  24 ; sha-256 implementation in assembler
  25 SHA256_BLOCK_BITS = 512
  26 SHA256_HASH_BITS = 256
  27
  28 .macro precall
  29         /* push r18 - r27, r30 - r31*/
  30         push r0
  31         push r1
  32         push r18
  33         push r19
  34         push r20
  35         push r21
  36         push r22
  37         push r23
  38         push r24
  39         push r25
  40         push r26
  41         push r27
  42         push r30
  43         push r31
  44         clr r1
  45 .endm
  46
  47 .macro postcall
  48         pop r31
  49         pop r30
  50         pop r27
  51         pop r26
  52         pop r25
  53         pop r24
  54         pop r23
  55         pop r22
  56         pop r21
  57         pop r20
  58         pop r19
  59         pop r18
  60         pop r1
  61         pop r0
  62 .endm
  63
  64
  65 .macro hexdump length
  66         push r27
  67         push r26
  68         ldi r25, '\r'
  69         mov r24, r25
  70         call uart_putc
  71         ldi r25, '\n'
  72         mov r24, r25
  73         call uart_putc
  74         pop r26
  75         pop r27
  76         movw r24, r26
  77 .if \length > 16
  78         ldi r22, lo8(16)
  79         ldi r23, hi8(16)
  80         push r27
  81         push r26
  82         call uart_hexdump
  83         pop r26
  84         pop r27
  85         adiw r26, 16
  86         hexdump \length-16
  87 .else
  88         ldi r22, lo8(\length)
  89         ldi r23, hi8(\length)
  90         call uart_hexdump
  91 .endif
  92 .endm
  93
  94 /* X points to Block */
  95 .macro dbg_hexdump length
  96         precall
  97         hexdump \length
  98         postcall
  99 .endm
 100
 101 .section .text
 102
 103 SPL = 0x3D
 104 SPH = 0x3E
 105 SREG = 0x3F
 106
 107
 108 ;
 109 ;sha256_ctx_t is:
 110 ;
 111 ; [h0][h1][h2][h3][h4][h5][h6][h7][length]
 112 ; hn is 32 bit large, length is 64 bit large
 113
 114 ;###########################################################
 115
 116 .global sha256_ctx2hash
 117 ; === sha256_ctx2hash ===
 118 ; this function converts a state into a normal hash (bytestring)
 119 ;  param1: the 16-bit destination pointer
 120 ;       given in r25,r24 (r25 is most significant)
 121 ;  param2: the 16-bit pointer to sha256_ctx structure
 122 ;       given in r23,r22
 123 sha256_ctx2hash:
 124         movw r26, r22
 125         movw r30, r24
 126         ldi r21, 8
 127         sbiw r26, 4
 128 1:
 129         ldi r20, 4
 130         adiw r26, 8
 131 2:
 132                 ld r0, -X
 133                 st Z+, r0
 134         dec r20
 135         brne 2b
 136
 137         dec r21
 138         brne 1b
 139
 140         ret
 141
 142 ;###########################################################
 143
 144 .global sha256
 145 ; === sha256 ===
 146 ; this function calculates SHA-256 hashes from messages in RAM
 147 ;  param1: the 16-bit hash destination pointer
 148 ;       given in r25,r24 (r25 is most significant)
 149 ;  param2: the 16-bit pointer to message
 150 ;       given in r23,r22
 151 ;  param3: 32-bit length value (length of message in bits)
 152 ;   given in r21,r20,r19,r18
 153 sha256:
 154 sha256_prolog:
 155         push r8
 156         push r9
 157         push r10
 158         push r11
 159         push r12
 160         push r13
 161         push r16
 162         push r17
 163         in r16, SPL
 164         in r17, SPH
 165         subi r16, 8*4+8
 166         sbci r17, 0
 167         in r0, SREG
 168         cli
 169         out SPL, r16
 170         out SPH, r17
 171         out SREG, r0
 172
 173         push r25
 174         push r24
 175         inc r16
 176         adc r17, r1
 177
 178         movw r8, r18            /* backup of length*/
 179         movw r10, r20
 180
 181         movw r12, r22   /* backup pf msg-ptr */
 182
 183         movw r24, r16
 184         rcall sha256_init
 185         /* if length >= 512 */
 186 1:
 187         tst r11
 188         brne 4f
 189         tst r10
 190         brne 4f
 191         mov r19, r9
 192         cpi r19, 0x02
 193         brlo 4f
 194
 195         movw r24, r16
 196         movw r22, r12
 197         rcall sha256_nextBlock
 198         ldi r19, 0x64
 199         add r22, r19
 200         adc r23, r1
 201         /* length -= 512 */
 202         ldi r19, 0x02
 203         sub r9, r19
 204         sbc r10, r1
 205         sbc r11, r1
 206         rjmp 1b
 207
 208 4:
 209         movw r24, r16
 210         movw r22, r12
 211         movw r20, r8
 212         rcall sha256_lastBlock
 213
 214         pop r24
 215         pop r25
 216         movw r22, r16
 217         rcall sha256_ctx2hash
 218
 219 sha256_epilog:
 220         in r30, SPL
 221         in r31, SPH
 222         adiw r30, 8*4+8
 223         in r0, SREG
 224         cli
 225         out SPL, r30
 226         out SPH, r31
 227         out SREG, r0
 228         pop r17
 229         pop r16
 230         pop r13
 231         pop r12
 232         pop r11
 233         pop r10
 234         pop r9
 235         pop r8
 236         ret
 237
 238 ;###########################################################
 239
 240
 241 ; block MUST NOT be larger than 64 bytes
 242
 243 .global sha256_lastBlock
 244 ; === sha256_lastBlock ===
 245 ; this function does padding & Co. for calculating SHA-256 hashes
 246 ;  param1: the 16-bit pointer to sha256_ctx structure
 247 ;       given in r25,r24 (r25 is most significant)
 248 ;  param2: an 16-bit pointer to 64 byte block to hash
 249 ;       given in r23,r22
 250 ;  param3: an 16-bit integer specifing length of block in bits
 251 ;       given in r21,r20
 252 sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
 253
 254
 255 sha256_lastBlock:
 256         cpi r21, 0x02
 257         brlo sha256_lastBlock_prolog
 258         push r25
 259         push r24
 260         push r23
 261         push r22
 262         push r21
 263         push r20
 264         rcall sha256_nextBlock
 265         pop r20
 266         pop r21
 267         pop r22
 268         pop r23
 269         pop r24
 270         pop r25
 271         subi r21, 0x02
 272         subi r23, -2
 273         rjmp sha256_lastBlock
 274 sha256_lastBlock_prolog:
 275         /* allocate space on stack */
 276         in r30, SPL
 277         in r31, SPH
 278         in r1, SREG
 279         subi r30, lo8(64)
 280         sbci r31, hi8(64)
 281         cli
 282         out SPL, r30
 283         out SPH, r31
 284         out SREG,r1
 285
 286         adiw r30, 1 /* SP points to next free byte on stack */
 287         mov r18, r20 /* r20 = LSB(length) */
 288         lsr r18
 289         lsr r18
 290         lsr r18
 291         bst r21, 0      /* may be we should explain this ... */
 292         bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
 293
 294
 295         movw r26, r22 /* X points to begin of msg */
 296         tst r18
 297         breq sha256_lastBlock_post_copy
 298         mov r1, r18
 299 sha256_lastBlock_copy_loop:
 300         ld r0, X+
 301         st Z+, r0
 302         dec r1
 303         brne sha256_lastBlock_copy_loop
 304 sha256_lastBlock_post_copy:
 305 sha256_lastBlock_insert_stuffing_bit:
 306         ldi r19, 0x80
 307         mov r0,r19
 308         ldi r19, 0x07
 309         and r19, r20 /* if we are in bitmode */
 310         breq 2f /* no bitmode */
 311 1:
 312         lsr r0
 313         dec r19
 314         brne 1b
 315         ld r19, X
 316 /* maybe we should do some ANDing here, just for safety */
 317         or r0, r19
 318 2:
 319         st Z+, r0
 320         inc r18
 321
 322 /* checking stuff here */
 323         cpi r18, 64-8+1
 324         brsh 0f
 325         rjmp sha256_lastBlock_insert_zeros
 326 0:
 327         /* oh shit, we landed here */
 328         /* first we have to fill it up with zeros */
 329         ldi r19, 64
 330         sub r19, r18
 331         breq 2f
 332 1:
 333         st Z+, r1
 334         dec r19
 335         brne 1b
 336 2:
 337         sbiw r30, 63
 338         sbiw r30,  1
 339         movw r22, r30
 340
 341         push r31
 342         push r30
 343         push r25
 344         push r24
 345         push r21
 346         push r20
 347         rcall sha256_nextBlock
 348         pop r20
 349         pop r21
 350         pop r24
 351         pop r25
 352         pop r30
 353         pop r31
 354
 355         /* now we should subtract 512 from length */
 356         movw r26, r24
 357         adiw r26, 4*8+1 /* we can skip the lowest byte */
 358         ld r19, X
 359         subi r19, hi8(512)
 360         st X+, r19
 361         ldi r18, 6
 362 1:
 363         ld r19, X
 364         sbci r19, 0
 365         st X+, r19
 366         dec r18
 367         brne 1b
 368
 369 ;       clr r18 /* not neccessary ;-) */
 370         /* reset Z pointer to begin of block */
 371
 372 sha256_lastBlock_insert_zeros:
 373         ldi r19, 64-8
 374         sub r19, r18
 375         breq sha256_lastBlock_insert_length
 376         clr r1
 377 1:
 378         st Z+, r1       /* r1 is still zero */
 379         dec r19
 380         brne 1b
 381
 382 ;       rjmp sha256_lastBlock_epilog
 383 sha256_lastBlock_insert_length:
 384         movw r26, r24   /* X points to state */
 385         adiw r26, 8*4   /* X points to (state.length) */
 386         adiw r30, 8             /* Z points one after the last byte of block */
 387         ld r0, X+
 388         add r0, r20
 389         st -Z, r0
 390         ld r0, X+
 391         adc r0, r21
 392         st -Z, r0
 393         ldi r19, 6
 394 1:
 395         ld r0, X+
 396         adc r0, r1
 397         st -Z, r0
 398         dec r19
 399         brne 1b
 400
 401         sbiw r30, 64-8
 402         movw r22, r30
 403         rcall sha256_nextBlock
 404
 405 sha256_lastBlock_epilog:
 406         in r30, SPL
 407         in r31, SPH
 408         in r1, SREG
 409         adiw r30, 63 ; lo8(64)
 410         adiw r30,  1  ; hi8(64)
 411         cli
 412         out SPL, r30
 413         out SPH, r31
 414         out SREG,r1
 415         clr r1
 416         clr r0
 417         ret
 418
 419 /**/
 420 ;###########################################################
 421
 422 .global sha256_nextBlock
 423 ; === sha256_nextBlock ===
 424 ; this is the core function for calculating SHA-256 hashes
 425 ;  param1: the 16-bit pointer to sha256_ctx structure
 426 ;       given in r25,r24 (r25 is most significant)
 427 ;  param2: an 16-bit pointer to 64 byte block to hash
 428 ;       given in r23,r22
 429 sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
 430
 431 Bck1 = 12
 432 Bck2 = 13
 433 Bck3 = 14
 434 Bck4 = 15
 435 Func1 = 22
 436 Func2 = 23
 437 Func3 = 24
 438 Func4 = 25
 439 Accu1 = 16
 440 Accu2 = 17
 441 Accu3 = 18
 442 Accu4 = 19
 443 XAccu1 = 8
 444 XAccu2 = 9
 445 XAccu3 = 10
 446 XAccu4 = 11
 447 T1      = 4
 448 T2      = 5
 449 T3      = 6
 450 T4      = 7
 451 LoopC = 1
 452 /* byteorder: high number <--> high significance */
 453 sha256_nextBlock:
 454  ; initial, let's make some space ready for local vars
 455         push r4 /* replace push & pop by mem ops? */
 456         push r5
 457         push r6
 458         push r7
 459         push r8
 460         push r9
 461         push r10
 462         push r11
 463         push r12
 464         push r13
 465         push r14
 466         push r15
 467         push r16
 468         push r17
 469         push r28
 470         push r29
 471         in r20, SPL
 472         in r21, SPH
 473         movw r18, r20                   ;backup SP
 474 ;       movw r26, r20                   ; X points to free space on stack
 475         movw r30, r22                   ; Z points to message
 476         subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
 477         sbci r21, hi8(sha256_nextBlock_localSpace)
 478         movw r26, r20                   ; X points to free space on stack
 479         in r0, SREG
 480         cli ; we want to be uninterrupted while updating SP
 481         out SPL, r20
 482         out SPH, r21
 483         out SREG, r0
 484         push r18
 485         push r19
 486         push r24
 487         push r25 /* param1 will be needed later */
 488  ; now we fill the w array with message (think about endianess)
 489         adiw r26, 1 ; X++
 490         ldi r20, 16
 491 sha256_nextBlock_wcpyloop:
 492         ld r23, Z+
 493         ld r22, Z+
 494         ld r19, Z+
 495         ld r18, Z+
 496         st X+, r18
 497         st X+, r19
 498         st X+, r22
 499         st X+, r23
 500         dec r20
 501         brne sha256_nextBlock_wcpyloop
 502 /*      for (i=16; i<64; ++i){
 503                 w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];
 504         } */
 505         /* r25,r24,r23,r24 (r21,r20) are function values
 506            r19,r18,r17,r16 are the accumulator
 507            r15,r14,r13,rBck1 are backup1
 508            r11,r10,r9 ,r8  are xor accu
 509            r1 is round counter                                                          */
 510
 511         ldi r20, 64-16
 512         mov LoopC, r20
 513 sha256_nextBlock_wcalcloop:
 514         movw r30, r26 ; cp X to Z
 515         sbiw r30, 63
 516         sbiw r30, 1             ; substract 64 = 16*4
 517         ld Accu1, Z+
 518         ld Accu2, Z+
 519         ld Accu3, Z+
 520         ld Accu4, Z+ /* w[i] = w[i-16] */
 521         ld Bck1, Z+
 522         ld Bck2, Z+
 523         ld Bck3, Z+
 524         ld Bck4, Z+ /* backup = w[i-15] */
 525         /* now sigma 0 */
 526         mov Func1, Bck2
 527         mov Func2, Bck3
 528         mov Func3, Bck4
 529         mov Func4, Bck1  /* prerotated by 8 */
 530         ldi r20, 1
 531         rcall bitrotl
 532         movw XAccu1, Func1
 533         movw XAccu3, Func3       /* store ROTR(w[i-15],7) in xor accu */
 534         movw Func1, Bck3
 535         movw Func3, Bck1 /* prerotated by 16 */
 536         ldi r20, 2
 537         rcall bitrotr
 538         eor XAccu1, Func1  /* xor ROTR(w[i-15], 18)*/
 539         eor XAccu2, Func2
 540         eor XAccu3, Func3
 541         eor XAccu4, Func4
 542         ldi Func2, 3             /* now shr3 */ /*we can destroy backup now*/
 543 sigma0_shr:
 544         lsr Bck4
 545         ror Bck3
 546         ror Bck2
 547         ror Bck1
 548         dec Func2
 549         brne sigma0_shr
 550         eor XAccu1, Bck1
 551         eor XAccu2, Bck2
 552         eor XAccu3, Bck3
 553         eor XAccu4, Bck4        /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
 554         add Accu1, XAccu1
 555         adc Accu2, XAccu2
 556         adc Accu3, XAccu3
 557         adc Accu4, XAccu4 /* finished with sigma0 */
 558         ldd Func1, Z+7*4  /* now accu += w[i-7] */
 559         ldd Func2, Z+7*4+1
 560         ldd Func3, Z+7*4+2
 561         ldd Func4, Z+7*4+3
 562         add Accu1, Func1
 563         adc Accu2, Func2
 564         adc Accu3, Func3
 565         adc Accu4, Func4
 566         ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
 567         ldd Bck2, Z+12*4+1
 568         ldd Bck3, Z+12*4+2
 569         ldd Bck4, Z+12*4+3
 570         /* now sigma 1 */
 571         movw Func1, Bck3
 572         movw Func3, Bck1 /* prerotated by 16 */
 573         ldi r20, 1
 574         rcall bitrotr
 575         movw XAccu3, Func3
 576         movw XAccu1, Func1       /* store in ROTR(w[i-2], 17) xor accu */
 577 ;       movw Func1, Bck3
 578 ;       movw Func3, Bck1 /* prerotated by 16 */
 579         ldi r20, 2
 580         rcall bitrotr
 581         eor XAccu1, Func1  /* xor ROTR(w[i-2], 19)*/
 582         eor XAccu2, Func2
 583         eor XAccu3, Func3
 584         eor XAccu4, Func4
 585         ldi Func2, 2     /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
 586 sigma1_shr:
 587         lsr Bck4
 588         ror Bck3
 589         ror Bck2
 590         dec Func2
 591         brne sigma1_shr
 592         eor XAccu1, Bck2
 593         eor XAccu2, Bck3
 594         eor XAccu3, Bck4  /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
 595         add Accu1, XAccu1
 596         adc Accu2, XAccu2
 597         adc Accu3, XAccu3
 598         adc Accu4, XAccu4 /* finished with sigma0 */
 599         /* now let's store the shit */
 600         st X+, Accu1
 601         st X+, Accu2
 602         st X+, Accu3
 603         st X+, Accu4
 604         dec LoopC
 605         breq 3f  ; skip if zero
 606         rjmp sha256_nextBlock_wcalcloop
 607 3:
 608         /* we are finished with w array X points one byte post w */
 609 /* init a array */
 610         pop r31
 611         pop r30
 612         push r30
 613         push r31
 614         ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
 615 init_a_array:
 616         ld r1, Z+
 617         st X+, r1
 618         dec r25
 619         brne init_a_array
 620
 621 /* now the real fun begins */
 622 /* for (i=0; i<64; ++i){
 623                         t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
 624                         t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
 625                         memmove(&(a[1]), &(a[0]), 7*4);         // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0];
 626                         a[4] += t1;
 627                         a[0] = t1 + t2;
 628                 } */
 629         /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
 630         sbiw r26, 8*4  /* X still points at a[7]+1*/
 631         movw r28, r26
 632         ldi r30, lo8(sha256_kv)
 633         ldi r31, hi8(sha256_kv)
 634         dec r27  /* X - (64*4 == 256) */
 635         ldi r25, 64
 636         mov LoopC, r25
 637 sha256_main_loop:
 638         /* now calculate t1 */
 639          /*CH(x,y,z) = (x&y)^((~x)&z)*/
 640         ldd T1, Y+5*4
 641         ldd T2, Y+5*4+1
 642         ldd T3, Y+5*4+2
 643         ldd T4, Y+5*4+3 /* y in T */
 644         ldd Func1, Y+4*4
 645         ldd Func2, Y+4*4+1
 646         ldd Func3, Y+4*4+2
 647         ldd Func4, Y+4*4+3  /* x in Func */
 648         ldd Bck1, Y+6*4
 649         ldd Bck2, Y+6*4+1
 650         ldd Bck3, Y+6*4+2
 651         ldd Bck4, Y+6*4+3 /* z in Bck */
 652         and T1, Func1
 653         and T2, Func2
 654         and T3, Func3
 655         and T4, Func4
 656         com Func1
 657         com Func2
 658         com Func3
 659         com Func4
 660         and Bck1, Func1
 661         and Bck2, Func2
 662         and Bck3, Func3
 663         and Bck4, Func4
 664         eor T1, Bck1
 665         eor T2, Bck2
 666         eor T3, Bck3
 667         eor T4, Bck4 /* done, CH(x,y,z) is in T */
 668         /* now SIGMA1(a[4]) */
 669         ldd Bck4, Y+4*4         /* think about using it from Func reg above*/
 670         ldd Bck1, Y+4*4+1
 671         ldd Bck2, Y+4*4+2
 672         ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */
 673         movw Func1, Bck1
 674         movw Func3, Bck3
 675         ldi r20, 2
 676         rcall bitrotl           /* rotr(x,6) */
 677         movw XAccu1, Func1
 678         movw XAccu3, Func3
 679         movw Func1, Bck1
 680         movw Func3, Bck3
 681         ldi r20, 3
 682         rcall bitrotr   /* rotr(x,11) */
 683         eor XAccu1, Func1
 684         eor XAccu2, Func2
 685         eor XAccu3, Func3
 686         eor XAccu4, Func4
 687         movw Func1, Bck3 /* this prerotates furteh 16 bits*/
 688         movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
 689         ldi r20, 1
 690         rcall bitrotr   /* rotr(x,11) */
 691         eor XAccu1, Func1
 692         eor XAccu2, Func2
 693         eor XAccu3, Func3
 694         eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
 695         add T1, XAccu1
 696         adc T2, XAccu2
 697         adc T3, XAccu3
 698         adc T4, XAccu4
 699         /* now we've to add a[7], w[i] and k[i] */
 700         ldd XAccu1, Y+4*7
 701         ldd XAccu2, Y+4*7+1
 702         ldd XAccu3, Y+4*7+2
 703         ldd XAccu4, Y+4*7+3
 704         add T1, XAccu1
 705         adc T2, XAccu2
 706         adc T3, XAccu3
 707         adc T4, XAccu4 /* add a[7] */
 708         ld XAccu1, X+
 709         ld XAccu2, X+
 710         ld XAccu3, X+
 711         ld XAccu4, X+
 712         add T1, XAccu1
 713         adc T2, XAccu2
 714         adc T3, XAccu3
 715         adc T4, XAccu4 /* add w[i] */
 716         lpm XAccu1, Z+
 717         lpm XAccu2, Z+
 718         lpm XAccu3, Z+
 719         lpm XAccu4, Z+
 720         add T1, XAccu1
 721         adc T2, XAccu2
 722         adc T3, XAccu3
 723         adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
 724         /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
 725                 /* starting with MAJ(x,y,z) */
 726         ldd Func1, Y+4*0+0
 727         ldd Func2, Y+4*0+1
 728         ldd Func3, Y+4*0+2
 729         ldd Func4, Y+4*0+3 /* load x=a[0] */
 730         ldd XAccu1, Y+4*1+0
 731         ldd XAccu2, Y+4*1+1
 732         ldd XAccu3, Y+4*1+2
 733         ldd XAccu4, Y+4*1+3 /* load y=a[1] */
 734         and XAccu1, Func1
 735         and XAccu2, Func2
 736         and XAccu3, Func3
 737         and XAccu4, Func4       /* XAccu == (x & y) */
 738         ldd Bck1, Y+4*2+0
 739         ldd Bck2, Y+4*2+1
 740         ldd Bck3, Y+4*2+2
 741         ldd Bck4, Y+4*2+3 /* load z=a[2] */
 742         and Func1, Bck1
 743         and Func2, Bck2
 744         and Func3, Bck3
 745         and Func4, Bck4
 746         eor XAccu1, Func1
 747         eor XAccu2, Func2
 748         eor XAccu3, Func3
 749         eor XAccu4, Func4       /* XAccu == (x & y) ^ (x & z) */
 750         ldd Func1, Y+4*1+0
 751         ldd Func2, Y+4*1+1
 752         ldd Func3, Y+4*1+2
 753         ldd Func4, Y+4*1+3 /* load y=a[1] */
 754         and Func1, Bck1
 755         and Func2, Bck2
 756         and Func3, Bck3
 757         and Func4, Bck4
 758         eor XAccu1, Func1
 759         eor XAccu2, Func2
 760         eor XAccu3, Func3
 761         eor XAccu4, Func4       /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
 762         /* SIGMA0(a[0]) */
 763         ldd Bck1, Y+4*0+0 /* we should combine this with above */
 764         ldd Bck2, Y+4*0+1
 765         ldd Bck3, Y+4*0+2
 766         ldd Bck4, Y+4*0+3
 767         movw Func1, Bck1
 768         movw Func3, Bck3
 769         ldi r20, 2
 770         rcall bitrotr
 771         movw Accu1, Func1
 772         movw Accu3, Func3 /* Accu = shr(a[0], 2) */
 773         movw Func1, Bck3
 774         movw Func3, Bck1 /* prerotate by 16 bits */
 775         ldi r20, 3
 776         rcall bitrotl
 777         eor Accu1, Func1
 778         eor Accu2, Func2
 779         eor Accu3, Func3
 780         eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
 781         mov Func1, Bck4
 782         mov Func2, Bck1
 783         mov Func3, Bck2
 784         mov Func4, Bck3  /* prerotate by 24 bits */
 785         ldi r20, 2
 786         rcall bitrotl
 787         eor Accu1, Func1
 788         eor Accu2, Func2
 789         eor Accu3, Func3
 790         eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
 791         add Accu1, XAccu1 /* add previous result (MAJ)*/
 792         adc Accu2, XAccu2
 793         adc Accu3, XAccu3
 794         adc Accu4, XAccu4
 795         /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
 796         /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
 797
 798         ldi r21, 7*4
 799         adiw r28, 7*4
 800 a_shift_loop:
 801         ld  r25, -Y /* warning: this is PREdecrement */
 802         std Y+4, r25
 803         dec r21
 804         brne a_shift_loop
 805
 806         ldd Bck1, Y+4*4+0
 807         ldd Bck2, Y+4*4+1
 808         ldd Bck3, Y+4*4+2
 809         ldd Bck4, Y+4*4+3
 810         add Bck1, T1
 811         adc Bck2, T2
 812         adc Bck3, T3
 813         adc Bck4, T4
 814         std Y+4*4+0, Bck1
 815         std Y+4*4+1, Bck2
 816         std Y+4*4+2, Bck3
 817         std Y+4*4+3, Bck4
 818         add Accu1, T1
 819         adc Accu2, T2
 820         adc Accu3, T3
 821         adc Accu4, T4
 822         std Y+4*0+0, Accu1
 823         std Y+4*0+1, Accu2
 824         std Y+4*0+2, Accu3
 825         std Y+4*0+3, Accu4 /* a array updated */
 826
 827
 828         dec LoopC
 829         breq update_state
 830         rjmp sha256_main_loop ;brne sha256_main_loop
 831 update_state:
 832         /* update state */
 833         /* pointers to state should still exist on the stack ;-) */
 834         pop r31
 835         pop r30
 836         ldi r21, 8
 837 update_state_loop:
 838         ldd Accu1, Z+0
 839         ldd Accu2, Z+1
 840         ldd Accu3, Z+2
 841         ldd Accu4, Z+3
 842         ld Func1, Y+
 843         ld Func2, Y+
 844         ld Func3, Y+
 845         ld Func4, Y+
 846         add Accu1, Func1
 847         adc Accu2, Func2
 848         adc Accu3, Func3
 849         adc Accu4, Func4
 850         st Z+, Accu1
 851         st Z+, Accu2
 852         st Z+, Accu3
 853         st Z+, Accu4
 854         dec r21
 855         brne update_state_loop
 856         /* now we just have to update the length */
 857         adiw r30, 1 /* since we add 512, we can simply skip the LSB */
 858         ldi r21, 2
 859         ldi r22, 6
 860         ld r20, Z
 861         add r20, r21
 862         st Z+, r20
 863         clr r21
 864 sha256_nextBlock_fix_length:
 865         brcc sha256_nextBlock_epilog
 866         ld r20, Z
 867         adc r20, r21
 868         st Z+, r20
 869         dec r22
 870         brne sha256_nextBlock_fix_length
 871
 872 ; EPILOG
 873 sha256_nextBlock_epilog:
 874 /* now we should clean up the stack */
 875
 876         pop r21
 877         pop r20
 878         in r0, SREG
 879         cli ; we want to be uninterrupted while updating SP
 880         out SPL, r20
 881         out SPH, r21
 882         out SREG, r0
 883
 884         clr r1
 885         pop r29
 886         pop r28
 887         pop r17
 888         pop r16
 889         pop r15
 890         pop r14
 891         pop r13
 892         pop r12
 893         pop r11
 894         pop r10
 895         pop r9
 896         pop r8
 897         pop r7
 898         pop r6
 899         pop r5
 900         pop r4
 901         ret
 902
 903 sha256_kv: ; round-key-vector stored in ProgMem
 904 .word   0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
 905 .word   0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
 906 .word   0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
 907 .word   0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
 908 .word   0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
 909 .word   0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
 910 .word   0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
 911 .word   0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
 912
 913
 914 ;###########################################################
 915
 916 .global sha256_init
 917 ;uint32_t sha256_init_vector[]={
 918 ;       0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
 919 ;       0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
 920 ;
 921 ;void sha256_init(sha256_ctx_t *state){
 922 ;       state->length=0;
 923 ;       memcpy(state->h, sha256_init_vector, 8*4);
 924 ;}
 925 ; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
 926 ; modifys: Z(r30,r31), Func1, r22
 927 sha256_init:
 928         movw r26, r24 ; (24,25) --> (26,27) load X with param1
 929         ldi r30, lo8((sha256_init_vector))
 930         ldi r31, hi8((sha256_init_vector))
 931         ldi r22, 32+8
 932 sha256_init_vloop:
 933         lpm r23, Z+
 934         st X+, r23
 935         dec r22
 936         brne sha256_init_vloop
 937         ret
 938
 939 sha256_init_vector:
 940 .word 0xE667, 0x6A09
 941 .word 0xAE85, 0xBB67
 942 .word 0xF372, 0x3C6E
 943 .word 0xF53A, 0xA54F
 944 .word 0x527F, 0x510E
 945 .word 0x688C, 0x9B05
 946 .word 0xD9AB, 0x1F83
 947 .word 0xCD19, 0x5BE0
 948 .word 0x0000, 0x0000
 949 .word 0x0000, 0x0000
 950
 951 ;###########################################################
 952
 953 .global rotl32
 954 ; === ROTL32 ===
 955 ; function that rotates a 32 bit word to the left
 956 ;  param1: the 32-bit word to rotate
 957 ;       given in r25,r24,r23,r22 (r25 is most significant)
 958 ;  param2: an 8-bit value telling how often to rotate
 959 ;       given in r20
 960 ; modifys: r21, r22
 961 rotl32:
 962         cpi r20, 8
 963         brlo bitrotl
 964         mov r21, r25
 965         mov r25, r24
 966         mov r24, r23
 967         mov r23, r22
 968         mov r22, r21
 969         subi r20, 8
 970         rjmp rotl32
 971 bitrotl:
 972         clr r21
 973         clc
 974 bitrotl_loop:
 975         tst r20
 976         breq fixrotl
 977         rol r22
 978         rol r23
 979         rol r24
 980         rol r25
 981         rol r21
 982         dec r20
 983         rjmp bitrotl_loop
 984 fixrotl:
 985         or r22, r21
 986         ret
 987
 988
 989 ;###########################################################
 990
 991 .global rotr32
 992 ; === ROTR32 ===
 993 ; function that rotates a 32 bit word to the right
 994 ;  param1: the 32-bit word to rotate
 995 ;       given in r25,r24,r23,22 (r25 is most significant)
 996 ;  param2: an 8-bit value telling how often to rotate
 997 ;       given in r20
 998 ; modifys: r21, r22
 999 rotr32:
1000         cpi r20, 8
1001         brlo bitrotr
1002         mov r21, r22
1003         mov r22, r23
1004         mov r23, r24
1005         mov r24, r25
1006         mov r25, r21
1007         subi r20, 8
1008         rjmp rotr32
1009 bitrotr:
1010         clr r21
1011         clc
1012 bitrotr_loop:
1013         tst r20
1014         breq fixrotr
1015         ror r25
1016         ror r24
1017         ror r23
1018         ror r22
1019         ror r21
1020         dec r20
1021         rjmp bitrotr_loop
1022 fixrotr:
1023         or r25, r21
1024         ret
1025
1026
1027 ;###########################################################
1028
1029 .global change_endian32
1030 ; === change_endian32 ===
1031 ; function that changes the endianess of a 32-bit word
1032 ;  param1: the 32-bit word
1033 ;       given in r25,r24,r23,22 (r25 is most significant)
1034 ;  modifys: r21, r22
1035 change_endian32:
1036         movw r20,  r22 ; (r22,r23) --> (r20,r21)
1037         mov r22, r25
1038         mov r23, r24
1039         mov r24, r21
1040         mov r25, r20
1041         ret
1042