sha256/sha256-asm.S

   1 /* sha256-asm.S */
   2 /*
   3     This file is part of the AVR-Crypto-Lib.
   4     Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
   5
   6     This program is free software: you can redistribute it and/or modify
   7     it under the terms of the GNU General Public License as published by
   8     the Free Software Foundation, either version 3 of the License, or
   9     (at your option) any later version.
  10
  11     This program is distributed in the hope that it will be useful,
  12     but WITHOUT ANY WARRANTY; without even the implied warranty of
  13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14     GNU General Public License for more details.
  15
  16     You should have received a copy of the GNU General Public License
  17     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 */
  19 /*
  20  * Author:      Daniel Otte
  21  *
  22  * License: GPLv3 or later
  23 */
  24 ; sha-256 implementation in assembler
  25 SHA256_BLOCK_BITS = 512
  26 SHA256_HASH_BITS = 256
  27
  28
  29 .macro precall
  30         /* push r18 - r27, r30 - r31*/
  31         push r0
  32         push r1
  33         push r18
  34         push r19
  35         push r20
  36         push r21
  37         push r22
  38         push r23
  39         push r24
  40         push r25
  41         push r26
  42         push r27
  43         push r30
  44         push r31
  45         clr r1
  46 .endm
  47
  48 .macro postcall
  49         pop r31
  50         pop r30
  51         pop r27
  52         pop r26
  53         pop r25
  54         pop r24
  55         pop r23
  56         pop r22
  57         pop r21
  58         pop r20
  59         pop r19
  60         pop r18
  61         pop r1
  62         pop r0
  63 .endm
  64
  65
  66 .macro hexdump length
  67         push r27
  68         push r26
  69         ldi r25, '\r'
  70         mov r24, r25
  71         call uart_putc
  72         ldi r25, '\n'
  73         mov r24, r25
  74         call uart_putc
  75         pop r26
  76         pop r27
  77         movw r24, r26
  78 .if \length > 16
  79         ldi r22, lo8(16)
  80         ldi r23, hi8(16)
  81         push r27
  82         push r26
  83         call uart_hexdump
  84         pop r26
  85         pop r27
  86         adiw r26, 16
  87         hexdump \length-16
  88 .else
  89         ldi r22, lo8(\length)
  90         ldi r23, hi8(\length)
  91         call uart_hexdump
  92 .endif
  93 .endm
  94
  95 /* X points to Block */
  96 .macro dbg_hexdump length
  97         precall
  98         hexdump \length
  99         postcall
 100 .endm
 101
 102 .section .text
 103
 104 SPL = 0x3D
 105 SPH = 0x3E
 106 SREG = 0x3F
 107
 108
 109 ;
 110 ;sha256_ctx_t is:
 111 ;
 112 ; [h0][h1][h2][h3][h4][h5][h6][h7][length]
 113 ; hn is 32 bit large, length is 64 bit large
 114
 115 ;###########################################################
 116
 117 .global sha256_ctx2hash
 118 ; === sha256_ctx2hash ===
 119 ; this function converts a state into a normal hash (bytestring)
 120 ;  param1: the 16-bit destination pointer
 121 ;       given in r25,r24 (r25 is most significant)
 122 ;  param2: the 16-bit pointer to sha256_ctx structure
 123 ;       given in r23,r22
 124 sha256_ctx2hash:
 125         movw r26, r22
 126         movw r30, r24
 127         ldi r21, 8
 128         sbiw r26, 4
 129 1:
 130         ldi r20, 4
 131         adiw r26, 8
 132 2:
 133                 ld r0, -X
 134                 st Z+, r0
 135         dec r20
 136         brne 2b
 137
 138         dec r21
 139         brne 1b
 140
 141         ret
 142
 143 ;###########################################################
 144
 145 .global sha256
 146 ; === sha256 ===
 147 ; this function calculates SHA-256 hashes from messages in RAM
 148 ;  param1: the 16-bit hash destination pointer
 149 ;       given in r25,r24 (r25 is most significant)
 150 ;  param2: the 16-bit pointer to message
 151 ;       given in r23,r22
 152 ;  param3: 32-bit length value (length of message in bits)
 153 ;   given in r21,r20,r19,r18
 154 sha256:
 155 sha256_prolog:
 156         push r8
 157         push r9
 158         push r10
 159         push r11
 160         push r12
 161         push r13
 162         push r16
 163         push r17
 164         in r30, SPL
 165         in r31, SPH
 166         sbiw r30, 8*4+8
 167         in r0, SREG
 168         cli
 169         out SPL, r30
 170         out SREG, r0
 171         out SPH, r31
 172
 173         push r25
 174         push r24
 175         adiw r30, 1
 176         movw r16, r30
 177         movw r8, r18            /* backup of length*/
 178         movw r10, r20
 179
 180         movw r12, r22   /* backup pf msg-ptr */
 181
 182         movw r24, r16
 183         rcall sha256_init
 184         /* if length > 0xffff */
 185 1:
 186         tst r11
 187         brne 2f
 188         tst r10
 189         breq 4f
 190 2:
 191         movw r24, r16
 192         movw r22, r12
 193         rcall sha256_nextBlock
 194         ldi r19, 64
 195         add r12, r19
 196         adc r13, r1
 197         /* length -= 512 */
 198         ldi r19, 0x02
 199         sub r9, r19
 200         sbc r10, r1
 201         sbc r11, r1
 202         rjmp 1b
 203
 204 4:
 205         movw r24, r16
 206         movw r22, r12
 207         movw r20, r8
 208         rcall sha256_lastBlock
 209
 210         pop r24
 211         pop r25
 212         movw r22, r16
 213         rcall sha256_ctx2hash
 214
 215 sha256_epilog:
 216         in r30, SPL
 217         in r31, SPH
 218         adiw r30, 8*4+8
 219         in r0, SREG
 220         cli
 221         out SPL, r30
 222         out SREG, r0
 223         out SPH, r31
 224         pop r17
 225         pop r16
 226         pop r13
 227         pop r12
 228         pop r11
 229         pop r10
 230         pop r9
 231         pop r8
 232         ret
 233
 234 ;###########################################################
 235
 236
 237 ; block MUST NOT be larger than 64 bytes
 238
 239 .global sha256_lastBlock
 240 ; === sha256_lastBlock ===
 241 ; this function does padding & Co. for calculating SHA-256 hashes
 242 ;  param1: the 16-bit pointer to sha256_ctx structure
 243 ;       given in r25,r24 (r25 is most significant)
 244 ;  param2: an 16-bit pointer to 64 byte block to hash
 245 ;       given in r23,r22
 246 ;  param3: an 16-bit integer specifing length of block in bits
 247 ;       given in r21,r20
 248 sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
 249
 250
 251 sha256_lastBlock:
 252         cpi r21, 0x02
 253         brlo sha256_lastBlock_prolog
 254         push r25
 255         push r24
 256         push r23
 257         push r22
 258         push r21
 259         push r20
 260         rcall sha256_nextBlock
 261         pop r20
 262         pop r21
 263         pop r22
 264         pop r23
 265         pop r24
 266         pop r25
 267         subi r21, 0x02
 268         ldi r19, 64
 269         add r22, r19
 270         adc r23, r1
 271         rjmp sha256_lastBlock
 272 sha256_lastBlock_prolog:
 273         /* allocate space on stack */
 274         in r30, SPL
 275         in r31, SPH
 276         in r0, SREG
 277         subi r30, lo8(64)
 278         sbci r31, hi8(64)
 279         cli
 280         out SPL, r30
 281         out SREG,r0
 282         out SPH, r31
 283
 284         adiw r30, 1 /* SP points to next free byte on stack */
 285         mov r18, r20 /* r20 = LSB(length) */
 286         lsr r18
 287         lsr r18
 288         lsr r18
 289         bst r21, 0      /* may be we should explain this ... */
 290         bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
 291
 292
 293         movw r26, r22 /* X points to begin of msg */
 294         tst r18
 295         breq sha256_lastBlock_post_copy
 296         mov r1, r18
 297 sha256_lastBlock_copy_loop:
 298         ld r0, X+
 299         st Z+, r0
 300         dec r1
 301         brne sha256_lastBlock_copy_loop
 302 sha256_lastBlock_post_copy:
 303 sha256_lastBlock_insert_stuffing_bit:
 304         ldi r19, 0x80
 305         mov r0,r19
 306         ldi r19, 0x07
 307         and r19, r20 /* if we are in bitmode */
 308         breq 2f /* no bitmode */
 309 1:
 310         lsr r0
 311         dec r19
 312         brne 1b
 313         ld r19, X
 314 /* maybe we should do some ANDing here, just for safety */
 315         or r0, r19
 316 2:
 317         st Z+, r0
 318         inc r18
 319
 320 /* checking stuff here */
 321         cpi r18, 64-8+1
 322         brsh 0f
 323         rjmp sha256_lastBlock_insert_zeros
 324 0:
 325         /* oh shit, we landed here */
 326         /* first we have to fill it up with zeros */
 327         ldi r19, 64
 328         sub r19, r18
 329         breq 2f
 330 1:
 331         st Z+, r1
 332         dec r19
 333         brne 1b
 334 2:
 335         sbiw r30, 63
 336         sbiw r30,  1
 337         movw r22, r30
 338
 339         push r31
 340         push r30
 341         push r25
 342         push r24
 343         push r21
 344         push r20
 345         rcall sha256_nextBlock
 346         pop r20
 347         pop r21
 348         pop r24
 349         pop r25
 350         pop r30
 351         pop r31
 352
 353         /* now we should subtract 512 from length */
 354         movw r26, r24
 355         adiw r26, 4*8+1 /* we can skip the lowest byte */
 356         ld r19, X
 357         subi r19, hi8(512)
 358         st X+, r19
 359         ldi r18, 6
 360 1:
 361         ld r19, X
 362         sbci r19, 0
 363         st X+, r19
 364         dec r18
 365         brne 1b
 366
 367 ;       clr r18 /* not neccessary ;-) */
 368         /* reset Z pointer to begin of block */
 369
 370 sha256_lastBlock_insert_zeros:
 371         ldi r19, 64-8
 372         sub r19, r18
 373         breq sha256_lastBlock_insert_length
 374         clr r1
 375 1:
 376         st Z+, r1       /* r1 is still zero */
 377         dec r19
 378         brne 1b
 379
 380 ;       rjmp sha256_lastBlock_epilog
 381 sha256_lastBlock_insert_length:
 382         movw r26, r24   /* X points to state */
 383         adiw r26, 8*4   /* X points to (state.length) */
 384         adiw r30, 8             /* Z points one after the last byte of block */
 385         ld r0, X+
 386         add r0, r20
 387         st -Z, r0
 388         ld r0, X+
 389         adc r0, r21
 390         st -Z, r0
 391         ldi r19, 6
 392 1:
 393         ld r0, X+
 394         adc r0, r1
 395         st -Z, r0
 396         dec r19
 397         brne 1b
 398
 399         sbiw r30, 64-8
 400         movw r22, r30
 401         rcall sha256_nextBlock
 402
 403 sha256_lastBlock_epilog:
 404         in r30, SPL
 405         in r31, SPH
 406         in r0, SREG
 407         adiw r30, 63 ; lo8(64)
 408         adiw r30,  1  ; hi8(64)
 409         cli
 410         out SPL, r30
 411         out SREG,r0
 412         out SPH, r31
 413         clr r1
 414         ret
 415
 416 /**/
 417 ;###########################################################
 418
 419 .global sha256_nextBlock
 420 ; === sha256_nextBlock ===
 421 ; this is the core function for calculating SHA-256 hashes
 422 ;  param1: the 16-bit pointer to sha256_ctx structure
 423 ;       given in r25,r24 (r25 is most significant)
 424 ;  param2: an 16-bit pointer to 64 byte block to hash
 425 ;       given in r23,r22
 426 sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
 427
 428 Bck1 = 12
 429 Bck2 = 13
 430 Bck3 = 14
 431 Bck4 = 15
 432 Func1 = 22
 433 Func2 = 23
 434 Func3 = 24
 435 Func4 = 25
 436 Accu1 = 16
 437 Accu2 = 17
 438 Accu3 = 18
 439 Accu4 = 19
 440 XAccu1 = 8
 441 XAccu2 = 9
 442 XAccu3 = 10
 443 XAccu4 = 11
 444 T1      = 4
 445 T2      = 5
 446 T3      = 6
 447 T4      = 7
 448 LoopC = 1
 449 /* byteorder: high number <--> high significance */
 450 sha256_nextBlock:
 451  ; initial, let's make some space ready for local vars
 452         push r4 /* replace push & pop by mem ops? */
 453         push r5
 454         push r6
 455         push r7
 456         push r8
 457         push r9
 458         push r10
 459         push r11
 460         push r12
 461         push r13
 462         push r14
 463         push r15
 464         push r16
 465         push r17
 466         push r28
 467         push r29
 468         in r20, SPL
 469         in r21, SPH
 470         movw r18, r20                   ;backup SP
 471 ;       movw r26, r20                   ; X points to free space on stack
 472         movw r30, r22                   ; Z points to message
 473         subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
 474         sbci r21, hi8(sha256_nextBlock_localSpace)
 475         movw r26, r20                   ; X points to free space on stack
 476         in r0, SREG
 477         cli ; we want to be uninterrupted while updating SP
 478         out SPL, r20
 479         out SREG, r0
 480         out SPH, r21
 481         push r18
 482         push r19
 483         push r24
 484         push r25 /* param1 will be needed later */
 485  ; now we fill the w array with message (think about endianess)
 486         adiw r26, 1 ; X++
 487         ldi r20, 16
 488 sha256_nextBlock_wcpyloop:
 489         ld r23, Z+
 490         ld r22, Z+
 491         ld r19, Z+
 492         ld r18, Z+
 493         st X+, r18
 494         st X+, r19
 495         st X+, r22
 496         st X+, r23
 497         dec r20
 498         brne sha256_nextBlock_wcpyloop
 499 /*      for (i=16; i<64; ++i){
 500                 w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];
 501         } */
 502         /* r25,r24,r23,r24 (r21,r20) are function values
 503            r19,r18,r17,r16 are the accumulator
 504            r15,r14,r13,rBck1 are backup1
 505            r11,r10,r9 ,r8  are xor accu
 506            r1 is round counter                                                          */
 507
 508         ldi r20, 64-16
 509         mov LoopC, r20
 510 sha256_nextBlock_wcalcloop:
 511         movw r30, r26 ; cp X to Z
 512         sbiw r30, 63
 513         sbiw r30, 1             ; substract 64 = 16*4
 514         ld Accu1, Z+
 515         ld Accu2, Z+
 516         ld Accu3, Z+
 517         ld Accu4, Z+ /* w[i] = w[i-16] */
 518         ld Bck1, Z+
 519         ld Bck2, Z+
 520         ld Bck3, Z+
 521         ld Bck4, Z+ /* backup = w[i-15] */
 522         /* now sigma 0 */
 523         mov Func1, Bck2
 524         mov Func2, Bck3
 525         mov Func3, Bck4
 526         mov Func4, Bck1  /* prerotated by 8 */
 527         ldi r20, 1
 528         rcall bitrotl
 529         movw XAccu1, Func1
 530         movw XAccu3, Func3       /* store ROTR(w[i-15],7) in xor accu */
 531         movw Func1, Bck3
 532         movw Func3, Bck1 /* prerotated by 16 */
 533         ldi r20, 2
 534         rcall bitrotr
 535         eor XAccu1, Func1  /* xor ROTR(w[i-15], 18)*/
 536         eor XAccu2, Func2
 537         eor XAccu3, Func3
 538         eor XAccu4, Func4
 539         ldi Func2, 3             /* now shr3 */ /*we can destroy backup now*/
 540 sigma0_shr:
 541         lsr Bck4
 542         ror Bck3
 543         ror Bck2
 544         ror Bck1
 545         dec Func2
 546         brne sigma0_shr
 547         eor XAccu1, Bck1
 548         eor XAccu2, Bck2
 549         eor XAccu3, Bck3
 550         eor XAccu4, Bck4        /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
 551         add Accu1, XAccu1
 552         adc Accu2, XAccu2
 553         adc Accu3, XAccu3
 554         adc Accu4, XAccu4 /* finished with sigma0 */
 555         ldd Func1, Z+7*4  /* now accu += w[i-7] */
 556         ldd Func2, Z+7*4+1
 557         ldd Func3, Z+7*4+2
 558         ldd Func4, Z+7*4+3
 559         add Accu1, Func1
 560         adc Accu2, Func2
 561         adc Accu3, Func3
 562         adc Accu4, Func4
 563         ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
 564         ldd Bck2, Z+12*4+1
 565         ldd Bck3, Z+12*4+2
 566         ldd Bck4, Z+12*4+3
 567         /* now sigma 1 */
 568         movw Func1, Bck3
 569         movw Func3, Bck1 /* prerotated by 16 */
 570         ldi r20, 1
 571         rcall bitrotr
 572         movw XAccu3, Func3
 573         movw XAccu1, Func1       /* store in ROTR(w[i-2], 17) xor accu */
 574 ;       movw Func1, Bck3
 575 ;       movw Func3, Bck1 /* prerotated by 16 */
 576         ldi r20, 2
 577         rcall bitrotr
 578         eor XAccu1, Func1  /* xor ROTR(w[i-2], 19)*/
 579         eor XAccu2, Func2
 580         eor XAccu3, Func3
 581         eor XAccu4, Func4
 582         ldi Func2, 2     /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
 583 sigma1_shr:
 584         lsr Bck4
 585         ror Bck3
 586         ror Bck2
 587         dec Func2
 588         brne sigma1_shr
 589         eor XAccu1, Bck2
 590         eor XAccu2, Bck3
 591         eor XAccu3, Bck4  /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
 592         add Accu1, XAccu1
 593         adc Accu2, XAccu2
 594         adc Accu3, XAccu3
 595         adc Accu4, XAccu4 /* finished with sigma0 */
 596         /* now let's store the shit */
 597         st X+, Accu1
 598         st X+, Accu2
 599         st X+, Accu3
 600         st X+, Accu4
 601         dec LoopC
 602         breq 3f  ; skip if zero
 603         rjmp sha256_nextBlock_wcalcloop
 604 3:
 605         /* we are finished with w array X points one byte post w */
 606 /* init a array */
 607         pop r31
 608         pop r30
 609         push r30
 610         push r31
 611         ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
 612 init_a_array:
 613         ld r1, Z+
 614         st X+, r1
 615         dec r25
 616         brne init_a_array
 617
 618 /* now the real fun begins */
 619 /* for (i=0; i<64; ++i){
 620                         t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
 621                         t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
 622                         memmove(&(a[1]), &(a[0]), 7*4);         // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0];
 623                         a[4] += t1;
 624                         a[0] = t1 + t2;
 625                 } */
 626         /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
 627         sbiw r26, 8*4  /* X still points at a[7]+1*/
 628         movw r28, r26
 629         ldi r30, lo8(sha256_kv)
 630         ldi r31, hi8(sha256_kv)
 631         dec r27  /* X - (64*4 == 256) */
 632         ldi r25, 64
 633         mov LoopC, r25
 634 sha256_main_loop:
 635         /* now calculate t1 */
 636          /*CH(x,y,z) = (x&y)^((~x)&z)*/
 637         ldd T1, Y+5*4
 638         ldd T2, Y+5*4+1
 639         ldd T3, Y+5*4+2
 640         ldd T4, Y+5*4+3 /* y in T */
 641         ldd Func1, Y+4*4
 642         ldd Func2, Y+4*4+1
 643         ldd Func3, Y+4*4+2
 644         ldd Func4, Y+4*4+3  /* x in Func */
 645         ldd Bck1, Y+6*4
 646         ldd Bck2, Y+6*4+1
 647         ldd Bck3, Y+6*4+2
 648         ldd Bck4, Y+6*4+3 /* z in Bck */
 649         and T1, Func1
 650         and T2, Func2
 651         and T3, Func3
 652         and T4, Func4
 653         com Func1
 654         com Func2
 655         com Func3
 656         com Func4
 657         and Bck1, Func1
 658         and Bck2, Func2
 659         and Bck3, Func3
 660         and Bck4, Func4
 661         eor T1, Bck1
 662         eor T2, Bck2
 663         eor T3, Bck3
 664         eor T4, Bck4 /* done, CH(x,y,z) is in T */
 665         /* now SIGMA1(a[4]) */
 666         ldd Bck4, Y+4*4         /* think about using it from Func reg above*/
 667         ldd Bck1, Y+4*4+1
 668         ldd Bck2, Y+4*4+2
 669         ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */
 670         movw Func1, Bck1
 671         movw Func3, Bck3
 672         ldi r20, 2
 673         rcall bitrotl           /* rotr(x,6) */
 674         movw XAccu1, Func1
 675         movw XAccu3, Func3
 676         movw Func1, Bck1
 677         movw Func3, Bck3
 678         ldi r20, 3
 679         rcall bitrotr   /* rotr(x,11) */
 680         eor XAccu1, Func1
 681         eor XAccu2, Func2
 682         eor XAccu3, Func3
 683         eor XAccu4, Func4
 684         movw Func1, Bck3 /* this prerotates furteh 16 bits*/
 685         movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
 686         ldi r20, 1
 687         rcall bitrotr   /* rotr(x,11) */
 688         eor XAccu1, Func1
 689         eor XAccu2, Func2
 690         eor XAccu3, Func3
 691         eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
 692         add T1, XAccu1
 693         adc T2, XAccu2
 694         adc T3, XAccu3
 695         adc T4, XAccu4
 696         /* now we've to add a[7], w[i] and k[i] */
 697         ldd XAccu1, Y+4*7
 698         ldd XAccu2, Y+4*7+1
 699         ldd XAccu3, Y+4*7+2
 700         ldd XAccu4, Y+4*7+3
 701         add T1, XAccu1
 702         adc T2, XAccu2
 703         adc T3, XAccu3
 704         adc T4, XAccu4 /* add a[7] */
 705         ld XAccu1, X+
 706         ld XAccu2, X+
 707         ld XAccu3, X+
 708         ld XAccu4, X+
 709         add T1, XAccu1
 710         adc T2, XAccu2
 711         adc T3, XAccu3
 712         adc T4, XAccu4 /* add w[i] */
 713         lpm XAccu1, Z+
 714         lpm XAccu2, Z+
 715         lpm XAccu3, Z+
 716         lpm XAccu4, Z+
 717         add T1, XAccu1
 718         adc T2, XAccu2
 719         adc T3, XAccu3
 720         adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
 721         /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
 722                 /* starting with MAJ(x,y,z) */
 723         ldd Func1, Y+4*0+0
 724         ldd Func2, Y+4*0+1
 725         ldd Func3, Y+4*0+2
 726         ldd Func4, Y+4*0+3 /* load x=a[0] */
 727         ldd XAccu1, Y+4*1+0
 728         ldd XAccu2, Y+4*1+1
 729         ldd XAccu3, Y+4*1+2
 730         ldd XAccu4, Y+4*1+3 /* load y=a[1] */
 731         and XAccu1, Func1
 732         and XAccu2, Func2
 733         and XAccu3, Func3
 734         and XAccu4, Func4       /* XAccu == (x & y) */
 735         ldd Bck1, Y+4*2+0
 736         ldd Bck2, Y+4*2+1
 737         ldd Bck3, Y+4*2+2
 738         ldd Bck4, Y+4*2+3 /* load z=a[2] */
 739         and Func1, Bck1
 740         and Func2, Bck2
 741         and Func3, Bck3
 742         and Func4, Bck4
 743         eor XAccu1, Func1
 744         eor XAccu2, Func2
 745         eor XAccu3, Func3
 746         eor XAccu4, Func4       /* XAccu == (x & y) ^ (x & z) */
 747         ldd Func1, Y+4*1+0
 748         ldd Func2, Y+4*1+1
 749         ldd Func3, Y+4*1+2
 750         ldd Func4, Y+4*1+3 /* load y=a[1] */
 751         and Func1, Bck1
 752         and Func2, Bck2
 753         and Func3, Bck3
 754         and Func4, Bck4
 755         eor XAccu1, Func1
 756         eor XAccu2, Func2
 757         eor XAccu3, Func3
 758         eor XAccu4, Func4       /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
 759         /* SIGMA0(a[0]) */
 760         ldd Bck1, Y+4*0+0 /* we should combine this with above */
 761         ldd Bck2, Y+4*0+1
 762         ldd Bck3, Y+4*0+2
 763         ldd Bck4, Y+4*0+3
 764         movw Func1, Bck1
 765         movw Func3, Bck3
 766         ldi r20, 2
 767         rcall bitrotr
 768         movw Accu1, Func1
 769         movw Accu3, Func3 /* Accu = shr(a[0], 2) */
 770         movw Func1, Bck3
 771         movw Func3, Bck1 /* prerotate by 16 bits */
 772         ldi r20, 3
 773         rcall bitrotl
 774         eor Accu1, Func1
 775         eor Accu2, Func2
 776         eor Accu3, Func3
 777         eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
 778         mov Func1, Bck4
 779         mov Func2, Bck1
 780         mov Func3, Bck2
 781         mov Func4, Bck3  /* prerotate by 24 bits */
 782         ldi r20, 2
 783         rcall bitrotl
 784         eor Accu1, Func1
 785         eor Accu2, Func2
 786         eor Accu3, Func3
 787         eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
 788         add Accu1, XAccu1 /* add previous result (MAJ)*/
 789         adc Accu2, XAccu2
 790         adc Accu3, XAccu3
 791         adc Accu4, XAccu4
 792         /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
 793         /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
 794
 795         ldi r21, 7*4
 796         adiw r28, 7*4
 797 a_shift_loop:
 798         ld  r25, -Y /* warning: this is PREdecrement */
 799         std Y+4, r25
 800         dec r21
 801         brne a_shift_loop
 802
 803         ldd Bck1, Y+4*4+0
 804         ldd Bck2, Y+4*4+1
 805         ldd Bck3, Y+4*4+2
 806         ldd Bck4, Y+4*4+3
 807         add Bck1, T1
 808         adc Bck2, T2
 809         adc Bck3, T3
 810         adc Bck4, T4
 811         std Y+4*4+0, Bck1
 812         std Y+4*4+1, Bck2
 813         std Y+4*4+2, Bck3
 814         std Y+4*4+3, Bck4
 815         add Accu1, T1
 816         adc Accu2, T2
 817         adc Accu3, T3
 818         adc Accu4, T4
 819         std Y+4*0+0, Accu1
 820         std Y+4*0+1, Accu2
 821         std Y+4*0+2, Accu3
 822         std Y+4*0+3, Accu4 /* a array updated */
 823
 824
 825         dec LoopC
 826         breq update_state
 827         rjmp sha256_main_loop ;brne sha256_main_loop
 828 update_state:
 829         /* update state */
 830         /* pointers to state should still exist on the stack ;-) */
 831         pop r31
 832         pop r30
 833         ldi r21, 8
 834 update_state_loop:
 835         ldd Accu1, Z+0
 836         ldd Accu2, Z+1
 837         ldd Accu3, Z+2
 838         ldd Accu4, Z+3
 839         ld Func1, Y+
 840         ld Func2, Y+
 841         ld Func3, Y+
 842         ld Func4, Y+
 843         add Accu1, Func1
 844         adc Accu2, Func2
 845         adc Accu3, Func3
 846         adc Accu4, Func4
 847         st Z+, Accu1
 848         st Z+, Accu2
 849         st Z+, Accu3
 850         st Z+, Accu4
 851         dec r21
 852         brne update_state_loop
 853         /* now we just have to update the length */
 854         adiw r30, 1 /* since we add 512, we can simply skip the LSB */
 855         ldi r21, 2
 856         ldi r22, 6
 857         ld r20, Z
 858         add r20, r21
 859         st Z+, r20
 860         clr r21
 861 sha256_nextBlock_fix_length:
 862         brcc sha256_nextBlock_epilog
 863         ld r20, Z
 864         adc r20, r21
 865         st Z+, r20
 866         dec r22
 867         brne sha256_nextBlock_fix_length
 868
 869 ; EPILOG
 870 sha256_nextBlock_epilog:
 871 /* now we should clean up the stack */
 872
 873         pop r21
 874         pop r20
 875         in r0, SREG
 876         cli ; we want to be uninterrupted while updating SP
 877         out SPL, r20
 878         out SREG, r0
 879         out SPH, r21
 880         clr r1
 881         pop r29
 882         pop r28
 883         pop r17
 884         pop r16
 885         pop r15
 886         pop r14
 887         pop r13
 888         pop r12
 889         pop r11
 890         pop r10
 891         pop r9
 892         pop r8
 893         pop r7
 894         pop r6
 895         pop r5
 896         pop r4
 897         ret
 898
 899 sha256_kv: ; round-key-vector stored in ProgMem
 900 .word   0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
 901 .word   0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
 902 .word   0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
 903 .word   0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
 904 .word   0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
 905 .word   0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
 906 .word   0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
 907 .word   0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
 908
 909
 910 ;###########################################################
 911
 912 .global sha256_init
 913 ;uint32_t sha256_init_vector[]={
 914 ;       0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
 915 ;       0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
 916 ;
 917 ;void sha256_init(sha256_ctx_t *state){
 918 ;       state->length=0;
 919 ;       memcpy(state->h, sha256_init_vector, 8*4);
 920 ;}
 921 ; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
 922 ; modifys: Z(r30,r31), Func1, r22
 923 sha256_init:
 924         movw r26, r24 ; (24,25) --> (26,27) load X with param1
 925         ldi r30, lo8((sha256_init_vector))
 926         ldi r31, hi8((sha256_init_vector))
 927         ldi r22, 32+8
 928 sha256_init_vloop:
 929         lpm r23, Z+
 930         st X+, r23
 931         dec r22
 932         brne sha256_init_vloop
 933         ret
 934
 935 sha256_init_vector:
 936 .word 0xE667, 0x6A09
 937 .word 0xAE85, 0xBB67
 938 .word 0xF372, 0x3C6E
 939 .word 0xF53A, 0xA54F
 940 .word 0x527F, 0x510E
 941 .word 0x688C, 0x9B05
 942 .word 0xD9AB, 0x1F83
 943 .word 0xCD19, 0x5BE0
 944 .word 0x0000, 0x0000
 945 .word 0x0000, 0x0000
 946
 947 ;###########################################################
 948
 949 .global rotl32
 950 ; === ROTL32 ===
 951 ; function that rotates a 32 bit word to the left
 952 ;  param1: the 32-bit word to rotate
 953 ;       given in r25,r24,r23,r22 (r25 is most significant)
 954 ;  param2: an 8-bit value telling how often to rotate
 955 ;       given in r20
 956 ; modifys: r21, r22
 957 rotl32:
 958         cpi r20, 8
 959         brlo bitrotl
 960         mov r21, r25
 961         mov r25, r24
 962         mov r24, r23
 963         mov r23, r22
 964         mov r22, r21
 965         subi r20, 8
 966         rjmp rotl32
 967 bitrotl:
 968         clr r21
 969         clc
 970 bitrotl_loop:
 971         tst r20
 972         breq fixrotl
 973 2:
 974         rol r22
 975         rol r23
 976         rol r24
 977         rol r25
 978         rol r21
 979         dec r20
 980         brne 2b
 981 fixrotl:
 982         or r22, r21
 983         ret
 984
 985
 986 ;###########################################################
 987
 988 .global rotr32
 989 ; === ROTR32 ===
 990 ; function that rotates a 32 bit word to the right
 991 ;  param1: the 32-bit word to rotate
 992 ;       given in r25,r24,r23,22 (r25 is most significant)
 993 ;  param2: an 8-bit value telling how often to rotate
 994 ;       given in r20
 995 ; modifys: r21, r22
 996 rotr32:
 997         cpi r20, 8
 998         brlo bitrotr
 999         mov r21, r22
1000         mov r22, r23
1001         mov r23, r24
1002         mov r24, r25
1003         mov r25, r21
1004         subi r20, 8
1005         rjmp rotr32
1006 bitrotr:
1007         clr r21
1008         clc
1009 bitrotr_loop:
1010         tst r20
1011         breq fixrotr
1012 2:
1013         ror r25
1014         ror r24
1015         ror r23
1016         ror r22
1017         ror r21
1018         dec r20
1019         brne 2b
1020 fixrotr:
1021         or r25, r21
1022         ret
1023
1024
1025 ;###########################################################
1026
1027 .global change_endian32
1028 ; === change_endian32 ===
1029 ; function that changes the endianess of a 32-bit word
1030 ;  param1: the 32-bit word
1031 ;       given in r25,r24,r23,22 (r25 is most significant)
1032 ;  modifys: r21, r22
1033 change_endian32:
1034         movw r20,  r22 ; (r22,r23) --> (r20,r21)
1035         mov r22, r25
1036         mov r23, r24
1037         mov r24, r21
1038         mov r25, r20
1039         ret
1040