sha256-asm.S

   1 /* sha256-asm.S */
   2 /*
   3     This file is part of the This file is part of the AVR-Crypto-Lib.
   4     Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
   5
   6     This program is free software: you can redistribute it and/or modify
   7     it under the terms of the GNU General Public License as published by
   8     the Free Software Foundation, either version 3 of the License, or
   9     (at your option) any later version.
  10
  11     This program is distributed in the hope that it will be useful,
  12     but WITHOUT ANY WARRANTY; without even the implied warranty of
  13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14     GNU General Public License for more details.
  15
  16     You should have received a copy of the GNU General Public License
  17     along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 */
  19 /*
  20  * Author:      Daniel Otte
  21  *
  22  * License: GPLv3 or later
  23 */
  24 ; sha-256 implementation in assembler
  25 SHA256_BLOCK_BITS = 512
  26 SHA256_HASH_BITS = 256
  27
  28 .macro precall
  29         /* push r18 - r27, r30 - r31*/
  30         push r0
  31         push r1
  32         push r18
  33         push r19
  34         push r20
  35         push r21
  36         push r22
  37         push r23
  38         push r24
  39         push r25
  40         push r26
  41         push r27
  42         push r30
  43         push r31
  44         clr r1
  45 .endm
  46
  47 .macro postcall
  48         pop r31
  49         pop r30
  50         pop r27
  51         pop r26
  52         pop r25
  53         pop r24
  54         pop r23
  55         pop r22
  56         pop r21
  57         pop r20
  58         pop r19
  59         pop r18
  60         pop r1
  61         pop r0
  62 .endm
  63
  64
  65 .macro hexdump length
  66         push r27
  67         push r26
  68         ldi r25, '\r'
  69         mov r24, r25
  70         call uart_putc
  71         ldi r25, '\n'
  72         mov r24, r25
  73         call uart_putc
  74         pop r26
  75         pop r27
  76         movw r24, r26
  77 .if \length > 16
  78         ldi r22, lo8(16)
  79         ldi r23, hi8(16)
  80         push r27
  81         push r26
  82         call uart_hexdump
  83         pop r26
  84         pop r27
  85         adiw r26, 16
  86         hexdump \length-16
  87 .else
  88         ldi r22, lo8(\length)
  89         ldi r23, hi8(\length)
  90         call uart_hexdump
  91 .endif
  92 .endm
  93
  94 /* X points to Block */
  95 .macro dbg_hexdump length
  96         precall
  97         hexdump \length
  98         postcall
  99 .endm
 100
 101 .section .text
 102
 103 SPL = 0x3D
 104 SPH = 0x3E
 105 SREG = 0x3F
 106
 107
 108 ;
 109 ;sha256_ctx_t is:
 110 ;
 111 ; [h0][h1][h2][h3][h4][h5][h6][h7][length]
 112 ; hn is 32 bit large, length is 64 bit large
 113
 114 ;###########################################################
 115
 116 .global sha256_ctx2hash
 117 ; === sha256_ctx2hash ===
 118 ; this function converts a state into a normal hash (bytestring)
 119 ;  param1: the 16-bit destination pointer
 120 ;       given in r25,r24 (r25 is most significant)
 121 ;  param2: the 16-bit pointer to sha256_ctx structure
 122 ;       given in r23,r22
 123 sha256_ctx2hash:
 124         movw r26, r22
 125         movw r30, r24
 126         ldi r21, 8
 127         sbiw r26, 4
 128 1:
 129         ldi r20, 4
 130         adiw r26, 8
 131 2:
 132                 ld r0, -X
 133                 st Z+, r0
 134         dec r20
 135         brne 2b
 136
 137         dec r21
 138         brne 1b
 139
 140         ret
 141
 142 ;###########################################################
 143
 144 .global sha256
 145 ; === sha256 ===
 146 ; this function calculates SHA-256 hashes from messages in RAM
 147 ;  param1: the 16-bit hash destination pointer
 148 ;       given in r25,r24 (r25 is most significant)
 149 ;  param2: the 16-bit pointer to message
 150 ;       given in r23,r22
 151 ;  param3: 32-bit length value (length of message in bits)
 152 ;   given in r21,r20,r19,r18
 153 sha256:
 154 sha256_prolog:
 155         push r8
 156         push r9
 157         push r10
 158         push r11
 159         push r12
 160         push r13
 161         push r16
 162         push r17
 163         in r16, SPL
 164         in r17, SPH
 165         subi r16, 8*4+8
 166         sbci r17, 0
 167         in r0, SREG
 168         cli
 169         out SPL, r16
 170         out SPH, r17
 171         out SREG, r0
 172
 173         push r25
 174         push r24
 175         inc r16
 176         adc r17, r1
 177
 178         movw r8, r18            /* backup of length*/
 179         movw r10, r20
 180
 181         movw r12, r22   /* backup pf msg-ptr */
 182
 183         movw r24, r16
 184         rcall sha256_init
 185         /* if length >= 512 */
 186 1:
 187         tst r11
 188         brne 4f
 189         tst r10
 190         brne 4f
 191         mov r19, r9
 192         cpi r19, 0x02
 193         brlo 4f
 194
 195         movw r24, r16
 196         movw r22, r12
 197         rcall sha256_nextBlock
 198         ldi r19, 0x64
 199         add r22, r19
 200         adc r23, r1
 201         /* length -= 512 */
 202         ldi r19, 0x02
 203         sub r9, r19
 204         sbc r10, r1
 205         sbc r11, r1
 206         rjmp 1b
 207
 208 4:
 209         movw r24, r16
 210         movw r22, r12
 211         movw r20, r8
 212         rcall sha256_lastBlock
 213
 214         pop r24
 215         pop r25
 216         movw r22, r16
 217         rcall sha256_ctx2hash
 218
 219 sha256_epilog:
 220         in r30, SPL
 221         in r31, SPH
 222         adiw r30, 8*4+8
 223         in r0, SREG
 224         cli
 225         out SPL, r30
 226         out SPH, r31
 227         out SREG, r0
 228         pop r17
 229         pop r16
 230         pop r13
 231         pop r12
 232         pop r11
 233         pop r10
 234         pop r9
 235         pop r8
 236         ret
 237
 238 ;###########################################################
 239
 240
 241 ; block MUST NOT be larger than 64 bytes
 242
 243 .global sha256_lastBlock
 244 ; === sha256_lastBlock ===
 245 ; this function does padding & Co. for calculating SHA-256 hashes
 246 ;  param1: the 16-bit pointer to sha256_ctx structure
 247 ;       given in r25,r24 (r25 is most significant)
 248 ;  param2: an 16-bit pointer to 64 byte block to hash
 249 ;       given in r23,r22
 250 ;  param3: an 16-bit integer specifing length of block in bits
 251 ;       given in r21,r20
 252 sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
 253
 254
 255 sha256_lastBlock:
 256         tst r20
 257         brne sha256_lastBlock_prolog
 258         cpi r21, 0x02
 259         brne sha256_lastBlock_prolog
 260         push r25
 261         push r24
 262         push r23
 263         push r22
 264         rcall sha256_nextBlock
 265         pop r22
 266         pop r23
 267         pop r24
 268         pop r25
 269         clr r21
 270         clr r22
 271 sha256_lastBlock_prolog:
 272         /* allocate space on stack */
 273         in r30, SPL
 274         in r31, SPH
 275         in r1, SREG
 276         subi r30, lo8(64)
 277         sbci r31, hi8(64)
 278         cli
 279         out SPL, r30
 280         out SPH, r31
 281         out SREG,r1
 282
 283         adiw r30, 1 /* SP points to next free byte on stack */
 284         mov r18, r20 /* r20 = LSB(length) */
 285         lsr r18
 286         lsr r18
 287         lsr r18
 288         bst r21, 0      /* may be we should explain this ... */
 289         bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
 290
 291
 292         movw r26, r22 /* X points to begin of msg */
 293         tst r18
 294         breq sha256_lastBlock_post_copy
 295         mov r1, r18
 296 sha256_lastBlock_copy_loop:
 297         ld r0, X+
 298         st Z+, r0
 299         dec r1
 300         brne sha256_lastBlock_copy_loop
 301 sha256_lastBlock_post_copy:
 302 sha256_lastBlock_insert_stuffing_bit:
 303         ldi r19, 0x80
 304         mov r0,r19
 305         ldi r19, 0x07
 306         and r19, r20 /* if we are in bitmode */
 307         breq 2f /* no bitmode */
 308 1:
 309         lsr r0
 310         dec r19
 311         brne 1b
 312         ld r19, X
 313 /* maybe we should do some ANDing here, just for safety */
 314         or r0, r19
 315 2:
 316         st Z+, r0
 317         inc r18
 318
 319 /* checking stuff here */
 320         cpi r18, 64-8+1
 321         brsh 0f
 322         rjmp sha256_lastBlock_insert_zeros
 323 0:
 324         /* oh shit, we landed here */
 325         /* first we have to fill it up with zeros */
 326         ldi r19, 64
 327         sub r19, r18
 328         breq 2f
 329 1:
 330         st Z+, r1
 331         dec r19
 332         brne 1b
 333 2:
 334         sbiw r30, 63
 335         sbiw r30,  1
 336         movw r22, r30
 337
 338         push r31
 339         push r30
 340         push r25
 341         push r24
 342         push r21
 343         push r20
 344         rcall sha256_nextBlock
 345         pop r20
 346         pop r21
 347         pop r24
 348         pop r25
 349         pop r30
 350         pop r31
 351
 352         /* now we should subtract 512 from length */
 353         movw r26, r24
 354         adiw r26, 4*8+1 /* we can skip the lowest byte */
 355         ld r19, X
 356         subi r19, hi8(512)
 357         st X+, r19
 358         ldi r18, 6
 359 1:
 360         ld r19, X
 361         sbci r19, 0
 362         st X+, r19
 363         dec r18
 364         brne 1b
 365
 366 ;       clr r18 /* not neccessary ;-) */
 367         /* reset Z pointer to begin of block */
 368
 369 sha256_lastBlock_insert_zeros:
 370         ldi r19, 64-8
 371         sub r19, r18
 372         breq sha256_lastBlock_insert_length
 373         clr r1
 374 1:
 375         st Z+, r1       /* r1 is still zero */
 376         dec r19
 377         brne 1b
 378
 379 ;       rjmp sha256_lastBlock_epilog
 380 sha256_lastBlock_insert_length:
 381         movw r26, r24   /* X points to state */
 382         adiw r26, 8*4   /* X points to (state.length) */
 383         adiw r30, 8             /* Z points one after the last byte of block */
 384         ld r0, X+
 385         add r0, r20
 386         st -Z, r0
 387         ld r0, X+
 388         adc r0, r21
 389         st -Z, r0
 390         ldi r19, 6
 391 1:
 392         ld r0, X+
 393         adc r0, r1
 394         st -Z, r0
 395         dec r19
 396         brne 1b
 397
 398         sbiw r30, 64-8
 399         movw r22, r30
 400         rcall sha256_nextBlock
 401
 402 sha256_lastBlock_epilog:
 403         in r30, SPL
 404         in r31, SPH
 405         in r1, SREG
 406         adiw r30, 63 ; lo8(64)
 407         adiw r30,  1  ; hi8(64)
 408         cli
 409         out SPL, r30
 410         out SPH, r31
 411         out SREG,r1
 412         clr r1
 413         clr r0
 414         ret
 415
 416 /**/
 417 ;###########################################################
 418
 419 .global sha256_nextBlock
 420 ; === sha256_nextBlock ===
 421 ; this is the core function for calculating SHA-256 hashes
 422 ;  param1: the 16-bit pointer to sha256_ctx structure
 423 ;       given in r25,r24 (r25 is most significant)
 424 ;  param2: an 16-bit pointer to 64 byte block to hash
 425 ;       given in r23,r22
 426 sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
 427
 428 Bck1 = 12
 429 Bck2 = 13
 430 Bck3 = 14
 431 Bck4 = 15
 432 Func1 = 22
 433 Func2 = 23
 434 Func3 = 24
 435 Func4 = 25
 436 Accu1 = 16
 437 Accu2 = 17
 438 Accu3 = 18
 439 Accu4 = 19
 440 XAccu1 = 8
 441 XAccu2 = 9
 442 XAccu3 = 10
 443 XAccu4 = 11
 444 T1      = 4
 445 T2      = 5
 446 T3      = 6
 447 T4      = 7
 448 LoopC = 1
 449 /* byteorder: high number <--> high significance */
 450 sha256_nextBlock:
 451  ; initial, let's make some space ready for local vars
 452         push r4 /* replace push & pop by mem ops? */
 453         push r5
 454         push r6
 455         push r7
 456         push r8
 457         push r9
 458         push r10
 459         push r11
 460         push r12
 461         push r13
 462         push r14
 463         push r15
 464         push r16
 465         push r17
 466         push r28
 467         push r29
 468         in r20, SPL
 469         in r21, SPH
 470         movw r18, r20                   ;backup SP
 471 ;       movw r26, r20                   ; X points to free space on stack
 472         movw r30, r22                   ; Z points to message
 473         subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
 474         sbci r21, hi8(sha256_nextBlock_localSpace)
 475         movw r26, r20                   ; X points to free space on stack
 476         in r0, SREG
 477         cli ; we want to be uninterrupted while updating SP
 478         out SPL, r20
 479         out SPH, r21
 480         out SREG, r0
 481         push r18
 482         push r19
 483         push r24
 484         push r25 /* param1 will be needed later */
 485  ; now we fill the w array with message (think about endianess)
 486         adiw r26, 1 ; X++
 487         ldi r20, 16
 488 sha256_nextBlock_wcpyloop:
 489         ld r23, Z+
 490         ld r22, Z+
 491         ld r19, Z+
 492         ld r18, Z+
 493         st X+, r18
 494         st X+, r19
 495         st X+, r22
 496         st X+, r23
 497         dec r20
 498         brne sha256_nextBlock_wcpyloop
 499 /*      for (i=16; i<64; ++i){
 500                 w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];
 501         } */
 502         /* r25,r24,r23,r24 (r21,r20) are function values
 503            r19,r18,r17,r16 are the accumulator
 504            r15,r14,r13,rBck1 are backup1
 505            r11,r10,r9 ,r8  are xor accu
 506            r1 is round counter                                                          */
 507
 508         ldi r20, 64-16
 509         mov LoopC, r20
 510 sha256_nextBlock_wcalcloop:
 511         movw r30, r26 ; cp X to Z
 512         sbiw r30, 63
 513         sbiw r30, 1             ; substract 64 = 16*4
 514         ld Accu1, Z+
 515         ld Accu2, Z+
 516         ld Accu3, Z+
 517         ld Accu4, Z+ /* w[i] = w[i-16] */
 518         ld Bck1, Z+
 519         ld Bck2, Z+
 520         ld Bck3, Z+
 521         ld Bck4, Z+ /* backup = w[i-15] */
 522         /* now sigma 0 */
 523         mov Func1, Bck2
 524         mov Func2, Bck3
 525         mov Func3, Bck4
 526         mov Func4, Bck1  /* prerotated by 8 */
 527         ldi r20, 1
 528         rcall bitrotl
 529         movw XAccu1, Func1
 530         movw XAccu3, Func3       /* store ROTR(w[i-15],7) in xor accu */
 531         movw Func1, Bck3
 532         movw Func3, Bck1 /* prerotated by 16 */
 533         ldi r20, 2
 534         rcall bitrotr
 535         eor XAccu1, Func1  /* xor ROTR(w[i-15], 18)*/
 536         eor XAccu2, Func2
 537         eor XAccu3, Func3
 538         eor XAccu4, Func4
 539         ldi Func2, 3             /* now shr3 */ /*we can destroy backup now*/
 540 sigma0_shr:
 541         lsr Bck4
 542         ror Bck3
 543         ror Bck2
 544         ror Bck1
 545         dec Func2
 546         brne sigma0_shr
 547         eor XAccu1, Bck1
 548         eor XAccu2, Bck2
 549         eor XAccu3, Bck3
 550         eor XAccu4, Bck4        /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
 551         add Accu1, XAccu1
 552         adc Accu2, XAccu2
 553         adc Accu3, XAccu3
 554         adc Accu4, XAccu4 /* finished with sigma0 */
 555         ldd Func1, Z+7*4  /* now accu += w[i-7] */
 556         ldd Func2, Z+7*4+1
 557         ldd Func3, Z+7*4+2
 558         ldd Func4, Z+7*4+3
 559         add Accu1, Func1
 560         adc Accu2, Func2
 561         adc Accu3, Func3
 562         adc Accu4, Func4
 563         ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
 564         ldd Bck2, Z+12*4+1
 565         ldd Bck3, Z+12*4+2
 566         ldd Bck4, Z+12*4+3
 567         /* now sigma 1 */
 568         movw Func1, Bck3
 569         movw Func3, Bck1 /* prerotated by 16 */
 570         ldi r20, 1
 571         rcall bitrotr
 572         movw XAccu3, Func3
 573         movw XAccu1, Func1       /* store in ROTR(w[i-2], 17) xor accu */
 574 ;       movw Func1, Bck3
 575 ;       movw Func3, Bck1 /* prerotated by 16 */
 576         ldi r20, 2
 577         rcall bitrotr
 578         eor XAccu1, Func1  /* xor ROTR(w[i-2], 19)*/
 579         eor XAccu2, Func2
 580         eor XAccu3, Func3
 581         eor XAccu4, Func4
 582         ldi Func2, 2     /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
 583 sigma1_shr:
 584         lsr Bck4
 585         ror Bck3
 586         ror Bck2
 587         dec Func2
 588         brne sigma1_shr
 589         eor XAccu1, Bck2
 590         eor XAccu2, Bck3
 591         eor XAccu3, Bck4  /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
 592         add Accu1, XAccu1
 593         adc Accu2, XAccu2
 594         adc Accu3, XAccu3
 595         adc Accu4, XAccu4 /* finished with sigma0 */
 596         /* now let's store the shit */
 597         st X+, Accu1
 598         st X+, Accu2
 599         st X+, Accu3
 600         st X+, Accu4
 601         dec LoopC
 602         breq 3f  ; skip if zero
 603         rjmp sha256_nextBlock_wcalcloop
 604 3:
 605         /* we are finished with w array X points one byte post w */
 606 /* init a array */
 607         pop r31
 608         pop r30
 609         push r30
 610         push r31
 611         ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
 612 init_a_array:
 613         ld r1, Z+
 614         st X+, r1
 615         dec r25
 616         brne init_a_array
 617
 618 /* now the real fun begins */
 619 /* for (i=0; i<64; ++i){
 620                         t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
 621                         t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
 622                         memmove(&(a[1]), &(a[0]), 7*4);         // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0];
 623                         a[4] += t1;
 624                         a[0] = t1 + t2;
 625                 } */
 626         /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
 627         sbiw r26, 8*4  /* X still points at a[7]+1*/
 628         movw r28, r26
 629         ldi r30, lo8(sha256_kv)
 630         ldi r31, hi8(sha256_kv)
 631         dec r27  /* X - (64*4 == 256) */
 632         ldi r25, 64
 633         mov LoopC, r25
 634 sha256_main_loop:
 635         /* now calculate t1 */
 636          /*CH(x,y,z) = (x&y)^((~x)&z)*/
 637         ldd T1, Y+5*4
 638         ldd T2, Y+5*4+1
 639         ldd T3, Y+5*4+2
 640         ldd T4, Y+5*4+3 /* y in T */
 641         ldd Func1, Y+4*4
 642         ldd Func2, Y+4*4+1
 643         ldd Func3, Y+4*4+2
 644         ldd Func4, Y+4*4+3  /* x in Func */
 645         ldd Bck1, Y+6*4
 646         ldd Bck2, Y+6*4+1
 647         ldd Bck3, Y+6*4+2
 648         ldd Bck4, Y+6*4+3 /* z in Bck */
 649         and T1, Func1
 650         and T2, Func2
 651         and T3, Func3
 652         and T4, Func4
 653         com Func1
 654         com Func2
 655         com Func3
 656         com Func4
 657         and Bck1, Func1
 658         and Bck2, Func2
 659         and Bck3, Func3
 660         and Bck4, Func4
 661         eor T1, Bck1
 662         eor T2, Bck2
 663         eor T3, Bck3
 664         eor T4, Bck4 /* done, CH(x,y,z) is in T */
 665         /* now SIGMA1(a[4]) */
 666         ldd Bck4, Y+4*4         /* think about using it from Func reg above*/
 667         ldd Bck1, Y+4*4+1
 668         ldd Bck2, Y+4*4+2
 669         ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */
 670         movw Func1, Bck1
 671         movw Func3, Bck3
 672         ldi r20, 2
 673         rcall bitrotl           /* rotr(x,6) */
 674         movw XAccu1, Func1
 675         movw XAccu3, Func3
 676         movw Func1, Bck1
 677         movw Func3, Bck3
 678         ldi r20, 3
 679         rcall bitrotr   /* rotr(x,11) */
 680         eor XAccu1, Func1
 681         eor XAccu2, Func2
 682         eor XAccu3, Func3
 683         eor XAccu4, Func4
 684         movw Func1, Bck3 /* this prerotates furteh 16 bits*/
 685         movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
 686         ldi r20, 1
 687         rcall bitrotr   /* rotr(x,11) */
 688         eor XAccu1, Func1
 689         eor XAccu2, Func2
 690         eor XAccu3, Func3
 691         eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
 692         add T1, XAccu1
 693         adc T2, XAccu2
 694         adc T3, XAccu3
 695         adc T4, XAccu4
 696         /* now we've to add a[7], w[i] and k[i] */
 697         ldd XAccu1, Y+4*7
 698         ldd XAccu2, Y+4*7+1
 699         ldd XAccu3, Y+4*7+2
 700         ldd XAccu4, Y+4*7+3
 701         add T1, XAccu1
 702         adc T2, XAccu2
 703         adc T3, XAccu3
 704         adc T4, XAccu4 /* add a[7] */
 705         ld XAccu1, X+
 706         ld XAccu2, X+
 707         ld XAccu3, X+
 708         ld XAccu4, X+
 709         add T1, XAccu1
 710         adc T2, XAccu2
 711         adc T3, XAccu3
 712         adc T4, XAccu4 /* add w[i] */
 713         lpm XAccu1, Z+
 714         lpm XAccu2, Z+
 715         lpm XAccu3, Z+
 716         lpm XAccu4, Z+
 717         add T1, XAccu1
 718         adc T2, XAccu2
 719         adc T3, XAccu3
 720         adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
 721         /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
 722                 /* starting with MAJ(x,y,z) */
 723         ldd Func1, Y+4*0+0
 724         ldd Func2, Y+4*0+1
 725         ldd Func3, Y+4*0+2
 726         ldd Func4, Y+4*0+3 /* load x=a[0] */
 727         ldd XAccu1, Y+4*1+0
 728         ldd XAccu2, Y+4*1+1
 729         ldd XAccu3, Y+4*1+2
 730         ldd XAccu4, Y+4*1+3 /* load y=a[1] */
 731         and XAccu1, Func1
 732         and XAccu2, Func2
 733         and XAccu3, Func3
 734         and XAccu4, Func4       /* XAccu == (x & y) */
 735         ldd Bck1, Y+4*2+0
 736         ldd Bck2, Y+4*2+1
 737         ldd Bck3, Y+4*2+2
 738         ldd Bck4, Y+4*2+3 /* load z=a[2] */
 739         and Func1, Bck1
 740         and Func2, Bck2
 741         and Func3, Bck3
 742         and Func4, Bck4
 743         eor XAccu1, Func1
 744         eor XAccu2, Func2
 745         eor XAccu3, Func3
 746         eor XAccu4, Func4       /* XAccu == (x & y) ^ (x & z) */
 747         ldd Func1, Y+4*1+0
 748         ldd Func2, Y+4*1+1
 749         ldd Func3, Y+4*1+2
 750         ldd Func4, Y+4*1+3 /* load y=a[1] */
 751         and Func1, Bck1
 752         and Func2, Bck2
 753         and Func3, Bck3
 754         and Func4, Bck4
 755         eor XAccu1, Func1
 756         eor XAccu2, Func2
 757         eor XAccu3, Func3
 758         eor XAccu4, Func4       /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
 759         /* SIGMA0(a[0]) */
 760         ldd Bck1, Y+4*0+0 /* we should combine this with above */
 761         ldd Bck2, Y+4*0+1
 762         ldd Bck3, Y+4*0+2
 763         ldd Bck4, Y+4*0+3
 764         movw Func1, Bck1
 765         movw Func3, Bck3
 766         ldi r20, 2
 767         rcall bitrotr
 768         movw Accu1, Func1
 769         movw Accu3, Func3 /* Accu = shr(a[0], 2) */
 770         movw Func1, Bck3
 771         movw Func3, Bck1 /* prerotate by 16 bits */
 772         ldi r20, 3
 773         rcall bitrotl
 774         eor Accu1, Func1
 775         eor Accu2, Func2
 776         eor Accu3, Func3
 777         eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
 778         mov Func1, Bck4
 779         mov Func2, Bck1
 780         mov Func3, Bck2
 781         mov Func4, Bck3  /* prerotate by 24 bits */
 782         ldi r20, 2
 783         rcall bitrotl
 784         eor Accu1, Func1
 785         eor Accu2, Func2
 786         eor Accu3, Func3
 787         eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
 788         add Accu1, XAccu1 /* add previous result (MAJ)*/
 789         adc Accu2, XAccu2
 790         adc Accu3, XAccu3
 791         adc Accu4, XAccu4
 792         /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
 793         /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
 794
 795         ldi r21, 7*4
 796         adiw r28, 7*4
 797 a_shift_loop:
 798         ld  r25, -Y /* warning: this is PREdecrement */
 799         std Y+4, r25
 800         dec r21
 801         brne a_shift_loop
 802
 803         ldd Bck1, Y+4*4+0
 804         ldd Bck2, Y+4*4+1
 805         ldd Bck3, Y+4*4+2
 806         ldd Bck4, Y+4*4+3
 807         add Bck1, T1
 808         adc Bck2, T2
 809         adc Bck3, T3
 810         adc Bck4, T4
 811         std Y+4*4+0, Bck1
 812         std Y+4*4+1, Bck2
 813         std Y+4*4+2, Bck3
 814         std Y+4*4+3, Bck4
 815         add Accu1, T1
 816         adc Accu2, T2
 817         adc Accu3, T3
 818         adc Accu4, T4
 819         std Y+4*0+0, Accu1
 820         std Y+4*0+1, Accu2
 821         std Y+4*0+2, Accu3
 822         std Y+4*0+3, Accu4 /* a array updated */
 823
 824
 825         dec LoopC
 826         breq update_state
 827         rjmp sha256_main_loop ;brne sha256_main_loop
 828 update_state:
 829         /* update state */
 830         /* pointers to state should still exist on the stack ;-) */
 831         pop r31
 832         pop r30
 833         ldi r21, 8
 834 update_state_loop:
 835         ldd Accu1, Z+0
 836         ldd Accu2, Z+1
 837         ldd Accu3, Z+2
 838         ldd Accu4, Z+3
 839         ld Func1, Y+
 840         ld Func2, Y+
 841         ld Func3, Y+
 842         ld Func4, Y+
 843         add Accu1, Func1
 844         adc Accu2, Func2
 845         adc Accu3, Func3
 846         adc Accu4, Func4
 847         st Z+, Accu1
 848         st Z+, Accu2
 849         st Z+, Accu3
 850         st Z+, Accu4
 851         dec r21
 852         brne update_state_loop
 853         /* now we just have to update the length */
 854         adiw r30, 1 /* since we add 512, we can simply skip the LSB */
 855         ldi r21, 2
 856         ldi r22, 6
 857         ld r20, Z
 858         add r20, r21
 859         st Z+, r20
 860         clr r21
 861 sha256_nextBlock_fix_length:
 862         brcc sha256_nextBlock_epilog
 863         ld r20, Z
 864         adc r20, r21
 865         st Z+, r20
 866         dec r22
 867         brne sha256_nextBlock_fix_length
 868
 869 ; EPILOG
 870 sha256_nextBlock_epilog:
 871 /* now we should clean up the stack */
 872
 873         pop r21
 874         pop r20
 875         in r0, SREG
 876         cli ; we want to be uninterrupted while updating SP
 877         out SPL, r20
 878         out SPH, r21
 879         out SREG, r0
 880
 881         clr r1
 882         pop r29
 883         pop r28
 884         pop r17
 885         pop r16
 886         pop r15
 887         pop r14
 888         pop r13
 889         pop r12
 890         pop r11
 891         pop r10
 892         pop r9
 893         pop r8
 894         pop r7
 895         pop r6
 896         pop r5
 897         pop r4
 898         ret
 899
 900 sha256_kv: ; round-key-vector stored in ProgMem
 901 .word   0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
 902 .word   0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
 903 .word   0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
 904 .word   0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
 905 .word   0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
 906 .word   0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
 907 .word   0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
 908 .word   0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
 909
 910
 911 ;###########################################################
 912
 913 .global sha256_init
 914 ;uint32_t sha256_init_vector[]={
 915 ;       0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
 916 ;       0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
 917 ;
 918 ;void sha256_init(sha256_ctx_t *state){
 919 ;       state->length=0;
 920 ;       memcpy(state->h, sha256_init_vector, 8*4);
 921 ;}
 922 ; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
 923 ; modifys: Z(r30,r31), Func1, r22
 924 sha256_init:
 925         movw r26, r24 ; (24,25) --> (26,27) load X with param1
 926         ldi r30, lo8((sha256_init_vector))
 927         ldi r31, hi8((sha256_init_vector))
 928         ldi r22, 32+8
 929 sha256_init_vloop:
 930         lpm r23, Z+
 931         st X+, r23
 932         dec r22
 933         brne sha256_init_vloop
 934         ret
 935
 936 sha256_init_vector:
 937 .word 0xE667, 0x6A09
 938 .word 0xAE85, 0xBB67
 939 .word 0xF372, 0x3C6E
 940 .word 0xF53A, 0xA54F
 941 .word 0x527F, 0x510E
 942 .word 0x688C, 0x9B05
 943 .word 0xD9AB, 0x1F83
 944 .word 0xCD19, 0x5BE0
 945 .word 0x0000, 0x0000
 946 .word 0x0000, 0x0000
 947
 948 ;###########################################################
 949
 950 .global rotl32
 951 ; === ROTL32 ===
 952 ; function that rotates a 32 bit word to the left
 953 ;  param1: the 32-bit word to rotate
 954 ;       given in r25,r24,r23,r22 (r25 is most significant)
 955 ;  param2: an 8-bit value telling how often to rotate
 956 ;       given in r20
 957 ; modifys: r21, r22
 958 rotl32:
 959         cpi r20, 8
 960         brlo bitrotl
 961         mov r21, r25
 962         mov r25, r24
 963         mov r24, r23
 964         mov r23, r22
 965         mov r22, r21
 966         subi r20, 8
 967         rjmp rotl32
 968 bitrotl:
 969         clr r21
 970         clc
 971 bitrotl_loop:
 972         tst r20
 973         breq fixrotl
 974         rol r22
 975         rol r23
 976         rol r24
 977         rol r25
 978         rol r21
 979         dec r20
 980         rjmp bitrotl_loop
 981 fixrotl:
 982         or r22, r21
 983         ret
 984
 985
 986 ;###########################################################
 987
 988 .global rotr32
 989 ; === ROTR32 ===
 990 ; function that rotates a 32 bit word to the right
 991 ;  param1: the 32-bit word to rotate
 992 ;       given in r25,r24,r23,22 (r25 is most significant)
 993 ;  param2: an 8-bit value telling how often to rotate
 994 ;       given in r20
 995 ; modifys: r21, r22
 996 rotr32:
 997         cpi r20, 8
 998         brlo bitrotr
 999         mov r21, r22
1000         mov r22, r23
1001         mov r23, r24
1002         mov r24, r25
1003         mov r25, r21
1004         subi r20, 8
1005         rjmp rotr32
1006 bitrotr:
1007         clr r21
1008         clc
1009 bitrotr_loop:
1010         tst r20
1011         breq fixrotr
1012         ror r25
1013         ror r24
1014         ror r23
1015         ror r22
1016         ror r21
1017         dec r20
1018         rjmp bitrotr_loop
1019 fixrotr:
1020         or r25, r21
1021         ret
1022
1023
1024 ;###########################################################
1025
1026 .global change_endian32
1027 ; === change_endian32 ===
1028 ; function that changes the endianess of a 32-bit word
1029 ;  param1: the 32-bit word
1030 ;       given in r25,r24,r23,22 (r25 is most significant)
1031 ;  modifys: r21, r22
1032 change_endian32:
1033         movw r20,  r22 ; (r22,r23) --> (r20,r21)
1034         mov r22, r25
1035         mov r23, r24
1036         mov r24, r21
1037         mov r25, r20
1038         ret
1039