Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 | /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ // // AES-NI optimized AES-GCM for x86_64 // // Copyright 2024 Google LLC // // Author: Eric Biggers <ebiggers@google.com> // //------------------------------------------------------------------------------ // // This file is dual-licensed, meaning that you can use it under your choice of // either of the following two licenses: // // Licensed under the Apache License 2.0 (the "License"). You may obtain a copy // of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // or // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // //------------------------------------------------------------------------------ // // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that // support the original set of AES instructions, i.e. AES-NI. Two // implementations are provided, one that uses AVX and one that doesn't. They // are very similar, being generated by the same macros. The only difference is // that the AVX implementation takes advantage of VEX-coded instructions in some // places to avoid some 'movdqu' and 'movdqa' instructions. The AVX // implementation does *not* use 256-bit vectors, as AES is not supported on // 256-bit vectors until the VAES feature (which this file doesn't target). // // The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1 // for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.) // // The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is // more thoroughly commented. This file has the following notable changes: // // - The vector length is fixed at 128-bit, i.e. xmm registers. This means // there is only one AES block (and GHASH block) per register. // // - Without AVX512 / AVX10, only 16 SIMD registers are available instead of // 32. We work around this by being much more careful about using // registers, relying heavily on loads to load values as they are needed. // // - Masking is not available either. We work around this by implementing // partial block loads and stores using overlapping scalar loads and stores // combined with shifts and SSE4.1 insertion and extraction instructions. // // - The main loop is organized differently due to the different design // constraints. First, with just one AES block per SIMD register, on some // CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore // do an 8-register wide loop. Considering that and the fact that we have // just 16 SIMD registers to work with, it's not feasible to cache AES // round keys and GHASH key powers in registers across loop iterations. // That's not ideal, but also not actually that bad, since loads can run in // parallel with other instructions. Significantly, this also makes it // possible to roll up the inner loops, relying on hardware loop unrolling // instead of software loop unrolling, greatly reducing code size. // // - We implement the GHASH multiplications in the main loop using Karatsuba // multiplication instead of schoolbook multiplication. This saves one // pclmulqdq instruction per block, at the cost of one 64-bit load, one // pshufd, and 0.25 pxors per block. (This is without the three-argument // XOR support that would be provided by AVX512 / AVX10, which would be // more beneficial to schoolbook than Karatsuba.) // // As a rough approximation, we can assume that Karatsuba multiplication is // faster than schoolbook multiplication in this context if one pshufd and // 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit // load is "free" due to running in parallel with arithmetic instructions.) // This is true on AMD CPUs, including all that support pclmulqdq up to at // least Zen 3. It's also true on older Intel CPUs: Westmere through // Haswell on the Core side, and Silvermont through Goldmont Plus on the // low-power side. On some of these CPUs, pclmulqdq is quite slow, and the // benefit of Karatsuba should be substantial. On newer Intel CPUs, // schoolbook multiplication should be faster, but only marginally. // // Not all these CPUs were available to be tested. However, benchmarks on // available CPUs suggest that this approximation is plausible. Switching // to Karatsuba showed negligible change (< 1%) on Intel Broadwell, // Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%. // Considering that and the fact that Karatsuba should be even more // beneficial on older Intel CPUs, it seems like the right choice here. // // An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be // saved by using a multiplication-less reduction method. We don't do that // because it would require a large number of shift and xor instructions, // making it less worthwhile and likely harmful on newer CPUs. // // It does make sense to sometimes use a different reduction optimization // that saves a pclmulqdq, though: precompute the hash key times x^64, and // multiply the low half of the data block by the hash key with the extra // factor of x^64. This eliminates one step of the reduction. However, // this is incompatible with Karatsuba multiplication. Therefore, for // multi-block processing we use Karatsuba multiplication with a regular // reduction. For single-block processing, we use the x^64 optimization. #include <linux/linkage.h> .section .rodata .p2align 4 .Lbswap_mask: .octa 0x000102030405060708090a0b0c0d0e0f .Lgfpoly: .quad 0xc200000000000000 .Lone: .quad 1 .Lgfpoly_and_internal_carrybit: .octa 0xc2000000000000010000000000000001 // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of // 'len' 0xff bytes and the rest zeroes. .Lzeropad_mask: .octa 0xffffffffffffffffffffffffffffffff .octa 0 // Offsets in struct aes_gcm_key_aesni #define OFFSETOF_AESKEYLEN 480 #define OFFSETOF_H_POWERS 496 #define OFFSETOF_H_POWERS_XORED 624 #define OFFSETOF_H_TIMES_X64 688 .text // Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback // assumes that all operands are distinct and that any mem operand is aligned. .macro _vpclmulqdq imm, src1, src2, dst .if USE_AVX vpclmulqdq \imm, \src1, \src2, \dst .else movdqa \src2, \dst pclmulqdq \imm, \src1, \dst .endif .endm // Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes // that all operands are distinct and that any mem operand is aligned. .macro _vpshufb src1, src2, dst .if USE_AVX vpshufb \src1, \src2, \dst .else movdqa \src2, \dst pshufb \src1, \dst .endif .endm // Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that // all operands are distinct. .macro _vpand src1, src2, dst .if USE_AVX vpand \src1, \src2, \dst .else movdqu \src1, \dst pand \src2, \dst .endif .endm // XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must // be a temporary xmm register. .macro _xor_mem_to_reg mem, reg, tmp .if USE_AVX vpxor \mem, \reg, \reg .else movdqu \mem, \tmp pxor \tmp, \reg .endif .endm // Test the unaligned memory operand \mem against the xmm register \reg. \tmp // must be a temporary xmm register. .macro _test_mem mem, reg, tmp .if USE_AVX vptest \mem, \reg .else movdqu \mem, \tmp ptest \tmp, \reg .endif .endm // Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst // and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. .macro _load_partial_block src, dst, tmp64, tmp32 sub $8, %ecx // LEN - 8 jle .Lle8\@ // Load 9 <= LEN <= 15 bytes. movq (\src), \dst // Load first 8 bytes mov (\src, %rcx), %rax // Load last 8 bytes neg %ecx shl $3, %ecx shr %cl, %rax // Discard overlapping bytes pinsrq $1, %rax, \dst jmp .Ldone\@ .Lle8\@: add $4, %ecx // LEN - 4 jl .Llt4\@ // Load 4 <= LEN <= 8 bytes. mov (\src), %eax // Load first 4 bytes mov (\src, %rcx), \tmp32 // Load last 4 bytes jmp .Lcombine\@ .Llt4\@: // Load 1 <= LEN <= 3 bytes. add $2, %ecx // LEN - 2 movzbl (\src), %eax // Load first byte jl .Lmovq\@ movzwl (\src, %rcx), \tmp32 // Load last 2 bytes .Lcombine\@: shl $3, %ecx shl %cl, \tmp64 or \tmp64, %rax // Combine the two parts .Lmovq\@: movq %rax, \dst .Ldone\@: .endm // Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. // Clobbers %rax, %rcx, and %rsi. .macro _store_partial_block src, dst sub $8, %ecx // LEN - 8 jl .Llt8\@ // Store 8 <= LEN <= 15 bytes. pextrq $1, \src, %rax mov %ecx, %esi shl $3, %ecx ror %cl, %rax mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes movq \src, (\dst) // Store first 8 bytes jmp .Ldone\@ .Llt8\@: add $4, %ecx // LEN - 4 jl .Llt4\@ // Store 4 <= LEN <= 7 bytes. pextrd $1, \src, %eax mov %ecx, %esi shl $3, %ecx ror %cl, %eax mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes movd \src, (\dst) // Store first 4 bytes jmp .Ldone\@ .Llt4\@: // Store 1 <= LEN <= 3 bytes. pextrb $0, \src, 0(\dst) cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? jl .Ldone\@ pextrb $1, \src, 1(\dst) je .Ldone\@ pextrb $2, \src, 2(\dst) .Ldone\@: .endm // Do one step of GHASH-multiplying \a by \b and storing the reduced product in // \b. To complete all steps, this must be invoked with \i=0 through \i=9. // \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the // .Lgfpoly constant, and \t0-\t1 must be temporary registers. .macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1 // MI = (a_L * b_H) + ((a*x^64)_L * b_L) .if \i == 0 _vpclmulqdq $0x01, \a, \b, \t0 .elseif \i == 1 _vpclmulqdq $0x00, \a_times_x64, \b, \t1 .elseif \i == 2 pxor \t1, \t0 // HI = (a_H * b_H) + ((a*x^64)_H * b_L) .elseif \i == 3 _vpclmulqdq $0x11, \a, \b, \t1 .elseif \i == 4 pclmulqdq $0x10, \a_times_x64, \b .elseif \i == 5 pxor \t1, \b .elseif \i == 6 // Fold MI into HI. pshufd $0x4e, \t0, \t1 // Swap halves of MI .elseif \i == 7 pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) .elseif \i == 8 pxor \t1, \b .elseif \i == 9 pxor \t0, \b .endif .endm // GHASH-multiply \a by \b and store the reduced product in \b. // See _ghash_mul_step for details. .macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1 .irp i, 0,1,2,3,4,5,6,7,8,9 _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1 .endr .endm // GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi. // This does Karatsuba multiplication and must be paired with _ghash_reduce. On // the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the // two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered. .macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0 // LO += a_L * b_L _vpclmulqdq $0x00, \a, \b, \t0 pxor \t0, \lo // b_L + b_H pshufd $0x4e, \b, \t0 pxor \b, \t0 // HI += a_H * b_H pclmulqdq $0x11, \a, \b pxor \b, \hi // MI += (a_L + a_H) * (b_L + b_H) pclmulqdq $0x00, \a_xored, \t0 pxor \t0, \mi .endm // Reduce the product from \lo, \mi, and \hi, and store the result in \dst. // This assumes that _ghash_mul_noreduce was used. .macro _ghash_reduce lo, mi, hi, dst, t0 movq .Lgfpoly(%rip), \t0 // MI += LO + HI (needed because we used Karatsuba multiplication) pxor \lo, \mi pxor \hi, \mi // Fold LO into MI. pshufd $0x4e, \lo, \dst pclmulqdq $0x00, \t0, \lo pxor \dst, \mi pxor \lo, \mi // Fold MI into HI. pshufd $0x4e, \mi, \dst pclmulqdq $0x00, \t0, \mi pxor \hi, \dst pxor \mi, \dst .endm // Do the first step of the GHASH update of a set of 8 ciphertext blocks. // // The whole GHASH update does: // // GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 + // blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1 // // This macro just does the first step: it does the unreduced multiplication // (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm // registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the // inner block counter in %rax, which is a value that counts up by 8 for each // block in the set of 8 and is used later to index by 8*blknum and 16*blknum. // // To reduce the number of pclmulqdq instructions required, both this macro and // _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook // multiplication. See the file comment for more details about this choice. // // Both macros expect the ciphertext blocks blk[0-7] to be available at DST if // encrypting, or SRC if decrypting. They also expect the precomputed hash key // powers H^i and their XOR'd-together halves to be available in the struct // pointed to by KEY. Both macros clobber TMP[0-2]. .macro _ghash_update_begin_8x enc // Initialize the inner block counter. xor %eax, %eax // Load the highest hash key power, H^8. movdqa OFFSETOF_H_POWERS(KEY), TMP0 // Load the first ciphertext block and byte-reflect it. .if \enc movdqu (DST), TMP1 .else movdqu (SRC), TMP1 .endif pshufb BSWAP_MASK, TMP1 // Add the GHASH accumulator to the ciphertext block to get the block // 'b' that needs to be multiplied with the hash key power 'a'. pxor TMP1, GHASH_ACC // b_L + b_H pshufd $0x4e, GHASH_ACC, MI pxor GHASH_ACC, MI // LO = a_L * b_L _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO // HI = a_H * b_H pclmulqdq $0x11, TMP0, GHASH_ACC // MI = (a_L + a_H) * (b_L + b_H) pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI .endm // Continue the GHASH update of 8 ciphertext blocks as described above by doing // an unreduced multiplication of the next ciphertext block by the next lowest // key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI. .macro _ghash_update_continue_8x enc add $8, %eax // Load the next lowest key power. movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0 // Load the next ciphertext block and byte-reflect it. .if \enc movdqu (DST,%rax,2), TMP1 .else movdqu (SRC,%rax,2), TMP1 .endif pshufb BSWAP_MASK, TMP1 // LO += a_L * b_L _vpclmulqdq $0x00, TMP0, TMP1, TMP2 pxor TMP2, LO // b_L + b_H pshufd $0x4e, TMP1, TMP2 pxor TMP1, TMP2 // HI += a_H * b_H pclmulqdq $0x11, TMP0, TMP1 pxor TMP1, GHASH_ACC // MI += (a_L + a_H) * (b_L + b_H) movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1 pclmulqdq $0x00, TMP1, TMP2 pxor TMP2, MI .endm // Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to // _ghash_reduce, but it's hardcoded to use the registers of the main loop and // it uses the same register for HI and the destination. It's also divided into // two steps. TMP1 must be preserved across steps. // // One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of // shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would // increase the critical path length, and it seems to slightly hurt performance. .macro _ghash_update_end_8x_step i .if \i == 0 movq .Lgfpoly(%rip), TMP1 pxor LO, MI pxor GHASH_ACC, MI pshufd $0x4e, LO, TMP2 pclmulqdq $0x00, TMP1, LO pxor TMP2, MI pxor LO, MI .elseif \i == 1 pshufd $0x4e, MI, TMP2 pclmulqdq $0x00, TMP1, MI pxor TMP2, GHASH_ACC pxor MI, GHASH_ACC .endif .endm // void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key); // // Given the expanded AES key, derive the GHASH subkey and initialize the GHASH // related fields in the key struct. .macro _aes_gcm_precompute // Function arguments .set KEY, %rdi // Additional local variables. // %xmm0-%xmm1 and %rax are used as temporaries. .set RNDKEYLAST_PTR, %rsi .set H_CUR, %xmm2 .set H_POW1, %xmm3 // H^1 .set H_POW1_X64, %xmm4 // H^1 * x^64 .set GFPOLY, %xmm5 // Encrypt an all-zeroes block to get the raw hash subkey. movl OFFSETOF_AESKEYLEN(KEY), %eax lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block lea 16(KEY), %rax 1: aesenc (%rax), H_POW1 add $16, %rax cmp %rax, RNDKEYLAST_PTR jne 1b aesenclast (RNDKEYLAST_PTR), H_POW1 // Preprocess the raw hash subkey as needed to operate on GHASH's // bit-reflected values directly: reflect its bytes, then multiply it by // x^-1 (using the backwards interpretation of polynomial coefficients // from the GCM spec) or equivalently x^1 (using the alternative, // natural interpretation of polynomial coefficients). pshufb .Lbswap_mask(%rip), H_POW1 movdqa H_POW1, %xmm0 pshufd $0xd3, %xmm0, %xmm0 psrad $31, %xmm0 paddq H_POW1, H_POW1 pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0 pxor %xmm0, H_POW1 // Store H^1. movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY) // Compute and store H^1 * x^64. movq .Lgfpoly(%rip), GFPOLY pshufd $0x4e, H_POW1, %xmm0 _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64 pxor %xmm0, H_POW1_X64 movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY) // Compute and store the halves of H^1 XOR'd together. pxor H_POW1, %xmm0 movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY) // Compute and store the remaining key powers H^2 through H^8. movdqa H_POW1, H_CUR mov $6*8, %eax .Lprecompute_next\@: // Compute H^i = H^{i-1} * H^1. _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1 // Store H^i. movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2) // Compute and store the halves of H^i XOR'd together. pshufd $0x4e, H_CUR, %xmm0 pxor H_CUR, %xmm0 movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax) sub $8, %eax jge .Lprecompute_next\@ RET .endm // void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, // u8 ghash_acc[16], const u8 *aad, int aadlen); // // This function processes the AAD (Additional Authenticated Data) in GCM. // Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the // data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all // zeroes. |aadlen| must be a multiple of 16, except on the last call where it // can be any length. The caller must do any buffering needed to ensure this. .macro _aes_gcm_aad_update // Function arguments .set KEY, %rdi .set GHASH_ACC_PTR, %rsi .set AAD, %rdx .set AADLEN, %ecx // Note: _load_partial_block relies on AADLEN being in %ecx. // Additional local variables. // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers. .set BSWAP_MASK, %xmm2 .set GHASH_ACC, %xmm3 .set H_POW1, %xmm4 // H^1 .set H_POW1_X64, %xmm5 // H^1 * x^64 .set GFPOLY, %xmm6 movdqa .Lbswap_mask(%rip), BSWAP_MASK movdqu (GHASH_ACC_PTR), GHASH_ACC movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 movq .Lgfpoly(%rip), GFPOLY // Process the AAD one full block at a time. sub $16, AADLEN jl .Laad_loop_1x_done\@ .Laad_loop_1x\@: movdqu (AAD), %xmm0 pshufb BSWAP_MASK, %xmm0 pxor %xmm0, GHASH_ACC _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 add $16, AAD sub $16, AADLEN jge .Laad_loop_1x\@ .Laad_loop_1x_done\@: // Check whether there is a partial block at the end. add $16, AADLEN jz .Laad_done\@ // Process a partial block of length 1 <= AADLEN <= 15. // _load_partial_block assumes that %ecx contains AADLEN. _load_partial_block AAD, %xmm0, %r10, %r10d pshufb BSWAP_MASK, %xmm0 pxor %xmm0, GHASH_ACC _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 .Laad_done\@: movdqu GHASH_ACC, (GHASH_ACC_PTR) RET .endm // Increment LE_CTR eight times to generate eight little-endian counter blocks, // swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with // the zero-th AES round key. Clobbers TMP0 and TMP1. .macro _ctr_begin_8x movq .Lone(%rip), TMP0 movdqa (KEY), TMP1 // zero-th round key .irp i, 0,1,2,3,4,5,6,7 _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i pxor TMP1, AESDATA\i paddd TMP0, LE_CTR .endr .endm // Do a non-last round of AES on AESDATA[0-7] using \round_key. .macro _aesenc_8x round_key .irp i, 0,1,2,3,4,5,6,7 aesenc \round_key, AESDATA\i .endr .endm // Do the last round of AES on AESDATA[0-7] using \round_key. .macro _aesenclast_8x round_key .irp i, 0,1,2,3,4,5,6,7 aesenclast \round_key, AESDATA\i .endr .endm // XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and // store the result to DST. Clobbers TMP0. .macro _xor_data_8x .irp i, 0,1,2,3,4,5,6,7 _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0 .endr .irp i, 0,1,2,3,4,5,6,7 movdqu AESDATA\i, \i*16(DST) .endr .endm // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key, // const u32 le_ctr[4], u8 ghash_acc[16], // const u8 *src, u8 *dst, int datalen); // // This macro generates a GCM encryption or decryption update function with the // above prototype (with \enc selecting which one). // // This function computes the next portion of the CTR keystream, XOR's it with // |datalen| bytes from |src|, and writes the resulting encrypted or decrypted // data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the // next |datalen| ciphertext bytes. // // |datalen| must be a multiple of 16, except on the last call where it can be // any length. The caller must do any buffering needed to ensure this. Both // in-place and out-of-place en/decryption are supported. // // |le_ctr| must give the current counter in little-endian format. For a new // message, the low word of the counter must be 2. This function loads the // counter from |le_ctr| and increments the loaded counter as needed, but it // does *not* store the updated counter back to |le_ctr|. The caller must // update |le_ctr| if any more data segments follow. Internally, only the low // 32-bit word of the counter is incremented, following the GCM standard. .macro _aes_gcm_update enc // Function arguments .set KEY, %rdi .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg .set GHASH_ACC_PTR, %rdx .set SRC, %rcx .set DST, %r8 .set DATALEN, %r9d .set DATALEN64, %r9 // Zero-extend DATALEN before using! // Note: the code setting up for _load_partial_block assumes that SRC is // in %rcx (and that DATALEN is *not* in %rcx). // Additional local variables // %rax and %rsi are used as temporary registers. Note: %rsi overlaps // with LE_CTR_PTR, which is used only at the beginning. .set AESKEYLEN, %r10d // AES key length in bytes .set AESKEYLEN64, %r10 .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key // Put the most frequently used values in %xmm0-%xmm7 to reduce code // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.) .set TMP0, %xmm0 .set TMP1, %xmm1 .set TMP2, %xmm2 .set LO, %xmm3 // Low part of unreduced product .set MI, %xmm4 // Middle part of unreduced product .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also // the high part of unreduced product .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes .set LE_CTR, %xmm7 // Little-endian counter value .set AESDATA0, %xmm8 .set AESDATA1, %xmm9 .set AESDATA2, %xmm10 .set AESDATA3, %xmm11 .set AESDATA4, %xmm12 .set AESDATA5, %xmm13 .set AESDATA6, %xmm14 .set AESDATA7, %xmm15 movdqa .Lbswap_mask(%rip), BSWAP_MASK movdqu (GHASH_ACC_PTR), GHASH_ACC movdqu (LE_CTR_PTR), LE_CTR movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR // If there are at least 8*16 bytes of data, then continue into the main // loop, which processes 8*16 bytes of data per iteration. // // The main loop interleaves AES and GHASH to improve performance on // CPUs that can execute these instructions in parallel. When // decrypting, the GHASH input (the ciphertext) is immediately // available. When encrypting, we instead encrypt a set of 8 blocks // first and then GHASH those blocks while encrypting the next set of 8, // repeat that as needed, and finally GHASH the last set of 8 blocks. // // Code size optimization: Prefer adding or subtracting -8*16 over 8*16, // as this makes the immediate fit in a signed byte, saving 3 bytes. add $-8*16, DATALEN jl .Lcrypt_loop_8x_done\@ .if \enc // Encrypt the first 8 plaintext blocks. _ctr_begin_8x lea 16(KEY), %rsi .p2align 4 1: movdqa (%rsi), TMP0 _aesenc_8x TMP0 add $16, %rsi cmp %rsi, RNDKEYLAST_PTR jne 1b movdqa (%rsi), TMP0 _aesenclast_8x TMP0 _xor_data_8x // Don't increment DST until the ciphertext blocks have been hashed. sub $-8*16, SRC add $-8*16, DATALEN jl .Lghash_last_ciphertext_8x\@ .endif .p2align 4 .Lcrypt_loop_8x\@: // Generate the next set of 8 counter blocks and start encrypting them. _ctr_begin_8x lea 16(KEY), %rsi // Do a round of AES, and start the GHASH update of 8 ciphertext blocks // by doing the unreduced multiplication for the first ciphertext block. movdqa (%rsi), TMP0 add $16, %rsi _aesenc_8x TMP0 _ghash_update_begin_8x \enc // Do 7 more rounds of AES, and continue the GHASH update by doing the // unreduced multiplication for the remaining ciphertext blocks. .p2align 4 1: movdqa (%rsi), TMP0 add $16, %rsi _aesenc_8x TMP0 _ghash_update_continue_8x \enc cmp $7*8, %eax jne 1b // Do the remaining AES rounds. .p2align 4 1: movdqa (%rsi), TMP0 add $16, %rsi _aesenc_8x TMP0 cmp %rsi, RNDKEYLAST_PTR jne 1b // Do the GHASH reduction and the last round of AES. movdqa (RNDKEYLAST_PTR), TMP0 _ghash_update_end_8x_step 0 _aesenclast_8x TMP0 _ghash_update_end_8x_step 1 // XOR the data with the AES-CTR keystream blocks. .if \enc sub $-8*16, DST .endif _xor_data_8x sub $-8*16, SRC .if !\enc sub $-8*16, DST .endif add $-8*16, DATALEN jge .Lcrypt_loop_8x\@ .if \enc .Lghash_last_ciphertext_8x\@: // Update GHASH with the last set of 8 ciphertext blocks. _ghash_update_begin_8x \enc .p2align 4 1: _ghash_update_continue_8x \enc cmp $7*8, %eax jne 1b _ghash_update_end_8x_step 0 _ghash_update_end_8x_step 1 sub $-8*16, DST .endif .Lcrypt_loop_8x_done\@: sub $-8*16, DATALEN jz .Ldone\@ // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep // things simple and keep the code size down by just going one block at // a time, again taking advantage of hardware loop unrolling. Since // there are enough key powers available for all remaining data, we do // the GHASH multiplications unreduced, and only reduce at the very end. .set HI, TMP2 .set H_POW, AESDATA0 .set H_POW_XORED, AESDATA1 .set ONE, AESDATA2 movq .Lone(%rip), ONE // Start collecting the unreduced GHASH intermediate value LO, MI, HI. pxor LO, LO pxor MI, MI pxor HI, HI // Set up a block counter %rax to contain 8*(8-n), where n is the number // of blocks that remain, counting any partial block. This will be used // to access the key powers H^n through H^1. mov DATALEN, %eax neg %eax and $~15, %eax sar $1, %eax add $64, %eax sub $16, DATALEN jl .Lcrypt_loop_1x_done\@ // Process the data one full block at a time. .Lcrypt_loop_1x\@: // Encrypt the next counter block. _vpshufb BSWAP_MASK, LE_CTR, TMP0 paddd ONE, LE_CTR pxor (KEY), TMP0 lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size cmp $24, AESKEYLEN jl 128f // AES-128? je 192f // AES-192? // AES-256 aesenc -7*16(%rsi), TMP0 aesenc -6*16(%rsi), TMP0 192: aesenc -5*16(%rsi), TMP0 aesenc -4*16(%rsi), TMP0 128: .irp i, -3,-2,-1,0,1,2,3,4,5 aesenc \i*16(%rsi), TMP0 .endr aesenclast (RNDKEYLAST_PTR), TMP0 // Load the next key power H^i. movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED // XOR the keystream block that was just generated in TMP0 with the next // source data block and store the resulting en/decrypted data to DST. .if \enc _xor_mem_to_reg (SRC), TMP0, tmp=TMP1 movdqu TMP0, (DST) .else movdqu (SRC), TMP1 pxor TMP1, TMP0 movdqu TMP0, (DST) .endif // Update GHASH with the ciphertext block. .if \enc pshufb BSWAP_MASK, TMP0 pxor TMP0, GHASH_ACC .else pshufb BSWAP_MASK, TMP1 pxor TMP1, GHASH_ACC .endif _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 pxor GHASH_ACC, GHASH_ACC add $8, %eax add $16, SRC add $16, DST sub $16, DATALEN jge .Lcrypt_loop_1x\@ .Lcrypt_loop_1x_done\@: // Check whether there is a partial block at the end. add $16, DATALEN jz .Lghash_reduce\@ // Process a partial block of length 1 <= DATALEN <= 15. // Encrypt a counter block for the last time. pshufb BSWAP_MASK, LE_CTR pxor (KEY), LE_CTR lea 16(KEY), %rsi 1: aesenc (%rsi), LE_CTR add $16, %rsi cmp %rsi, RNDKEYLAST_PTR jne 1b aesenclast (RNDKEYLAST_PTR), LE_CTR // Load the lowest key power, H^1. movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is // in %rcx, but _load_partial_block needs DATALEN in %rcx instead. // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC. mov SRC, RNDKEYLAST_PTR mov DATALEN, %ecx _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi // XOR the keystream block that was just generated in LE_CTR with the // source data block and store the resulting en/decrypted data to DST. pxor TMP0, LE_CTR mov DATALEN, %ecx _store_partial_block LE_CTR, DST // If encrypting, zero-pad the final ciphertext block for GHASH. (If // decrypting, this was already done by _load_partial_block.) .if \enc lea .Lzeropad_mask+16(%rip), %rax sub DATALEN64, %rax _vpand (%rax), LE_CTR, TMP0 .endif // Update GHASH with the final ciphertext block. pshufb BSWAP_MASK, TMP0 pxor TMP0, GHASH_ACC _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 .Lghash_reduce\@: // Finally, do the GHASH reduction. _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0 .Ldone\@: // Store the updated GHASH accumulator back to memory. movdqu GHASH_ACC, (GHASH_ACC_PTR) RET .endm // void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key, // const u32 le_ctr[4], u8 ghash_acc[16], // u64 total_aadlen, u64 total_datalen); // bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key, // const u32 le_ctr[4], const u8 ghash_acc[16], // u64 total_aadlen, u64 total_datalen, // const u8 tag[16], int taglen); // // This macro generates one of the above two functions (with \enc selecting // which one). Both functions finish computing the GCM authentication tag by // updating GHASH with the lengths block and encrypting the GHASH accumulator. // |total_aadlen| and |total_datalen| must be the total length of the additional // authenticated data and the en/decrypted data in bytes, respectively. // // The encryption function then stores the full-length (16-byte) computed // authentication tag to |ghash_acc|. The decryption function instead loads the // expected authentication tag (the one that was transmitted) from the 16-byte // buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the // computed tag in constant time, and returns true if and only if they match. .macro _aes_gcm_final enc // Function arguments .set KEY, %rdi .set LE_CTR_PTR, %rsi .set GHASH_ACC_PTR, %rdx .set TOTAL_AADLEN, %rcx .set TOTAL_DATALEN, %r8 .set TAG, %r9 .set TAGLEN, %r10d // Originally at 8(%rsp) .set TAGLEN64, %r10 // Additional local variables. // %rax and %xmm0-%xmm2 are used as temporary registers. .set AESKEYLEN, %r11d .set AESKEYLEN64, %r11 .set BSWAP_MASK, %xmm3 .set GHASH_ACC, %xmm4 .set H_POW1, %xmm5 // H^1 .set H_POW1_X64, %xmm6 // H^1 * x^64 .set GFPOLY, %xmm7 movdqa .Lbswap_mask(%rip), BSWAP_MASK movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN // Set up a counter block with 1 in the low 32-bit word. This is the // counter that produces the ciphertext needed to encrypt the auth tag. movdqu (LE_CTR_PTR), %xmm0 mov $1, %eax pinsrd $0, %eax, %xmm0 // Build the lengths block and XOR it into the GHASH accumulator. movq TOTAL_DATALEN, GHASH_ACC pinsrq $1, TOTAL_AADLEN, GHASH_ACC psllq $3, GHASH_ACC // Bytes to bits _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1 movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 movq .Lgfpoly(%rip), GFPOLY // Make %rax point to the 6th from last AES round key. (Using signed // byte offsets -7*16 through 6*16 decreases code size.) lea (KEY,AESKEYLEN64,4), %rax // AES-encrypt the counter block and also multiply GHASH_ACC by H^1. // Interleave the AES and GHASH instructions to improve performance. pshufb BSWAP_MASK, %xmm0 pxor (KEY), %xmm0 cmp $24, AESKEYLEN jl 128f // AES-128? je 192f // AES-192? // AES-256 aesenc -7*16(%rax), %xmm0 aesenc -6*16(%rax), %xmm0 192: aesenc -5*16(%rax), %xmm0 aesenc -4*16(%rax), %xmm0 128: .irp i, 0,1,2,3,4,5,6,7,8 aesenc (\i-3)*16(%rax), %xmm0 _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 .endr aesenclast 6*16(%rax), %xmm0 _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 // Undo the byte reflection of the GHASH accumulator. pshufb BSWAP_MASK, GHASH_ACC // Encrypt the GHASH accumulator. pxor %xmm0, GHASH_ACC .if \enc // Return the computed auth tag. movdqu GHASH_ACC, (GHASH_ACC_PTR) .else .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN! // Verify the auth tag in constant time by XOR'ing the transmitted and // computed auth tags together and using the ptest instruction to check // whether the first TAGLEN bytes of the result are zero. _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0 movl 8(%rsp), TAGLEN lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR sub TAGLEN64, ZEROPAD_MASK_PTR xor %eax, %eax _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0 sete %al .endif RET .endm .set USE_AVX, 0 SYM_FUNC_START(aes_gcm_precompute_aesni) _aes_gcm_precompute SYM_FUNC_END(aes_gcm_precompute_aesni) SYM_FUNC_START(aes_gcm_aad_update_aesni) _aes_gcm_aad_update SYM_FUNC_END(aes_gcm_aad_update_aesni) SYM_FUNC_START(aes_gcm_enc_update_aesni) _aes_gcm_update 1 SYM_FUNC_END(aes_gcm_enc_update_aesni) SYM_FUNC_START(aes_gcm_dec_update_aesni) _aes_gcm_update 0 SYM_FUNC_END(aes_gcm_dec_update_aesni) SYM_FUNC_START(aes_gcm_enc_final_aesni) _aes_gcm_final 1 SYM_FUNC_END(aes_gcm_enc_final_aesni) SYM_FUNC_START(aes_gcm_dec_final_aesni) _aes_gcm_final 0 SYM_FUNC_END(aes_gcm_dec_final_aesni) .set USE_AVX, 1 SYM_FUNC_START(aes_gcm_precompute_aesni_avx) _aes_gcm_precompute SYM_FUNC_END(aes_gcm_precompute_aesni_avx) SYM_FUNC_START(aes_gcm_aad_update_aesni_avx) _aes_gcm_aad_update SYM_FUNC_END(aes_gcm_aad_update_aesni_avx) SYM_FUNC_START(aes_gcm_enc_update_aesni_avx) _aes_gcm_update 1 SYM_FUNC_END(aes_gcm_enc_update_aesni_avx) SYM_FUNC_START(aes_gcm_dec_update_aesni_avx) _aes_gcm_update 0 SYM_FUNC_END(aes_gcm_dec_update_aesni_avx) SYM_FUNC_START(aes_gcm_enc_final_aesni_avx) _aes_gcm_final 1 SYM_FUNC_END(aes_gcm_enc_final_aesni_avx) SYM_FUNC_START(aes_gcm_dec_final_aesni_avx) _aes_gcm_final 0 SYM_FUNC_END(aes_gcm_dec_final_aesni_avx) |