Linux Audio

Check our new training course

Embedded Linux Audio

Check our new training course
with Creative Commons CC-BY-SA
lecture materials

Bootlin logo

Elixir Cross Referencer

Loading...
   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// AES-NI optimized AES-GCM for x86_64
//
// Copyright 2024 Google LLC
//
// Author: Eric Biggers <ebiggers@google.com>
//
//------------------------------------------------------------------------------
//
// This file is dual-licensed, meaning that you can use it under your choice of
// either of the following two licenses:
//
// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
// of the License at
//
//	http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// or
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
//    this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
//------------------------------------------------------------------------------
//
// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
// support the original set of AES instructions, i.e. AES-NI.  Two
// implementations are provided, one that uses AVX and one that doesn't.  They
// are very similar, being generated by the same macros.  The only difference is
// that the AVX implementation takes advantage of VEX-coded instructions in some
// places to avoid some 'movdqu' and 'movdqa' instructions.  The AVX
// implementation does *not* use 256-bit vectors, as AES is not supported on
// 256-bit vectors until the VAES feature (which this file doesn't target).
//
// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
// for the *_aesni functions or AVX for the *_aesni_avx ones.  (But it seems
// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
//
// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
// more thoroughly commented.  This file has the following notable changes:
//
//    - The vector length is fixed at 128-bit, i.e. xmm registers.  This means
//      there is only one AES block (and GHASH block) per register.
//
//    - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
//      32.  We work around this by being much more careful about using
//      registers, relying heavily on loads to load values as they are needed.
//
//    - Masking is not available either.  We work around this by implementing
//      partial block loads and stores using overlapping scalar loads and stores
//      combined with shifts and SSE4.1 insertion and extraction instructions.
//
//    - The main loop is organized differently due to the different design
//      constraints.  First, with just one AES block per SIMD register, on some
//      CPUs 4 registers don't saturate the 'aesenc' throughput.  We therefore
//      do an 8-register wide loop.  Considering that and the fact that we have
//      just 16 SIMD registers to work with, it's not feasible to cache AES
//      round keys and GHASH key powers in registers across loop iterations.
//      That's not ideal, but also not actually that bad, since loads can run in
//      parallel with other instructions.  Significantly, this also makes it
//      possible to roll up the inner loops, relying on hardware loop unrolling
//      instead of software loop unrolling, greatly reducing code size.
//
//    - We implement the GHASH multiplications in the main loop using Karatsuba
//      multiplication instead of schoolbook multiplication.  This saves one
//      pclmulqdq instruction per block, at the cost of one 64-bit load, one
//      pshufd, and 0.25 pxors per block.  (This is without the three-argument
//      XOR support that would be provided by AVX512 / AVX10, which would be
//      more beneficial to schoolbook than Karatsuba.)
//
//      As a rough approximation, we can assume that Karatsuba multiplication is
//      faster than schoolbook multiplication in this context if one pshufd and
//      0.25 pxors are cheaper than a pclmulqdq.  (We assume that the 64-bit
//      load is "free" due to running in parallel with arithmetic instructions.)
//      This is true on AMD CPUs, including all that support pclmulqdq up to at
//      least Zen 3.  It's also true on older Intel CPUs: Westmere through
//      Haswell on the Core side, and Silvermont through Goldmont Plus on the
//      low-power side.  On some of these CPUs, pclmulqdq is quite slow, and the
//      benefit of Karatsuba should be substantial.  On newer Intel CPUs,
//      schoolbook multiplication should be faster, but only marginally.
//
//      Not all these CPUs were available to be tested.  However, benchmarks on
//      available CPUs suggest that this approximation is plausible.  Switching
//      to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
//      Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
//      Considering that and the fact that Karatsuba should be even more
//      beneficial on older Intel CPUs, it seems like the right choice here.
//
//      An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
//      saved by using a multiplication-less reduction method.  We don't do that
//      because it would require a large number of shift and xor instructions,
//      making it less worthwhile and likely harmful on newer CPUs.
//
//      It does make sense to sometimes use a different reduction optimization
//      that saves a pclmulqdq, though: precompute the hash key times x^64, and
//      multiply the low half of the data block by the hash key with the extra
//      factor of x^64.  This eliminates one step of the reduction.  However,
//      this is incompatible with Karatsuba multiplication.  Therefore, for
//      multi-block processing we use Karatsuba multiplication with a regular
//      reduction.  For single-block processing, we use the x^64 optimization.

#include <linux/linkage.h>

.section .rodata
.p2align 4
.Lbswap_mask:
	.octa   0x000102030405060708090a0b0c0d0e0f
.Lgfpoly:
	.quad	0xc200000000000000
.Lone:
	.quad	1
.Lgfpoly_and_internal_carrybit:
	.octa	0xc2000000000000010000000000000001
	// Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
	// 'len' 0xff bytes and the rest zeroes.
.Lzeropad_mask:
	.octa	0xffffffffffffffffffffffffffffffff
	.octa	0

// Offsets in struct aes_gcm_key_aesni
#define OFFSETOF_AESKEYLEN	480
#define OFFSETOF_H_POWERS	496
#define OFFSETOF_H_POWERS_XORED	624
#define OFFSETOF_H_TIMES_X64	688

.text

// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq.  The fallback
// assumes that all operands are distinct and that any mem operand is aligned.
.macro	_vpclmulqdq	imm, src1, src2, dst
.if USE_AVX
	vpclmulqdq	\imm, \src1, \src2, \dst
.else
	movdqa		\src2, \dst
	pclmulqdq	\imm, \src1, \dst
.endif
.endm

// Do a vpshufb, or fall back to a movdqa and a pshufb.  The fallback assumes
// that all operands are distinct and that any mem operand is aligned.
.macro	_vpshufb	src1, src2, dst
.if USE_AVX
	vpshufb		\src1, \src2, \dst
.else
	movdqa		\src2, \dst
	pshufb		\src1, \dst
.endif
.endm

// Do a vpand, or fall back to a movdqu and a pand.  The fallback assumes that
// all operands are distinct.
.macro	_vpand		src1, src2, dst
.if USE_AVX
	vpand		\src1, \src2, \dst
.else
	movdqu		\src1, \dst
	pand		\src2, \dst
.endif
.endm

// XOR the unaligned memory operand \mem into the xmm register \reg.  \tmp must
// be a temporary xmm register.
.macro	_xor_mem_to_reg	mem, reg, tmp
.if USE_AVX
	vpxor		\mem, \reg, \reg
.else
	movdqu		\mem, \tmp
	pxor		\tmp, \reg
.endif
.endm

// Test the unaligned memory operand \mem against the xmm register \reg.  \tmp
// must be a temporary xmm register.
.macro	_test_mem	mem, reg, tmp
.if USE_AVX
	vptest		\mem, \reg
.else
	movdqu		\mem, \tmp
	ptest		\tmp, \reg
.endif
.endm

// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
// and zeroize any remaining bytes.  Clobbers %rax, %rcx, and \tmp{64,32}.
.macro	_load_partial_block	src, dst, tmp64, tmp32
	sub		$8, %ecx		// LEN - 8
	jle		.Lle8\@

	// Load 9 <= LEN <= 15 bytes.
	movq		(\src), \dst		// Load first 8 bytes
	mov		(\src, %rcx), %rax	// Load last 8 bytes
	neg		%ecx
	shl		$3, %ecx
	shr		%cl, %rax		// Discard overlapping bytes
	pinsrq		$1, %rax, \dst
	jmp		.Ldone\@

.Lle8\@:
	add		$4, %ecx		// LEN - 4
	jl		.Llt4\@

	// Load 4 <= LEN <= 8 bytes.
	mov		(\src), %eax		// Load first 4 bytes
	mov		(\src, %rcx), \tmp32	// Load last 4 bytes
	jmp		.Lcombine\@

.Llt4\@:
	// Load 1 <= LEN <= 3 bytes.
	add		$2, %ecx		// LEN - 2
	movzbl		(\src), %eax		// Load first byte
	jl		.Lmovq\@
	movzwl		(\src, %rcx), \tmp32	// Load last 2 bytes
.Lcombine\@:
	shl		$3, %ecx
	shl		%cl, \tmp64
	or		\tmp64, %rax		// Combine the two parts
.Lmovq\@:
	movq		%rax, \dst
.Ldone\@:
.endm

// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
// Clobbers %rax, %rcx, and %rsi.
.macro	_store_partial_block	src, dst
	sub		$8, %ecx		// LEN - 8
	jl		.Llt8\@

	// Store 8 <= LEN <= 15 bytes.
	pextrq		$1, \src, %rax
	mov		%ecx, %esi
	shl		$3, %ecx
	ror		%cl, %rax
	mov		%rax, (\dst, %rsi)	// Store last LEN - 8 bytes
	movq		\src, (\dst)		// Store first 8 bytes
	jmp		.Ldone\@

.Llt8\@:
	add		$4, %ecx		// LEN - 4
	jl		.Llt4\@

	// Store 4 <= LEN <= 7 bytes.
	pextrd		$1, \src, %eax
	mov		%ecx, %esi
	shl		$3, %ecx
	ror		%cl, %eax
	mov		%eax, (\dst, %rsi)	// Store last LEN - 4 bytes
	movd		\src, (\dst)		// Store first 4 bytes
	jmp		.Ldone\@

.Llt4\@:
	// Store 1 <= LEN <= 3 bytes.
	pextrb		$0, \src, 0(\dst)
	cmp		$-2, %ecx		// LEN - 4 == -2, i.e. LEN == 2?
	jl		.Ldone\@
	pextrb		$1, \src, 1(\dst)
	je		.Ldone\@
	pextrb		$2, \src, 2(\dst)
.Ldone\@:
.endm

// Do one step of GHASH-multiplying \a by \b and storing the reduced product in
// \b.  To complete all steps, this must be invoked with \i=0 through \i=9.
// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
// .Lgfpoly constant, and \t0-\t1 must be temporary registers.
.macro	_ghash_mul_step	i, a, a_times_x64, b, gfpoly, t0, t1

	// MI = (a_L * b_H) + ((a*x^64)_L * b_L)
.if \i == 0
	_vpclmulqdq	$0x01, \a, \b, \t0
.elseif \i == 1
	_vpclmulqdq	$0x00, \a_times_x64, \b, \t1
.elseif \i == 2
	pxor		\t1, \t0

	// HI = (a_H * b_H) + ((a*x^64)_H * b_L)
.elseif \i == 3
	_vpclmulqdq	$0x11, \a, \b, \t1
.elseif \i == 4
	pclmulqdq	$0x10, \a_times_x64, \b
.elseif \i == 5
	pxor		\t1, \b
.elseif \i == 6

	// Fold MI into HI.
	pshufd		$0x4e, \t0, \t1		// Swap halves of MI
.elseif \i == 7
	pclmulqdq	$0x00, \gfpoly, \t0	// MI_L*(x^63 + x^62 + x^57)
.elseif \i == 8
	pxor		\t1, \b
.elseif \i == 9
	pxor		\t0, \b
.endif
.endm

// GHASH-multiply \a by \b and store the reduced product in \b.
// See _ghash_mul_step for details.
.macro	_ghash_mul	a, a_times_x64, b, gfpoly, t0, t1
.irp i, 0,1,2,3,4,5,6,7,8,9
	_ghash_mul_step	\i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
.endr
.endm

// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
// This does Karatsuba multiplication and must be paired with _ghash_reduce.  On
// the first call, \lo, \mi, and \hi must be zero.  \a_xored must contain the
// two halves of \a XOR'd together, i.e. a_L + a_H.  \b is clobbered.
.macro	_ghash_mul_noreduce	a, a_xored, b, lo, mi, hi, t0

	// LO += a_L * b_L
	_vpclmulqdq	$0x00, \a, \b, \t0
	pxor		\t0, \lo

	// b_L + b_H
	pshufd		$0x4e, \b, \t0
	pxor		\b, \t0

	// HI += a_H * b_H
	pclmulqdq	$0x11, \a, \b
	pxor		\b, \hi

	// MI += (a_L + a_H) * (b_L + b_H)
	pclmulqdq	$0x00, \a_xored, \t0
	pxor		\t0, \mi
.endm

// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
// This assumes that _ghash_mul_noreduce was used.
.macro	_ghash_reduce	lo, mi, hi, dst, t0

	movq		.Lgfpoly(%rip), \t0

	// MI += LO + HI (needed because we used Karatsuba multiplication)
	pxor		\lo, \mi
	pxor		\hi, \mi

	// Fold LO into MI.
	pshufd		$0x4e, \lo, \dst
	pclmulqdq	$0x00, \t0, \lo
	pxor		\dst, \mi
	pxor		\lo, \mi

	// Fold MI into HI.
	pshufd		$0x4e, \mi, \dst
	pclmulqdq	$0x00, \t0, \mi
	pxor		\hi, \dst
	pxor		\mi, \dst
.endm

// Do the first step of the GHASH update of a set of 8 ciphertext blocks.
//
// The whole GHASH update does:
//
//	GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
//				blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
//
// This macro just does the first step: it does the unreduced multiplication
// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
// registers LO, MI, and GHASH_ACC a.k.a. HI.  It also zero-initializes the
// inner block counter in %rax, which is a value that counts up by 8 for each
// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
//
// To reduce the number of pclmulqdq instructions required, both this macro and
// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
// multiplication.  See the file comment for more details about this choice.
//
// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
// encrypting, or SRC if decrypting.  They also expect the precomputed hash key
// powers H^i and their XOR'd-together halves to be available in the struct
// pointed to by KEY.  Both macros clobber TMP[0-2].
.macro	_ghash_update_begin_8x	enc

	// Initialize the inner block counter.
	xor		%eax, %eax

	// Load the highest hash key power, H^8.
	movdqa		OFFSETOF_H_POWERS(KEY), TMP0

	// Load the first ciphertext block and byte-reflect it.
.if \enc
	movdqu		(DST), TMP1
.else
	movdqu		(SRC), TMP1
.endif
	pshufb		BSWAP_MASK, TMP1

	// Add the GHASH accumulator to the ciphertext block to get the block
	// 'b' that needs to be multiplied with the hash key power 'a'.
	pxor		TMP1, GHASH_ACC

	// b_L + b_H
	pshufd		$0x4e, GHASH_ACC, MI
	pxor		GHASH_ACC, MI

	// LO = a_L * b_L
	_vpclmulqdq	$0x00, TMP0, GHASH_ACC, LO

	// HI = a_H * b_H
	pclmulqdq	$0x11, TMP0, GHASH_ACC

	// MI = (a_L + a_H) * (b_L + b_H)
	pclmulqdq	$0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
.endm

// Continue the GHASH update of 8 ciphertext blocks as described above by doing
// an unreduced multiplication of the next ciphertext block by the next lowest
// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
.macro	_ghash_update_continue_8x enc
	add		$8, %eax

	// Load the next lowest key power.
	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), TMP0

	// Load the next ciphertext block and byte-reflect it.
.if \enc
	movdqu		(DST,%rax,2), TMP1
.else
	movdqu		(SRC,%rax,2), TMP1
.endif
	pshufb		BSWAP_MASK, TMP1

	// LO += a_L * b_L
	_vpclmulqdq	$0x00, TMP0, TMP1, TMP2
	pxor		TMP2, LO

	// b_L + b_H
	pshufd		$0x4e, TMP1, TMP2
	pxor		TMP1, TMP2

	// HI += a_H * b_H
	pclmulqdq	$0x11, TMP0, TMP1
	pxor		TMP1, GHASH_ACC

	// MI += (a_L + a_H) * (b_L + b_H)
	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
	pclmulqdq	$0x00, TMP1, TMP2
	pxor		TMP2, MI
.endm

// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC.  This is similar to
// _ghash_reduce, but it's hardcoded to use the registers of the main loop and
// it uses the same register for HI and the destination.  It's also divided into
// two steps.  TMP1 must be preserved across steps.
//
// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
// shuffling LO, XOR'ing LO into MI, and shuffling MI.  However, this would
// increase the critical path length, and it seems to slightly hurt performance.
.macro	_ghash_update_end_8x_step	i
.if \i == 0
	movq		.Lgfpoly(%rip), TMP1
	pxor		LO, MI
	pxor		GHASH_ACC, MI
	pshufd		$0x4e, LO, TMP2
	pclmulqdq	$0x00, TMP1, LO
	pxor		TMP2, MI
	pxor		LO, MI
.elseif \i == 1
	pshufd		$0x4e, MI, TMP2
	pclmulqdq	$0x00, TMP1, MI
	pxor		TMP2, GHASH_ACC
	pxor		MI, GHASH_ACC
.endif
.endm

// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
//
// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
// related fields in the key struct.
.macro	_aes_gcm_precompute

	// Function arguments
	.set	KEY,		%rdi

	// Additional local variables.
	// %xmm0-%xmm1 and %rax are used as temporaries.
	.set	RNDKEYLAST_PTR,	%rsi
	.set	H_CUR,		%xmm2
	.set	H_POW1,		%xmm3	// H^1
	.set	H_POW1_X64,	%xmm4	// H^1 * x^64
	.set	GFPOLY,		%xmm5

	// Encrypt an all-zeroes block to get the raw hash subkey.
	movl		OFFSETOF_AESKEYLEN(KEY), %eax
	lea		6*16(KEY,%rax,4), RNDKEYLAST_PTR
	movdqa		(KEY), H_POW1  // Zero-th round key XOR all-zeroes block
	lea		16(KEY), %rax
1:
	aesenc		(%rax), H_POW1
	add		$16, %rax
	cmp		%rax, RNDKEYLAST_PTR
	jne		1b
	aesenclast	(RNDKEYLAST_PTR), H_POW1

	// Preprocess the raw hash subkey as needed to operate on GHASH's
	// bit-reflected values directly: reflect its bytes, then multiply it by
	// x^-1 (using the backwards interpretation of polynomial coefficients
	// from the GCM spec) or equivalently x^1 (using the alternative,
	// natural interpretation of polynomial coefficients).
	pshufb		.Lbswap_mask(%rip), H_POW1
	movdqa		H_POW1, %xmm0
	pshufd		$0xd3, %xmm0, %xmm0
	psrad		$31, %xmm0
	paddq		H_POW1, H_POW1
	pand		.Lgfpoly_and_internal_carrybit(%rip), %xmm0
	pxor		%xmm0, H_POW1

	// Store H^1.
	movdqa		H_POW1, OFFSETOF_H_POWERS+7*16(KEY)

	// Compute and store H^1 * x^64.
	movq		.Lgfpoly(%rip), GFPOLY
	pshufd		$0x4e, H_POW1, %xmm0
	_vpclmulqdq	$0x00, H_POW1, GFPOLY, H_POW1_X64
	pxor		%xmm0, H_POW1_X64
	movdqa		H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)

	// Compute and store the halves of H^1 XOR'd together.
	pxor		H_POW1, %xmm0
	movq		%xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)

	// Compute and store the remaining key powers H^2 through H^8.
	movdqa		H_POW1, H_CUR
	mov		$6*8, %eax
.Lprecompute_next\@:
	// Compute H^i = H^{i-1} * H^1.
	_ghash_mul	H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
	// Store H^i.
	movdqa		H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
	// Compute and store the halves of H^i XOR'd together.
	pshufd		$0x4e, H_CUR, %xmm0
	pxor		H_CUR, %xmm0
	movq		%xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
	sub		$8, %eax
	jge		.Lprecompute_next\@

	RET
.endm

// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
//				 u8 ghash_acc[16], const u8 *aad, int aadlen);
//
// This function processes the AAD (Additional Authenticated Data) in GCM.
// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
// data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
// zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
// can be any length.  The caller must do any buffering needed to ensure this.
.macro	_aes_gcm_aad_update

	// Function arguments
	.set	KEY,		%rdi
	.set	GHASH_ACC_PTR,	%rsi
	.set	AAD,		%rdx
	.set	AADLEN,		%ecx
	// Note: _load_partial_block relies on AADLEN being in %ecx.

	// Additional local variables.
	// %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
	.set	BSWAP_MASK,	%xmm2
	.set	GHASH_ACC,	%xmm3
	.set	H_POW1,		%xmm4	// H^1
	.set	H_POW1_X64,	%xmm5	// H^1 * x^64
	.set	GFPOLY,		%xmm6

	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
	movdqu		(GHASH_ACC_PTR), GHASH_ACC
	movdqa		OFFSETOF_H_POWERS+7*16(KEY), H_POW1
	movdqa		OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
	movq		.Lgfpoly(%rip), GFPOLY

	// Process the AAD one full block at a time.
	sub		$16, AADLEN
	jl		.Laad_loop_1x_done\@
.Laad_loop_1x\@:
	movdqu		(AAD), %xmm0
	pshufb		BSWAP_MASK, %xmm0
	pxor		%xmm0, GHASH_ACC
	_ghash_mul	H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
	add		$16, AAD
	sub		$16, AADLEN
	jge		.Laad_loop_1x\@
.Laad_loop_1x_done\@:
	// Check whether there is a partial block at the end.
	add		$16, AADLEN
	jz		.Laad_done\@

	// Process a partial block of length 1 <= AADLEN <= 15.
	// _load_partial_block assumes that %ecx contains AADLEN.
	_load_partial_block	AAD, %xmm0, %r10, %r10d
	pshufb		BSWAP_MASK, %xmm0
	pxor		%xmm0, GHASH_ACC
	_ghash_mul	H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1

.Laad_done\@:
	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
	RET
.endm

// Increment LE_CTR eight times to generate eight little-endian counter blocks,
// swap each to big-endian, and store them in AESDATA[0-7].  Also XOR them with
// the zero-th AES round key.  Clobbers TMP0 and TMP1.
.macro	_ctr_begin_8x
	movq		.Lone(%rip), TMP0
	movdqa		(KEY), TMP1		// zero-th round key
.irp i, 0,1,2,3,4,5,6,7
	_vpshufb	BSWAP_MASK, LE_CTR, AESDATA\i
	pxor		TMP1, AESDATA\i
	paddd		TMP0, LE_CTR
.endr
.endm

// Do a non-last round of AES on AESDATA[0-7] using \round_key.
.macro	_aesenc_8x	round_key
.irp i, 0,1,2,3,4,5,6,7
	aesenc		\round_key, AESDATA\i
.endr
.endm

// Do the last round of AES on AESDATA[0-7] using \round_key.
.macro	_aesenclast_8x	round_key
.irp i, 0,1,2,3,4,5,6,7
	aesenclast	\round_key, AESDATA\i
.endr
.endm

// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
// store the result to DST.  Clobbers TMP0.
.macro	_xor_data_8x
.irp i, 0,1,2,3,4,5,6,7
	_xor_mem_to_reg	\i*16(SRC), AESDATA\i, tmp=TMP0
.endr
.irp i, 0,1,2,3,4,5,6,7
	movdqu		AESDATA\i, \i*16(DST)
.endr
.endm

// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
//					  const u32 le_ctr[4], u8 ghash_acc[16],
//					  const u8 *src, u8 *dst, int datalen);
//
// This macro generates a GCM encryption or decryption update function with the
// above prototype (with \enc selecting which one).
//
// This function computes the next portion of the CTR keystream, XOR's it with
// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
// data to |dst|.  It also updates the GHASH accumulator |ghash_acc| using the
// next |datalen| ciphertext bytes.
//
// |datalen| must be a multiple of 16, except on the last call where it can be
// any length.  The caller must do any buffering needed to ensure this.  Both
// in-place and out-of-place en/decryption are supported.
//
// |le_ctr| must give the current counter in little-endian format.  For a new
// message, the low word of the counter must be 2.  This function loads the
// counter from |le_ctr| and increments the loaded counter as needed, but it
// does *not* store the updated counter back to |le_ctr|.  The caller must
// update |le_ctr| if any more data segments follow.  Internally, only the low
// 32-bit word of the counter is incremented, following the GCM standard.
.macro	_aes_gcm_update	enc

	// Function arguments
	.set	KEY,		%rdi
	.set	LE_CTR_PTR,	%rsi	// Note: overlaps with usage as temp reg
	.set	GHASH_ACC_PTR,	%rdx
	.set	SRC,		%rcx
	.set	DST,		%r8
	.set	DATALEN,	%r9d
	.set	DATALEN64,	%r9	// Zero-extend DATALEN before using!
	// Note: the code setting up for _load_partial_block assumes that SRC is
	// in %rcx (and that DATALEN is *not* in %rcx).

	// Additional local variables

	// %rax and %rsi are used as temporary registers.  Note: %rsi overlaps
	// with LE_CTR_PTR, which is used only at the beginning.

	.set	AESKEYLEN,	%r10d	// AES key length in bytes
	.set	AESKEYLEN64,	%r10
	.set	RNDKEYLAST_PTR,	%r11	// Pointer to last AES round key

	// Put the most frequently used values in %xmm0-%xmm7 to reduce code
	// size.  (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
	.set	TMP0,		%xmm0
	.set	TMP1,		%xmm1
	.set	TMP2,		%xmm2
	.set	LO,		%xmm3	// Low part of unreduced product
	.set	MI,		%xmm4	// Middle part of unreduced product
	.set	GHASH_ACC,	%xmm5	// GHASH accumulator; in main loop also
					// the high part of unreduced product
	.set	BSWAP_MASK,	%xmm6	// Shuffle mask for reflecting bytes
	.set	LE_CTR,		%xmm7	// Little-endian counter value
	.set	AESDATA0,	%xmm8
	.set	AESDATA1,	%xmm9
	.set	AESDATA2,	%xmm10
	.set	AESDATA3,	%xmm11
	.set	AESDATA4,	%xmm12
	.set	AESDATA5,	%xmm13
	.set	AESDATA6,	%xmm14
	.set	AESDATA7,	%xmm15

	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
	movdqu		(GHASH_ACC_PTR), GHASH_ACC
	movdqu		(LE_CTR_PTR), LE_CTR

	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
	lea		6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR

	// If there are at least 8*16 bytes of data, then continue into the main
	// loop, which processes 8*16 bytes of data per iteration.
	//
	// The main loop interleaves AES and GHASH to improve performance on
	// CPUs that can execute these instructions in parallel.  When
	// decrypting, the GHASH input (the ciphertext) is immediately
	// available.  When encrypting, we instead encrypt a set of 8 blocks
	// first and then GHASH those blocks while encrypting the next set of 8,
	// repeat that as needed, and finally GHASH the last set of 8 blocks.
	//
	// Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
	// as this makes the immediate fit in a signed byte, saving 3 bytes.
	add		$-8*16, DATALEN
	jl		.Lcrypt_loop_8x_done\@
.if \enc
	// Encrypt the first 8 plaintext blocks.
	_ctr_begin_8x
	lea		16(KEY), %rsi
	.p2align 4
1:
	movdqa		(%rsi), TMP0
	_aesenc_8x	TMP0
	add		$16, %rsi
	cmp		%rsi, RNDKEYLAST_PTR
	jne		1b
	movdqa		(%rsi), TMP0
	_aesenclast_8x	TMP0
	_xor_data_8x
	// Don't increment DST until the ciphertext blocks have been hashed.
	sub		$-8*16, SRC
	add		$-8*16, DATALEN
	jl		.Lghash_last_ciphertext_8x\@
.endif

	.p2align 4
.Lcrypt_loop_8x\@:

	// Generate the next set of 8 counter blocks and start encrypting them.
	_ctr_begin_8x
	lea		16(KEY), %rsi

	// Do a round of AES, and start the GHASH update of 8 ciphertext blocks
	// by doing the unreduced multiplication for the first ciphertext block.
	movdqa		(%rsi), TMP0
	add		$16, %rsi
	_aesenc_8x	TMP0
	_ghash_update_begin_8x \enc

	// Do 7 more rounds of AES, and continue the GHASH update by doing the
	// unreduced multiplication for the remaining ciphertext blocks.
	.p2align 4
1:
	movdqa		(%rsi), TMP0
	add		$16, %rsi
	_aesenc_8x	TMP0
	_ghash_update_continue_8x \enc
	cmp		$7*8, %eax
	jne		1b

	// Do the remaining AES rounds.
	.p2align 4
1:
	movdqa		(%rsi), TMP0
	add		$16, %rsi
	_aesenc_8x	TMP0
	cmp		%rsi, RNDKEYLAST_PTR
	jne		1b

	// Do the GHASH reduction and the last round of AES.
	movdqa		(RNDKEYLAST_PTR), TMP0
	_ghash_update_end_8x_step	0
	_aesenclast_8x	TMP0
	_ghash_update_end_8x_step	1

	// XOR the data with the AES-CTR keystream blocks.
.if \enc
	sub		$-8*16, DST
.endif
	_xor_data_8x
	sub		$-8*16, SRC
.if !\enc
	sub		$-8*16, DST
.endif
	add		$-8*16, DATALEN
	jge		.Lcrypt_loop_8x\@

.if \enc
.Lghash_last_ciphertext_8x\@:
	// Update GHASH with the last set of 8 ciphertext blocks.
	_ghash_update_begin_8x		\enc
	.p2align 4
1:
	_ghash_update_continue_8x	\enc
	cmp		$7*8, %eax
	jne		1b
	_ghash_update_end_8x_step	0
	_ghash_update_end_8x_step	1
	sub		$-8*16, DST
.endif

.Lcrypt_loop_8x_done\@:

	sub		$-8*16, DATALEN
	jz		.Ldone\@

	// Handle the remainder of length 1 <= DATALEN < 8*16 bytes.  We keep
	// things simple and keep the code size down by just going one block at
	// a time, again taking advantage of hardware loop unrolling.  Since
	// there are enough key powers available for all remaining data, we do
	// the GHASH multiplications unreduced, and only reduce at the very end.

	.set	HI,		TMP2
	.set	H_POW,		AESDATA0
	.set	H_POW_XORED,	AESDATA1
	.set	ONE,		AESDATA2

	movq		.Lone(%rip), ONE

	// Start collecting the unreduced GHASH intermediate value LO, MI, HI.
	pxor		LO, LO
	pxor		MI, MI
	pxor		HI, HI

	// Set up a block counter %rax to contain 8*(8-n), where n is the number
	// of blocks that remain, counting any partial block.  This will be used
	// to access the key powers H^n through H^1.
	mov		DATALEN, %eax
	neg		%eax
	and		$~15, %eax
	sar		$1, %eax
	add		$64, %eax

	sub		$16, DATALEN
	jl		.Lcrypt_loop_1x_done\@

	// Process the data one full block at a time.
.Lcrypt_loop_1x\@:

	// Encrypt the next counter block.
	_vpshufb	BSWAP_MASK, LE_CTR, TMP0
	paddd		ONE, LE_CTR
	pxor		(KEY), TMP0
	lea		-6*16(RNDKEYLAST_PTR), %rsi	// Reduce code size
	cmp		$24, AESKEYLEN
	jl		128f	// AES-128?
	je		192f	// AES-192?
	// AES-256
	aesenc		-7*16(%rsi), TMP0
	aesenc		-6*16(%rsi), TMP0
192:
	aesenc		-5*16(%rsi), TMP0
	aesenc		-4*16(%rsi), TMP0
128:
.irp i, -3,-2,-1,0,1,2,3,4,5
	aesenc		\i*16(%rsi), TMP0
.endr
	aesenclast	(RNDKEYLAST_PTR), TMP0

	// Load the next key power H^i.
	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED

	// XOR the keystream block that was just generated in TMP0 with the next
	// source data block and store the resulting en/decrypted data to DST.
.if \enc
	_xor_mem_to_reg	(SRC), TMP0, tmp=TMP1
	movdqu		TMP0, (DST)
.else
	movdqu		(SRC), TMP1
	pxor		TMP1, TMP0
	movdqu		TMP0, (DST)
.endif

	// Update GHASH with the ciphertext block.
.if \enc
	pshufb		BSWAP_MASK, TMP0
	pxor		TMP0, GHASH_ACC
.else
	pshufb		BSWAP_MASK, TMP1
	pxor		TMP1, GHASH_ACC
.endif
	_ghash_mul_noreduce	H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
	pxor		GHASH_ACC, GHASH_ACC

	add		$8, %eax
	add		$16, SRC
	add		$16, DST
	sub		$16, DATALEN
	jge		.Lcrypt_loop_1x\@
.Lcrypt_loop_1x_done\@:
	// Check whether there is a partial block at the end.
	add		$16, DATALEN
	jz		.Lghash_reduce\@

	// Process a partial block of length 1 <= DATALEN <= 15.

	// Encrypt a counter block for the last time.
	pshufb		BSWAP_MASK, LE_CTR
	pxor		(KEY), LE_CTR
	lea		16(KEY), %rsi
1:
	aesenc		(%rsi), LE_CTR
	add		$16, %rsi
	cmp		%rsi, RNDKEYLAST_PTR
	jne		1b
	aesenclast	(RNDKEYLAST_PTR), LE_CTR

	// Load the lowest key power, H^1.
	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED

	// Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC.  SRC is
	// in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
	// RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
	mov		SRC, RNDKEYLAST_PTR
	mov		DATALEN, %ecx
	_load_partial_block	RNDKEYLAST_PTR, TMP0, %rsi, %esi

	// XOR the keystream block that was just generated in LE_CTR with the
	// source data block and store the resulting en/decrypted data to DST.
	pxor		TMP0, LE_CTR
	mov		DATALEN, %ecx
	_store_partial_block	LE_CTR, DST

	// If encrypting, zero-pad the final ciphertext block for GHASH.  (If
	// decrypting, this was already done by _load_partial_block.)
.if \enc
	lea		.Lzeropad_mask+16(%rip), %rax
	sub		DATALEN64, %rax
	_vpand		(%rax), LE_CTR, TMP0
.endif

	// Update GHASH with the final ciphertext block.
	pshufb		BSWAP_MASK, TMP0
	pxor		TMP0, GHASH_ACC
	_ghash_mul_noreduce	H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0

.Lghash_reduce\@:
	// Finally, do the GHASH reduction.
	_ghash_reduce	LO, MI, HI, GHASH_ACC, TMP0

.Ldone\@:
	// Store the updated GHASH accumulator back to memory.
	movdqu		GHASH_ACC, (GHASH_ACC_PTR)

	RET
.endm

// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
//				   const u32 le_ctr[4], u8 ghash_acc[16],
//				   u64 total_aadlen, u64 total_datalen);
// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
//				   const u32 le_ctr[4], const u8 ghash_acc[16],
//				   u64 total_aadlen, u64 total_datalen,
//				   const u8 tag[16], int taglen);
//
// This macro generates one of the above two functions (with \enc selecting
// which one).  Both functions finish computing the GCM authentication tag by
// updating GHASH with the lengths block and encrypting the GHASH accumulator.
// |total_aadlen| and |total_datalen| must be the total length of the additional
// authenticated data and the en/decrypted data in bytes, respectively.
//
// The encryption function then stores the full-length (16-byte) computed
// authentication tag to |ghash_acc|.  The decryption function instead loads the
// expected authentication tag (the one that was transmitted) from the 16-byte
// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
// computed tag in constant time, and returns true if and only if they match.
.macro	_aes_gcm_final	enc

	// Function arguments
	.set	KEY,		%rdi
	.set	LE_CTR_PTR,	%rsi
	.set	GHASH_ACC_PTR,	%rdx
	.set	TOTAL_AADLEN,	%rcx
	.set	TOTAL_DATALEN,	%r8
	.set	TAG,		%r9
	.set	TAGLEN,		%r10d	// Originally at 8(%rsp)
	.set	TAGLEN64,	%r10

	// Additional local variables.
	// %rax and %xmm0-%xmm2 are used as temporary registers.
	.set	AESKEYLEN,	%r11d
	.set	AESKEYLEN64,	%r11
	.set	BSWAP_MASK,	%xmm3
	.set	GHASH_ACC,	%xmm4
	.set	H_POW1,		%xmm5	// H^1
	.set	H_POW1_X64,	%xmm6	// H^1 * x^64
	.set	GFPOLY,		%xmm7

	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN

	// Set up a counter block with 1 in the low 32-bit word.  This is the
	// counter that produces the ciphertext needed to encrypt the auth tag.
	movdqu		(LE_CTR_PTR), %xmm0
	mov		$1, %eax
	pinsrd		$0, %eax, %xmm0

	// Build the lengths block and XOR it into the GHASH accumulator.
	movq		TOTAL_DATALEN, GHASH_ACC
	pinsrq		$1, TOTAL_AADLEN, GHASH_ACC
	psllq		$3, GHASH_ACC	// Bytes to bits
	_xor_mem_to_reg	(GHASH_ACC_PTR), GHASH_ACC, %xmm1

	movdqa		OFFSETOF_H_POWERS+7*16(KEY), H_POW1
	movdqa		OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
	movq		.Lgfpoly(%rip), GFPOLY

	// Make %rax point to the 6th from last AES round key.  (Using signed
	// byte offsets -7*16 through 6*16 decreases code size.)
	lea		(KEY,AESKEYLEN64,4), %rax

	// AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
	// Interleave the AES and GHASH instructions to improve performance.
	pshufb		BSWAP_MASK, %xmm0
	pxor		(KEY), %xmm0
	cmp		$24, AESKEYLEN
	jl		128f	// AES-128?
	je		192f	// AES-192?
	// AES-256
	aesenc		-7*16(%rax), %xmm0
	aesenc		-6*16(%rax), %xmm0
192:
	aesenc		-5*16(%rax), %xmm0
	aesenc		-4*16(%rax), %xmm0
128:
.irp i, 0,1,2,3,4,5,6,7,8
	aesenc		(\i-3)*16(%rax), %xmm0
	_ghash_mul_step	\i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
.endr
	aesenclast	6*16(%rax), %xmm0
	_ghash_mul_step	9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2

	// Undo the byte reflection of the GHASH accumulator.
	pshufb		BSWAP_MASK, GHASH_ACC

	// Encrypt the GHASH accumulator.
	pxor		%xmm0, GHASH_ACC

.if \enc
	// Return the computed auth tag.
	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
.else
	.set		ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!

	// Verify the auth tag in constant time by XOR'ing the transmitted and
	// computed auth tags together and using the ptest instruction to check
	// whether the first TAGLEN bytes of the result are zero.
	_xor_mem_to_reg	(TAG), GHASH_ACC, tmp=%xmm0
	movl		8(%rsp), TAGLEN
	lea		.Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
	sub		TAGLEN64, ZEROPAD_MASK_PTR
	xor		%eax, %eax
	_test_mem	(ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
	sete		%al
.endif
	RET
.endm

.set	USE_AVX, 0
SYM_FUNC_START(aes_gcm_precompute_aesni)
	_aes_gcm_precompute
SYM_FUNC_END(aes_gcm_precompute_aesni)
SYM_FUNC_START(aes_gcm_aad_update_aesni)
	_aes_gcm_aad_update
SYM_FUNC_END(aes_gcm_aad_update_aesni)
SYM_FUNC_START(aes_gcm_enc_update_aesni)
	_aes_gcm_update	1
SYM_FUNC_END(aes_gcm_enc_update_aesni)
SYM_FUNC_START(aes_gcm_dec_update_aesni)
	_aes_gcm_update	0
SYM_FUNC_END(aes_gcm_dec_update_aesni)
SYM_FUNC_START(aes_gcm_enc_final_aesni)
	_aes_gcm_final	1
SYM_FUNC_END(aes_gcm_enc_final_aesni)
SYM_FUNC_START(aes_gcm_dec_final_aesni)
	_aes_gcm_final	0
SYM_FUNC_END(aes_gcm_dec_final_aesni)

.set	USE_AVX, 1
SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
	_aes_gcm_precompute
SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
	_aes_gcm_aad_update
SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
	_aes_gcm_update	1
SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
	_aes_gcm_update	0
SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
	_aes_gcm_final	1
SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
	_aes_gcm_final	0
SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)