Loading...
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 | /* SPDX-License-Identifier: GPL-2.0 OR MIT */ /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. */ #include <linux/linkage.h> .section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 .align 32 IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 .octa 0x5BE0CD191F83D9AB9B05688C510E527F .section .rodata.cst16.ROT16, "aM", @progbits, 16 .align 16 ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 .section .rodata.cst16.ROR328, "aM", @progbits, 16 .align 16 ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 .section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 .align 64 SIGMA: .byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 .byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 .byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 .byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 .byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 .byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 .byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 .byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 .byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 .byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 #ifdef CONFIG_AS_AVX512 .section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 .align 64 SIGMA2: .long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 .long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 .long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 .long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 .long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 .long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 .long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 .long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 .long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 .long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 #endif /* CONFIG_AS_AVX512 */ .text SYM_FUNC_START(blake2s_compress_ssse3) testq %rdx,%rdx je .Lendofloop movdqu (%rdi),%xmm0 movdqu 0x10(%rdi),%xmm1 movdqa ROT16(%rip),%xmm12 movdqa ROR328(%rip),%xmm13 movdqu 0x20(%rdi),%xmm14 movq %rcx,%xmm15 leaq SIGMA+0xa0(%rip),%r8 jmp .Lbeginofloop .align 32 .Lbeginofloop: movdqa %xmm0,%xmm10 movdqa %xmm1,%xmm11 paddq %xmm15,%xmm14 movdqa IV(%rip),%xmm2 movdqa %xmm14,%xmm3 pxor IV+0x10(%rip),%xmm3 leaq SIGMA(%rip),%rcx .Lroundloop: movzbl (%rcx),%eax movd (%rsi,%rax,4),%xmm4 movzbl 0x1(%rcx),%eax movd (%rsi,%rax,4),%xmm5 movzbl 0x2(%rcx),%eax movd (%rsi,%rax,4),%xmm6 movzbl 0x3(%rcx),%eax movd (%rsi,%rax,4),%xmm7 punpckldq %xmm5,%xmm4 punpckldq %xmm7,%xmm6 punpcklqdq %xmm6,%xmm4 paddd %xmm4,%xmm0 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 pshufb %xmm12,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm8 psrld $0xc,%xmm1 pslld $0x14,%xmm8 por %xmm8,%xmm1 movzbl 0x4(%rcx),%eax movd (%rsi,%rax,4),%xmm5 movzbl 0x5(%rcx),%eax movd (%rsi,%rax,4),%xmm6 movzbl 0x6(%rcx),%eax movd (%rsi,%rax,4),%xmm7 movzbl 0x7(%rcx),%eax movd (%rsi,%rax,4),%xmm4 punpckldq %xmm6,%xmm5 punpckldq %xmm4,%xmm7 punpcklqdq %xmm7,%xmm5 paddd %xmm5,%xmm0 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 pshufb %xmm13,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm8 psrld $0x7,%xmm1 pslld $0x19,%xmm8 por %xmm8,%xmm1 pshufd $0x93,%xmm0,%xmm0 pshufd $0x4e,%xmm3,%xmm3 pshufd $0x39,%xmm2,%xmm2 movzbl 0x8(%rcx),%eax movd (%rsi,%rax,4),%xmm6 movzbl 0x9(%rcx),%eax movd (%rsi,%rax,4),%xmm7 movzbl 0xa(%rcx),%eax movd (%rsi,%rax,4),%xmm4 movzbl 0xb(%rcx),%eax movd (%rsi,%rax,4),%xmm5 punpckldq %xmm7,%xmm6 punpckldq %xmm5,%xmm4 punpcklqdq %xmm4,%xmm6 paddd %xmm6,%xmm0 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 pshufb %xmm12,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm8 psrld $0xc,%xmm1 pslld $0x14,%xmm8 por %xmm8,%xmm1 movzbl 0xc(%rcx),%eax movd (%rsi,%rax,4),%xmm7 movzbl 0xd(%rcx),%eax movd (%rsi,%rax,4),%xmm4 movzbl 0xe(%rcx),%eax movd (%rsi,%rax,4),%xmm5 movzbl 0xf(%rcx),%eax movd (%rsi,%rax,4),%xmm6 punpckldq %xmm4,%xmm7 punpckldq %xmm6,%xmm5 punpcklqdq %xmm5,%xmm7 paddd %xmm7,%xmm0 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 pshufb %xmm13,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm8 psrld $0x7,%xmm1 pslld $0x19,%xmm8 por %xmm8,%xmm1 pshufd $0x39,%xmm0,%xmm0 pshufd $0x4e,%xmm3,%xmm3 pshufd $0x93,%xmm2,%xmm2 addq $0x10,%rcx cmpq %r8,%rcx jnz .Lroundloop pxor %xmm2,%xmm0 pxor %xmm3,%xmm1 pxor %xmm10,%xmm0 pxor %xmm11,%xmm1 addq $0x40,%rsi decq %rdx jnz .Lbeginofloop movdqu %xmm0,(%rdi) movdqu %xmm1,0x10(%rdi) movdqu %xmm14,0x20(%rdi) .Lendofloop: RET SYM_FUNC_END(blake2s_compress_ssse3) #ifdef CONFIG_AS_AVX512 SYM_FUNC_START(blake2s_compress_avx512) vmovdqu (%rdi),%xmm0 vmovdqu 0x10(%rdi),%xmm1 vmovdqu 0x20(%rdi),%xmm4 vmovq %rcx,%xmm5 vmovdqa IV(%rip),%xmm14 vmovdqa IV+16(%rip),%xmm15 jmp .Lblake2s_compress_avx512_mainloop .align 32 .Lblake2s_compress_avx512_mainloop: vmovdqa %xmm0,%xmm10 vmovdqa %xmm1,%xmm11 vpaddq %xmm5,%xmm4,%xmm4 vmovdqa %xmm14,%xmm2 vpxor %xmm15,%xmm4,%xmm3 vmovdqu (%rsi),%ymm6 vmovdqu 0x20(%rsi),%ymm7 addq $0x40,%rsi leaq SIGMA2(%rip),%rax movb $0xa,%cl .Lblake2s_compress_avx512_roundloop: addq $0x40,%rax vmovdqa -0x40(%rax),%ymm8 vmovdqa -0x20(%rax),%ymm9 vpermi2d %ymm7,%ymm6,%ymm8 vpermi2d %ymm7,%ymm6,%ymm9 vmovdqa %ymm8,%ymm6 vmovdqa %ymm9,%ymm7 vpaddd %xmm8,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vprord $0x10,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vprord $0xc,%xmm1,%xmm1 vextracti128 $0x1,%ymm8,%xmm8 vpaddd %xmm8,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vprord $0x8,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vprord $0x7,%xmm1,%xmm1 vpshufd $0x93,%xmm0,%xmm0 vpshufd $0x4e,%xmm3,%xmm3 vpshufd $0x39,%xmm2,%xmm2 vpaddd %xmm9,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vprord $0x10,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vprord $0xc,%xmm1,%xmm1 vextracti128 $0x1,%ymm9,%xmm9 vpaddd %xmm9,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vprord $0x8,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vprord $0x7,%xmm1,%xmm1 vpshufd $0x39,%xmm0,%xmm0 vpshufd $0x4e,%xmm3,%xmm3 vpshufd $0x93,%xmm2,%xmm2 decb %cl jne .Lblake2s_compress_avx512_roundloop vpxor %xmm10,%xmm0,%xmm0 vpxor %xmm11,%xmm1,%xmm1 vpxor %xmm2,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 decq %rdx jne .Lblake2s_compress_avx512_mainloop vmovdqu %xmm0,(%rdi) vmovdqu %xmm1,0x10(%rdi) vmovdqu %xmm4,0x20(%rdi) vzeroupper RET SYM_FUNC_END(blake2s_compress_avx512) #endif /* CONFIG_AS_AVX512 */ |