chacha20poly1305_amd64.s

   1// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.
   2
   3//go:build gc && !purego
   4
   5#include "textflag.h"
   6
   7// func polyHashADInternal<>()
   8TEXT polyHashADInternal<>(SB), NOSPLIT, $0
   9	// Hack: Must declare #define macros inside of a function due to Avo constraints
  10	// ROL rotates the uint32s in register R left by N bits, using temporary T.
  11	#define ROL(N, R, T) \
  12		MOVO R, T; \
  13		PSLLL $(N), T; \
  14		PSRLL $(32-(N)), R; \
  15		PXOR T, R
  16
  17	// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
  18	#ifdef GOAMD64_v2
  19		#define ROL8(R, T) PSHUFB rol8<>(SB), R
  20	#else
  21		#define ROL8(R, T) ROL(8, R, T)
  22	#endif
  23
  24	// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
  25	#ifdef GOAMD64_v2
  26		#define ROL16(R, T) PSHUFB rol16<>(SB), R
  27	#else
  28		#define ROL16(R, T) ROL(16, R, T)
  29	#endif
  30	XORQ  R10, R10
  31	XORQ  R11, R11
  32	XORQ  R12, R12
  33	CMPQ  R9, $0x0d
  34	JNE   hashADLoop
  35	MOVQ  (CX), R10
  36	MOVQ  5(CX), R11
  37	SHRQ  $0x18, R11
  38	MOVQ  $0x00000001, R12
  39	MOVQ  (BP), AX
  40	MOVQ  AX, R15
  41	MULQ  R10
  42	MOVQ  AX, R13
  43	MOVQ  DX, R14
  44	MOVQ  (BP), AX
  45	MULQ  R11
  46	IMULQ R12, R15
  47	ADDQ  AX, R14
  48	ADCQ  DX, R15
  49	MOVQ  8(BP), AX
  50	MOVQ  AX, R8
  51	MULQ  R10
  52	ADDQ  AX, R14
  53	ADCQ  $0x00, DX
  54	MOVQ  DX, R10
  55	MOVQ  8(BP), AX
  56	MULQ  R11
  57	ADDQ  AX, R15
  58	ADCQ  $0x00, DX
  59	IMULQ R12, R8
  60	ADDQ  R10, R15
  61	ADCQ  DX, R8
  62	MOVQ  R13, R10
  63	MOVQ  R14, R11
  64	MOVQ  R15, R12
  65	ANDQ  $0x03, R12
  66	MOVQ  R15, R13
  67	ANDQ  $-4, R13
  68	MOVQ  R8, R14
  69	SHRQ  $0x02, R8, R15
  70	SHRQ  $0x02, R8
  71	ADDQ  R13, R10
  72	ADCQ  R14, R11
  73	ADCQ  $0x00, R12
  74	ADDQ  R15, R10
  75	ADCQ  R8, R11
  76	ADCQ  $0x00, R12
  77	RET
  78
  79hashADLoop:
  80	// Hash in 16 byte chunks
  81	CMPQ  R9, $0x10
  82	JB    hashADTail
  83	ADDQ  (CX), R10
  84	ADCQ  8(CX), R11
  85	ADCQ  $0x01, R12
  86	LEAQ  16(CX), CX
  87	SUBQ  $0x10, R9
  88	MOVQ  (BP), AX
  89	MOVQ  AX, R15
  90	MULQ  R10
  91	MOVQ  AX, R13
  92	MOVQ  DX, R14
  93	MOVQ  (BP), AX
  94	MULQ  R11
  95	IMULQ R12, R15
  96	ADDQ  AX, R14
  97	ADCQ  DX, R15
  98	MOVQ  8(BP), AX
  99	MOVQ  AX, R8
 100	MULQ  R10
 101	ADDQ  AX, R14
 102	ADCQ  $0x00, DX
 103	MOVQ  DX, R10
 104	MOVQ  8(BP), AX
 105	MULQ  R11
 106	ADDQ  AX, R15
 107	ADCQ  $0x00, DX
 108	IMULQ R12, R8
 109	ADDQ  R10, R15
 110	ADCQ  DX, R8
 111	MOVQ  R13, R10
 112	MOVQ  R14, R11
 113	MOVQ  R15, R12
 114	ANDQ  $0x03, R12
 115	MOVQ  R15, R13
 116	ANDQ  $-4, R13
 117	MOVQ  R8, R14
 118	SHRQ  $0x02, R8, R15
 119	SHRQ  $0x02, R8
 120	ADDQ  R13, R10
 121	ADCQ  R14, R11
 122	ADCQ  $0x00, R12
 123	ADDQ  R15, R10
 124	ADCQ  R8, R11
 125	ADCQ  $0x00, R12
 126	JMP   hashADLoop
 127
 128hashADTail:
 129	CMPQ R9, $0x00
 130	JE   hashADDone
 131
 132	// Hash last < 16 byte tail
 133	XORQ R13, R13
 134	XORQ R14, R14
 135	XORQ R15, R15
 136	ADDQ R9, CX
 137
 138hashADTailLoop:
 139	SHLQ  $0x08, R13, R14
 140	SHLQ  $0x08, R13
 141	MOVB  -1(CX), R15
 142	XORQ  R15, R13
 143	DECQ  CX
 144	DECQ  R9
 145	JNE   hashADTailLoop
 146	ADDQ  R13, R10
 147	ADCQ  R14, R11
 148	ADCQ  $0x01, R12
 149	MOVQ  (BP), AX
 150	MOVQ  AX, R15
 151	MULQ  R10
 152	MOVQ  AX, R13
 153	MOVQ  DX, R14
 154	MOVQ  (BP), AX
 155	MULQ  R11
 156	IMULQ R12, R15
 157	ADDQ  AX, R14
 158	ADCQ  DX, R15
 159	MOVQ  8(BP), AX
 160	MOVQ  AX, R8
 161	MULQ  R10
 162	ADDQ  AX, R14
 163	ADCQ  $0x00, DX
 164	MOVQ  DX, R10
 165	MOVQ  8(BP), AX
 166	MULQ  R11
 167	ADDQ  AX, R15
 168	ADCQ  $0x00, DX
 169	IMULQ R12, R8
 170	ADDQ  R10, R15
 171	ADCQ  DX, R8
 172	MOVQ  R13, R10
 173	MOVQ  R14, R11
 174	MOVQ  R15, R12
 175	ANDQ  $0x03, R12
 176	MOVQ  R15, R13
 177	ANDQ  $-4, R13
 178	MOVQ  R8, R14
 179	SHRQ  $0x02, R8, R15
 180	SHRQ  $0x02, R8
 181	ADDQ  R13, R10
 182	ADCQ  R14, R11
 183	ADCQ  $0x00, R12
 184	ADDQ  R15, R10
 185	ADCQ  R8, R11
 186	ADCQ  $0x00, R12
 187
 188hashADDone:
 189	RET
 190
 191// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
 192// Requires: AVX, AVX2, BMI2, CMOV, SSE2
 193TEXT chacha20Poly1305Open(SB), $288-97
 194	// For aligned stack access
 195	MOVQ SP, BP
 196	ADDQ $0x20, BP
 197	ANDQ $-32, BP
 198	MOVQ dst_base+0(FP), DI
 199	MOVQ key_base+24(FP), R8
 200	MOVQ src_base+48(FP), SI
 201	MOVQ src_len+56(FP), BX
 202	MOVQ ad_base+72(FP), CX
 203
 204	// Check for AVX2 support
 205	CMPB useAVX2+0(SB), $0x01
 206	JE   chacha20Poly1305Open_AVX2
 207
 208	// Special optimization, for very short buffers
 209	CMPQ BX, $0x80
 210	JBE  openSSE128
 211
 212	// For long buffers, prepare the poly key first
 213	MOVOU chacha20Constants<>+0(SB), X0
 214	MOVOU 16(R8), X3
 215	MOVOU 32(R8), X6
 216	MOVOU 48(R8), X9
 217	MOVO  X9, X13
 218
 219	// Store state on stack for future use
 220	MOVO X3, 32(BP)
 221	MOVO X6, 48(BP)
 222	MOVO X9, 128(BP)
 223	MOVQ $0x0000000a, R9
 224
 225openSSEPreparePolyKey:
 226	PADDD X3, X0
 227	PXOR  X0, X9
 228	ROL16(X9, X12)
 229	PADDD X9, X6
 230	PXOR  X6, X3
 231	MOVO  X3, X12
 232	PSLLL $0x0c, X12
 233	PSRLL $0x14, X3
 234	PXOR  X12, X3
 235	PADDD X3, X0
 236	PXOR  X0, X9
 237	ROL8(X9, X12)
 238	PADDD X9, X6
 239	PXOR  X6, X3
 240	MOVO  X3, X12
 241	PSLLL $0x07, X12
 242	PSRLL $0x19, X3
 243	PXOR  X12, X3
 244	BYTE  $0x66
 245	BYTE  $0x0f
 246	BYTE  $0x3a
 247	BYTE  $0x0f
 248	BYTE  $0xdb
 249	BYTE  $0x04
 250	BYTE  $0x66
 251	BYTE  $0x0f
 252	BYTE  $0x3a
 253	BYTE  $0x0f
 254	BYTE  $0xf6
 255	BYTE  $0x08
 256	BYTE  $0x66
 257	BYTE  $0x45
 258	BYTE  $0x0f
 259	BYTE  $0x3a
 260	BYTE  $0x0f
 261	BYTE  $0xc9
 262	BYTE  $0x0c
 263	PADDD X3, X0
 264	PXOR  X0, X9
 265	ROL16(X9, X12)
 266	PADDD X9, X6
 267	PXOR  X6, X3
 268	MOVO  X3, X12
 269	PSLLL $0x0c, X12
 270	PSRLL $0x14, X3
 271	PXOR  X12, X3
 272	PADDD X3, X0
 273	PXOR  X0, X9
 274	ROL8(X9, X12)
 275	PADDD X9, X6
 276	PXOR  X6, X3
 277	MOVO  X3, X12
 278	PSLLL $0x07, X12
 279	PSRLL $0x19, X3
 280	PXOR  X12, X3
 281	BYTE  $0x66
 282	BYTE  $0x0f
 283	BYTE  $0x3a
 284	BYTE  $0x0f
 285	BYTE  $0xdb
 286	BYTE  $0x0c
 287	BYTE  $0x66
 288	BYTE  $0x0f
 289	BYTE  $0x3a
 290	BYTE  $0x0f
 291	BYTE  $0xf6
 292	BYTE  $0x08
 293	BYTE  $0x66
 294	BYTE  $0x45
 295	BYTE  $0x0f
 296	BYTE  $0x3a
 297	BYTE  $0x0f
 298	BYTE  $0xc9
 299	BYTE  $0x04
 300	DECQ  R9
 301	JNE   openSSEPreparePolyKey
 302
 303	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
 304	PADDL chacha20Constants<>+0(SB), X0
 305	PADDL 32(BP), X3
 306
 307	// Clamp and store the key
 308	PAND polyClampMask<>+0(SB), X0
 309	MOVO X0, (BP)
 310	MOVO X3, 16(BP)
 311
 312	// Hash AAD
 313	MOVQ ad_len+80(FP), R9
 314	CALL polyHashADInternal<>(SB)
 315
 316openSSEMainLoop:
 317	CMPQ BX, $0x00000100
 318	JB   openSSEMainLoopDone
 319
 320	// Load state, increment counter blocks
 321	MOVO  chacha20Constants<>+0(SB), X0
 322	MOVO  32(BP), X3
 323	MOVO  48(BP), X6
 324	MOVO  128(BP), X9
 325	PADDL sseIncMask<>+0(SB), X9
 326	MOVO  X0, X1
 327	MOVO  X3, X4
 328	MOVO  X6, X7
 329	MOVO  X9, X10
 330	PADDL sseIncMask<>+0(SB), X10
 331	MOVO  X1, X2
 332	MOVO  X4, X5
 333	MOVO  X7, X8
 334	MOVO  X10, X11
 335	PADDL sseIncMask<>+0(SB), X11
 336	MOVO  X2, X12
 337	MOVO  X5, X13
 338	MOVO  X8, X14
 339	MOVO  X11, X15
 340	PADDL sseIncMask<>+0(SB), X15
 341
 342	// Store counters
 343	MOVO X9, 80(BP)
 344	MOVO X10, 96(BP)
 345	MOVO X11, 112(BP)
 346	MOVO X15, 128(BP)
 347
 348	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash
 349	// 2 blocks, and for the remaining 4 only 1 block - for a total of 16
 350	MOVQ $0x00000004, CX
 351	MOVQ SI, R9
 352
 353openSSEInternalLoop:
 354	MOVO  X14, 64(BP)
 355	PADDD X3, X0
 356	PXOR  X0, X9
 357	ROL16(X9, X14)
 358	PADDD X9, X6
 359	PXOR  X6, X3
 360	MOVO  X3, X14
 361	PSLLL $0x0c, X14
 362	PSRLL $0x14, X3
 363	PXOR  X14, X3
 364	PADDD X3, X0
 365	PXOR  X0, X9
 366	ROL8(X9, X14)
 367	PADDD X9, X6
 368	PXOR  X6, X3
 369	MOVO  X3, X14
 370	PSLLL $0x07, X14
 371	PSRLL $0x19, X3
 372	PXOR  X14, X3
 373	PADDD X4, X1
 374	PXOR  X1, X10
 375	ROL16(X10, X14)
 376	PADDD X10, X7
 377	PXOR  X7, X4
 378	MOVO  X4, X14
 379	PSLLL $0x0c, X14
 380	PSRLL $0x14, X4
 381	PXOR  X14, X4
 382	PADDD X4, X1
 383	PXOR  X1, X10
 384	ROL8(X10, X14)
 385	PADDD X10, X7
 386	PXOR  X7, X4
 387	MOVO  X4, X14
 388	PSLLL $0x07, X14
 389	PSRLL $0x19, X4
 390	PXOR  X14, X4
 391	PADDD X5, X2
 392	PXOR  X2, X11
 393	ROL16(X11, X14)
 394	PADDD X11, X8
 395	PXOR  X8, X5
 396	MOVO  X5, X14
 397	PSLLL $0x0c, X14
 398	PSRLL $0x14, X5
 399	PXOR  X14, X5
 400	PADDD X5, X2
 401	PXOR  X2, X11
 402	ROL8(X11, X14)
 403	PADDD X11, X8
 404	PXOR  X8, X5
 405	MOVO  X5, X14
 406	PSLLL $0x07, X14
 407	PSRLL $0x19, X5
 408	PXOR  X14, X5
 409	MOVO  64(BP), X14
 410	MOVO  X7, 64(BP)
 411	PADDD X13, X12
 412	PXOR  X12, X15
 413	ROL16(X15, X7)
 414	PADDD X15, X14
 415	PXOR  X14, X13
 416	MOVO  X13, X7
 417	PSLLL $0x0c, X7
 418	PSRLL $0x14, X13
 419	PXOR  X7, X13
 420	PADDD X13, X12
 421	PXOR  X12, X15
 422	ROL8(X15, X7)
 423	PADDD X15, X14
 424	PXOR  X14, X13
 425	MOVO  X13, X7
 426	PSLLL $0x07, X7
 427	PSRLL $0x19, X13
 428	PXOR  X7, X13
 429	MOVO  64(BP), X7
 430	ADDQ  (R9), R10
 431	ADCQ  8(R9), R11
 432	ADCQ  $0x01, R12
 433	BYTE  $0x66
 434	BYTE  $0x0f
 435	BYTE  $0x3a
 436	BYTE  $0x0f
 437	BYTE  $0xdb
 438	BYTE  $0x04
 439	BYTE  $0x66
 440	BYTE  $0x0f
 441	BYTE  $0x3a
 442	BYTE  $0x0f
 443	BYTE  $0xe4
 444	BYTE  $0x04
 445	BYTE  $0x66
 446	BYTE  $0x0f
 447	BYTE  $0x3a
 448	BYTE  $0x0f
 449	BYTE  $0xed
 450	BYTE  $0x04
 451	BYTE  $0x66
 452	BYTE  $0x45
 453	BYTE  $0x0f
 454	BYTE  $0x3a
 455	BYTE  $0x0f
 456	BYTE  $0xed
 457	BYTE  $0x04
 458	BYTE  $0x66
 459	BYTE  $0x0f
 460	BYTE  $0x3a
 461	BYTE  $0x0f
 462	BYTE  $0xf6
 463	BYTE  $0x08
 464	BYTE  $0x66
 465	BYTE  $0x0f
 466	BYTE  $0x3a
 467	BYTE  $0x0f
 468	BYTE  $0xff
 469	BYTE  $0x08
 470	BYTE  $0x66
 471	BYTE  $0x45
 472	BYTE  $0x0f
 473	BYTE  $0x3a
 474	BYTE  $0x0f
 475	BYTE  $0xc0
 476	BYTE  $0x08
 477	BYTE  $0x66
 478	BYTE  $0x45
 479	BYTE  $0x0f
 480	BYTE  $0x3a
 481	BYTE  $0x0f
 482	BYTE  $0xf6
 483	BYTE  $0x08
 484	BYTE  $0x66
 485	BYTE  $0x45
 486	BYTE  $0x0f
 487	BYTE  $0x3a
 488	BYTE  $0x0f
 489	BYTE  $0xc9
 490	BYTE  $0x0c
 491	BYTE  $0x66
 492	BYTE  $0x45
 493	BYTE  $0x0f
 494	BYTE  $0x3a
 495	BYTE  $0x0f
 496	BYTE  $0xd2
 497	BYTE  $0x0c
 498	BYTE  $0x66
 499	BYTE  $0x45
 500	BYTE  $0x0f
 501	BYTE  $0x3a
 502	BYTE  $0x0f
 503	BYTE  $0xdb
 504	BYTE  $0x0c
 505	BYTE  $0x66
 506	BYTE  $0x45
 507	BYTE  $0x0f
 508	BYTE  $0x3a
 509	BYTE  $0x0f
 510	BYTE  $0xff
 511	BYTE  $0x0c
 512	MOVQ  (BP), AX
 513	MOVQ  AX, R15
 514	MULQ  R10
 515	MOVQ  AX, R13
 516	MOVQ  DX, R14
 517	MOVQ  (BP), AX
 518	MULQ  R11
 519	IMULQ R12, R15
 520	ADDQ  AX, R14
 521	ADCQ  DX, R15
 522	MOVQ  8(BP), AX
 523	MOVQ  AX, R8
 524	MULQ  R10
 525	ADDQ  AX, R14
 526	ADCQ  $0x00, DX
 527	MOVQ  DX, R10
 528	MOVQ  8(BP), AX
 529	MULQ  R11
 530	ADDQ  AX, R15
 531	ADCQ  $0x00, DX
 532	LEAQ  16(R9), R9
 533	MOVO  X14, 64(BP)
 534	PADDD X3, X0
 535	PXOR  X0, X9
 536	ROL16(X9, X14)
 537	PADDD X9, X6
 538	PXOR  X6, X3
 539	MOVO  X3, X14
 540	PSLLL $0x0c, X14
 541	PSRLL $0x14, X3
 542	PXOR  X14, X3
 543	PADDD X3, X0
 544	PXOR  X0, X9
 545	ROL8(X9, X14)
 546	PADDD X9, X6
 547	PXOR  X6, X3
 548	MOVO  X3, X14
 549	PSLLL $0x07, X14
 550	PSRLL $0x19, X3
 551	PXOR  X14, X3
 552	PADDD X4, X1
 553	PXOR  X1, X10
 554	ROL16(X10, X14)
 555	PADDD X10, X7
 556	PXOR  X7, X4
 557	MOVO  X4, X14
 558	PSLLL $0x0c, X14
 559	PSRLL $0x14, X4
 560	PXOR  X14, X4
 561	PADDD X4, X1
 562	PXOR  X1, X10
 563	ROL8(X10, X14)
 564	PADDD X10, X7
 565	PXOR  X7, X4
 566	MOVO  X4, X14
 567	PSLLL $0x07, X14
 568	PSRLL $0x19, X4
 569	PXOR  X14, X4
 570	PADDD X5, X2
 571	PXOR  X2, X11
 572	ROL16(X11, X14)
 573	PADDD X11, X8
 574	PXOR  X8, X5
 575	MOVO  X5, X14
 576	PSLLL $0x0c, X14
 577	PSRLL $0x14, X5
 578	PXOR  X14, X5
 579	PADDD X5, X2
 580	PXOR  X2, X11
 581	ROL8(X11, X14)
 582	PADDD X11, X8
 583	PXOR  X8, X5
 584	MOVO  X5, X14
 585	PSLLL $0x07, X14
 586	PSRLL $0x19, X5
 587	PXOR  X14, X5
 588	MOVO  64(BP), X14
 589	MOVO  X7, 64(BP)
 590	IMULQ R12, R8
 591	ADDQ  R10, R15
 592	ADCQ  DX, R8
 593	PADDD X13, X12
 594	PXOR  X12, X15
 595	ROL16(X15, X7)
 596	PADDD X15, X14
 597	PXOR  X14, X13
 598	MOVO  X13, X7
 599	PSLLL $0x0c, X7
 600	PSRLL $0x14, X13
 601	PXOR  X7, X13
 602	PADDD X13, X12
 603	PXOR  X12, X15
 604	ROL8(X15, X7)
 605	PADDD X15, X14
 606	PXOR  X14, X13
 607	MOVO  X13, X7
 608	PSLLL $0x07, X7
 609	PSRLL $0x19, X13
 610	PXOR  X7, X13
 611	MOVO  64(BP), X7
 612	MOVQ  R13, R10
 613	MOVQ  R14, R11
 614	MOVQ  R15, R12
 615	ANDQ  $0x03, R12
 616	MOVQ  R15, R13
 617	ANDQ  $-4, R13
 618	MOVQ  R8, R14
 619	SHRQ  $0x02, R8, R15
 620	SHRQ  $0x02, R8
 621	ADDQ  R13, R10
 622	ADCQ  R14, R11
 623	ADCQ  $0x00, R12
 624	ADDQ  R15, R10
 625	ADCQ  R8, R11
 626	ADCQ  $0x00, R12
 627	BYTE  $0x66
 628	BYTE  $0x0f
 629	BYTE  $0x3a
 630	BYTE  $0x0f
 631	BYTE  $0xdb
 632	BYTE  $0x0c
 633	BYTE  $0x66
 634	BYTE  $0x0f
 635	BYTE  $0x3a
 636	BYTE  $0x0f
 637	BYTE  $0xe4
 638	BYTE  $0x0c
 639	BYTE  $0x66
 640	BYTE  $0x0f
 641	BYTE  $0x3a
 642	BYTE  $0x0f
 643	BYTE  $0xed
 644	BYTE  $0x0c
 645	BYTE  $0x66
 646	BYTE  $0x45
 647	BYTE  $0x0f
 648	BYTE  $0x3a
 649	BYTE  $0x0f
 650	BYTE  $0xed
 651	BYTE  $0x0c
 652	BYTE  $0x66
 653	BYTE  $0x0f
 654	BYTE  $0x3a
 655	BYTE  $0x0f
 656	BYTE  $0xf6
 657	BYTE  $0x08
 658	BYTE  $0x66
 659	BYTE  $0x0f
 660	BYTE  $0x3a
 661	BYTE  $0x0f
 662	BYTE  $0xff
 663	BYTE  $0x08
 664	BYTE  $0x66
 665	BYTE  $0x45
 666	BYTE  $0x0f
 667	BYTE  $0x3a
 668	BYTE  $0x0f
 669	BYTE  $0xc0
 670	BYTE  $0x08
 671	BYTE  $0x66
 672	BYTE  $0x45
 673	BYTE  $0x0f
 674	BYTE  $0x3a
 675	BYTE  $0x0f
 676	BYTE  $0xf6
 677	BYTE  $0x08
 678	BYTE  $0x66
 679	BYTE  $0x45
 680	BYTE  $0x0f
 681	BYTE  $0x3a
 682	BYTE  $0x0f
 683	BYTE  $0xc9
 684	BYTE  $0x04
 685	BYTE  $0x66
 686	BYTE  $0x45
 687	BYTE  $0x0f
 688	BYTE  $0x3a
 689	BYTE  $0x0f
 690	BYTE  $0xd2
 691	BYTE  $0x04
 692	BYTE  $0x66
 693	BYTE  $0x45
 694	BYTE  $0x0f
 695	BYTE  $0x3a
 696	BYTE  $0x0f
 697	BYTE  $0xdb
 698	BYTE  $0x04
 699	BYTE  $0x66
 700	BYTE  $0x45
 701	BYTE  $0x0f
 702	BYTE  $0x3a
 703	BYTE  $0x0f
 704	BYTE  $0xff
 705	BYTE  $0x04
 706	DECQ  CX
 707	JGE   openSSEInternalLoop
 708	ADDQ  (R9), R10
 709	ADCQ  8(R9), R11
 710	ADCQ  $0x01, R12
 711	MOVQ  (BP), AX
 712	MOVQ  AX, R15
 713	MULQ  R10
 714	MOVQ  AX, R13
 715	MOVQ  DX, R14
 716	MOVQ  (BP), AX
 717	MULQ  R11
 718	IMULQ R12, R15
 719	ADDQ  AX, R14
 720	ADCQ  DX, R15
 721	MOVQ  8(BP), AX
 722	MOVQ  AX, R8
 723	MULQ  R10
 724	ADDQ  AX, R14
 725	ADCQ  $0x00, DX
 726	MOVQ  DX, R10
 727	MOVQ  8(BP), AX
 728	MULQ  R11
 729	ADDQ  AX, R15
 730	ADCQ  $0x00, DX
 731	IMULQ R12, R8
 732	ADDQ  R10, R15
 733	ADCQ  DX, R8
 734	MOVQ  R13, R10
 735	MOVQ  R14, R11
 736	MOVQ  R15, R12
 737	ANDQ  $0x03, R12
 738	MOVQ  R15, R13
 739	ANDQ  $-4, R13
 740	MOVQ  R8, R14
 741	SHRQ  $0x02, R8, R15
 742	SHRQ  $0x02, R8
 743	ADDQ  R13, R10
 744	ADCQ  R14, R11
 745	ADCQ  $0x00, R12
 746	ADDQ  R15, R10
 747	ADCQ  R8, R11
 748	ADCQ  $0x00, R12
 749	LEAQ  16(R9), R9
 750	CMPQ  CX, $-6
 751	JG    openSSEInternalLoop
 752
 753	// Add in the state
 754	PADDD chacha20Constants<>+0(SB), X0
 755	PADDD chacha20Constants<>+0(SB), X1
 756	PADDD chacha20Constants<>+0(SB), X2
 757	PADDD chacha20Constants<>+0(SB), X12
 758	PADDD 32(BP), X3
 759	PADDD 32(BP), X4
 760	PADDD 32(BP), X5
 761	PADDD 32(BP), X13
 762	PADDD 48(BP), X6
 763	PADDD 48(BP), X7
 764	PADDD 48(BP), X8
 765	PADDD 48(BP), X14
 766	PADDD 80(BP), X9
 767	PADDD 96(BP), X10
 768	PADDD 112(BP), X11
 769	PADDD 128(BP), X15
 770
 771	// Load - xor - store
 772	MOVO  X15, 64(BP)
 773	MOVOU (SI), X15
 774	PXOR  X15, X0
 775	MOVOU X0, (DI)
 776	MOVOU 16(SI), X15
 777	PXOR  X15, X3
 778	MOVOU X3, 16(DI)
 779	MOVOU 32(SI), X15
 780	PXOR  X15, X6
 781	MOVOU X6, 32(DI)
 782	MOVOU 48(SI), X15
 783	PXOR  X15, X9
 784	MOVOU X9, 48(DI)
 785	MOVOU 64(SI), X9
 786	PXOR  X9, X1
 787	MOVOU X1, 64(DI)
 788	MOVOU 80(SI), X9
 789	PXOR  X9, X4
 790	MOVOU X4, 80(DI)
 791	MOVOU 96(SI), X9
 792	PXOR  X9, X7
 793	MOVOU X7, 96(DI)
 794	MOVOU 112(SI), X9
 795	PXOR  X9, X10
 796	MOVOU X10, 112(DI)
 797	MOVOU 128(SI), X9
 798	PXOR  X9, X2
 799	MOVOU X2, 128(DI)
 800	MOVOU 144(SI), X9
 801	PXOR  X9, X5
 802	MOVOU X5, 144(DI)
 803	MOVOU 160(SI), X9
 804	PXOR  X9, X8
 805	MOVOU X8, 160(DI)
 806	MOVOU 176(SI), X9
 807	PXOR  X9, X11
 808	MOVOU X11, 176(DI)
 809	MOVOU 192(SI), X9
 810	PXOR  X9, X12
 811	MOVOU X12, 192(DI)
 812	MOVOU 208(SI), X9
 813	PXOR  X9, X13
 814	MOVOU X13, 208(DI)
 815	MOVOU 224(SI), X9
 816	PXOR  X9, X14
 817	MOVOU X14, 224(DI)
 818	MOVOU 240(SI), X9
 819	PXOR  64(BP), X9
 820	MOVOU X9, 240(DI)
 821	LEAQ  256(SI), SI
 822	LEAQ  256(DI), DI
 823	SUBQ  $0x00000100, BX
 824	JMP   openSSEMainLoop
 825
 826openSSEMainLoopDone:
 827	// Handle the various tail sizes efficiently
 828	TESTQ BX, BX
 829	JE    openSSEFinalize
 830	CMPQ  BX, $0x40
 831	JBE   openSSETail64
 832	CMPQ  BX, $0x80
 833	JBE   openSSETail128
 834	CMPQ  BX, $0xc0
 835	JBE   openSSETail192
 836	JMP   openSSETail256
 837
 838openSSEFinalize:
 839	// Hash in the PT, AAD lengths
 840	ADDQ  ad_len+80(FP), R10
 841	ADCQ  src_len+56(FP), R11
 842	ADCQ  $0x01, R12
 843	MOVQ  (BP), AX
 844	MOVQ  AX, R15
 845	MULQ  R10
 846	MOVQ  AX, R13
 847	MOVQ  DX, R14
 848	MOVQ  (BP), AX
 849	MULQ  R11
 850	IMULQ R12, R15
 851	ADDQ  AX, R14
 852	ADCQ  DX, R15
 853	MOVQ  8(BP), AX
 854	MOVQ  AX, R8
 855	MULQ  R10
 856	ADDQ  AX, R14
 857	ADCQ  $0x00, DX
 858	MOVQ  DX, R10
 859	MOVQ  8(BP), AX
 860	MULQ  R11
 861	ADDQ  AX, R15
 862	ADCQ  $0x00, DX
 863	IMULQ R12, R8
 864	ADDQ  R10, R15
 865	ADCQ  DX, R8
 866	MOVQ  R13, R10
 867	MOVQ  R14, R11
 868	MOVQ  R15, R12
 869	ANDQ  $0x03, R12
 870	MOVQ  R15, R13
 871	ANDQ  $-4, R13
 872	MOVQ  R8, R14
 873	SHRQ  $0x02, R8, R15
 874	SHRQ  $0x02, R8
 875	ADDQ  R13, R10
 876	ADCQ  R14, R11
 877	ADCQ  $0x00, R12
 878	ADDQ  R15, R10
 879	ADCQ  R8, R11
 880	ADCQ  $0x00, R12
 881
 882	// Final reduce
 883	MOVQ    R10, R13
 884	MOVQ    R11, R14
 885	MOVQ    R12, R15
 886	SUBQ    $-5, R10
 887	SBBQ    $-1, R11
 888	SBBQ    $0x03, R12
 889	CMOVQCS R13, R10
 890	CMOVQCS R14, R11
 891	CMOVQCS R15, R12
 892
 893	// Add in the "s" part of the key
 894	ADDQ 16(BP), R10
 895	ADCQ 24(BP), R11
 896
 897	// Finally, constant time compare to the tag at the end of the message
 898	XORQ    AX, AX
 899	MOVQ    $0x00000001, DX
 900	XORQ    (SI), R10
 901	XORQ    8(SI), R11
 902	ORQ     R11, R10
 903	CMOVQEQ DX, AX
 904
 905	// Return true iff tags are equal
 906	MOVB AX, ret+96(FP)
 907	RET
 908
 909openSSE128:
 910	MOVOU chacha20Constants<>+0(SB), X0
 911	MOVOU 16(R8), X3
 912	MOVOU 32(R8), X6
 913	MOVOU 48(R8), X9
 914	MOVO  X0, X1
 915	MOVO  X3, X4
 916	MOVO  X6, X7
 917	MOVO  X9, X10
 918	PADDL sseIncMask<>+0(SB), X10
 919	MOVO  X1, X2
 920	MOVO  X4, X5
 921	MOVO  X7, X8
 922	MOVO  X10, X11
 923	PADDL sseIncMask<>+0(SB), X11
 924	MOVO  X3, X13
 925	MOVO  X6, X14
 926	MOVO  X10, X15
 927	MOVQ  $0x0000000a, R9
 928
 929openSSE128InnerCipherLoop:
 930	PADDD X3, X0
 931	PXOR  X0, X9
 932	ROL16(X9, X12)
 933	PADDD X9, X6
 934	PXOR  X6, X3
 935	MOVO  X3, X12
 936	PSLLL $0x0c, X12
 937	PSRLL $0x14, X3
 938	PXOR  X12, X3
 939	PADDD X3, X0
 940	PXOR  X0, X9
 941	ROL8(X9, X12)
 942	PADDD X9, X6
 943	PXOR  X6, X3
 944	MOVO  X3, X12
 945	PSLLL $0x07, X12
 946	PSRLL $0x19, X3
 947	PXOR  X12, X3
 948	PADDD X4, X1
 949	PXOR  X1, X10
 950	ROL16(X10, X12)
 951	PADDD X10, X7
 952	PXOR  X7, X4
 953	MOVO  X4, X12
 954	PSLLL $0x0c, X12
 955	PSRLL $0x14, X4
 956	PXOR  X12, X4
 957	PADDD X4, X1
 958	PXOR  X1, X10
 959	ROL8(X10, X12)
 960	PADDD X10, X7
 961	PXOR  X7, X4
 962	MOVO  X4, X12
 963	PSLLL $0x07, X12
 964	PSRLL $0x19, X4
 965	PXOR  X12, X4
 966	PADDD X5, X2
 967	PXOR  X2, X11
 968	ROL16(X11, X12)
 969	PADDD X11, X8
 970	PXOR  X8, X5
 971	MOVO  X5, X12
 972	PSLLL $0x0c, X12
 973	PSRLL $0x14, X5
 974	PXOR  X12, X5
 975	PADDD X5, X2
 976	PXOR  X2, X11
 977	ROL8(X11, X12)
 978	PADDD X11, X8
 979	PXOR  X8, X5
 980	MOVO  X5, X12
 981	PSLLL $0x07, X12
 982	PSRLL $0x19, X5
 983	PXOR  X12, X5
 984	BYTE  $0x66
 985	BYTE  $0x0f
 986	BYTE  $0x3a
 987	BYTE  $0x0f
 988	BYTE  $0xdb
 989	BYTE  $0x04
 990	BYTE  $0x66
 991	BYTE  $0x0f
 992	BYTE  $0x3a
 993	BYTE  $0x0f
 994	BYTE  $0xe4
 995	BYTE  $0x04
 996	BYTE  $0x66
 997	BYTE  $0x0f
 998	BYTE  $0x3a
 999	BYTE  $0x0f
1000	BYTE  $0xed
1001	BYTE  $0x04
1002	BYTE  $0x66
1003	BYTE  $0x0f
1004	BYTE  $0x3a
1005	BYTE  $0x0f
1006	BYTE  $0xf6
1007	BYTE  $0x08
1008	BYTE  $0x66
1009	BYTE  $0x0f
1010	BYTE  $0x3a
1011	BYTE  $0x0f
1012	BYTE  $0xff
1013	BYTE  $0x08
1014	BYTE  $0x66
1015	BYTE  $0x45
1016	BYTE  $0x0f
1017	BYTE  $0x3a
1018	BYTE  $0x0f
1019	BYTE  $0xc0
1020	BYTE  $0x08
1021	BYTE  $0x66
1022	BYTE  $0x45
1023	BYTE  $0x0f
1024	BYTE  $0x3a
1025	BYTE  $0x0f
1026	BYTE  $0xc9
1027	BYTE  $0x0c
1028	BYTE  $0x66
1029	BYTE  $0x45
1030	BYTE  $0x0f
1031	BYTE  $0x3a
1032	BYTE  $0x0f
1033	BYTE  $0xd2
1034	BYTE  $0x0c
1035	BYTE  $0x66
1036	BYTE  $0x45
1037	BYTE  $0x0f
1038	BYTE  $0x3a
1039	BYTE  $0x0f
1040	BYTE  $0xdb
1041	BYTE  $0x0c
1042	PADDD X3, X0
1043	PXOR  X0, X9
1044	ROL16(X9, X12)
1045	PADDD X9, X6
1046	PXOR  X6, X3
1047	MOVO  X3, X12
1048	PSLLL $0x0c, X12
1049	PSRLL $0x14, X3
1050	PXOR  X12, X3
1051	PADDD X3, X0
1052	PXOR  X0, X9
1053	ROL8(X9, X12)
1054	PADDD X9, X6
1055	PXOR  X6, X3
1056	MOVO  X3, X12
1057	PSLLL $0x07, X12
1058	PSRLL $0x19, X3
1059	PXOR  X12, X3
1060	PADDD X4, X1
1061	PXOR  X1, X10
1062	ROL16(X10, X12)
1063	PADDD X10, X7
1064	PXOR  X7, X4
1065	MOVO  X4, X12
1066	PSLLL $0x0c, X12
1067	PSRLL $0x14, X4
1068	PXOR  X12, X4
1069	PADDD X4, X1
1070	PXOR  X1, X10
1071	ROL8(X10, X12)
1072	PADDD X10, X7
1073	PXOR  X7, X4
1074	MOVO  X4, X12
1075	PSLLL $0x07, X12
1076	PSRLL $0x19, X4
1077	PXOR  X12, X4
1078	PADDD X5, X2
1079	PXOR  X2, X11
1080	ROL16(X11, X12)
1081	PADDD X11, X8
1082	PXOR  X8, X5
1083	MOVO  X5, X12
1084	PSLLL $0x0c, X12
1085	PSRLL $0x14, X5
1086	PXOR  X12, X5
1087	PADDD X5, X2
1088	PXOR  X2, X11
1089	ROL8(X11, X12)
1090	PADDD X11, X8
1091	PXOR  X8, X5
1092	MOVO  X5, X12
1093	PSLLL $0x07, X12
1094	PSRLL $0x19, X5
1095	PXOR  X12, X5
1096	BYTE  $0x66
1097	BYTE  $0x0f
1098	BYTE  $0x3a
1099	BYTE  $0x0f
1100	BYTE  $0xdb
1101	BYTE  $0x0c
1102	BYTE  $0x66
1103	BYTE  $0x0f
1104	BYTE  $0x3a
1105	BYTE  $0x0f
1106	BYTE  $0xe4
1107	BYTE  $0x0c
1108	BYTE  $0x66
1109	BYTE  $0x0f
1110	BYTE  $0x3a
1111	BYTE  $0x0f
1112	BYTE  $0xed
1113	BYTE  $0x0c
1114	BYTE  $0x66
1115	BYTE  $0x0f
1116	BYTE  $0x3a
1117	BYTE  $0x0f
1118	BYTE  $0xf6
1119	BYTE  $0x08
1120	BYTE  $0x66
1121	BYTE  $0x0f
1122	BYTE  $0x3a
1123	BYTE  $0x0f
1124	BYTE  $0xff
1125	BYTE  $0x08
1126	BYTE  $0x66
1127	BYTE  $0x45
1128	BYTE  $0x0f
1129	BYTE  $0x3a
1130	BYTE  $0x0f
1131	BYTE  $0xc0
1132	BYTE  $0x08
1133	BYTE  $0x66
1134	BYTE  $0x45
1135	BYTE  $0x0f
1136	BYTE  $0x3a
1137	BYTE  $0x0f
1138	BYTE  $0xc9
1139	BYTE  $0x04
1140	BYTE  $0x66
1141	BYTE  $0x45
1142	BYTE  $0x0f
1143	BYTE  $0x3a
1144	BYTE  $0x0f
1145	BYTE  $0xd2
1146	BYTE  $0x04
1147	BYTE  $0x66
1148	BYTE  $0x45
1149	BYTE  $0x0f
1150	BYTE  $0x3a
1151	BYTE  $0x0f
1152	BYTE  $0xdb
1153	BYTE  $0x04
1154	DECQ  R9
1155	JNE   openSSE128InnerCipherLoop
1156
1157	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
1158	PADDL chacha20Constants<>+0(SB), X0
1159	PADDL chacha20Constants<>+0(SB), X1
1160	PADDL chacha20Constants<>+0(SB), X2
1161	PADDL X13, X3
1162	PADDL X13, X4
1163	PADDL X13, X5
1164	PADDL X14, X7
1165	PADDL X14, X8
1166	PADDL X15, X10
1167	PADDL sseIncMask<>+0(SB), X15
1168	PADDL X15, X11
1169
1170	// Clamp and store the key
1171	PAND  polyClampMask<>+0(SB), X0
1172	MOVOU X0, (BP)
1173	MOVOU X3, 16(BP)
1174
1175	// Hash
1176	MOVQ ad_len+80(FP), R9
1177	CALL polyHashADInternal<>(SB)
1178
1179openSSE128Open:
1180	CMPQ BX, $0x10
1181	JB   openSSETail16
1182	SUBQ $0x10, BX
1183
1184	// Load for hashing
1185	ADDQ (SI), R10
1186	ADCQ 8(SI), R11
1187	ADCQ $0x01, R12
1188
1189	// Load for decryption
1190	MOVOU (SI), X12
1191	PXOR  X12, X1
1192	MOVOU X1, (DI)
1193	LEAQ  16(SI), SI
1194	LEAQ  16(DI), DI
1195	MOVQ  (BP), AX
1196	MOVQ  AX, R15
1197	MULQ  R10
1198	MOVQ  AX, R13
1199	MOVQ  DX, R14
1200	MOVQ  (BP), AX
1201	MULQ  R11
1202	IMULQ R12, R15
1203	ADDQ  AX, R14
1204	ADCQ  DX, R15
1205	MOVQ  8(BP), AX
1206	MOVQ  AX, R8
1207	MULQ  R10
1208	ADDQ  AX, R14
1209	ADCQ  $0x00, DX
1210	MOVQ  DX, R10
1211	MOVQ  8(BP), AX
1212	MULQ  R11
1213	ADDQ  AX, R15
1214	ADCQ  $0x00, DX
1215	IMULQ R12, R8
1216	ADDQ  R10, R15
1217	ADCQ  DX, R8
1218	MOVQ  R13, R10
1219	MOVQ  R14, R11
1220	MOVQ  R15, R12
1221	ANDQ  $0x03, R12
1222	MOVQ  R15, R13
1223	ANDQ  $-4, R13
1224	MOVQ  R8, R14
1225	SHRQ  $0x02, R8, R15
1226	SHRQ  $0x02, R8
1227	ADDQ  R13, R10
1228	ADCQ  R14, R11
1229	ADCQ  $0x00, R12
1230	ADDQ  R15, R10
1231	ADCQ  R8, R11
1232	ADCQ  $0x00, R12
1233
1234	// Shift the stream "left"
1235	MOVO X4, X1
1236	MOVO X7, X4
1237	MOVO X10, X7
1238	MOVO X2, X10
1239	MOVO X5, X2
1240	MOVO X8, X5
1241	MOVO X11, X8
1242	JMP  openSSE128Open
1243
1244openSSETail16:
1245	TESTQ BX, BX
1246	JE    openSSEFinalize
1247
1248	// We can safely load the CT from the end, because it is padded with the MAC
1249	MOVQ  BX, R9
1250	SHLQ  $0x04, R9
1251	LEAQ  andMask<>+0(SB), R13
1252	MOVOU (SI), X12
1253	ADDQ  BX, SI
1254	PAND  -16(R13)(R9*1), X12
1255	MOVO  X12, 64(BP)
1256	MOVQ  X12, R13
1257	MOVQ  72(BP), R14
1258	PXOR  X1, X12
1259
1260	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
1261openSSETail16Store:
1262	MOVQ   X12, R8
1263	MOVB   R8, (DI)
1264	PSRLDQ $0x01, X12
1265	INCQ   DI
1266	DECQ   BX
1267	JNE    openSSETail16Store
1268	ADDQ   R13, R10
1269	ADCQ   R14, R11
1270	ADCQ   $0x01, R12
1271	MOVQ   (BP), AX
1272	MOVQ   AX, R15
1273	MULQ   R10
1274	MOVQ   AX, R13
1275	MOVQ   DX, R14
1276	MOVQ   (BP), AX
1277	MULQ   R11
1278	IMULQ  R12, R15
1279	ADDQ   AX, R14
1280	ADCQ   DX, R15
1281	MOVQ   8(BP), AX
1282	MOVQ   AX, R8
1283	MULQ   R10
1284	ADDQ   AX, R14
1285	ADCQ   $0x00, DX
1286	MOVQ   DX, R10
1287	MOVQ   8(BP), AX
1288	MULQ   R11
1289	ADDQ   AX, R15
1290	ADCQ   $0x00, DX
1291	IMULQ  R12, R8
1292	ADDQ   R10, R15
1293	ADCQ   DX, R8
1294	MOVQ   R13, R10
1295	MOVQ   R14, R11
1296	MOVQ   R15, R12
1297	ANDQ   $0x03, R12
1298	MOVQ   R15, R13
1299	ANDQ   $-4, R13
1300	MOVQ   R8, R14
1301	SHRQ   $0x02, R8, R15
1302	SHRQ   $0x02, R8
1303	ADDQ   R13, R10
1304	ADCQ   R14, R11
1305	ADCQ   $0x00, R12
1306	ADDQ   R15, R10
1307	ADCQ   R8, R11
1308	ADCQ   $0x00, R12
1309	JMP    openSSEFinalize
1310
1311openSSETail64:
1312	MOVO  chacha20Constants<>+0(SB), X0
1313	MOVO  32(BP), X3
1314	MOVO  48(BP), X6
1315	MOVO  128(BP), X9
1316	PADDL sseIncMask<>+0(SB), X9
1317	MOVO  X9, 80(BP)
1318	XORQ  R9, R9
1319	MOVQ  BX, CX
1320	CMPQ  CX, $0x10
1321	JB    openSSETail64LoopB
1322
1323openSSETail64LoopA:
1324	ADDQ  (SI)(R9*1), R10
1325	ADCQ  8(SI)(R9*1), R11
1326	ADCQ  $0x01, R12
1327	MOVQ  (BP), AX
1328	MOVQ  AX, R15
1329	MULQ  R10
1330	MOVQ  AX, R13
1331	MOVQ  DX, R14
1332	MOVQ  (BP), AX
1333	MULQ  R11
1334	IMULQ R12, R15
1335	ADDQ  AX, R14
1336	ADCQ  DX, R15
1337	MOVQ  8(BP), AX
1338	MOVQ  AX, R8
1339	MULQ  R10
1340	ADDQ  AX, R14
1341	ADCQ  $0x00, DX
1342	MOVQ  DX, R10
1343	MOVQ  8(BP), AX
1344	MULQ  R11
1345	ADDQ  AX, R15
1346	ADCQ  $0x00, DX
1347	IMULQ R12, R8
1348	ADDQ  R10, R15
1349	ADCQ  DX, R8
1350	MOVQ  R13, R10
1351	MOVQ  R14, R11
1352	MOVQ  R15, R12
1353	ANDQ  $0x03, R12
1354	MOVQ  R15, R13
1355	ANDQ  $-4, R13
1356	MOVQ  R8, R14
1357	SHRQ  $0x02, R8, R15
1358	SHRQ  $0x02, R8
1359	ADDQ  R13, R10
1360	ADCQ  R14, R11
1361	ADCQ  $0x00, R12
1362	ADDQ  R15, R10
1363	ADCQ  R8, R11
1364	ADCQ  $0x00, R12
1365	SUBQ  $0x10, CX
1366
1367openSSETail64LoopB:
1368	ADDQ  $0x10, R9
1369	PADDD X3, X0
1370	PXOR  X0, X9
1371	ROL16(X9, X12)
1372	PADDD X9, X6
1373	PXOR  X6, X3
1374	MOVO  X3, X12
1375	PSLLL $0x0c, X12
1376	PSRLL $0x14, X3
1377	PXOR  X12, X3
1378	PADDD X3, X0
1379	PXOR  X0, X9
1380	ROL8(X9, X12)
1381	PADDD X9, X6
1382	PXOR  X6, X3
1383	MOVO  X3, X12
1384	PSLLL $0x07, X12
1385	PSRLL $0x19, X3
1386	PXOR  X12, X3
1387	BYTE  $0x66
1388	BYTE  $0x0f
1389	BYTE  $0x3a
1390	BYTE  $0x0f
1391	BYTE  $0xdb
1392	BYTE  $0x04
1393	BYTE  $0x66
1394	BYTE  $0x0f
1395	BYTE  $0x3a
1396	BYTE  $0x0f
1397	BYTE  $0xf6
1398	BYTE  $0x08
1399	BYTE  $0x66
1400	BYTE  $0x45
1401	BYTE  $0x0f
1402	BYTE  $0x3a
1403	BYTE  $0x0f
1404	BYTE  $0xc9
1405	BYTE  $0x0c
1406	PADDD X3, X0
1407	PXOR  X0, X9
1408	ROL16(X9, X12)
1409	PADDD X9, X6
1410	PXOR  X6, X3
1411	MOVO  X3, X12
1412	PSLLL $0x0c, X12
1413	PSRLL $0x14, X3
1414	PXOR  X12, X3
1415	PADDD X3, X0
1416	PXOR  X0, X9
1417	ROL8(X9, X12)
1418	PADDD X9, X6
1419	PXOR  X6, X3
1420	MOVO  X3, X12
1421	PSLLL $0x07, X12
1422	PSRLL $0x19, X3
1423	PXOR  X12, X3
1424	BYTE  $0x66
1425	BYTE  $0x0f
1426	BYTE  $0x3a
1427	BYTE  $0x0f
1428	BYTE  $0xdb
1429	BYTE  $0x0c
1430	BYTE  $0x66
1431	BYTE  $0x0f
1432	BYTE  $0x3a
1433	BYTE  $0x0f
1434	BYTE  $0xf6
1435	BYTE  $0x08
1436	BYTE  $0x66
1437	BYTE  $0x45
1438	BYTE  $0x0f
1439	BYTE  $0x3a
1440	BYTE  $0x0f
1441	BYTE  $0xc9
1442	BYTE  $0x04
1443	CMPQ  CX, $0x10
1444	JAE   openSSETail64LoopA
1445	CMPQ  R9, $0xa0
1446	JNE   openSSETail64LoopB
1447	PADDL chacha20Constants<>+0(SB), X0
1448	PADDL 32(BP), X3
1449	PADDL 48(BP), X6
1450	PADDL 80(BP), X9
1451
1452openSSETail64DecLoop:
1453	CMPQ  BX, $0x10
1454	JB    openSSETail64DecLoopDone
1455	SUBQ  $0x10, BX
1456	MOVOU (SI), X12
1457	PXOR  X12, X0
1458	MOVOU X0, (DI)
1459	LEAQ  16(SI), SI
1460	LEAQ  16(DI), DI
1461	MOVO  X3, X0
1462	MOVO  X6, X3
1463	MOVO  X9, X6
1464	JMP   openSSETail64DecLoop
1465
1466openSSETail64DecLoopDone:
1467	MOVO X0, X1
1468	JMP  openSSETail16
1469
1470openSSETail128:
1471	MOVO  chacha20Constants<>+0(SB), X1
1472	MOVO  32(BP), X4
1473	MOVO  48(BP), X7
1474	MOVO  128(BP), X10
1475	PADDL sseIncMask<>+0(SB), X10
1476	MOVO  X10, 80(BP)
1477	MOVO  X1, X0
1478	MOVO  X4, X3
1479	MOVO  X7, X6
1480	MOVO  X10, X9
1481	PADDL sseIncMask<>+0(SB), X9
1482	MOVO  X9, 96(BP)
1483	XORQ  R9, R9
1484	MOVQ  BX, CX
1485	ANDQ  $-16, CX
1486
1487openSSETail128LoopA:
1488	ADDQ  (SI)(R9*1), R10
1489	ADCQ  8(SI)(R9*1), R11
1490	ADCQ  $0x01, R12
1491	MOVQ  (BP), AX
1492	MOVQ  AX, R15
1493	MULQ  R10
1494	MOVQ  AX, R13
1495	MOVQ  DX, R14
1496	MOVQ  (BP), AX
1497	MULQ  R11
1498	IMULQ R12, R15
1499	ADDQ  AX, R14
1500	ADCQ  DX, R15
1501	MOVQ  8(BP), AX
1502	MOVQ  AX, R8
1503	MULQ  R10
1504	ADDQ  AX, R14
1505	ADCQ  $0x00, DX
1506	MOVQ  DX, R10
1507	MOVQ  8(BP), AX
1508	MULQ  R11
1509	ADDQ  AX, R15
1510	ADCQ  $0x00, DX
1511	IMULQ R12, R8
1512	ADDQ  R10, R15
1513	ADCQ  DX, R8
1514	MOVQ  R13, R10
1515	MOVQ  R14, R11
1516	MOVQ  R15, R12
1517	ANDQ  $0x03, R12
1518	MOVQ  R15, R13
1519	ANDQ  $-4, R13
1520	MOVQ  R8, R14
1521	SHRQ  $0x02, R8, R15
1522	SHRQ  $0x02, R8
1523	ADDQ  R13, R10
1524	ADCQ  R14, R11
1525	ADCQ  $0x00, R12
1526	ADDQ  R15, R10
1527	ADCQ  R8, R11
1528	ADCQ  $0x00, R12
1529
1530openSSETail128LoopB:
1531	ADDQ  $0x10, R9
1532	PADDD X3, X0
1533	PXOR  X0, X9
1534	ROL16(X9, X12)
1535	PADDD X9, X6
1536	PXOR  X6, X3
1537	MOVO  X3, X12
1538	PSLLL $0x0c, X12
1539	PSRLL $0x14, X3
1540	PXOR  X12, X3
1541	PADDD X3, X0
1542	PXOR  X0, X9
1543	ROL8(X9, X12)
1544	PADDD X9, X6
1545	PXOR  X6, X3
1546	MOVO  X3, X12
1547	PSLLL $0x07, X12
1548	PSRLL $0x19, X3
1549	PXOR  X12, X3
1550	PADDD X4, X1
1551	PXOR  X1, X10
1552	ROL16(X10, X12)
1553	PADDD X10, X7
1554	PXOR  X7, X4
1555	MOVO  X4, X12
1556	PSLLL $0x0c, X12
1557	PSRLL $0x14, X4
1558	PXOR  X12, X4
1559	PADDD X4, X1
1560	PXOR  X1, X10
1561	ROL8(X10, X12)
1562	PADDD X10, X7
1563	PXOR  X7, X4
1564	MOVO  X4, X12
1565	PSLLL $0x07, X12
1566	PSRLL $0x19, X4
1567	PXOR  X12, X4
1568	BYTE  $0x66
1569	BYTE  $0x0f
1570	BYTE  $0x3a
1571	BYTE  $0x0f
1572	BYTE  $0xdb
1573	BYTE  $0x04
1574	BYTE  $0x66
1575	BYTE  $0x0f
1576	BYTE  $0x3a
1577	BYTE  $0x0f
1578	BYTE  $0xf6
1579	BYTE  $0x08
1580	BYTE  $0x66
1581	BYTE  $0x45
1582	BYTE  $0x0f
1583	BYTE  $0x3a
1584	BYTE  $0x0f
1585	BYTE  $0xc9
1586	BYTE  $0x0c
1587	BYTE  $0x66
1588	BYTE  $0x0f
1589	BYTE  $0x3a
1590	BYTE  $0x0f
1591	BYTE  $0xe4
1592	BYTE  $0x04
1593	BYTE  $0x66
1594	BYTE  $0x0f
1595	BYTE  $0x3a
1596	BYTE  $0x0f
1597	BYTE  $0xff
1598	BYTE  $0x08
1599	BYTE  $0x66
1600	BYTE  $0x45
1601	BYTE  $0x0f
1602	BYTE  $0x3a
1603	BYTE  $0x0f
1604	BYTE  $0xd2
1605	BYTE  $0x0c
1606	PADDD X3, X0
1607	PXOR  X0, X9
1608	ROL16(X9, X12)
1609	PADDD X9, X6
1610	PXOR  X6, X3
1611	MOVO  X3, X12
1612	PSLLL $0x0c, X12
1613	PSRLL $0x14, X3
1614	PXOR  X12, X3
1615	PADDD X3, X0
1616	PXOR  X0, X9
1617	ROL8(X9, X12)
1618	PADDD X9, X6
1619	PXOR  X6, X3
1620	MOVO  X3, X12
1621	PSLLL $0x07, X12
1622	PSRLL $0x19, X3
1623	PXOR  X12, X3
1624	PADDD X4, X1
1625	PXOR  X1, X10
1626	ROL16(X10, X12)
1627	PADDD X10, X7
1628	PXOR  X7, X4
1629	MOVO  X4, X12
1630	PSLLL $0x0c, X12
1631	PSRLL $0x14, X4
1632	PXOR  X12, X4
1633	PADDD X4, X1
1634	PXOR  X1, X10
1635	ROL8(X10, X12)
1636	PADDD X10, X7
1637	PXOR  X7, X4
1638	MOVO  X4, X12
1639	PSLLL $0x07, X12
1640	PSRLL $0x19, X4
1641	PXOR  X12, X4
1642	BYTE  $0x66
1643	BYTE  $0x0f
1644	BYTE  $0x3a
1645	BYTE  $0x0f
1646	BYTE  $0xdb
1647	BYTE  $0x0c
1648	BYTE  $0x66
1649	BYTE  $0x0f
1650	BYTE  $0x3a
1651	BYTE  $0x0f
1652	BYTE  $0xf6
1653	BYTE  $0x08
1654	BYTE  $0x66
1655	BYTE  $0x45
1656	BYTE  $0x0f
1657	BYTE  $0x3a
1658	BYTE  $0x0f
1659	BYTE  $0xc9
1660	BYTE  $0x04
1661	BYTE  $0x66
1662	BYTE  $0x0f
1663	BYTE  $0x3a
1664	BYTE  $0x0f
1665	BYTE  $0xe4
1666	BYTE  $0x0c
1667	BYTE  $0x66
1668	BYTE  $0x0f
1669	BYTE  $0x3a
1670	BYTE  $0x0f
1671	BYTE  $0xff
1672	BYTE  $0x08
1673	BYTE  $0x66
1674	BYTE  $0x45
1675	BYTE  $0x0f
1676	BYTE  $0x3a
1677	BYTE  $0x0f
1678	BYTE  $0xd2
1679	BYTE  $0x04
1680	CMPQ  R9, CX
1681	JB    openSSETail128LoopA
1682	CMPQ  R9, $0xa0
1683	JNE   openSSETail128LoopB
1684	PADDL chacha20Constants<>+0(SB), X0
1685	PADDL chacha20Constants<>+0(SB), X1
1686	PADDL 32(BP), X3
1687	PADDL 32(BP), X4
1688	PADDL 48(BP), X6
1689	PADDL 48(BP), X7
1690	PADDL 96(BP), X9
1691	PADDL 80(BP), X10
1692	MOVOU (SI), X12
1693	MOVOU 16(SI), X13
1694	MOVOU 32(SI), X14
1695	MOVOU 48(SI), X15
1696	PXOR  X12, X1
1697	PXOR  X13, X4
1698	PXOR  X14, X7
1699	PXOR  X15, X10
1700	MOVOU X1, (DI)
1701	MOVOU X4, 16(DI)
1702	MOVOU X7, 32(DI)
1703	MOVOU X10, 48(DI)
1704	SUBQ  $0x40, BX
1705	LEAQ  64(SI), SI
1706	LEAQ  64(DI), DI
1707	JMP   openSSETail64DecLoop
1708
1709openSSETail192:
1710	MOVO    chacha20Constants<>+0(SB), X2
1711	MOVO    32(BP), X5
1712	MOVO    48(BP), X8
1713	MOVO    128(BP), X11
1714	PADDL   sseIncMask<>+0(SB), X11
1715	MOVO    X11, 80(BP)
1716	MOVO    X2, X1
1717	MOVO    X5, X4
1718	MOVO    X8, X7
1719	MOVO    X11, X10
1720	PADDL   sseIncMask<>+0(SB), X10
1721	MOVO    X10, 96(BP)
1722	MOVO    X1, X0
1723	MOVO    X4, X3
1724	MOVO    X7, X6
1725	MOVO    X10, X9
1726	PADDL   sseIncMask<>+0(SB), X9
1727	MOVO    X9, 112(BP)
1728	MOVQ    BX, CX
1729	MOVQ    $0x000000a0, R9
1730	CMPQ    CX, $0xa0
1731	CMOVQGT R9, CX
1732	ANDQ    $-16, CX
1733	XORQ    R9, R9
1734
1735openSSLTail192LoopA:
1736	ADDQ  (SI)(R9*1), R10
1737	ADCQ  8(SI)(R9*1), R11
1738	ADCQ  $0x01, R12
1739	MOVQ  (BP), AX
1740	MOVQ  AX, R15
1741	MULQ  R10
1742	MOVQ  AX, R13
1743	MOVQ  DX, R14
1744	MOVQ  (BP), AX
1745	MULQ  R11
1746	IMULQ R12, R15
1747	ADDQ  AX, R14
1748	ADCQ  DX, R15
1749	MOVQ  8(BP), AX
1750	MOVQ  AX, R8
1751	MULQ  R10
1752	ADDQ  AX, R14
1753	ADCQ  $0x00, DX
1754	MOVQ  DX, R10
1755	MOVQ  8(BP), AX
1756	MULQ  R11
1757	ADDQ  AX, R15
1758	ADCQ  $0x00, DX
1759	IMULQ R12, R8
1760	ADDQ  R10, R15
1761	ADCQ  DX, R8
1762	MOVQ  R13, R10
1763	MOVQ  R14, R11
1764	MOVQ  R15, R12
1765	ANDQ  $0x03, R12
1766	MOVQ  R15, R13
1767	ANDQ  $-4, R13
1768	MOVQ  R8, R14
1769	SHRQ  $0x02, R8, R15
1770	SHRQ  $0x02, R8
1771	ADDQ  R13, R10
1772	ADCQ  R14, R11
1773	ADCQ  $0x00, R12
1774	ADDQ  R15, R10
1775	ADCQ  R8, R11
1776	ADCQ  $0x00, R12
1777
1778openSSLTail192LoopB:
1779	ADDQ  $0x10, R9
1780	PADDD X3, X0
1781	PXOR  X0, X9
1782	ROL16(X9, X12)
1783	PADDD X9, X6
1784	PXOR  X6, X3
1785	MOVO  X3, X12
1786	PSLLL $0x0c, X12
1787	PSRLL $0x14, X3
1788	PXOR  X12, X3
1789	PADDD X3, X0
1790	PXOR  X0, X9
1791	ROL8(X9, X12)
1792	PADDD X9, X6
1793	PXOR  X6, X3
1794	MOVO  X3, X12
1795	PSLLL $0x07, X12
1796	PSRLL $0x19, X3
1797	PXOR  X12, X3
1798	PADDD X4, X1
1799	PXOR  X1, X10
1800	ROL16(X10, X12)
1801	PADDD X10, X7
1802	PXOR  X7, X4
1803	MOVO  X4, X12
1804	PSLLL $0x0c, X12
1805	PSRLL $0x14, X4
1806	PXOR  X12, X4
1807	PADDD X4, X1
1808	PXOR  X1, X10
1809	ROL8(X10, X12)
1810	PADDD X10, X7
1811	PXOR  X7, X4
1812	MOVO  X4, X12
1813	PSLLL $0x07, X12
1814	PSRLL $0x19, X4
1815	PXOR  X12, X4
1816	PADDD X5, X2
1817	PXOR  X2, X11
1818	ROL16(X11, X12)
1819	PADDD X11, X8
1820	PXOR  X8, X5
1821	MOVO  X5, X12
1822	PSLLL $0x0c, X12
1823	PSRLL $0x14, X5
1824	PXOR  X12, X5
1825	PADDD X5, X2
1826	PXOR  X2, X11
1827	ROL8(X11, X12)
1828	PADDD X11, X8
1829	PXOR  X8, X5
1830	MOVO  X5, X12
1831	PSLLL $0x07, X12
1832	PSRLL $0x19, X5
1833	PXOR  X12, X5
1834	BYTE  $0x66
1835	BYTE  $0x0f
1836	BYTE  $0x3a
1837	BYTE  $0x0f
1838	BYTE  $0xdb
1839	BYTE  $0x04
1840	BYTE  $0x66
1841	BYTE  $0x0f
1842	BYTE  $0x3a
1843	BYTE  $0x0f
1844	BYTE  $0xf6
1845	BYTE  $0x08
1846	BYTE  $0x66
1847	BYTE  $0x45
1848	BYTE  $0x0f
1849	BYTE  $0x3a
1850	BYTE  $0x0f
1851	BYTE  $0xc9
1852	BYTE  $0x0c
1853	BYTE  $0x66
1854	BYTE  $0x0f
1855	BYTE  $0x3a
1856	BYTE  $0x0f
1857	BYTE  $0xe4
1858	BYTE  $0x04
1859	BYTE  $0x66
1860	BYTE  $0x0f
1861	BYTE  $0x3a
1862	BYTE  $0x0f
1863	BYTE  $0xff
1864	BYTE  $0x08
1865	BYTE  $0x66
1866	BYTE  $0x45
1867	BYTE  $0x0f
1868	BYTE  $0x3a
1869	BYTE  $0x0f
1870	BYTE  $0xd2
1871	BYTE  $0x0c
1872	BYTE  $0x66
1873	BYTE  $0x0f
1874	BYTE  $0x3a
1875	BYTE  $0x0f
1876	BYTE  $0xed
1877	BYTE  $0x04
1878	BYTE  $0x66
1879	BYTE  $0x45
1880	BYTE  $0x0f
1881	BYTE  $0x3a
1882	BYTE  $0x0f
1883	BYTE  $0xc0
1884	BYTE  $0x08
1885	BYTE  $0x66
1886	BYTE  $0x45
1887	BYTE  $0x0f
1888	BYTE  $0x3a
1889	BYTE  $0x0f
1890	BYTE  $0xdb
1891	BYTE  $0x0c
1892	PADDD X3, X0
1893	PXOR  X0, X9
1894	ROL16(X9, X12)
1895	PADDD X9, X6
1896	PXOR  X6, X3
1897	MOVO  X3, X12
1898	PSLLL $0x0c, X12
1899	PSRLL $0x14, X3
1900	PXOR  X12, X3
1901	PADDD X3, X0
1902	PXOR  X0, X9
1903	ROL8(X9, X12)
1904	PADDD X9, X6
1905	PXOR  X6, X3
1906	MOVO  X3, X12
1907	PSLLL $0x07, X12
1908	PSRLL $0x19, X3
1909	PXOR  X12, X3
1910	PADDD X4, X1
1911	PXOR  X1, X10
1912	ROL16(X10, X12)
1913	PADDD X10, X7
1914	PXOR  X7, X4
1915	MOVO  X4, X12
1916	PSLLL $0x0c, X12
1917	PSRLL $0x14, X4
1918	PXOR  X12, X4
1919	PADDD X4, X1
1920	PXOR  X1, X10
1921	ROL8(X10, X12)
1922	PADDD X10, X7
1923	PXOR  X7, X4
1924	MOVO  X4, X12
1925	PSLLL $0x07, X12
1926	PSRLL $0x19, X4
1927	PXOR  X12, X4
1928	PADDD X5, X2
1929	PXOR  X2, X11
1930	ROL16(X11, X12)
1931	PADDD X11, X8
1932	PXOR  X8, X5
1933	MOVO  X5, X12
1934	PSLLL $0x0c, X12
1935	PSRLL $0x14, X5
1936	PXOR  X12, X5
1937	PADDD X5, X2
1938	PXOR  X2, X11
1939	ROL8(X11, X12)
1940	PADDD X11, X8
1941	PXOR  X8, X5
1942	MOVO  X5, X12
1943	PSLLL $0x07, X12
1944	PSRLL $0x19, X5
1945	PXOR  X12, X5
1946	BYTE  $0x66
1947	BYTE  $0x0f
1948	BYTE  $0x3a
1949	BYTE  $0x0f
1950	BYTE  $0xdb
1951	BYTE  $0x0c
1952	BYTE  $0x66
1953	BYTE  $0x0f
1954	BYTE  $0x3a
1955	BYTE  $0x0f
1956	BYTE  $0xf6
1957	BYTE  $0x08
1958	BYTE  $0x66
1959	BYTE  $0x45
1960	BYTE  $0x0f
1961	BYTE  $0x3a
1962	BYTE  $0x0f
1963	BYTE  $0xc9
1964	BYTE  $0x04
1965	BYTE  $0x66
1966	BYTE  $0x0f
1967	BYTE  $0x3a
1968	BYTE  $0x0f
1969	BYTE  $0xe4
1970	BYTE  $0x0c
1971	BYTE  $0x66
1972	BYTE  $0x0f
1973	BYTE  $0x3a
1974	BYTE  $0x0f
1975	BYTE  $0xff
1976	BYTE  $0x08
1977	BYTE  $0x66
1978	BYTE  $0x45
1979	BYTE  $0x0f
1980	BYTE  $0x3a
1981	BYTE  $0x0f
1982	BYTE  $0xd2
1983	BYTE  $0x04
1984	BYTE  $0x66
1985	BYTE  $0x0f
1986	BYTE  $0x3a
1987	BYTE  $0x0f
1988	BYTE  $0xed
1989	BYTE  $0x0c
1990	BYTE  $0x66
1991	BYTE  $0x45
1992	BYTE  $0x0f
1993	BYTE  $0x3a
1994	BYTE  $0x0f
1995	BYTE  $0xc0
1996	BYTE  $0x08
1997	BYTE  $0x66
1998	BYTE  $0x45
1999	BYTE  $0x0f
2000	BYTE  $0x3a
2001	BYTE  $0x0f
2002	BYTE  $0xdb
2003	BYTE  $0x04
2004	CMPQ  R9, CX
2005	JB    openSSLTail192LoopA
2006	CMPQ  R9, $0xa0
2007	JNE   openSSLTail192LoopB
2008	CMPQ  BX, $0xb0
2009	JB    openSSLTail192Store
2010	ADDQ  160(SI), R10
2011	ADCQ  168(SI), R11
2012	ADCQ  $0x01, R12
2013	MOVQ  (BP), AX
2014	MOVQ  AX, R15
2015	MULQ  R10
2016	MOVQ  AX, R13
2017	MOVQ  DX, R14
2018	MOVQ  (BP), AX
2019	MULQ  R11
2020	IMULQ R12, R15
2021	ADDQ  AX, R14
2022	ADCQ  DX, R15
2023	MOVQ  8(BP), AX
2024	MOVQ  AX, R8
2025	MULQ  R10
2026	ADDQ  AX, R14
2027	ADCQ  $0x00, DX
2028	MOVQ  DX, R10
2029	MOVQ  8(BP), AX
2030	MULQ  R11
2031	ADDQ  AX, R15
2032	ADCQ  $0x00, DX
2033	IMULQ R12, R8
2034	ADDQ  R10, R15
2035	ADCQ  DX, R8
2036	MOVQ  R13, R10
2037	MOVQ  R14, R11
2038	MOVQ  R15, R12
2039	ANDQ  $0x03, R12
2040	MOVQ  R15, R13
2041	ANDQ  $-4, R13
2042	MOVQ  R8, R14
2043	SHRQ  $0x02, R8, R15
2044	SHRQ  $0x02, R8
2045	ADDQ  R13, R10
2046	ADCQ  R14, R11
2047	ADCQ  $0x00, R12
2048	ADDQ  R15, R10
2049	ADCQ  R8, R11
2050	ADCQ  $0x00, R12
2051	CMPQ  BX, $0xc0
2052	JB    openSSLTail192Store
2053	ADDQ  176(SI), R10
2054	ADCQ  184(SI), R11
2055	ADCQ  $0x01, R12
2056	MOVQ  (BP), AX
2057	MOVQ  AX, R15
2058	MULQ  R10
2059	MOVQ  AX, R13
2060	MOVQ  DX, R14
2061	MOVQ  (BP), AX
2062	MULQ  R11
2063	IMULQ R12, R15
2064	ADDQ  AX, R14
2065	ADCQ  DX, R15
2066	MOVQ  8(BP), AX
2067	MOVQ  AX, R8
2068	MULQ  R10
2069	ADDQ  AX, R14
2070	ADCQ  $0x00, DX
2071	MOVQ  DX, R10
2072	MOVQ  8(BP), AX
2073	MULQ  R11
2074	ADDQ  AX, R15
2075	ADCQ  $0x00, DX
2076	IMULQ R12, R8
2077	ADDQ  R10, R15
2078	ADCQ  DX, R8
2079	MOVQ  R13, R10
2080	MOVQ  R14, R11
2081	MOVQ  R15, R12
2082	ANDQ  $0x03, R12
2083	MOVQ  R15, R13
2084	ANDQ  $-4, R13
2085	MOVQ  R8, R14
2086	SHRQ  $0x02, R8, R15
2087	SHRQ  $0x02, R8
2088	ADDQ  R13, R10
2089	ADCQ  R14, R11
2090	ADCQ  $0x00, R12
2091	ADDQ  R15, R10
2092	ADCQ  R8, R11
2093	ADCQ  $0x00, R12
2094
2095openSSLTail192Store:
2096	PADDL chacha20Constants<>+0(SB), X0
2097	PADDL chacha20Constants<>+0(SB), X1
2098	PADDL chacha20Constants<>+0(SB), X2
2099	PADDL 32(BP), X3
2100	PADDL 32(BP), X4
2101	PADDL 32(BP), X5
2102	PADDL 48(BP), X6
2103	PADDL 48(BP), X7
2104	PADDL 48(BP), X8
2105	PADDL 112(BP), X9
2106	PADDL 96(BP), X10
2107	PADDL 80(BP), X11
2108	MOVOU (SI), X12
2109	MOVOU 16(SI), X13
2110	MOVOU 32(SI), X14
2111	MOVOU 48(SI), X15
2112	PXOR  X12, X2
2113	PXOR  X13, X5
2114	PXOR  X14, X8
2115	PXOR  X15, X11
2116	MOVOU X2, (DI)
2117	MOVOU X5, 16(DI)
2118	MOVOU X8, 32(DI)
2119	MOVOU X11, 48(DI)
2120	MOVOU 64(SI), X12
2121	MOVOU 80(SI), X13
2122	MOVOU 96(SI), X14
2123	MOVOU 112(SI), X15
2124	PXOR  X12, X1
2125	PXOR  X13, X4
2126	PXOR  X14, X7
2127	PXOR  X15, X10
2128	MOVOU X1, 64(DI)
2129	MOVOU X4, 80(DI)
2130	MOVOU X7, 96(DI)
2131	MOVOU X10, 112(DI)
2132	SUBQ  $0x80, BX
2133	LEAQ  128(SI), SI
2134	LEAQ  128(DI), DI
2135	JMP   openSSETail64DecLoop
2136
2137openSSETail256:
2138	MOVO  chacha20Constants<>+0(SB), X0
2139	MOVO  32(BP), X3
2140	MOVO  48(BP), X6
2141	MOVO  128(BP), X9
2142	PADDL sseIncMask<>+0(SB), X9
2143	MOVO  X0, X1
2144	MOVO  X3, X4
2145	MOVO  X6, X7
2146	MOVO  X9, X10
2147	PADDL sseIncMask<>+0(SB), X10
2148	MOVO  X1, X2
2149	MOVO  X4, X5
2150	MOVO  X7, X8
2151	MOVO  X10, X11
2152	PADDL sseIncMask<>+0(SB), X11
2153	MOVO  X2, X12
2154	MOVO  X5, X13
2155	MOVO  X8, X14
2156	MOVO  X11, X15
2157	PADDL sseIncMask<>+0(SB), X15
2158
2159	// Store counters
2160	MOVO X9, 80(BP)
2161	MOVO X10, 96(BP)
2162	MOVO X11, 112(BP)
2163	MOVO X15, 128(BP)
2164	XORQ R9, R9
2165
2166openSSETail256Loop:
2167	ADDQ  (SI)(R9*1), R10
2168	ADCQ  8(SI)(R9*1), R11
2169	ADCQ  $0x01, R12
2170	MOVO  X14, 64(BP)
2171	PADDD X3, X0
2172	PXOR  X0, X9
2173	ROL16(X9, X14)
2174	PADDD X9, X6
2175	PXOR  X6, X3
2176	MOVO  X3, X14
2177	PSLLL $0x0c, X14
2178	PSRLL $0x14, X3
2179	PXOR  X14, X3
2180	PADDD X3, X0
2181	PXOR  X0, X9
2182	ROL8(X9, X14)
2183	PADDD X9, X6
2184	PXOR  X6, X3
2185	MOVO  X3, X14
2186	PSLLL $0x07, X14
2187	PSRLL $0x19, X3
2188	PXOR  X14, X3
2189	PADDD X4, X1
2190	PXOR  X1, X10
2191	ROL16(X10, X14)
2192	PADDD X10, X7
2193	PXOR  X7, X4
2194	MOVO  X4, X14
2195	PSLLL $0x0c, X14
2196	PSRLL $0x14, X4
2197	PXOR  X14, X4
2198	PADDD X4, X1
2199	PXOR  X1, X10
2200	ROL8(X10, X14)
2201	PADDD X10, X7
2202	PXOR  X7, X4
2203	MOVO  X4, X14
2204	PSLLL $0x07, X14
2205	PSRLL $0x19, X4
2206	PXOR  X14, X4
2207	PADDD X5, X2
2208	PXOR  X2, X11
2209	ROL16(X11, X14)
2210	PADDD X11, X8
2211	PXOR  X8, X5
2212	MOVO  X5, X14
2213	PSLLL $0x0c, X14
2214	PSRLL $0x14, X5
2215	PXOR  X14, X5
2216	PADDD X5, X2
2217	PXOR  X2, X11
2218	ROL8(X11, X14)
2219	PADDD X11, X8
2220	PXOR  X8, X5
2221	MOVO  X5, X14
2222	PSLLL $0x07, X14
2223	PSRLL $0x19, X5
2224	PXOR  X14, X5
2225	MOVO  64(BP), X14
2226	MOVO  X7, 64(BP)
2227	PADDD X13, X12
2228	PXOR  X12, X15
2229	ROL16(X15, X7)
2230	PADDD X15, X14
2231	PXOR  X14, X13
2232	MOVO  X13, X7
2233	PSLLL $0x0c, X7
2234	PSRLL $0x14, X13
2235	PXOR  X7, X13
2236	PADDD X13, X12
2237	PXOR  X12, X15
2238	ROL8(X15, X7)
2239	PADDD X15, X14
2240	PXOR  X14, X13
2241	MOVO  X13, X7
2242	PSLLL $0x07, X7
2243	PSRLL $0x19, X13
2244	PXOR  X7, X13
2245	MOVO  64(BP), X7
2246	BYTE  $0x66
2247	BYTE  $0x0f
2248	BYTE  $0x3a
2249	BYTE  $0x0f
2250	BYTE  $0xdb
2251	BYTE  $0x04
2252	BYTE  $0x66
2253	BYTE  $0x0f
2254	BYTE  $0x3a
2255	BYTE  $0x0f
2256	BYTE  $0xe4
2257	BYTE  $0x04
2258	BYTE  $0x66
2259	BYTE  $0x0f
2260	BYTE  $0x3a
2261	BYTE  $0x0f
2262	BYTE  $0xed
2263	BYTE  $0x04
2264	BYTE  $0x66
2265	BYTE  $0x45
2266	BYTE  $0x0f
2267	BYTE  $0x3a
2268	BYTE  $0x0f
2269	BYTE  $0xed
2270	BYTE  $0x04
2271	BYTE  $0x66
2272	BYTE  $0x0f
2273	BYTE  $0x3a
2274	BYTE  $0x0f
2275	BYTE  $0xf6
2276	BYTE  $0x08
2277	BYTE  $0x66
2278	BYTE  $0x0f
2279	BYTE  $0x3a
2280	BYTE  $0x0f
2281	BYTE  $0xff
2282	BYTE  $0x08
2283	BYTE  $0x66
2284	BYTE  $0x45
2285	BYTE  $0x0f
2286	BYTE  $0x3a
2287	BYTE  $0x0f
2288	BYTE  $0xc0
2289	BYTE  $0x08
2290	BYTE  $0x66
2291	BYTE  $0x45
2292	BYTE  $0x0f
2293	BYTE  $0x3a
2294	BYTE  $0x0f
2295	BYTE  $0xf6
2296	BYTE  $0x08
2297	BYTE  $0x66
2298	BYTE  $0x45
2299	BYTE  $0x0f
2300	BYTE  $0x3a
2301	BYTE  $0x0f
2302	BYTE  $0xc9
2303	BYTE  $0x0c
2304	BYTE  $0x66
2305	BYTE  $0x45
2306	BYTE  $0x0f
2307	BYTE  $0x3a
2308	BYTE  $0x0f
2309	BYTE  $0xd2
2310	BYTE  $0x0c
2311	BYTE  $0x66
2312	BYTE  $0x45
2313	BYTE  $0x0f
2314	BYTE  $0x3a
2315	BYTE  $0x0f
2316	BYTE  $0xdb
2317	BYTE  $0x0c
2318	BYTE  $0x66
2319	BYTE  $0x45
2320	BYTE  $0x0f
2321	BYTE  $0x3a
2322	BYTE  $0x0f
2323	BYTE  $0xff
2324	BYTE  $0x0c
2325	MOVQ  (BP), AX
2326	MOVQ  AX, R15
2327	MULQ  R10
2328	MOVQ  AX, R13
2329	MOVQ  DX, R14
2330	MOVQ  (BP), AX
2331	MULQ  R11
2332	IMULQ R12, R15
2333	ADDQ  AX, R14
2334	ADCQ  DX, R15
2335	MOVQ  8(BP), AX
2336	MOVQ  AX, R8
2337	MULQ  R10
2338	ADDQ  AX, R14
2339	ADCQ  $0x00, DX
2340	MOVQ  DX, R10
2341	MOVQ  8(BP), AX
2342	MULQ  R11
2343	ADDQ  AX, R15
2344	ADCQ  $0x00, DX
2345	MOVO  X14, 64(BP)
2346	PADDD X3, X0
2347	PXOR  X0, X9
2348	ROL16(X9, X14)
2349	PADDD X9, X6
2350	PXOR  X6, X3
2351	MOVO  X3, X14
2352	PSLLL $0x0c, X14
2353	PSRLL $0x14, X3
2354	PXOR  X14, X3
2355	PADDD X3, X0
2356	PXOR  X0, X9
2357	ROL8(X9, X14)
2358	PADDD X9, X6
2359	PXOR  X6, X3
2360	MOVO  X3, X14
2361	PSLLL $0x07, X14
2362	PSRLL $0x19, X3
2363	PXOR  X14, X3
2364	PADDD X4, X1
2365	PXOR  X1, X10
2366	ROL16(X10, X14)
2367	PADDD X10, X7
2368	PXOR  X7, X4
2369	MOVO  X4, X14
2370	PSLLL $0x0c, X14
2371	PSRLL $0x14, X4
2372	PXOR  X14, X4
2373	PADDD X4, X1
2374	PXOR  X1, X10
2375	ROL8(X10, X14)
2376	PADDD X10, X7
2377	PXOR  X7, X4
2378	MOVO  X4, X14
2379	PSLLL $0x07, X14
2380	PSRLL $0x19, X4
2381	PXOR  X14, X4
2382	PADDD X5, X2
2383	PXOR  X2, X11
2384	ROL16(X11, X14)
2385	PADDD X11, X8
2386	PXOR  X8, X5
2387	MOVO  X5, X14
2388	PSLLL $0x0c, X14
2389	PSRLL $0x14, X5
2390	PXOR  X14, X5
2391	PADDD X5, X2
2392	PXOR  X2, X11
2393	ROL8(X11, X14)
2394	PADDD X11, X8
2395	PXOR  X8, X5
2396	MOVO  X5, X14
2397	PSLLL $0x07, X14
2398	PSRLL $0x19, X5
2399	PXOR  X14, X5
2400	MOVO  64(BP), X14
2401	MOVO  X7, 64(BP)
2402	PADDD X13, X12
2403	PXOR  X12, X15
2404	ROL16(X15, X7)
2405	PADDD X15, X14
2406	PXOR  X14, X13
2407	MOVO  X13, X7
2408	PSLLL $0x0c, X7
2409	PSRLL $0x14, X13
2410	PXOR  X7, X13
2411	PADDD X13, X12
2412	PXOR  X12, X15
2413	ROL8(X15, X7)
2414	PADDD X15, X14
2415	PXOR  X14, X13
2416	MOVO  X13, X7
2417	PSLLL $0x07, X7
2418	PSRLL $0x19, X13
2419	PXOR  X7, X13
2420	MOVO  64(BP), X7
2421	IMULQ R12, R8
2422	ADDQ  R10, R15
2423	ADCQ  DX, R8
2424	MOVQ  R13, R10
2425	MOVQ  R14, R11
2426	MOVQ  R15, R12
2427	ANDQ  $0x03, R12
2428	MOVQ  R15, R13
2429	ANDQ  $-4, R13
2430	MOVQ  R8, R14
2431	SHRQ  $0x02, R8, R15
2432	SHRQ  $0x02, R8
2433	ADDQ  R13, R10
2434	ADCQ  R14, R11
2435	ADCQ  $0x00, R12
2436	ADDQ  R15, R10
2437	ADCQ  R8, R11
2438	ADCQ  $0x00, R12
2439	BYTE  $0x66
2440	BYTE  $0x0f
2441	BYTE  $0x3a
2442	BYTE  $0x0f
2443	BYTE  $0xdb
2444	BYTE  $0x0c
2445	BYTE  $0x66
2446	BYTE  $0x0f
2447	BYTE  $0x3a
2448	BYTE  $0x0f
2449	BYTE  $0xe4
2450	BYTE  $0x0c
2451	BYTE  $0x66
2452	BYTE  $0x0f
2453	BYTE  $0x3a
2454	BYTE  $0x0f
2455	BYTE  $0xed
2456	BYTE  $0x0c
2457	BYTE  $0x66
2458	BYTE  $0x45
2459	BYTE  $0x0f
2460	BYTE  $0x3a
2461	BYTE  $0x0f
2462	BYTE  $0xed
2463	BYTE  $0x0c
2464	BYTE  $0x66
2465	BYTE  $0x0f
2466	BYTE  $0x3a
2467	BYTE  $0x0f
2468	BYTE  $0xf6
2469	BYTE  $0x08
2470	BYTE  $0x66
2471	BYTE  $0x0f
2472	BYTE  $0x3a
2473	BYTE  $0x0f
2474	BYTE  $0xff
2475	BYTE  $0x08
2476	BYTE  $0x66
2477	BYTE  $0x45
2478	BYTE  $0x0f
2479	BYTE  $0x3a
2480	BYTE  $0x0f
2481	BYTE  $0xc0
2482	BYTE  $0x08
2483	BYTE  $0x66
2484	BYTE  $0x45
2485	BYTE  $0x0f
2486	BYTE  $0x3a
2487	BYTE  $0x0f
2488	BYTE  $0xf6
2489	BYTE  $0x08
2490	BYTE  $0x66
2491	BYTE  $0x45
2492	BYTE  $0x0f
2493	BYTE  $0x3a
2494	BYTE  $0x0f
2495	BYTE  $0xc9
2496	BYTE  $0x04
2497	BYTE  $0x66
2498	BYTE  $0x45
2499	BYTE  $0x0f
2500	BYTE  $0x3a
2501	BYTE  $0x0f
2502	BYTE  $0xd2
2503	BYTE  $0x04
2504	BYTE  $0x66
2505	BYTE  $0x45
2506	BYTE  $0x0f
2507	BYTE  $0x3a
2508	BYTE  $0x0f
2509	BYTE  $0xdb
2510	BYTE  $0x04
2511	BYTE  $0x66
2512	BYTE  $0x45
2513	BYTE  $0x0f
2514	BYTE  $0x3a
2515	BYTE  $0x0f
2516	BYTE  $0xff
2517	BYTE  $0x04
2518	ADDQ  $0x10, R9
2519	CMPQ  R9, $0xa0
2520	JB    openSSETail256Loop
2521	MOVQ  BX, CX
2522	ANDQ  $-16, CX
2523
2524openSSETail256HashLoop:
2525	ADDQ  (SI)(R9*1), R10
2526	ADCQ  8(SI)(R9*1), R11
2527	ADCQ  $0x01, R12
2528	MOVQ  (BP), AX
2529	MOVQ  AX, R15
2530	MULQ  R10
2531	MOVQ  AX, R13
2532	MOVQ  DX, R14
2533	MOVQ  (BP), AX
2534	MULQ  R11
2535	IMULQ R12, R15
2536	ADDQ  AX, R14
2537	ADCQ  DX, R15
2538	MOVQ  8(BP), AX
2539	MOVQ  AX, R8
2540	MULQ  R10
2541	ADDQ  AX, R14
2542	ADCQ  $0x00, DX
2543	MOVQ  DX, R10
2544	MOVQ  8(BP), AX
2545	MULQ  R11
2546	ADDQ  AX, R15
2547	ADCQ  $0x00, DX
2548	IMULQ R12, R8
2549	ADDQ  R10, R15
2550	ADCQ  DX, R8
2551	MOVQ  R13, R10
2552	MOVQ  R14, R11
2553	MOVQ  R15, R12
2554	ANDQ  $0x03, R12
2555	MOVQ  R15, R13
2556	ANDQ  $-4, R13
2557	MOVQ  R8, R14
2558	SHRQ  $0x02, R8, R15
2559	SHRQ  $0x02, R8
2560	ADDQ  R13, R10
2561	ADCQ  R14, R11
2562	ADCQ  $0x00, R12
2563	ADDQ  R15, R10
2564	ADCQ  R8, R11
2565	ADCQ  $0x00, R12
2566	ADDQ  $0x10, R9
2567	CMPQ  R9, CX
2568	JB    openSSETail256HashLoop
2569
2570	// Add in the state
2571	PADDD chacha20Constants<>+0(SB), X0
2572	PADDD chacha20Constants<>+0(SB), X1
2573	PADDD chacha20Constants<>+0(SB), X2
2574	PADDD chacha20Constants<>+0(SB), X12
2575	PADDD 32(BP), X3
2576	PADDD 32(BP), X4
2577	PADDD 32(BP), X5
2578	PADDD 32(BP), X13
2579	PADDD 48(BP), X6
2580	PADDD 48(BP), X7
2581	PADDD 48(BP), X8
2582	PADDD 48(BP), X14
2583	PADDD 80(BP), X9
2584	PADDD 96(BP), X10
2585	PADDD 112(BP), X11
2586	PADDD 128(BP), X15
2587	MOVO  X15, 64(BP)
2588
2589	// Load - xor - store
2590	MOVOU (SI), X15
2591	PXOR  X15, X0
2592	MOVOU 16(SI), X15
2593	PXOR  X15, X3
2594	MOVOU 32(SI), X15
2595	PXOR  X15, X6
2596	MOVOU 48(SI), X15
2597	PXOR  X15, X9
2598	MOVOU X0, (DI)
2599	MOVOU X3, 16(DI)
2600	MOVOU X6, 32(DI)
2601	MOVOU X9, 48(DI)
2602	MOVOU 64(SI), X0
2603	MOVOU 80(SI), X3
2604	MOVOU 96(SI), X6
2605	MOVOU 112(SI), X9
2606	PXOR  X0, X1
2607	PXOR  X3, X4
2608	PXOR  X6, X7
2609	PXOR  X9, X10
2610	MOVOU X1, 64(DI)
2611	MOVOU X4, 80(DI)
2612	MOVOU X7, 96(DI)
2613	MOVOU X10, 112(DI)
2614	MOVOU 128(SI), X0
2615	MOVOU 144(SI), X3
2616	MOVOU 160(SI), X6
2617	MOVOU 176(SI), X9
2618	PXOR  X0, X2
2619	PXOR  X3, X5
2620	PXOR  X6, X8
2621	PXOR  X9, X11
2622	MOVOU X2, 128(DI)
2623	MOVOU X5, 144(DI)
2624	MOVOU X8, 160(DI)
2625	MOVOU X11, 176(DI)
2626	LEAQ  192(SI), SI
2627	LEAQ  192(DI), DI
2628	SUBQ  $0xc0, BX
2629	MOVO  X12, X0
2630	MOVO  X13, X3
2631	MOVO  X14, X6
2632	MOVO  64(BP), X9
2633	JMP   openSSETail64DecLoop
2634
2635chacha20Poly1305Open_AVX2:
2636	VZEROUPPER
2637	VMOVDQU chacha20Constants<>+0(SB), Y0
2638	BYTE    $0xc4
2639	BYTE    $0x42
2640	BYTE    $0x7d
2641	BYTE    $0x5a
2642	BYTE    $0x70
2643	BYTE    $0x10
2644	BYTE    $0xc4
2645	BYTE    $0x42
2646	BYTE    $0x7d
2647	BYTE    $0x5a
2648	BYTE    $0x60
2649	BYTE    $0x20
2650	BYTE    $0xc4
2651	BYTE    $0xc2
2652	BYTE    $0x7d
2653	BYTE    $0x5a
2654	BYTE    $0x60
2655	BYTE    $0x30
2656	VPADDD  avx2InitMask<>+0(SB), Y4, Y4
2657
2658	// Special optimization, for very short buffers
2659	CMPQ BX, $0xc0
2660	JBE  openAVX2192
2661	CMPQ BX, $0x00000140
2662	JBE  openAVX2320
2663
2664	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
2665	VMOVDQA Y14, 32(BP)
2666	VMOVDQA Y12, 64(BP)
2667	VMOVDQA Y4, 192(BP)
2668	MOVQ    $0x0000000a, R9
2669
2670openAVX2PreparePolyKey:
2671	VPADDD     Y14, Y0, Y0
2672	VPXOR      Y0, Y4, Y4
2673	VPSHUFB    rol16<>+0(SB), Y4, Y4
2674	VPADDD     Y4, Y12, Y12
2675	VPXOR      Y12, Y14, Y14
2676	VPSLLD     $0x0c, Y14, Y3
2677	VPSRLD     $0x14, Y14, Y14
2678	VPXOR      Y3, Y14, Y14
2679	VPADDD     Y14, Y0, Y0
2680	VPXOR      Y0, Y4, Y4
2681	VPSHUFB    rol8<>+0(SB), Y4, Y4
2682	VPADDD     Y4, Y12, Y12
2683	VPXOR      Y12, Y14, Y14
2684	VPSLLD     $0x07, Y14, Y3
2685	VPSRLD     $0x19, Y14, Y14
2686	VPXOR      Y3, Y14, Y14
2687	VPALIGNR   $0x04, Y14, Y14, Y14
2688	VPALIGNR   $0x08, Y12, Y12, Y12
2689	VPALIGNR   $0x0c, Y4, Y4, Y4
2690	VPADDD     Y14, Y0, Y0
2691	VPXOR      Y0, Y4, Y4
2692	VPSHUFB    rol16<>+0(SB), Y4, Y4
2693	VPADDD     Y4, Y12, Y12
2694	VPXOR      Y12, Y14, Y14
2695	VPSLLD     $0x0c, Y14, Y3
2696	VPSRLD     $0x14, Y14, Y14
2697	VPXOR      Y3, Y14, Y14
2698	VPADDD     Y14, Y0, Y0
2699	VPXOR      Y0, Y4, Y4
2700	VPSHUFB    rol8<>+0(SB), Y4, Y4
2701	VPADDD     Y4, Y12, Y12
2702	VPXOR      Y12, Y14, Y14
2703	VPSLLD     $0x07, Y14, Y3
2704	VPSRLD     $0x19, Y14, Y14
2705	VPXOR      Y3, Y14, Y14
2706	VPALIGNR   $0x0c, Y14, Y14, Y14
2707	VPALIGNR   $0x08, Y12, Y12, Y12
2708	VPALIGNR   $0x04, Y4, Y4, Y4
2709	DECQ       R9
2710	JNE        openAVX2PreparePolyKey
2711	VPADDD     chacha20Constants<>+0(SB), Y0, Y0
2712	VPADDD     32(BP), Y14, Y14
2713	VPADDD     64(BP), Y12, Y12
2714	VPADDD     192(BP), Y4, Y4
2715	VPERM2I128 $0x02, Y0, Y14, Y3
2716
2717	// Clamp and store poly key
2718	VPAND   polyClampMask<>+0(SB), Y3, Y3
2719	VMOVDQA Y3, (BP)
2720
2721	// Stream for the first 64 bytes
2722	VPERM2I128 $0x13, Y0, Y14, Y0
2723	VPERM2I128 $0x13, Y12, Y4, Y14
2724
2725	// Hash AD + first 64 bytes
2726	MOVQ ad_len+80(FP), R9
2727	CALL polyHashADInternal<>(SB)
2728	XORQ CX, CX
2729
2730openAVX2InitialHash64:
2731	ADDQ  (SI)(CX*1), R10
2732	ADCQ  8(SI)(CX*1), R11
2733	ADCQ  $0x01, R12
2734	MOVQ  (BP), DX
2735	MOVQ  DX, R15
2736	MULXQ R10, R13, R14
2737	IMULQ R12, R15
2738	MULXQ R11, AX, DX
2739	ADDQ  AX, R14
2740	ADCQ  DX, R15
2741	MOVQ  8(BP), DX
2742	MULXQ R10, R10, AX
2743	ADDQ  R10, R14
2744	MULXQ R11, R11, R8
2745	ADCQ  R11, R15
2746	ADCQ  $0x00, R8
2747	IMULQ R12, DX
2748	ADDQ  AX, R15
2749	ADCQ  DX, R8
2750	MOVQ  R13, R10
2751	MOVQ  R14, R11
2752	MOVQ  R15, R12
2753	ANDQ  $0x03, R12
2754	MOVQ  R15, R13
2755	ANDQ  $-4, R13
2756	MOVQ  R8, R14
2757	SHRQ  $0x02, R8, R15
2758	SHRQ  $0x02, R8
2759	ADDQ  R13, R10
2760	ADCQ  R14, R11
2761	ADCQ  $0x00, R12
2762	ADDQ  R15, R10
2763	ADCQ  R8, R11
2764	ADCQ  $0x00, R12
2765	ADDQ  $0x10, CX
2766	CMPQ  CX, $0x40
2767	JNE   openAVX2InitialHash64
2768
2769	// Decrypt the first 64 bytes
2770	VPXOR   (SI), Y0, Y0
2771	VPXOR   32(SI), Y14, Y14
2772	VMOVDQU Y0, (DI)
2773	VMOVDQU Y14, 32(DI)
2774	LEAQ    64(SI), SI
2775	LEAQ    64(DI), DI
2776	SUBQ    $0x40, BX
2777
2778openAVX2MainLoop:
2779	CMPQ BX, $0x00000200
2780	JB   openAVX2MainLoopDone
2781
2782	// Load state, increment counter blocks, store the incremented counters
2783	VMOVDQU chacha20Constants<>+0(SB), Y0
2784	VMOVDQA Y0, Y5
2785	VMOVDQA Y0, Y6
2786	VMOVDQA Y0, Y7
2787	VMOVDQA 32(BP), Y14
2788	VMOVDQA Y14, Y9
2789	VMOVDQA Y14, Y10
2790	VMOVDQA Y14, Y11
2791	VMOVDQA 64(BP), Y12
2792	VMOVDQA Y12, Y13
2793	VMOVDQA Y12, Y8
2794	VMOVDQA Y12, Y15
2795	VMOVDQA 192(BP), Y4
2796	VPADDD  avx2IncMask<>+0(SB), Y4, Y4
2797	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
2798	VPADDD  avx2IncMask<>+0(SB), Y1, Y2
2799	VPADDD  avx2IncMask<>+0(SB), Y2, Y3
2800	VMOVDQA Y4, 96(BP)
2801	VMOVDQA Y1, 128(BP)
2802	VMOVDQA Y2, 160(BP)
2803	VMOVDQA Y3, 192(BP)
2804	XORQ    CX, CX
2805
2806openAVX2InternalLoop:
2807	ADDQ     (SI)(CX*1), R10
2808	ADCQ     8(SI)(CX*1), R11
2809	ADCQ     $0x01, R12
2810	VPADDD   Y14, Y0, Y0
2811	VPADDD   Y9, Y5, Y5
2812	VPADDD   Y10, Y6, Y6
2813	VPADDD   Y11, Y7, Y7
2814	MOVQ     (BP), DX
2815	MOVQ     DX, R15
2816	MULXQ    R10, R13, R14
2817	IMULQ    R12, R15
2818	MULXQ    R11, AX, DX
2819	ADDQ     AX, R14
2820	ADCQ     DX, R15
2821	VPXOR    Y0, Y4, Y4
2822	VPXOR    Y5, Y1, Y1
2823	VPXOR    Y6, Y2, Y2
2824	VPXOR    Y7, Y3, Y3
2825	VPSHUFB  rol16<>+0(SB), Y4, Y4
2826	VPSHUFB  rol16<>+0(SB), Y1, Y1
2827	VPSHUFB  rol16<>+0(SB), Y2, Y2
2828	VPSHUFB  rol16<>+0(SB), Y3, Y3
2829	MOVQ     8(BP), DX
2830	MULXQ    R10, R10, AX
2831	ADDQ     R10, R14
2832	MULXQ    R11, R11, R8
2833	ADCQ     R11, R15
2834	ADCQ     $0x00, R8
2835	VPADDD   Y4, Y12, Y12
2836	VPADDD   Y1, Y13, Y13
2837	VPADDD   Y2, Y8, Y8
2838	VPADDD   Y3, Y15, Y15
2839	VPXOR    Y12, Y14, Y14
2840	VPXOR    Y13, Y9, Y9
2841	VPXOR    Y8, Y10, Y10
2842	VPXOR    Y15, Y11, Y11
2843	IMULQ    R12, DX
2844	ADDQ     AX, R15
2845	ADCQ     DX, R8
2846	VMOVDQA  Y15, 224(BP)
2847	VPSLLD   $0x0c, Y14, Y15
2848	VPSRLD   $0x14, Y14, Y14
2849	VPXOR    Y15, Y14, Y14
2850	VPSLLD   $0x0c, Y9, Y15
2851	VPSRLD   $0x14, Y9, Y9
2852	VPXOR    Y15, Y9, Y9
2853	VPSLLD   $0x0c, Y10, Y15
2854	VPSRLD   $0x14, Y10, Y10
2855	VPXOR    Y15, Y10, Y10
2856	VPSLLD   $0x0c, Y11, Y15
2857	VPSRLD   $0x14, Y11, Y11
2858	VPXOR    Y15, Y11, Y11
2859	VMOVDQA  224(BP), Y15
2860	MOVQ     R13, R10
2861	MOVQ     R14, R11
2862	MOVQ     R15, R12
2863	ANDQ     $0x03, R12
2864	MOVQ     R15, R13
2865	ANDQ     $-4, R13
2866	MOVQ     R8, R14
2867	SHRQ     $0x02, R8, R15
2868	SHRQ     $0x02, R8
2869	ADDQ     R13, R10
2870	ADCQ     R14, R11
2871	ADCQ     $0x00, R12
2872	ADDQ     R15, R10
2873	ADCQ     R8, R11
2874	ADCQ     $0x00, R12
2875	VPADDD   Y14, Y0, Y0
2876	VPADDD   Y9, Y5, Y5
2877	VPADDD   Y10, Y6, Y6
2878	VPADDD   Y11, Y7, Y7
2879	VPXOR    Y0, Y4, Y4
2880	VPXOR    Y5, Y1, Y1
2881	VPXOR    Y6, Y2, Y2
2882	VPXOR    Y7, Y3, Y3
2883	VPSHUFB  rol8<>+0(SB), Y4, Y4
2884	VPSHUFB  rol8<>+0(SB), Y1, Y1
2885	VPSHUFB  rol8<>+0(SB), Y2, Y2
2886	VPSHUFB  rol8<>+0(SB), Y3, Y3
2887	ADDQ     16(SI)(CX*1), R10
2888	ADCQ     24(SI)(CX*1), R11
2889	ADCQ     $0x01, R12
2890	VPADDD   Y4, Y12, Y12
2891	VPADDD   Y1, Y13, Y13
2892	VPADDD   Y2, Y8, Y8
2893	VPADDD   Y3, Y15, Y15
2894	MOVQ     (BP), DX
2895	MOVQ     DX, R15
2896	MULXQ    R10, R13, R14
2897	IMULQ    R12, R15
2898	MULXQ    R11, AX, DX
2899	ADDQ     AX, R14
2900	ADCQ     DX, R15
2901	VPXOR    Y12, Y14, Y14
2902	VPXOR    Y13, Y9, Y9
2903	VPXOR    Y8, Y10, Y10
2904	VPXOR    Y15, Y11, Y11
2905	VMOVDQA  Y15, 224(BP)
2906	VPSLLD   $0x07, Y14, Y15
2907	VPSRLD   $0x19, Y14, Y14
2908	VPXOR    Y15, Y14, Y14
2909	VPSLLD   $0x07, Y9, Y15
2910	VPSRLD   $0x19, Y9, Y9
2911	VPXOR    Y15, Y9, Y9
2912	VPSLLD   $0x07, Y10, Y15
2913	VPSRLD   $0x19, Y10, Y10
2914	VPXOR    Y15, Y10, Y10
2915	VPSLLD   $0x07, Y11, Y15
2916	VPSRLD   $0x19, Y11, Y11
2917	VPXOR    Y15, Y11, Y11
2918	VMOVDQA  224(BP), Y15
2919	MOVQ     8(BP), DX
2920	MULXQ    R10, R10, AX
2921	ADDQ     R10, R14
2922	MULXQ    R11, R11, R8
2923	ADCQ     R11, R15
2924	ADCQ     $0x00, R8
2925	VPALIGNR $0x04, Y14, Y14, Y14
2926	VPALIGNR $0x04, Y9, Y9, Y9
2927	VPALIGNR $0x04, Y10, Y10, Y10
2928	VPALIGNR $0x04, Y11, Y11, Y11
2929	VPALIGNR $0x08, Y12, Y12, Y12
2930	VPALIGNR $0x08, Y13, Y13, Y13
2931	VPALIGNR $0x08, Y8, Y8, Y8
2932	VPALIGNR $0x08, Y15, Y15, Y15
2933	VPALIGNR $0x0c, Y4, Y4, Y4
2934	VPALIGNR $0x0c, Y1, Y1, Y1
2935	VPALIGNR $0x0c, Y2, Y2, Y2
2936	VPALIGNR $0x0c, Y3, Y3, Y3
2937	VPADDD   Y14, Y0, Y0
2938	VPADDD   Y9, Y5, Y5
2939	VPADDD   Y10, Y6, Y6
2940	VPADDD   Y11, Y7, Y7
2941	IMULQ    R12, DX
2942	ADDQ     AX, R15
2943	ADCQ     DX, R8
2944	VPXOR    Y0, Y4, Y4
2945	VPXOR    Y5, Y1, Y1
2946	VPXOR    Y6, Y2, Y2
2947	VPXOR    Y7, Y3, Y3
2948	VPSHUFB  rol16<>+0(SB), Y4, Y4
2949	VPSHUFB  rol16<>+0(SB), Y1, Y1
2950	VPSHUFB  rol16<>+0(SB), Y2, Y2
2951	VPSHUFB  rol16<>+0(SB), Y3, Y3
2952	MOVQ     R13, R10
2953	MOVQ     R14, R11
2954	MOVQ     R15, R12
2955	ANDQ     $0x03, R12
2956	MOVQ     R15, R13
2957	ANDQ     $-4, R13
2958	MOVQ     R8, R14
2959	SHRQ     $0x02, R8, R15
2960	SHRQ     $0x02, R8
2961	ADDQ     R13, R10
2962	ADCQ     R14, R11
2963	ADCQ     $0x00, R12
2964	ADDQ     R15, R10
2965	ADCQ     R8, R11
2966	ADCQ     $0x00, R12
2967	VPADDD   Y4, Y12, Y12
2968	VPADDD   Y1, Y13, Y13
2969	VPADDD   Y2, Y8, Y8
2970	VPADDD   Y3, Y15, Y15
2971	VPXOR    Y12, Y14, Y14
2972	VPXOR    Y13, Y9, Y9
2973	VPXOR    Y8, Y10, Y10
2974	VPXOR    Y15, Y11, Y11
2975	ADDQ     32(SI)(CX*1), R10
2976	ADCQ     40(SI)(CX*1), R11
2977	ADCQ     $0x01, R12
2978	LEAQ     48(CX), CX
2979	VMOVDQA  Y15, 224(BP)
2980	VPSLLD   $0x0c, Y14, Y15
2981	VPSRLD   $0x14, Y14, Y14
2982	VPXOR    Y15, Y14, Y14
2983	VPSLLD   $0x0c, Y9, Y15
2984	VPSRLD   $0x14, Y9, Y9
2985	VPXOR    Y15, Y9, Y9
2986	VPSLLD   $0x0c, Y10, Y15
2987	VPSRLD   $0x14, Y10, Y10
2988	VPXOR    Y15, Y10, Y10
2989	VPSLLD   $0x0c, Y11, Y15
2990	VPSRLD   $0x14, Y11, Y11
2991	VPXOR    Y15, Y11, Y11
2992	VMOVDQA  224(BP), Y15
2993	MOVQ     (BP), DX
2994	MOVQ     DX, R15
2995	MULXQ    R10, R13, R14
2996	IMULQ    R12, R15
2997	MULXQ    R11, AX, DX
2998	ADDQ     AX, R14
2999	ADCQ     DX, R15
3000	VPADDD   Y14, Y0, Y0
3001	VPADDD   Y9, Y5, Y5
3002	VPADDD   Y10, Y6, Y6
3003	VPADDD   Y11, Y7, Y7
3004	VPXOR    Y0, Y4, Y4
3005	VPXOR    Y5, Y1, Y1
3006	VPXOR    Y6, Y2, Y2
3007	VPXOR    Y7, Y3, Y3
3008	MOVQ     8(BP), DX
3009	MULXQ    R10, R10, AX
3010	ADDQ     R10, R14
3011	MULXQ    R11, R11, R8
3012	ADCQ     R11, R15
3013	ADCQ     $0x00, R8
3014	VPSHUFB  rol8<>+0(SB), Y4, Y4
3015	VPSHUFB  rol8<>+0(SB), Y1, Y1
3016	VPSHUFB  rol8<>+0(SB), Y2, Y2
3017	VPSHUFB  rol8<>+0(SB), Y3, Y3
3018	VPADDD   Y4, Y12, Y12
3019	VPADDD   Y1, Y13, Y13
3020	VPADDD   Y2, Y8, Y8
3021	VPADDD   Y3, Y15, Y15
3022	IMULQ    R12, DX
3023	ADDQ     AX, R15
3024	ADCQ     DX, R8
3025	VPXOR    Y12, Y14, Y14
3026	VPXOR    Y13, Y9, Y9
3027	VPXOR    Y8, Y10, Y10
3028	VPXOR    Y15, Y11, Y11
3029	VMOVDQA  Y15, 224(BP)
3030	VPSLLD   $0x07, Y14, Y15
3031	VPSRLD   $0x19, Y14, Y14
3032	VPXOR    Y15, Y14, Y14
3033	VPSLLD   $0x07, Y9, Y15
3034	VPSRLD   $0x19, Y9, Y9
3035	VPXOR    Y15, Y9, Y9
3036	VPSLLD   $0x07, Y10, Y15
3037	VPSRLD   $0x19, Y10, Y10
3038	VPXOR    Y15, Y10, Y10
3039	VPSLLD   $0x07, Y11, Y15
3040	VPSRLD   $0x19, Y11, Y11
3041	VPXOR    Y15, Y11, Y11
3042	VMOVDQA  224(BP), Y15
3043	MOVQ     R13, R10
3044	MOVQ     R14, R11
3045	MOVQ     R15, R12
3046	ANDQ     $0x03, R12
3047	MOVQ     R15, R13
3048	ANDQ     $-4, R13
3049	MOVQ     R8, R14
3050	SHRQ     $0x02, R8, R15
3051	SHRQ     $0x02, R8
3052	ADDQ     R13, R10
3053	ADCQ     R14, R11
3054	ADCQ     $0x00, R12
3055	ADDQ     R15, R10
3056	ADCQ     R8, R11
3057	ADCQ     $0x00, R12
3058	VPALIGNR $0x0c, Y14, Y14, Y14
3059	VPALIGNR $0x0c, Y9, Y9, Y9
3060	VPALIGNR $0x0c, Y10, Y10, Y10
3061	VPALIGNR $0x0c, Y11, Y11, Y11
3062	VPALIGNR $0x08, Y12, Y12, Y12
3063	VPALIGNR $0x08, Y13, Y13, Y13
3064	VPALIGNR $0x08, Y8, Y8, Y8
3065	VPALIGNR $0x08, Y15, Y15, Y15
3066	VPALIGNR $0x04, Y4, Y4, Y4
3067	VPALIGNR $0x04, Y1, Y1, Y1
3068	VPALIGNR $0x04, Y2, Y2, Y2
3069	VPALIGNR $0x04, Y3, Y3, Y3
3070	CMPQ     CX, $0x000001e0
3071	JNE      openAVX2InternalLoop
3072	VPADDD   chacha20Constants<>+0(SB), Y0, Y0
3073	VPADDD   chacha20Constants<>+0(SB), Y5, Y5
3074	VPADDD   chacha20Constants<>+0(SB), Y6, Y6
3075	VPADDD   chacha20Constants<>+0(SB), Y7, Y7
3076	VPADDD   32(BP), Y14, Y14
3077	VPADDD   32(BP), Y9, Y9
3078	VPADDD   32(BP), Y10, Y10
3079	VPADDD   32(BP), Y11, Y11
3080	VPADDD   64(BP), Y12, Y12
3081	VPADDD   64(BP), Y13, Y13
3082	VPADDD   64(BP), Y8, Y8
3083	VPADDD   64(BP), Y15, Y15
3084	VPADDD   96(BP), Y4, Y4
3085	VPADDD   128(BP), Y1, Y1
3086	VPADDD   160(BP), Y2, Y2
3087	VPADDD   192(BP), Y3, Y3
3088	VMOVDQA  Y15, 224(BP)
3089
3090	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
3091	ADDQ       480(SI), R10
3092	ADCQ       488(SI), R11
3093	ADCQ       $0x01, R12
3094	MOVQ       (BP), DX
3095	MOVQ       DX, R15
3096	MULXQ      R10, R13, R14
3097	IMULQ      R12, R15
3098	MULXQ      R11, AX, DX
3099	ADDQ       AX, R14
3100	ADCQ       DX, R15
3101	MOVQ       8(BP), DX
3102	MULXQ      R10, R10, AX
3103	ADDQ       R10, R14
3104	MULXQ      R11, R11, R8
3105	ADCQ       R11, R15
3106	ADCQ       $0x00, R8
3107	IMULQ      R12, DX
3108	ADDQ       AX, R15
3109	ADCQ       DX, R8
3110	MOVQ       R13, R10
3111	MOVQ       R14, R11
3112	MOVQ       R15, R12
3113	ANDQ       $0x03, R12
3114	MOVQ       R15, R13
3115	ANDQ       $-4, R13
3116	MOVQ       R8, R14
3117	SHRQ       $0x02, R8, R15
3118	SHRQ       $0x02, R8
3119	ADDQ       R13, R10
3120	ADCQ       R14, R11
3121	ADCQ       $0x00, R12
3122	ADDQ       R15, R10
3123	ADCQ       R8, R11
3124	ADCQ       $0x00, R12
3125	VPERM2I128 $0x02, Y0, Y14, Y15
3126	VPERM2I128 $0x13, Y0, Y14, Y14
3127	VPERM2I128 $0x02, Y12, Y4, Y0
3128	VPERM2I128 $0x13, Y12, Y4, Y12
3129	VPXOR      (SI), Y15, Y15
3130	VPXOR      32(SI), Y0, Y0
3131	VPXOR      64(SI), Y14, Y14
3132	VPXOR      96(SI), Y12, Y12
3133	VMOVDQU    Y15, (DI)
3134	VMOVDQU    Y0, 32(DI)
3135	VMOVDQU    Y14, 64(DI)
3136	VMOVDQU    Y12, 96(DI)
3137	VPERM2I128 $0x02, Y5, Y9, Y0
3138	VPERM2I128 $0x02, Y13, Y1, Y14
3139	VPERM2I128 $0x13, Y5, Y9, Y12
3140	VPERM2I128 $0x13, Y13, Y1, Y4
3141	VPXOR      128(SI), Y0, Y0
3142	VPXOR      160(SI), Y14, Y14
3143	VPXOR      192(SI), Y12, Y12
3144	VPXOR      224(SI), Y4, Y4
3145	VMOVDQU    Y0, 128(DI)
3146	VMOVDQU    Y14, 160(DI)
3147	VMOVDQU    Y12, 192(DI)
3148	VMOVDQU    Y4, 224(DI)
3149
3150	// and here
3151	ADDQ       496(SI), R10
3152	ADCQ       504(SI), R11
3153	ADCQ       $0x01, R12
3154	MOVQ       (BP), DX
3155	MOVQ       DX, R15
3156	MULXQ      R10, R13, R14
3157	IMULQ      R12, R15
3158	MULXQ      R11, AX, DX
3159	ADDQ       AX, R14
3160	ADCQ       DX, R15
3161	MOVQ       8(BP), DX
3162	MULXQ      R10, R10, AX
3163	ADDQ       R10, R14
3164	MULXQ      R11, R11, R8
3165	ADCQ       R11, R15
3166	ADCQ       $0x00, R8
3167	IMULQ      R12, DX
3168	ADDQ       AX, R15
3169	ADCQ       DX, R8
3170	MOVQ       R13, R10
3171	MOVQ       R14, R11
3172	MOVQ       R15, R12
3173	ANDQ       $0x03, R12
3174	MOVQ       R15, R13
3175	ANDQ       $-4, R13
3176	MOVQ       R8, R14
3177	SHRQ       $0x02, R8, R15
3178	SHRQ       $0x02, R8
3179	ADDQ       R13, R10
3180	ADCQ       R14, R11
3181	ADCQ       $0x00, R12
3182	ADDQ       R15, R10
3183	ADCQ       R8, R11
3184	ADCQ       $0x00, R12
3185	VPERM2I128 $0x02, Y6, Y10, Y0
3186	VPERM2I128 $0x02, Y8, Y2, Y14
3187	VPERM2I128 $0x13, Y6, Y10, Y12
3188	VPERM2I128 $0x13, Y8, Y2, Y4
3189	VPXOR      256(SI), Y0, Y0
3190	VPXOR      288(SI), Y14, Y14
3191	VPXOR      320(SI), Y12, Y12
3192	VPXOR      352(SI), Y4, Y4
3193	VMOVDQU    Y0, 256(DI)
3194	VMOVDQU    Y14, 288(DI)
3195	VMOVDQU    Y12, 320(DI)
3196	VMOVDQU    Y4, 352(DI)
3197	VPERM2I128 $0x02, Y7, Y11, Y0
3198	VPERM2I128 $0x02, 224(BP), Y3, Y14
3199	VPERM2I128 $0x13, Y7, Y11, Y12
3200	VPERM2I128 $0x13, 224(BP), Y3, Y4
3201	VPXOR      384(SI), Y0, Y0
3202	VPXOR      416(SI), Y14, Y14
3203	VPXOR      448(SI), Y12, Y12
3204	VPXOR      480(SI), Y4, Y4
3205	VMOVDQU    Y0, 384(DI)
3206	VMOVDQU    Y14, 416(DI)
3207	VMOVDQU    Y12, 448(DI)
3208	VMOVDQU    Y4, 480(DI)
3209	LEAQ       512(SI), SI
3210	LEAQ       512(DI), DI
3211	SUBQ       $0x00000200, BX
3212	JMP        openAVX2MainLoop
3213
3214openAVX2MainLoopDone:
3215	// Handle the various tail sizes efficiently
3216	TESTQ BX, BX
3217	JE    openSSEFinalize
3218	CMPQ  BX, $0x80
3219	JBE   openAVX2Tail128
3220	CMPQ  BX, $0x00000100
3221	JBE   openAVX2Tail256
3222	CMPQ  BX, $0x00000180
3223	JBE   openAVX2Tail384
3224	JMP   openAVX2Tail512
3225
3226openAVX2192:
3227	VMOVDQA Y0, Y5
3228	VMOVDQA Y14, Y9
3229	VMOVDQA Y12, Y13
3230	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
3231	VMOVDQA Y0, Y6
3232	VMOVDQA Y14, Y10
3233	VMOVDQA Y12, Y8
3234	VMOVDQA Y4, Y2
3235	VMOVDQA Y1, Y15
3236	MOVQ    $0x0000000a, R9
3237
3238openAVX2192InnerCipherLoop:
3239	VPADDD     Y14, Y0, Y0
3240	VPXOR      Y0, Y4, Y4
3241	VPSHUFB    rol16<>+0(SB), Y4, Y4
3242	VPADDD     Y4, Y12, Y12
3243	VPXOR      Y12, Y14, Y14
3244	VPSLLD     $0x0c, Y14, Y3
3245	VPSRLD     $0x14, Y14, Y14
3246	VPXOR      Y3, Y14, Y14
3247	VPADDD     Y14, Y0, Y0
3248	VPXOR      Y0, Y4, Y4
3249	VPSHUFB    rol8<>+0(SB), Y4, Y4
3250	VPADDD     Y4, Y12, Y12
3251	VPXOR      Y12, Y14, Y14
3252	VPSLLD     $0x07, Y14, Y3
3253	VPSRLD     $0x19, Y14, Y14
3254	VPXOR      Y3, Y14, Y14
3255	VPADDD     Y9, Y5, Y5
3256	VPXOR      Y5, Y1, Y1
3257	VPSHUFB    rol16<>+0(SB), Y1, Y1
3258	VPADDD     Y1, Y13, Y13
3259	VPXOR      Y13, Y9, Y9
3260	VPSLLD     $0x0c, Y9, Y3
3261	VPSRLD     $0x14, Y9, Y9
3262	VPXOR      Y3, Y9, Y9
3263	VPADDD     Y9, Y5, Y5
3264	VPXOR      Y5, Y1, Y1
3265	VPSHUFB    rol8<>+0(SB), Y1, Y1
3266	VPADDD     Y1, Y13, Y13
3267	VPXOR      Y13, Y9, Y9
3268	VPSLLD     $0x07, Y9, Y3
3269	VPSRLD     $0x19, Y9, Y9
3270	VPXOR      Y3, Y9, Y9
3271	VPALIGNR   $0x04, Y14, Y14, Y14
3272	VPALIGNR   $0x04, Y9, Y9, Y9
3273	VPALIGNR   $0x08, Y12, Y12, Y12
3274	VPALIGNR   $0x08, Y13, Y13, Y13
3275	VPALIGNR   $0x0c, Y4, Y4, Y4
3276	VPALIGNR   $0x0c, Y1, Y1, Y1
3277	VPADDD     Y14, Y0, Y0
3278	VPXOR      Y0, Y4, Y4
3279	VPSHUFB    rol16<>+0(SB), Y4, Y4
3280	VPADDD     Y4, Y12, Y12
3281	VPXOR      Y12, Y14, Y14
3282	VPSLLD     $0x0c, Y14, Y3
3283	VPSRLD     $0x14, Y14, Y14
3284	VPXOR      Y3, Y14, Y14
3285	VPADDD     Y14, Y0, Y0
3286	VPXOR      Y0, Y4, Y4
3287	VPSHUFB    rol8<>+0(SB), Y4, Y4
3288	VPADDD     Y4, Y12, Y12
3289	VPXOR      Y12, Y14, Y14
3290	VPSLLD     $0x07, Y14, Y3
3291	VPSRLD     $0x19, Y14, Y14
3292	VPXOR      Y3, Y14, Y14
3293	VPADDD     Y9, Y5, Y5
3294	VPXOR      Y5, Y1, Y1
3295	VPSHUFB    rol16<>+0(SB), Y1, Y1
3296	VPADDD     Y1, Y13, Y13
3297	VPXOR      Y13, Y9, Y9
3298	VPSLLD     $0x0c, Y9, Y3
3299	VPSRLD     $0x14, Y9, Y9
3300	VPXOR      Y3, Y9, Y9
3301	VPADDD     Y9, Y5, Y5
3302	VPXOR      Y5, Y1, Y1
3303	VPSHUFB    rol8<>+0(SB), Y1, Y1
3304	VPADDD     Y1, Y13, Y13
3305	VPXOR      Y13, Y9, Y9
3306	VPSLLD     $0x07, Y9, Y3
3307	VPSRLD     $0x19, Y9, Y9
3308	VPXOR      Y3, Y9, Y9
3309	VPALIGNR   $0x0c, Y14, Y14, Y14
3310	VPALIGNR   $0x0c, Y9, Y9, Y9
3311	VPALIGNR   $0x08, Y12, Y12, Y12
3312	VPALIGNR   $0x08, Y13, Y13, Y13
3313	VPALIGNR   $0x04, Y4, Y4, Y4
3314	VPALIGNR   $0x04, Y1, Y1, Y1
3315	DECQ       R9
3316	JNE        openAVX2192InnerCipherLoop
3317	VPADDD     Y6, Y0, Y0
3318	VPADDD     Y6, Y5, Y5
3319	VPADDD     Y10, Y14, Y14
3320	VPADDD     Y10, Y9, Y9
3321	VPADDD     Y8, Y12, Y12
3322	VPADDD     Y8, Y13, Y13
3323	VPADDD     Y2, Y4, Y4
3324	VPADDD     Y15, Y1, Y1
3325	VPERM2I128 $0x02, Y0, Y14, Y3
3326
3327	// Clamp and store poly key
3328	VPAND   polyClampMask<>+0(SB), Y3, Y3
3329	VMOVDQA Y3, (BP)
3330
3331	// Stream for up to 192 bytes
3332	VPERM2I128 $0x13, Y0, Y14, Y0
3333	VPERM2I128 $0x13, Y12, Y4, Y14
3334	VPERM2I128 $0x02, Y5, Y9, Y12
3335	VPERM2I128 $0x02, Y13, Y1, Y4
3336	VPERM2I128 $0x13, Y5, Y9, Y5
3337	VPERM2I128 $0x13, Y13, Y1, Y9
3338
3339openAVX2ShortOpen:
3340	// Hash
3341	MOVQ ad_len+80(FP), R9
3342	CALL polyHashADInternal<>(SB)
3343
3344openAVX2ShortOpenLoop:
3345	CMPQ BX, $0x20
3346	JB   openAVX2ShortTail32
3347	SUBQ $0x20, BX
3348
3349	// Load for hashing
3350	ADDQ  (SI), R10
3351	ADCQ  8(SI), R11
3352	ADCQ  $0x01, R12
3353	MOVQ  (BP), DX
3354	MOVQ  DX, R15
3355	MULXQ R10, R13, R14
3356	IMULQ R12, R15
3357	MULXQ R11, AX, DX
3358	ADDQ  AX, R14
3359	ADCQ  DX, R15
3360	MOVQ  8(BP), DX
3361	MULXQ R10, R10, AX
3362	ADDQ  R10, R14
3363	MULXQ R11, R11, R8
3364	ADCQ  R11, R15
3365	ADCQ  $0x00, R8
3366	IMULQ R12, DX
3367	ADDQ  AX, R15
3368	ADCQ  DX, R8
3369	MOVQ  R13, R10
3370	MOVQ  R14, R11
3371	MOVQ  R15, R12
3372	ANDQ  $0x03, R12
3373	MOVQ  R15, R13
3374	ANDQ  $-4, R13
3375	MOVQ  R8, R14
3376	SHRQ  $0x02, R8, R15
3377	SHRQ  $0x02, R8
3378	ADDQ  R13, R10
3379	ADCQ  R14, R11
3380	ADCQ  $0x00, R12
3381	ADDQ  R15, R10
3382	ADCQ  R8, R11
3383	ADCQ  $0x00, R12
3384	ADDQ  16(SI), R10
3385	ADCQ  24(SI), R11
3386	ADCQ  $0x01, R12
3387	MOVQ  (BP), DX
3388	MOVQ  DX, R15
3389	MULXQ R10, R13, R14
3390	IMULQ R12, R15
3391	MULXQ R11, AX, DX
3392	ADDQ  AX, R14
3393	ADCQ  DX, R15
3394	MOVQ  8(BP), DX
3395	MULXQ R10, R10, AX
3396	ADDQ  R10, R14
3397	MULXQ R11, R11, R8
3398	ADCQ  R11, R15
3399	ADCQ  $0x00, R8
3400	IMULQ R12, DX
3401	ADDQ  AX, R15
3402	ADCQ  DX, R8
3403	MOVQ  R13, R10
3404	MOVQ  R14, R11
3405	MOVQ  R15, R12
3406	ANDQ  $0x03, R12
3407	MOVQ  R15, R13
3408	ANDQ  $-4, R13
3409	MOVQ  R8, R14
3410	SHRQ  $0x02, R8, R15
3411	SHRQ  $0x02, R8
3412	ADDQ  R13, R10
3413	ADCQ  R14, R11
3414	ADCQ  $0x00, R12
3415	ADDQ  R15, R10
3416	ADCQ  R8, R11
3417	ADCQ  $0x00, R12
3418
3419	// Load for decryption
3420	VPXOR   (SI), Y0, Y0
3421	VMOVDQU Y0, (DI)
3422	LEAQ    32(SI), SI
3423	LEAQ    32(DI), DI
3424
3425	// Shift stream left
3426	VMOVDQA Y14, Y0
3427	VMOVDQA Y12, Y14
3428	VMOVDQA Y4, Y12
3429	VMOVDQA Y5, Y4
3430	VMOVDQA Y9, Y5
3431	VMOVDQA Y13, Y9
3432	VMOVDQA Y1, Y13
3433	VMOVDQA Y6, Y1
3434	VMOVDQA Y10, Y6
3435	JMP     openAVX2ShortOpenLoop
3436
3437openAVX2ShortTail32:
3438	CMPQ    BX, $0x10
3439	VMOVDQA X0, X1
3440	JB      openAVX2ShortDone
3441	SUBQ    $0x10, BX
3442
3443	// Load for hashing
3444	ADDQ  (SI), R10
3445	ADCQ  8(SI), R11
3446	ADCQ  $0x01, R12
3447	MOVQ  (BP), DX
3448	MOVQ  DX, R15
3449	MULXQ R10, R13, R14
3450	IMULQ R12, R15
3451	MULXQ R11, AX, DX
3452	ADDQ  AX, R14
3453	ADCQ  DX, R15
3454	MOVQ  8(BP), DX
3455	MULXQ R10, R10, AX
3456	ADDQ  R10, R14
3457	MULXQ R11, R11, R8
3458	ADCQ  R11, R15
3459	ADCQ  $0x00, R8
3460	IMULQ R12, DX
3461	ADDQ  AX, R15
3462	ADCQ  DX, R8
3463	MOVQ  R13, R10
3464	MOVQ  R14, R11
3465	MOVQ  R15, R12
3466	ANDQ  $0x03, R12
3467	MOVQ  R15, R13
3468	ANDQ  $-4, R13
3469	MOVQ  R8, R14
3470	SHRQ  $0x02, R8, R15
3471	SHRQ  $0x02, R8
3472	ADDQ  R13, R10
3473	ADCQ  R14, R11
3474	ADCQ  $0x00, R12
3475	ADDQ  R15, R10
3476	ADCQ  R8, R11
3477	ADCQ  $0x00, R12
3478
3479	// Load for decryption
3480	VPXOR      (SI), X0, X12
3481	VMOVDQU    X12, (DI)
3482	LEAQ       16(SI), SI
3483	LEAQ       16(DI), DI
3484	VPERM2I128 $0x11, Y0, Y0, Y0
3485	VMOVDQA    X0, X1
3486
3487openAVX2ShortDone:
3488	VZEROUPPER
3489	JMP openSSETail16
3490
3491openAVX2320:
3492	VMOVDQA Y0, Y5
3493	VMOVDQA Y14, Y9
3494	VMOVDQA Y12, Y13
3495	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
3496	VMOVDQA Y0, Y6
3497	VMOVDQA Y14, Y10
3498	VMOVDQA Y12, Y8
3499	VPADDD  avx2IncMask<>+0(SB), Y1, Y2
3500	VMOVDQA Y14, Y7
3501	VMOVDQA Y12, Y11
3502	VMOVDQA Y4, Y15
3503	MOVQ    $0x0000000a, R9
3504
3505openAVX2320InnerCipherLoop:
3506	VPADDD   Y14, Y0, Y0
3507	VPXOR    Y0, Y4, Y4
3508	VPSHUFB  rol16<>+0(SB), Y4, Y4
3509	VPADDD   Y4, Y12, Y12
3510	VPXOR    Y12, Y14, Y14
3511	VPSLLD   $0x0c, Y14, Y3
3512	VPSRLD   $0x14, Y14, Y14
3513	VPXOR    Y3, Y14, Y14
3514	VPADDD   Y14, Y0, Y0
3515	VPXOR    Y0, Y4, Y4
3516	VPSHUFB  rol8<>+0(SB), Y4, Y4
3517	VPADDD   Y4, Y12, Y12
3518	VPXOR    Y12, Y14, Y14
3519	VPSLLD   $0x07, Y14, Y3
3520	VPSRLD   $0x19, Y14, Y14
3521	VPXOR    Y3, Y14, Y14
3522	VPADDD   Y9, Y5, Y5
3523	VPXOR    Y5, Y1, Y1
3524	VPSHUFB  rol16<>+0(SB), Y1, Y1
3525	VPADDD   Y1, Y13, Y13
3526	VPXOR    Y13, Y9, Y9
3527	VPSLLD   $0x0c, Y9, Y3
3528	VPSRLD   $0x14, Y9, Y9
3529	VPXOR    Y3, Y9, Y9
3530	VPADDD   Y9, Y5, Y5
3531	VPXOR    Y5, Y1, Y1
3532	VPSHUFB  rol8<>+0(SB), Y1, Y1
3533	VPADDD   Y1, Y13, Y13
3534	VPXOR    Y13, Y9, Y9
3535	VPSLLD   $0x07, Y9, Y3
3536	VPSRLD   $0x19, Y9, Y9
3537	VPXOR    Y3, Y9, Y9
3538	VPADDD   Y10, Y6, Y6
3539	VPXOR    Y6, Y2, Y2
3540	VPSHUFB  rol16<>+0(SB), Y2, Y2
3541	VPADDD   Y2, Y8, Y8
3542	VPXOR    Y8, Y10, Y10
3543	VPSLLD   $0x0c, Y10, Y3
3544	VPSRLD   $0x14, Y10, Y10
3545	VPXOR    Y3, Y10, Y10
3546	VPADDD   Y10, Y6, Y6
3547	VPXOR    Y6, Y2, Y2
3548	VPSHUFB  rol8<>+0(SB), Y2, Y2
3549	VPADDD   Y2, Y8, Y8
3550	VPXOR    Y8, Y10, Y10
3551	VPSLLD   $0x07, Y10, Y3
3552	VPSRLD   $0x19, Y10, Y10
3553	VPXOR    Y3, Y10, Y10
3554	VPALIGNR $0x04, Y14, Y14, Y14
3555	VPALIGNR $0x04, Y9, Y9, Y9
3556	VPALIGNR $0x04, Y10, Y10, Y10
3557	VPALIGNR $0x08, Y12, Y12, Y12
3558	VPALIGNR $0x08, Y13, Y13, Y13
3559	VPALIGNR $0x08, Y8, Y8, Y8
3560	VPALIGNR $0x0c, Y4, Y4, Y4
3561	VPALIGNR $0x0c, Y1, Y1, Y1
3562	VPALIGNR $0x0c, Y2, Y2, Y2
3563	VPADDD   Y14, Y0, Y0
3564	VPXOR    Y0, Y4, Y4
3565	VPSHUFB  rol16<>+0(SB), Y4, Y4
3566	VPADDD   Y4, Y12, Y12
3567	VPXOR    Y12, Y14, Y14
3568	VPSLLD   $0x0c, Y14, Y3
3569	VPSRLD   $0x14, Y14, Y14
3570	VPXOR    Y3, Y14, Y14
3571	VPADDD   Y14, Y0, Y0
3572	VPXOR    Y0, Y4, Y4
3573	VPSHUFB  rol8<>+0(SB), Y4, Y4
3574	VPADDD   Y4, Y12, Y12
3575	VPXOR    Y12, Y14, Y14
3576	VPSLLD   $0x07, Y14, Y3
3577	VPSRLD   $0x19, Y14, Y14
3578	VPXOR    Y3, Y14, Y14
3579	VPADDD   Y9, Y5, Y5
3580	VPXOR    Y5, Y1, Y1
3581	VPSHUFB  rol16<>+0(SB), Y1, Y1
3582	VPADDD   Y1, Y13, Y13
3583	VPXOR    Y13, Y9, Y9
3584	VPSLLD   $0x0c, Y9, Y3
3585	VPSRLD   $0x14, Y9, Y9
3586	VPXOR    Y3, Y9, Y9
3587	VPADDD   Y9, Y5, Y5
3588	VPXOR    Y5, Y1, Y1
3589	VPSHUFB  rol8<>+0(SB), Y1, Y1
3590	VPADDD   Y1, Y13, Y13
3591	VPXOR    Y13, Y9, Y9
3592	VPSLLD   $0x07, Y9, Y3
3593	VPSRLD   $0x19, Y9, Y9
3594	VPXOR    Y3, Y9, Y9
3595	VPADDD   Y10, Y6, Y6
3596	VPXOR    Y6, Y2, Y2
3597	VPSHUFB  rol16<>+0(SB), Y2, Y2
3598	VPADDD   Y2, Y8, Y8
3599	VPXOR    Y8, Y10, Y10
3600	VPSLLD   $0x0c, Y10, Y3
3601	VPSRLD   $0x14, Y10, Y10
3602	VPXOR    Y3, Y10, Y10
3603	VPADDD   Y10, Y6, Y6
3604	VPXOR    Y6, Y2, Y2
3605	VPSHUFB  rol8<>+0(SB), Y2, Y2
3606	VPADDD   Y2, Y8, Y8
3607	VPXOR    Y8, Y10, Y10
3608	VPSLLD   $0x07, Y10, Y3
3609	VPSRLD   $0x19, Y10, Y10
3610	VPXOR    Y3, Y10, Y10
3611	VPALIGNR $0x0c, Y14, Y14, Y14
3612	VPALIGNR $0x0c, Y9, Y9, Y9
3613	VPALIGNR $0x0c, Y10, Y10, Y10
3614	VPALIGNR $0x08, Y12, Y12, Y12
3615	VPALIGNR $0x08, Y13, Y13, Y13
3616	VPALIGNR $0x08, Y8, Y8, Y8
3617	VPALIGNR $0x04, Y4, Y4, Y4
3618	VPALIGNR $0x04, Y1, Y1, Y1
3619	VPALIGNR $0x04, Y2, Y2, Y2
3620	DECQ     R9
3621	JNE      openAVX2320InnerCipherLoop
3622	VMOVDQA  chacha20Constants<>+0(SB), Y3
3623	VPADDD   Y3, Y0, Y0
3624	VPADDD   Y3, Y5, Y5
3625	VPADDD   Y3, Y6, Y6
3626	VPADDD   Y7, Y14, Y14
3627	VPADDD   Y7, Y9, Y9
3628	VPADDD   Y7, Y10, Y10
3629	VPADDD   Y11, Y12, Y12
3630	VPADDD   Y11, Y13, Y13
3631	VPADDD   Y11, Y8, Y8
3632	VMOVDQA  avx2IncMask<>+0(SB), Y3
3633	VPADDD   Y15, Y4, Y4
3634	VPADDD   Y3, Y15, Y15
3635	VPADDD   Y15, Y1, Y1
3636	VPADDD   Y3, Y15, Y15
3637	VPADDD   Y15, Y2, Y2
3638
3639	// Clamp and store poly key
3640	VPERM2I128 $0x02, Y0, Y14, Y3
3641	VPAND      polyClampMask<>+0(SB), Y3, Y3
3642	VMOVDQA    Y3, (BP)
3643
3644	// Stream for up to 320 bytes
3645	VPERM2I128 $0x13, Y0, Y14, Y0
3646	VPERM2I128 $0x13, Y12, Y4, Y14
3647	VPERM2I128 $0x02, Y5, Y9, Y12
3648	VPERM2I128 $0x02, Y13, Y1, Y4
3649	VPERM2I128 $0x13, Y5, Y9, Y5
3650	VPERM2I128 $0x13, Y13, Y1, Y9
3651	VPERM2I128 $0x02, Y6, Y10, Y13
3652	VPERM2I128 $0x02, Y8, Y2, Y1
3653	VPERM2I128 $0x13, Y6, Y10, Y6
3654	VPERM2I128 $0x13, Y8, Y2, Y10
3655	JMP        openAVX2ShortOpen
3656
3657openAVX2Tail128:
3658	// Need to decrypt up to 128 bytes - prepare two blocks
3659	VMOVDQA chacha20Constants<>+0(SB), Y5
3660	VMOVDQA 32(BP), Y9
3661	VMOVDQA 64(BP), Y13
3662	VMOVDQA 192(BP), Y1
3663	VPADDD  avx2IncMask<>+0(SB), Y1, Y1
3664	VMOVDQA Y1, Y4
3665	XORQ    R9, R9
3666	MOVQ    BX, CX
3667	ANDQ    $-16, CX
3668	TESTQ   CX, CX
3669	JE      openAVX2Tail128LoopB
3670
3671openAVX2Tail128LoopA:
3672	ADDQ  (SI)(R9*1), R10
3673	ADCQ  8(SI)(R9*1), R11
3674	ADCQ  $0x01, R12
3675	MOVQ  (BP), DX
3676	MOVQ  DX, R15
3677	MULXQ R10, R13, R14
3678	IMULQ R12, R15
3679	MULXQ R11, AX, DX
3680	ADDQ  AX, R14
3681	ADCQ  DX, R15
3682	MOVQ  8(BP), DX
3683	MULXQ R10, R10, AX
3684	ADDQ  R10, R14
3685	MULXQ R11, R11, R8
3686	ADCQ  R11, R15
3687	ADCQ  $0x00, R8
3688	IMULQ R12, DX
3689	ADDQ  AX, R15
3690	ADCQ  DX, R8
3691	MOVQ  R13, R10
3692	MOVQ  R14, R11
3693	MOVQ  R15, R12
3694	ANDQ  $0x03, R12
3695	MOVQ  R15, R13
3696	ANDQ  $-4, R13
3697	MOVQ  R8, R14
3698	SHRQ  $0x02, R8, R15
3699	SHRQ  $0x02, R8
3700	ADDQ  R13, R10
3701	ADCQ  R14, R11
3702	ADCQ  $0x00, R12
3703	ADDQ  R15, R10
3704	ADCQ  R8, R11
3705	ADCQ  $0x00, R12
3706
3707openAVX2Tail128LoopB:
3708	ADDQ       $0x10, R9
3709	VPADDD     Y9, Y5, Y5
3710	VPXOR      Y5, Y1, Y1
3711	VPSHUFB    rol16<>+0(SB), Y1, Y1
3712	VPADDD     Y1, Y13, Y13
3713	VPXOR      Y13, Y9, Y9
3714	VPSLLD     $0x0c, Y9, Y3
3715	VPSRLD     $0x14, Y9, Y9
3716	VPXOR      Y3, Y9, Y9
3717	VPADDD     Y9, Y5, Y5
3718	VPXOR      Y5, Y1, Y1
3719	VPSHUFB    rol8<>+0(SB), Y1, Y1
3720	VPADDD     Y1, Y13, Y13
3721	VPXOR      Y13, Y9, Y9
3722	VPSLLD     $0x07, Y9, Y3
3723	VPSRLD     $0x19, Y9, Y9
3724	VPXOR      Y3, Y9, Y9
3725	VPALIGNR   $0x04, Y9, Y9, Y9
3726	VPALIGNR   $0x08, Y13, Y13, Y13
3727	VPALIGNR   $0x0c, Y1, Y1, Y1
3728	VPADDD     Y9, Y5, Y5
3729	VPXOR      Y5, Y1, Y1
3730	VPSHUFB    rol16<>+0(SB), Y1, Y1
3731	VPADDD     Y1, Y13, Y13
3732	VPXOR      Y13, Y9, Y9
3733	VPSLLD     $0x0c, Y9, Y3
3734	VPSRLD     $0x14, Y9, Y9
3735	VPXOR      Y3, Y9, Y9
3736	VPADDD     Y9, Y5, Y5
3737	VPXOR      Y5, Y1, Y1
3738	VPSHUFB    rol8<>+0(SB), Y1, Y1
3739	VPADDD     Y1, Y13, Y13
3740	VPXOR      Y13, Y9, Y9
3741	VPSLLD     $0x07, Y9, Y3
3742	VPSRLD     $0x19, Y9, Y9
3743	VPXOR      Y3, Y9, Y9
3744	VPALIGNR   $0x0c, Y9, Y9, Y9
3745	VPALIGNR   $0x08, Y13, Y13, Y13
3746	VPALIGNR   $0x04, Y1, Y1, Y1
3747	CMPQ       R9, CX
3748	JB         openAVX2Tail128LoopA
3749	CMPQ       R9, $0xa0
3750	JNE        openAVX2Tail128LoopB
3751	VPADDD     chacha20Constants<>+0(SB), Y5, Y5
3752	VPADDD     32(BP), Y9, Y9
3753	VPADDD     64(BP), Y13, Y13
3754	VPADDD     Y4, Y1, Y1
3755	VPERM2I128 $0x02, Y5, Y9, Y0
3756	VPERM2I128 $0x02, Y13, Y1, Y14
3757	VPERM2I128 $0x13, Y5, Y9, Y12
3758	VPERM2I128 $0x13, Y13, Y1, Y4
3759
3760openAVX2TailLoop:
3761	CMPQ BX, $0x20
3762	JB   openAVX2Tail
3763	SUBQ $0x20, BX
3764
3765	// Load for decryption
3766	VPXOR   (SI), Y0, Y0
3767	VMOVDQU Y0, (DI)
3768	LEAQ    32(SI), SI
3769	LEAQ    32(DI), DI
3770	VMOVDQA Y14, Y0
3771	VMOVDQA Y12, Y14
3772	VMOVDQA Y4, Y12
3773	JMP     openAVX2TailLoop
3774
3775openAVX2Tail:
3776	CMPQ    BX, $0x10
3777	VMOVDQA X0, X1
3778	JB      openAVX2TailDone
3779	SUBQ    $0x10, BX
3780
3781	// Load for decryption
3782	VPXOR      (SI), X0, X12
3783	VMOVDQU    X12, (DI)
3784	LEAQ       16(SI), SI
3785	LEAQ       16(DI), DI
3786	VPERM2I128 $0x11, Y0, Y0, Y0
3787	VMOVDQA    X0, X1
3788
3789openAVX2TailDone:
3790	VZEROUPPER
3791	JMP openSSETail16
3792
3793openAVX2Tail256:
3794	VMOVDQA chacha20Constants<>+0(SB), Y0
3795	VMOVDQA Y0, Y5
3796	VMOVDQA 32(BP), Y14
3797	VMOVDQA Y14, Y9
3798	VMOVDQA 64(BP), Y12
3799	VMOVDQA Y12, Y13
3800	VMOVDQA 192(BP), Y4
3801	VPADDD  avx2IncMask<>+0(SB), Y4, Y4
3802	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
3803	VMOVDQA Y4, Y7
3804	VMOVDQA Y1, Y11
3805
3806	// Compute the number of iterations that will hash data
3807	MOVQ    BX, 224(BP)
3808	MOVQ    BX, CX
3809	SUBQ    $0x80, CX
3810	SHRQ    $0x04, CX
3811	MOVQ    $0x0000000a, R9
3812	CMPQ    CX, $0x0a
3813	CMOVQGT R9, CX
3814	MOVQ    SI, BX
3815	XORQ    R9, R9
3816
3817openAVX2Tail256LoopA:
3818	ADDQ  (BX), R10
3819	ADCQ  8(BX), R11
3820	ADCQ  $0x01, R12
3821	MOVQ  (BP), DX
3822	MOVQ  DX, R15
3823	MULXQ R10, R13, R14
3824	IMULQ R12, R15
3825	MULXQ R11, AX, DX
3826	ADDQ  AX, R14
3827	ADCQ  DX, R15
3828	MOVQ  8(BP), DX
3829	MULXQ R10, R10, AX
3830	ADDQ  R10, R14
3831	MULXQ R11, R11, R8
3832	ADCQ  R11, R15
3833	ADCQ  $0x00, R8
3834	IMULQ R12, DX
3835	ADDQ  AX, R15
3836	ADCQ  DX, R8
3837	MOVQ  R13, R10
3838	MOVQ  R14, R11
3839	MOVQ  R15, R12
3840	ANDQ  $0x03, R12
3841	MOVQ  R15, R13
3842	ANDQ  $-4, R13
3843	MOVQ  R8, R14
3844	SHRQ  $0x02, R8, R15
3845	SHRQ  $0x02, R8
3846	ADDQ  R13, R10
3847	ADCQ  R14, R11
3848	ADCQ  $0x00, R12
3849	ADDQ  R15, R10
3850	ADCQ  R8, R11
3851	ADCQ  $0x00, R12
3852	LEAQ  16(BX), BX
3853
3854openAVX2Tail256LoopB:
3855	VPADDD   Y14, Y0, Y0
3856	VPXOR    Y0, Y4, Y4
3857	VPSHUFB  rol16<>+0(SB), Y4, Y4
3858	VPADDD   Y4, Y12, Y12
3859	VPXOR    Y12, Y14, Y14
3860	VPSLLD   $0x0c, Y14, Y3
3861	VPSRLD   $0x14, Y14, Y14
3862	VPXOR    Y3, Y14, Y14
3863	VPADDD   Y14, Y0, Y0
3864	VPXOR    Y0, Y4, Y4
3865	VPSHUFB  rol8<>+0(SB), Y4, Y4
3866	VPADDD   Y4, Y12, Y12
3867	VPXOR    Y12, Y14, Y14
3868	VPSLLD   $0x07, Y14, Y3
3869	VPSRLD   $0x19, Y14, Y14
3870	VPXOR    Y3, Y14, Y14
3871	VPADDD   Y9, Y5, Y5
3872	VPXOR    Y5, Y1, Y1
3873	VPSHUFB  rol16<>+0(SB), Y1, Y1
3874	VPADDD   Y1, Y13, Y13
3875	VPXOR    Y13, Y9, Y9
3876	VPSLLD   $0x0c, Y9, Y3
3877	VPSRLD   $0x14, Y9, Y9
3878	VPXOR    Y3, Y9, Y9
3879	VPADDD   Y9, Y5, Y5
3880	VPXOR    Y5, Y1, Y1
3881	VPSHUFB  rol8<>+0(SB), Y1, Y1
3882	VPADDD   Y1, Y13, Y13
3883	VPXOR    Y13, Y9, Y9
3884	VPSLLD   $0x07, Y9, Y3
3885	VPSRLD   $0x19, Y9, Y9
3886	VPXOR    Y3, Y9, Y9
3887	VPALIGNR $0x04, Y14, Y14, Y14
3888	VPALIGNR $0x04, Y9, Y9, Y9
3889	VPALIGNR $0x08, Y12, Y12, Y12
3890	VPALIGNR $0x08, Y13, Y13, Y13
3891	VPALIGNR $0x0c, Y4, Y4, Y4
3892	VPALIGNR $0x0c, Y1, Y1, Y1
3893	INCQ     R9
3894	VPADDD   Y14, Y0, Y0
3895	VPXOR    Y0, Y4, Y4
3896	VPSHUFB  rol16<>+0(SB), Y4, Y4
3897	VPADDD   Y4, Y12, Y12
3898	VPXOR    Y12, Y14, Y14
3899	VPSLLD   $0x0c, Y14, Y3
3900	VPSRLD   $0x14, Y14, Y14
3901	VPXOR    Y3, Y14, Y14
3902	VPADDD   Y14, Y0, Y0
3903	VPXOR    Y0, Y4, Y4
3904	VPSHUFB  rol8<>+0(SB), Y4, Y4
3905	VPADDD   Y4, Y12, Y12
3906	VPXOR    Y12, Y14, Y14
3907	VPSLLD   $0x07, Y14, Y3
3908	VPSRLD   $0x19, Y14, Y14
3909	VPXOR    Y3, Y14, Y14
3910	VPADDD   Y9, Y5, Y5
3911	VPXOR    Y5, Y1, Y1
3912	VPSHUFB  rol16<>+0(SB), Y1, Y1
3913	VPADDD   Y1, Y13, Y13
3914	VPXOR    Y13, Y9, Y9
3915	VPSLLD   $0x0c, Y9, Y3
3916	VPSRLD   $0x14, Y9, Y9
3917	VPXOR    Y3, Y9, Y9
3918	VPADDD   Y9, Y5, Y5
3919	VPXOR    Y5, Y1, Y1
3920	VPSHUFB  rol8<>+0(SB), Y1, Y1
3921	VPADDD   Y1, Y13, Y13
3922	VPXOR    Y13, Y9, Y9
3923	VPSLLD   $0x07, Y9, Y3
3924	VPSRLD   $0x19, Y9, Y9
3925	VPXOR    Y3, Y9, Y9
3926	VPALIGNR $0x0c, Y14, Y14, Y14
3927	VPALIGNR $0x0c, Y9, Y9, Y9
3928	VPALIGNR $0x08, Y12, Y12, Y12
3929	VPALIGNR $0x08, Y13, Y13, Y13
3930	VPALIGNR $0x04, Y4, Y4, Y4
3931	VPALIGNR $0x04, Y1, Y1, Y1
3932	CMPQ     R9, CX
3933	JB       openAVX2Tail256LoopA
3934	CMPQ     R9, $0x0a
3935	JNE      openAVX2Tail256LoopB
3936	MOVQ     BX, R9
3937	SUBQ     SI, BX
3938	MOVQ     BX, CX
3939	MOVQ     224(BP), BX
3940
3941openAVX2Tail256Hash:
3942	ADDQ  $0x10, CX
3943	CMPQ  CX, BX
3944	JGT   openAVX2Tail256HashEnd
3945	ADDQ  (R9), R10
3946	ADCQ  8(R9), R11
3947	ADCQ  $0x01, R12
3948	MOVQ  (BP), DX
3949	MOVQ  DX, R15
3950	MULXQ R10, R13, R14
3951	IMULQ R12, R15
3952	MULXQ R11, AX, DX
3953	ADDQ  AX, R14
3954	ADCQ  DX, R15
3955	MOVQ  8(BP), DX
3956	MULXQ R10, R10, AX
3957	ADDQ  R10, R14
3958	MULXQ R11, R11, R8
3959	ADCQ  R11, R15
3960	ADCQ  $0x00, R8
3961	IMULQ R12, DX
3962	ADDQ  AX, R15
3963	ADCQ  DX, R8
3964	MOVQ  R13, R10
3965	MOVQ  R14, R11
3966	MOVQ  R15, R12
3967	ANDQ  $0x03, R12
3968	MOVQ  R15, R13
3969	ANDQ  $-4, R13
3970	MOVQ  R8, R14
3971	SHRQ  $0x02, R8, R15
3972	SHRQ  $0x02, R8
3973	ADDQ  R13, R10
3974	ADCQ  R14, R11
3975	ADCQ  $0x00, R12
3976	ADDQ  R15, R10
3977	ADCQ  R8, R11
3978	ADCQ  $0x00, R12
3979	LEAQ  16(R9), R9
3980	JMP   openAVX2Tail256Hash
3981
3982openAVX2Tail256HashEnd:
3983	VPADDD     chacha20Constants<>+0(SB), Y0, Y0
3984	VPADDD     chacha20Constants<>+0(SB), Y5, Y5
3985	VPADDD     32(BP), Y14, Y14
3986	VPADDD     32(BP), Y9, Y9
3987	VPADDD     64(BP), Y12, Y12
3988	VPADDD     64(BP), Y13, Y13
3989	VPADDD     Y7, Y4, Y4
3990	VPADDD     Y11, Y1, Y1
3991	VPERM2I128 $0x02, Y0, Y14, Y6
3992	VPERM2I128 $0x02, Y12, Y4, Y10
3993	VPERM2I128 $0x13, Y0, Y14, Y8
3994	VPERM2I128 $0x13, Y12, Y4, Y2
3995	VPERM2I128 $0x02, Y5, Y9, Y0
3996	VPERM2I128 $0x02, Y13, Y1, Y14
3997	VPERM2I128 $0x13, Y5, Y9, Y12
3998	VPERM2I128 $0x13, Y13, Y1, Y4
3999	VPXOR      (SI), Y6, Y6
4000	VPXOR      32(SI), Y10, Y10
4001	VPXOR      64(SI), Y8, Y8
4002	VPXOR      96(SI), Y2, Y2
4003	VMOVDQU    Y6, (DI)
4004	VMOVDQU    Y10, 32(DI)
4005	VMOVDQU    Y8, 64(DI)
4006	VMOVDQU    Y2, 96(DI)
4007	LEAQ       128(SI), SI
4008	LEAQ       128(DI), DI
4009	SUBQ       $0x80, BX
4010	JMP        openAVX2TailLoop
4011
4012openAVX2Tail384:
4013	// Need to decrypt up to 384 bytes - prepare six blocks
4014	VMOVDQA chacha20Constants<>+0(SB), Y0
4015	VMOVDQA Y0, Y5
4016	VMOVDQA Y0, Y6
4017	VMOVDQA 32(BP), Y14
4018	VMOVDQA Y14, Y9
4019	VMOVDQA Y14, Y10
4020	VMOVDQA 64(BP), Y12
4021	VMOVDQA Y12, Y13
4022	VMOVDQA Y12, Y8
4023	VMOVDQA 192(BP), Y4
4024	VPADDD  avx2IncMask<>+0(SB), Y4, Y4
4025	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
4026	VPADDD  avx2IncMask<>+0(SB), Y1, Y2
4027	VMOVDQA Y4, 96(BP)
4028	VMOVDQA Y1, 128(BP)
4029	VMOVDQA Y2, 160(BP)
4030
4031	// Compute the number of iterations that will hash two blocks of data
4032	MOVQ    BX, 224(BP)
4033	MOVQ    BX, CX
4034	SUBQ    $0x00000100, CX
4035	SHRQ    $0x04, CX
4036	ADDQ    $0x06, CX
4037	MOVQ    $0x0000000a, R9
4038	CMPQ    CX, $0x0a
4039	CMOVQGT R9, CX
4040	MOVQ    SI, BX
4041	XORQ    R9, R9
4042
4043openAVX2Tail384LoopB:
4044	ADDQ  (BX), R10
4045	ADCQ  8(BX), R11
4046	ADCQ  $0x01, R12
4047	MOVQ  (BP), DX
4048	MOVQ  DX, R15
4049	MULXQ R10, R13, R14
4050	IMULQ R12, R15
4051	MULXQ R11, AX, DX
4052	ADDQ  AX, R14
4053	ADCQ  DX, R15
4054	MOVQ  8(BP), DX
4055	MULXQ R10, R10, AX
4056	ADDQ  R10, R14
4057	MULXQ R11, R11, R8
4058	ADCQ  R11, R15
4059	ADCQ  $0x00, R8
4060	IMULQ R12, DX
4061	ADDQ  AX, R15
4062	ADCQ  DX, R8
4063	MOVQ  R13, R10
4064	MOVQ  R14, R11
4065	MOVQ  R15, R12
4066	ANDQ  $0x03, R12
4067	MOVQ  R15, R13
4068	ANDQ  $-4, R13
4069	MOVQ  R8, R14
4070	SHRQ  $0x02, R8, R15
4071	SHRQ  $0x02, R8
4072	ADDQ  R13, R10
4073	ADCQ  R14, R11
4074	ADCQ  $0x00, R12
4075	ADDQ  R15, R10
4076	ADCQ  R8, R11
4077	ADCQ  $0x00, R12
4078	LEAQ  16(BX), BX
4079
4080openAVX2Tail384LoopA:
4081	VPADDD   Y14, Y0, Y0
4082	VPXOR    Y0, Y4, Y4
4083	VPSHUFB  rol16<>+0(SB), Y4, Y4
4084	VPADDD   Y4, Y12, Y12
4085	VPXOR    Y12, Y14, Y14
4086	VPSLLD   $0x0c, Y14, Y3
4087	VPSRLD   $0x14, Y14, Y14
4088	VPXOR    Y3, Y14, Y14
4089	VPADDD   Y14, Y0, Y0
4090	VPXOR    Y0, Y4, Y4
4091	VPSHUFB  rol8<>+0(SB), Y4, Y4
4092	VPADDD   Y4, Y12, Y12
4093	VPXOR    Y12, Y14, Y14
4094	VPSLLD   $0x07, Y14, Y3
4095	VPSRLD   $0x19, Y14, Y14
4096	VPXOR    Y3, Y14, Y14
4097	VPADDD   Y9, Y5, Y5
4098	VPXOR    Y5, Y1, Y1
4099	VPSHUFB  rol16<>+0(SB), Y1, Y1
4100	VPADDD   Y1, Y13, Y13
4101	VPXOR    Y13, Y9, Y9
4102	VPSLLD   $0x0c, Y9, Y3
4103	VPSRLD   $0x14, Y9, Y9
4104	VPXOR    Y3, Y9, Y9
4105	VPADDD   Y9, Y5, Y5
4106	VPXOR    Y5, Y1, Y1
4107	VPSHUFB  rol8<>+0(SB), Y1, Y1
4108	VPADDD   Y1, Y13, Y13
4109	VPXOR    Y13, Y9, Y9
4110	VPSLLD   $0x07, Y9, Y3
4111	VPSRLD   $0x19, Y9, Y9
4112	VPXOR    Y3, Y9, Y9
4113	VPADDD   Y10, Y6, Y6
4114	VPXOR    Y6, Y2, Y2
4115	VPSHUFB  rol16<>+0(SB), Y2, Y2
4116	VPADDD   Y2, Y8, Y8
4117	VPXOR    Y8, Y10, Y10
4118	VPSLLD   $0x0c, Y10, Y3
4119	VPSRLD   $0x14, Y10, Y10
4120	VPXOR    Y3, Y10, Y10
4121	VPADDD   Y10, Y6, Y6
4122	VPXOR    Y6, Y2, Y2
4123	VPSHUFB  rol8<>+0(SB), Y2, Y2
4124	VPADDD   Y2, Y8, Y8
4125	VPXOR    Y8, Y10, Y10
4126	VPSLLD   $0x07, Y10, Y3
4127	VPSRLD   $0x19, Y10, Y10
4128	VPXOR    Y3, Y10, Y10
4129	VPALIGNR $0x04, Y14, Y14, Y14
4130	VPALIGNR $0x04, Y9, Y9, Y9
4131	VPALIGNR $0x04, Y10, Y10, Y10
4132	VPALIGNR $0x08, Y12, Y12, Y12
4133	VPALIGNR $0x08, Y13, Y13, Y13
4134	VPALIGNR $0x08, Y8, Y8, Y8
4135	VPALIGNR $0x0c, Y4, Y4, Y4
4136	VPALIGNR $0x0c, Y1, Y1, Y1
4137	VPALIGNR $0x0c, Y2, Y2, Y2
4138	ADDQ     (BX), R10
4139	ADCQ     8(BX), R11
4140	ADCQ     $0x01, R12
4141	MOVQ     (BP), DX
4142	MOVQ     DX, R15
4143	MULXQ    R10, R13, R14
4144	IMULQ    R12, R15
4145	MULXQ    R11, AX, DX
4146	ADDQ     AX, R14
4147	ADCQ     DX, R15
4148	MOVQ     8(BP), DX
4149	MULXQ    R10, R10, AX
4150	ADDQ     R10, R14
4151	MULXQ    R11, R11, R8
4152	ADCQ     R11, R15
4153	ADCQ     $0x00, R8
4154	IMULQ    R12, DX
4155	ADDQ     AX, R15
4156	ADCQ     DX, R8
4157	MOVQ     R13, R10
4158	MOVQ     R14, R11
4159	MOVQ     R15, R12
4160	ANDQ     $0x03, R12
4161	MOVQ     R15, R13
4162	ANDQ     $-4, R13
4163	MOVQ     R8, R14
4164	SHRQ     $0x02, R8, R15
4165	SHRQ     $0x02, R8
4166	ADDQ     R13, R10
4167	ADCQ     R14, R11
4168	ADCQ     $0x00, R12
4169	ADDQ     R15, R10
4170	ADCQ     R8, R11
4171	ADCQ     $0x00, R12
4172	LEAQ     16(BX), BX
4173	INCQ     R9
4174	VPADDD   Y14, Y0, Y0
4175	VPXOR    Y0, Y4, Y4
4176	VPSHUFB  rol16<>+0(SB), Y4, Y4
4177	VPADDD   Y4, Y12, Y12
4178	VPXOR    Y12, Y14, Y14
4179	VPSLLD   $0x0c, Y14, Y3
4180	VPSRLD   $0x14, Y14, Y14
4181	VPXOR    Y3, Y14, Y14
4182	VPADDD   Y14, Y0, Y0
4183	VPXOR    Y0, Y4, Y4
4184	VPSHUFB  rol8<>+0(SB), Y4, Y4
4185	VPADDD   Y4, Y12, Y12
4186	VPXOR    Y12, Y14, Y14
4187	VPSLLD   $0x07, Y14, Y3
4188	VPSRLD   $0x19, Y14, Y14
4189	VPXOR    Y3, Y14, Y14
4190	VPADDD   Y9, Y5, Y5
4191	VPXOR    Y5, Y1, Y1
4192	VPSHUFB  rol16<>+0(SB), Y1, Y1
4193	VPADDD   Y1, Y13, Y13
4194	VPXOR    Y13, Y9, Y9
4195	VPSLLD   $0x0c, Y9, Y3
4196	VPSRLD   $0x14, Y9, Y9
4197	VPXOR    Y3, Y9, Y9
4198	VPADDD   Y9, Y5, Y5
4199	VPXOR    Y5, Y1, Y1
4200	VPSHUFB  rol8<>+0(SB), Y1, Y1
4201	VPADDD   Y1, Y13, Y13
4202	VPXOR    Y13, Y9, Y9
4203	VPSLLD   $0x07, Y9, Y3
4204	VPSRLD   $0x19, Y9, Y9
4205	VPXOR    Y3, Y9, Y9
4206	VPADDD   Y10, Y6, Y6
4207	VPXOR    Y6, Y2, Y2
4208	VPSHUFB  rol16<>+0(SB), Y2, Y2
4209	VPADDD   Y2, Y8, Y8
4210	VPXOR    Y8, Y10, Y10
4211	VPSLLD   $0x0c, Y10, Y3
4212	VPSRLD   $0x14, Y10, Y10
4213	VPXOR    Y3, Y10, Y10
4214	VPADDD   Y10, Y6, Y6
4215	VPXOR    Y6, Y2, Y2
4216	VPSHUFB  rol8<>+0(SB), Y2, Y2
4217	VPADDD   Y2, Y8, Y8
4218	VPXOR    Y8, Y10, Y10
4219	VPSLLD   $0x07, Y10, Y3
4220	VPSRLD   $0x19, Y10, Y10
4221	VPXOR    Y3, Y10, Y10
4222	VPALIGNR $0x0c, Y14, Y14, Y14
4223	VPALIGNR $0x0c, Y9, Y9, Y9
4224	VPALIGNR $0x0c, Y10, Y10, Y10
4225	VPALIGNR $0x08, Y12, Y12, Y12
4226	VPALIGNR $0x08, Y13, Y13, Y13
4227	VPALIGNR $0x08, Y8, Y8, Y8
4228	VPALIGNR $0x04, Y4, Y4, Y4
4229	VPALIGNR $0x04, Y1, Y1, Y1
4230	VPALIGNR $0x04, Y2, Y2, Y2
4231	CMPQ     R9, CX
4232	JB       openAVX2Tail384LoopB
4233	CMPQ     R9, $0x0a
4234	JNE      openAVX2Tail384LoopA
4235	MOVQ     BX, R9
4236	SUBQ     SI, BX
4237	MOVQ     BX, CX
4238	MOVQ     224(BP), BX
4239
4240openAVX2Tail384Hash:
4241	ADDQ  $0x10, CX
4242	CMPQ  CX, BX
4243	JGT   openAVX2Tail384HashEnd
4244	ADDQ  (R9), R10
4245	ADCQ  8(R9), R11
4246	ADCQ  $0x01, R12
4247	MOVQ  (BP), DX
4248	MOVQ  DX, R15
4249	MULXQ R10, R13, R14
4250	IMULQ R12, R15
4251	MULXQ R11, AX, DX
4252	ADDQ  AX, R14
4253	ADCQ  DX, R15
4254	MOVQ  8(BP), DX
4255	MULXQ R10, R10, AX
4256	ADDQ  R10, R14
4257	MULXQ R11, R11, R8
4258	ADCQ  R11, R15
4259	ADCQ  $0x00, R8
4260	IMULQ R12, DX
4261	ADDQ  AX, R15
4262	ADCQ  DX, R8
4263	MOVQ  R13, R10
4264	MOVQ  R14, R11
4265	MOVQ  R15, R12
4266	ANDQ  $0x03, R12
4267	MOVQ  R15, R13
4268	ANDQ  $-4, R13
4269	MOVQ  R8, R14
4270	SHRQ  $0x02, R8, R15
4271	SHRQ  $0x02, R8
4272	ADDQ  R13, R10
4273	ADCQ  R14, R11
4274	ADCQ  $0x00, R12
4275	ADDQ  R15, R10
4276	ADCQ  R8, R11
4277	ADCQ  $0x00, R12
4278	LEAQ  16(R9), R9
4279	JMP   openAVX2Tail384Hash
4280
4281openAVX2Tail384HashEnd:
4282	VPADDD     chacha20Constants<>+0(SB), Y0, Y0
4283	VPADDD     chacha20Constants<>+0(SB), Y5, Y5
4284	VPADDD     chacha20Constants<>+0(SB), Y6, Y6
4285	VPADDD     32(BP), Y14, Y14
4286	VPADDD     32(BP), Y9, Y9
4287	VPADDD     32(BP), Y10, Y10
4288	VPADDD     64(BP), Y12, Y12
4289	VPADDD     64(BP), Y13, Y13
4290	VPADDD     64(BP), Y8, Y8
4291	VPADDD     96(BP), Y4, Y4
4292	VPADDD     128(BP), Y1, Y1
4293	VPADDD     160(BP), Y2, Y2
4294	VPERM2I128 $0x02, Y0, Y14, Y3
4295	VPERM2I128 $0x02, Y12, Y4, Y7
4296	VPERM2I128 $0x13, Y0, Y14, Y11
4297	VPERM2I128 $0x13, Y12, Y4, Y15
4298	VPXOR      (SI), Y3, Y3
4299	VPXOR      32(SI), Y7, Y7
4300	VPXOR      64(SI), Y11, Y11
4301	VPXOR      96(SI), Y15, Y15
4302	VMOVDQU    Y3, (DI)
4303	VMOVDQU    Y7, 32(DI)
4304	VMOVDQU    Y11, 64(DI)
4305	VMOVDQU    Y15, 96(DI)
4306	VPERM2I128 $0x02, Y5, Y9, Y3
4307	VPERM2I128 $0x02, Y13, Y1, Y7
4308	VPERM2I128 $0x13, Y5, Y9, Y11
4309	VPERM2I128 $0x13, Y13, Y1, Y15
4310	VPXOR      128(SI), Y3, Y3
4311	VPXOR      160(SI), Y7, Y7
4312	VPXOR      192(SI), Y11, Y11
4313	VPXOR      224(SI), Y15, Y15
4314	VMOVDQU    Y3, 128(DI)
4315	VMOVDQU    Y7, 160(DI)
4316	VMOVDQU    Y11, 192(DI)
4317	VMOVDQU    Y15, 224(DI)
4318	VPERM2I128 $0x02, Y6, Y10, Y0
4319	VPERM2I128 $0x02, Y8, Y2, Y14
4320	VPERM2I128 $0x13, Y6, Y10, Y12
4321	VPERM2I128 $0x13, Y8, Y2, Y4
4322	LEAQ       256(SI), SI
4323	LEAQ       256(DI), DI
4324	SUBQ       $0x00000100, BX
4325	JMP        openAVX2TailLoop
4326
4327openAVX2Tail512:
4328	VMOVDQU chacha20Constants<>+0(SB), Y0
4329	VMOVDQA Y0, Y5
4330	VMOVDQA Y0, Y6
4331	VMOVDQA Y0, Y7
4332	VMOVDQA 32(BP), Y14
4333	VMOVDQA Y14, Y9
4334	VMOVDQA Y14, Y10
4335	VMOVDQA Y14, Y11
4336	VMOVDQA 64(BP), Y12
4337	VMOVDQA Y12, Y13
4338	VMOVDQA Y12, Y8
4339	VMOVDQA Y12, Y15
4340	VMOVDQA 192(BP), Y4
4341	VPADDD  avx2IncMask<>+0(SB), Y4, Y4
4342	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
4343	VPADDD  avx2IncMask<>+0(SB), Y1, Y2
4344	VPADDD  avx2IncMask<>+0(SB), Y2, Y3
4345	VMOVDQA Y4, 96(BP)
4346	VMOVDQA Y1, 128(BP)
4347	VMOVDQA Y2, 160(BP)
4348	VMOVDQA Y3, 192(BP)
4349	XORQ    CX, CX
4350	MOVQ    SI, R9
4351
4352openAVX2Tail512LoopB:
4353	ADDQ  (R9), R10
4354	ADCQ  8(R9), R11
4355	ADCQ  $0x01, R12
4356	MOVQ  (BP), DX
4357	MOVQ  DX, R15
4358	MULXQ R10, R13, R14
4359	IMULQ R12, R15
4360	MULXQ R11, AX, DX
4361	ADDQ  AX, R14
4362	ADCQ  DX, R15
4363	MOVQ  8(BP), DX
4364	MULXQ R10, R10, AX
4365	ADDQ  R10, R14
4366	MULXQ R11, R11, R8
4367	ADCQ  R11, R15
4368	ADCQ  $0x00, R8
4369	IMULQ R12, DX
4370	ADDQ  AX, R15
4371	ADCQ  DX, R8
4372	MOVQ  R13, R10
4373	MOVQ  R14, R11
4374	MOVQ  R15, R12
4375	ANDQ  $0x03, R12
4376	MOVQ  R15, R13
4377	ANDQ  $-4, R13
4378	MOVQ  R8, R14
4379	SHRQ  $0x02, R8, R15
4380	SHRQ  $0x02, R8
4381	ADDQ  R13, R10
4382	ADCQ  R14, R11
4383	ADCQ  $0x00, R12
4384	ADDQ  R15, R10
4385	ADCQ  R8, R11
4386	ADCQ  $0x00, R12
4387	LEAQ  16(R9), R9
4388
4389openAVX2Tail512LoopA:
4390	VPADDD   Y14, Y0, Y0
4391	VPADDD   Y9, Y5, Y5
4392	VPADDD   Y10, Y6, Y6
4393	VPADDD   Y11, Y7, Y7
4394	VPXOR    Y0, Y4, Y4
4395	VPXOR    Y5, Y1, Y1
4396	VPXOR    Y6, Y2, Y2
4397	VPXOR    Y7, Y3, Y3
4398	VPSHUFB  rol16<>+0(SB), Y4, Y4
4399	VPSHUFB  rol16<>+0(SB), Y1, Y1
4400	VPSHUFB  rol16<>+0(SB), Y2, Y2
4401	VPSHUFB  rol16<>+0(SB), Y3, Y3
4402	VPADDD   Y4, Y12, Y12
4403	VPADDD   Y1, Y13, Y13
4404	VPADDD   Y2, Y8, Y8
4405	VPADDD   Y3, Y15, Y15
4406	VPXOR    Y12, Y14, Y14
4407	VPXOR    Y13, Y9, Y9
4408	VPXOR    Y8, Y10, Y10
4409	VPXOR    Y15, Y11, Y11
4410	VMOVDQA  Y15, 224(BP)
4411	VPSLLD   $0x0c, Y14, Y15
4412	VPSRLD   $0x14, Y14, Y14
4413	VPXOR    Y15, Y14, Y14
4414	VPSLLD   $0x0c, Y9, Y15
4415	VPSRLD   $0x14, Y9, Y9
4416	VPXOR    Y15, Y9, Y9
4417	VPSLLD   $0x0c, Y10, Y15
4418	VPSRLD   $0x14, Y10, Y10
4419	VPXOR    Y15, Y10, Y10
4420	VPSLLD   $0x0c, Y11, Y15
4421	VPSRLD   $0x14, Y11, Y11
4422	VPXOR    Y15, Y11, Y11
4423	VMOVDQA  224(BP), Y15
4424	ADDQ     (R9), R10
4425	ADCQ     8(R9), R11
4426	ADCQ     $0x01, R12
4427	MOVQ     (BP), DX
4428	MOVQ     DX, R15
4429	MULXQ    R10, R13, R14
4430	IMULQ    R12, R15
4431	MULXQ    R11, AX, DX
4432	ADDQ     AX, R14
4433	ADCQ     DX, R15
4434	MOVQ     8(BP), DX
4435	MULXQ    R10, R10, AX
4436	ADDQ     R10, R14
4437	MULXQ    R11, R11, R8
4438	ADCQ     R11, R15
4439	ADCQ     $0x00, R8
4440	IMULQ    R12, DX
4441	ADDQ     AX, R15
4442	ADCQ     DX, R8
4443	MOVQ     R13, R10
4444	MOVQ     R14, R11
4445	MOVQ     R15, R12
4446	ANDQ     $0x03, R12
4447	MOVQ     R15, R13
4448	ANDQ     $-4, R13
4449	MOVQ     R8, R14
4450	SHRQ     $0x02, R8, R15
4451	SHRQ     $0x02, R8
4452	ADDQ     R13, R10
4453	ADCQ     R14, R11
4454	ADCQ     $0x00, R12
4455	ADDQ     R15, R10
4456	ADCQ     R8, R11
4457	ADCQ     $0x00, R12
4458	VPADDD   Y14, Y0, Y0
4459	VPADDD   Y9, Y5, Y5
4460	VPADDD   Y10, Y6, Y6
4461	VPADDD   Y11, Y7, Y7
4462	VPXOR    Y0, Y4, Y4
4463	VPXOR    Y5, Y1, Y1
4464	VPXOR    Y6, Y2, Y2
4465	VPXOR    Y7, Y3, Y3
4466	VPSHUFB  rol8<>+0(SB), Y4, Y4
4467	VPSHUFB  rol8<>+0(SB), Y1, Y1
4468	VPSHUFB  rol8<>+0(SB), Y2, Y2
4469	VPSHUFB  rol8<>+0(SB), Y3, Y3
4470	VPADDD   Y4, Y12, Y12
4471	VPADDD   Y1, Y13, Y13
4472	VPADDD   Y2, Y8, Y8
4473	VPADDD   Y3, Y15, Y15
4474	VPXOR    Y12, Y14, Y14
4475	VPXOR    Y13, Y9, Y9
4476	VPXOR    Y8, Y10, Y10
4477	VPXOR    Y15, Y11, Y11
4478	VMOVDQA  Y15, 224(BP)
4479	VPSLLD   $0x07, Y14, Y15
4480	VPSRLD   $0x19, Y14, Y14
4481	VPXOR    Y15, Y14, Y14
4482	VPSLLD   $0x07, Y9, Y15
4483	VPSRLD   $0x19, Y9, Y9
4484	VPXOR    Y15, Y9, Y9
4485	VPSLLD   $0x07, Y10, Y15
4486	VPSRLD   $0x19, Y10, Y10
4487	VPXOR    Y15, Y10, Y10
4488	VPSLLD   $0x07, Y11, Y15
4489	VPSRLD   $0x19, Y11, Y11
4490	VPXOR    Y15, Y11, Y11
4491	VMOVDQA  224(BP), Y15
4492	VPALIGNR $0x04, Y14, Y14, Y14
4493	VPALIGNR $0x04, Y9, Y9, Y9
4494	VPALIGNR $0x04, Y10, Y10, Y10
4495	VPALIGNR $0x04, Y11, Y11, Y11
4496	VPALIGNR $0x08, Y12, Y12, Y12
4497	VPALIGNR $0x08, Y13, Y13, Y13
4498	VPALIGNR $0x08, Y8, Y8, Y8
4499	VPALIGNR $0x08, Y15, Y15, Y15
4500	VPALIGNR $0x0c, Y4, Y4, Y4
4501	VPALIGNR $0x0c, Y1, Y1, Y1
4502	VPALIGNR $0x0c, Y2, Y2, Y2
4503	VPALIGNR $0x0c, Y3, Y3, Y3
4504	VPADDD   Y14, Y0, Y0
4505	VPADDD   Y9, Y5, Y5
4506	VPADDD   Y10, Y6, Y6
4507	VPADDD   Y11, Y7, Y7
4508	VPXOR    Y0, Y4, Y4
4509	VPXOR    Y5, Y1, Y1
4510	VPXOR    Y6, Y2, Y2
4511	VPXOR    Y7, Y3, Y3
4512	VPSHUFB  rol16<>+0(SB), Y4, Y4
4513	VPSHUFB  rol16<>+0(SB), Y1, Y1
4514	VPSHUFB  rol16<>+0(SB), Y2, Y2
4515	VPSHUFB  rol16<>+0(SB), Y3, Y3
4516	VPADDD   Y4, Y12, Y12
4517	VPADDD   Y1, Y13, Y13
4518	VPADDD   Y2, Y8, Y8
4519	VPADDD   Y3, Y15, Y15
4520	VPXOR    Y12, Y14, Y14
4521	VPXOR    Y13, Y9, Y9
4522	VPXOR    Y8, Y10, Y10
4523	VPXOR    Y15, Y11, Y11
4524	ADDQ     16(R9), R10
4525	ADCQ     24(R9), R11
4526	ADCQ     $0x01, R12
4527	MOVQ     (BP), DX
4528	MOVQ     DX, R15
4529	MULXQ    R10, R13, R14
4530	IMULQ    R12, R15
4531	MULXQ    R11, AX, DX
4532	ADDQ     AX, R14
4533	ADCQ     DX, R15
4534	MOVQ     8(BP), DX
4535	MULXQ    R10, R10, AX
4536	ADDQ     R10, R14
4537	MULXQ    R11, R11, R8
4538	ADCQ     R11, R15
4539	ADCQ     $0x00, R8
4540	IMULQ    R12, DX
4541	ADDQ     AX, R15
4542	ADCQ     DX, R8
4543	MOVQ     R13, R10
4544	MOVQ     R14, R11
4545	MOVQ     R15, R12
4546	ANDQ     $0x03, R12
4547	MOVQ     R15, R13
4548	ANDQ     $-4, R13
4549	MOVQ     R8, R14
4550	SHRQ     $0x02, R8, R15
4551	SHRQ     $0x02, R8
4552	ADDQ     R13, R10
4553	ADCQ     R14, R11
4554	ADCQ     $0x00, R12
4555	ADDQ     R15, R10
4556	ADCQ     R8, R11
4557	ADCQ     $0x00, R12
4558	LEAQ     32(R9), R9
4559	VMOVDQA  Y15, 224(BP)
4560	VPSLLD   $0x0c, Y14, Y15
4561	VPSRLD   $0x14, Y14, Y14
4562	VPXOR    Y15, Y14, Y14
4563	VPSLLD   $0x0c, Y9, Y15
4564	VPSRLD   $0x14, Y9, Y9
4565	VPXOR    Y15, Y9, Y9
4566	VPSLLD   $0x0c, Y10, Y15
4567	VPSRLD   $0x14, Y10, Y10
4568	VPXOR    Y15, Y10, Y10
4569	VPSLLD   $0x0c, Y11, Y15
4570	VPSRLD   $0x14, Y11, Y11
4571	VPXOR    Y15, Y11, Y11
4572	VMOVDQA  224(BP), Y15
4573	VPADDD   Y14, Y0, Y0
4574	VPADDD   Y9, Y5, Y5
4575	VPADDD   Y10, Y6, Y6
4576	VPADDD   Y11, Y7, Y7
4577	VPXOR    Y0, Y4, Y4
4578	VPXOR    Y5, Y1, Y1
4579	VPXOR    Y6, Y2, Y2
4580	VPXOR    Y7, Y3, Y3
4581	VPSHUFB  rol8<>+0(SB), Y4, Y4
4582	VPSHUFB  rol8<>+0(SB), Y1, Y1
4583	VPSHUFB  rol8<>+0(SB), Y2, Y2
4584	VPSHUFB  rol8<>+0(SB), Y3, Y3
4585	VPADDD   Y4, Y12, Y12
4586	VPADDD   Y1, Y13, Y13
4587	VPADDD   Y2, Y8, Y8
4588	VPADDD   Y3, Y15, Y15
4589	VPXOR    Y12, Y14, Y14
4590	VPXOR    Y13, Y9, Y9
4591	VPXOR    Y8, Y10, Y10
4592	VPXOR    Y15, Y11, Y11
4593	VMOVDQA  Y15, 224(BP)
4594	VPSLLD   $0x07, Y14, Y15
4595	VPSRLD   $0x19, Y14, Y14
4596	VPXOR    Y15, Y14, Y14
4597	VPSLLD   $0x07, Y9, Y15
4598	VPSRLD   $0x19, Y9, Y9
4599	VPXOR    Y15, Y9, Y9
4600	VPSLLD   $0x07, Y10, Y15
4601	VPSRLD   $0x19, Y10, Y10
4602	VPXOR    Y15, Y10, Y10
4603	VPSLLD   $0x07, Y11, Y15
4604	VPSRLD   $0x19, Y11, Y11
4605	VPXOR    Y15, Y11, Y11
4606	VMOVDQA  224(BP), Y15
4607	VPALIGNR $0x0c, Y14, Y14, Y14
4608	VPALIGNR $0x0c, Y9, Y9, Y9
4609	VPALIGNR $0x0c, Y10, Y10, Y10
4610	VPALIGNR $0x0c, Y11, Y11, Y11
4611	VPALIGNR $0x08, Y12, Y12, Y12
4612	VPALIGNR $0x08, Y13, Y13, Y13
4613	VPALIGNR $0x08, Y8, Y8, Y8
4614	VPALIGNR $0x08, Y15, Y15, Y15
4615	VPALIGNR $0x04, Y4, Y4, Y4
4616	VPALIGNR $0x04, Y1, Y1, Y1
4617	VPALIGNR $0x04, Y2, Y2, Y2
4618	VPALIGNR $0x04, Y3, Y3, Y3
4619	INCQ     CX
4620	CMPQ     CX, $0x04
4621	JLT      openAVX2Tail512LoopB
4622	CMPQ     CX, $0x0a
4623	JNE      openAVX2Tail512LoopA
4624	MOVQ     BX, CX
4625	SUBQ     $0x00000180, CX
4626	ANDQ     $-16, CX
4627
4628openAVX2Tail512HashLoop:
4629	TESTQ CX, CX
4630	JE    openAVX2Tail512HashEnd
4631	ADDQ  (R9), R10
4632	ADCQ  8(R9), R11
4633	ADCQ  $0x01, R12
4634	MOVQ  (BP), DX
4635	MOVQ  DX, R15
4636	MULXQ R10, R13, R14
4637	IMULQ R12, R15
4638	MULXQ R11, AX, DX
4639	ADDQ  AX, R14
4640	ADCQ  DX, R15
4641	MOVQ  8(BP), DX
4642	MULXQ R10, R10, AX
4643	ADDQ  R10, R14
4644	MULXQ R11, R11, R8
4645	ADCQ  R11, R15
4646	ADCQ  $0x00, R8
4647	IMULQ R12, DX
4648	ADDQ  AX, R15
4649	ADCQ  DX, R8
4650	MOVQ  R13, R10
4651	MOVQ  R14, R11
4652	MOVQ  R15, R12
4653	ANDQ  $0x03, R12
4654	MOVQ  R15, R13
4655	ANDQ  $-4, R13
4656	MOVQ  R8, R14
4657	SHRQ  $0x02, R8, R15
4658	SHRQ  $0x02, R8
4659	ADDQ  R13, R10
4660	ADCQ  R14, R11
4661	ADCQ  $0x00, R12
4662	ADDQ  R15, R10
4663	ADCQ  R8, R11
4664	ADCQ  $0x00, R12
4665	LEAQ  16(R9), R9
4666	SUBQ  $0x10, CX
4667	JMP   openAVX2Tail512HashLoop
4668
4669openAVX2Tail512HashEnd:
4670	VPADDD     chacha20Constants<>+0(SB), Y0, Y0
4671	VPADDD     chacha20Constants<>+0(SB), Y5, Y5
4672	VPADDD     chacha20Constants<>+0(SB), Y6, Y6
4673	VPADDD     chacha20Constants<>+0(SB), Y7, Y7
4674	VPADDD     32(BP), Y14, Y14
4675	VPADDD     32(BP), Y9, Y9
4676	VPADDD     32(BP), Y10, Y10
4677	VPADDD     32(BP), Y11, Y11
4678	VPADDD     64(BP), Y12, Y12
4679	VPADDD     64(BP), Y13, Y13
4680	VPADDD     64(BP), Y8, Y8
4681	VPADDD     64(BP), Y15, Y15
4682	VPADDD     96(BP), Y4, Y4
4683	VPADDD     128(BP), Y1, Y1
4684	VPADDD     160(BP), Y2, Y2
4685	VPADDD     192(BP), Y3, Y3
4686	VMOVDQA    Y15, 224(BP)
4687	VPERM2I128 $0x02, Y0, Y14, Y15
4688	VPERM2I128 $0x13, Y0, Y14, Y14
4689	VPERM2I128 $0x02, Y12, Y4, Y0
4690	VPERM2I128 $0x13, Y12, Y4, Y12
4691	VPXOR      (SI), Y15, Y15
4692	VPXOR      32(SI), Y0, Y0
4693	VPXOR      64(SI), Y14, Y14
4694	VPXOR      96(SI), Y12, Y12
4695	VMOVDQU    Y15, (DI)
4696	VMOVDQU    Y0, 32(DI)
4697	VMOVDQU    Y14, 64(DI)
4698	VMOVDQU    Y12, 96(DI)
4699	VPERM2I128 $0x02, Y5, Y9, Y0
4700	VPERM2I128 $0x02, Y13, Y1, Y14
4701	VPERM2I128 $0x13, Y5, Y9, Y12
4702	VPERM2I128 $0x13, Y13, Y1, Y4
4703	VPXOR      128(SI), Y0, Y0
4704	VPXOR      160(SI), Y14, Y14
4705	VPXOR      192(SI), Y12, Y12
4706	VPXOR      224(SI), Y4, Y4
4707	VMOVDQU    Y0, 128(DI)
4708	VMOVDQU    Y14, 160(DI)
4709	VMOVDQU    Y12, 192(DI)
4710	VMOVDQU    Y4, 224(DI)
4711	VPERM2I128 $0x02, Y6, Y10, Y0
4712	VPERM2I128 $0x02, Y8, Y2, Y14
4713	VPERM2I128 $0x13, Y6, Y10, Y12
4714	VPERM2I128 $0x13, Y8, Y2, Y4
4715	VPXOR      256(SI), Y0, Y0
4716	VPXOR      288(SI), Y14, Y14
4717	VPXOR      320(SI), Y12, Y12
4718	VPXOR      352(SI), Y4, Y4
4719	VMOVDQU    Y0, 256(DI)
4720	VMOVDQU    Y14, 288(DI)
4721	VMOVDQU    Y12, 320(DI)
4722	VMOVDQU    Y4, 352(DI)
4723	VPERM2I128 $0x02, Y7, Y11, Y0
4724	VPERM2I128 $0x02, 224(BP), Y3, Y14
4725	VPERM2I128 $0x13, Y7, Y11, Y12
4726	VPERM2I128 $0x13, 224(BP), Y3, Y4
4727	LEAQ       384(SI), SI
4728	LEAQ       384(DI), DI
4729	SUBQ       $0x00000180, BX
4730	JMP        openAVX2TailLoop
4731
4732DATA chacha20Constants<>+0(SB)/4, $0x61707865
4733DATA chacha20Constants<>+4(SB)/4, $0x3320646e
4734DATA chacha20Constants<>+8(SB)/4, $0x79622d32
4735DATA chacha20Constants<>+12(SB)/4, $0x6b206574
4736DATA chacha20Constants<>+16(SB)/4, $0x61707865
4737DATA chacha20Constants<>+20(SB)/4, $0x3320646e
4738DATA chacha20Constants<>+24(SB)/4, $0x79622d32
4739DATA chacha20Constants<>+28(SB)/4, $0x6b206574
4740GLOBL chacha20Constants<>(SB), RODATA|NOPTR, $32
4741
4742DATA polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
4743DATA polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
4744DATA polyClampMask<>+16(SB)/8, $0xffffffffffffffff
4745DATA polyClampMask<>+24(SB)/8, $0xffffffffffffffff
4746GLOBL polyClampMask<>(SB), RODATA|NOPTR, $32
4747
4748DATA sseIncMask<>+0(SB)/8, $0x0000000000000001
4749DATA sseIncMask<>+8(SB)/8, $0x0000000000000000
4750GLOBL sseIncMask<>(SB), RODATA|NOPTR, $16
4751
4752DATA andMask<>+0(SB)/8, $0x00000000000000ff
4753DATA andMask<>+8(SB)/8, $0x0000000000000000
4754DATA andMask<>+16(SB)/8, $0x000000000000ffff
4755DATA andMask<>+24(SB)/8, $0x0000000000000000
4756DATA andMask<>+32(SB)/8, $0x0000000000ffffff
4757DATA andMask<>+40(SB)/8, $0x0000000000000000
4758DATA andMask<>+48(SB)/8, $0x00000000ffffffff
4759DATA andMask<>+56(SB)/8, $0x0000000000000000
4760DATA andMask<>+64(SB)/8, $0x000000ffffffffff
4761DATA andMask<>+72(SB)/8, $0x0000000000000000
4762DATA andMask<>+80(SB)/8, $0x0000ffffffffffff
4763DATA andMask<>+88(SB)/8, $0x0000000000000000
4764DATA andMask<>+96(SB)/8, $0x00ffffffffffffff
4765DATA andMask<>+104(SB)/8, $0x0000000000000000
4766DATA andMask<>+112(SB)/8, $0xffffffffffffffff
4767DATA andMask<>+120(SB)/8, $0x0000000000000000
4768DATA andMask<>+128(SB)/8, $0xffffffffffffffff
4769DATA andMask<>+136(SB)/8, $0x00000000000000ff
4770DATA andMask<>+144(SB)/8, $0xffffffffffffffff
4771DATA andMask<>+152(SB)/8, $0x000000000000ffff
4772DATA andMask<>+160(SB)/8, $0xffffffffffffffff
4773DATA andMask<>+168(SB)/8, $0x0000000000ffffff
4774DATA andMask<>+176(SB)/8, $0xffffffffffffffff
4775DATA andMask<>+184(SB)/8, $0x00000000ffffffff
4776DATA andMask<>+192(SB)/8, $0xffffffffffffffff
4777DATA andMask<>+200(SB)/8, $0x000000ffffffffff
4778DATA andMask<>+208(SB)/8, $0xffffffffffffffff
4779DATA andMask<>+216(SB)/8, $0x0000ffffffffffff
4780DATA andMask<>+224(SB)/8, $0xffffffffffffffff
4781DATA andMask<>+232(SB)/8, $0x00ffffffffffffff
4782GLOBL andMask<>(SB), RODATA|NOPTR, $240
4783
4784DATA avx2InitMask<>+0(SB)/8, $0x0000000000000000
4785DATA avx2InitMask<>+8(SB)/8, $0x0000000000000000
4786DATA avx2InitMask<>+16(SB)/8, $0x0000000000000001
4787DATA avx2InitMask<>+24(SB)/8, $0x0000000000000000
4788GLOBL avx2InitMask<>(SB), RODATA|NOPTR, $32
4789
4790DATA rol16<>+0(SB)/8, $0x0504070601000302
4791DATA rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
4792DATA rol16<>+16(SB)/8, $0x0504070601000302
4793DATA rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
4794GLOBL rol16<>(SB), RODATA|NOPTR, $32
4795
4796DATA rol8<>+0(SB)/8, $0x0605040702010003
4797DATA rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
4798DATA rol8<>+16(SB)/8, $0x0605040702010003
4799DATA rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
4800GLOBL rol8<>(SB), RODATA|NOPTR, $32
4801
4802DATA avx2IncMask<>+0(SB)/8, $0x0000000000000002
4803DATA avx2IncMask<>+8(SB)/8, $0x0000000000000000
4804DATA avx2IncMask<>+16(SB)/8, $0x0000000000000002
4805DATA avx2IncMask<>+24(SB)/8, $0x0000000000000000
4806GLOBL avx2IncMask<>(SB), RODATA|NOPTR, $32
4807
4808// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
4809// Requires: AVX, AVX2, BMI2, CMOV, SSE2
4810TEXT chacha20Poly1305Seal(SB), $288-96
4811	MOVQ SP, BP
4812	ADDQ $0x20, BP
4813	ANDQ $-32, BP
4814	MOVQ dst_base+0(FP), DI
4815	MOVQ key_base+24(FP), R8
4816	MOVQ src_base+48(FP), SI
4817	MOVQ src_len+56(FP), BX
4818	MOVQ ad_base+72(FP), CX
4819	CMPB useAVX2+0(SB), $0x01
4820	JE   chacha20Poly1305Seal_AVX2
4821
4822	// Special optimization, for very short buffers
4823	CMPQ BX, $0x80
4824	JBE  sealSSE128
4825
4826	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
4827	MOVOU chacha20Constants<>+0(SB), X0
4828	MOVOU 16(R8), X3
4829	MOVOU 32(R8), X6
4830	MOVOU 48(R8), X9
4831
4832	// Store state on stack for future use
4833	MOVO X3, 32(BP)
4834	MOVO X6, 48(BP)
4835
4836	// Load state, increment counter blocks
4837	MOVO  X0, X1
4838	MOVO  X3, X4
4839	MOVO  X6, X7
4840	MOVO  X9, X10
4841	PADDL sseIncMask<>+0(SB), X10
4842	MOVO  X1, X2
4843	MOVO  X4, X5
4844	MOVO  X7, X8
4845	MOVO  X10, X11
4846	PADDL sseIncMask<>+0(SB), X11
4847	MOVO  X2, X12
4848	MOVO  X5, X13
4849	MOVO  X8, X14
4850	MOVO  X11, X15
4851	PADDL sseIncMask<>+0(SB), X15
4852
4853	// Store counters
4854	MOVO X9, 80(BP)
4855	MOVO X10, 96(BP)
4856	MOVO X11, 112(BP)
4857	MOVO X15, 128(BP)
4858	MOVQ $0x0000000a, R9
4859
4860sealSSEIntroLoop:
4861	MOVO  X14, 64(BP)
4862	PADDD X3, X0
4863	PXOR  X0, X9
4864	ROL16(X9, X14)
4865	PADDD X9, X6
4866	PXOR  X6, X3
4867	MOVO  X3, X14
4868	PSLLL $0x0c, X14
4869	PSRLL $0x14, X3
4870	PXOR  X14, X3
4871	PADDD X3, X0
4872	PXOR  X0, X9
4873	ROL8(X9, X14)
4874	PADDD X9, X6
4875	PXOR  X6, X3
4876	MOVO  X3, X14
4877	PSLLL $0x07, X14
4878	PSRLL $0x19, X3
4879	PXOR  X14, X3
4880	PADDD X4, X1
4881	PXOR  X1, X10
4882	ROL16(X10, X14)
4883	PADDD X10, X7
4884	PXOR  X7, X4
4885	MOVO  X4, X14
4886	PSLLL $0x0c, X14
4887	PSRLL $0x14, X4
4888	PXOR  X14, X4
4889	PADDD X4, X1
4890	PXOR  X1, X10
4891	ROL8(X10, X14)
4892	PADDD X10, X7
4893	PXOR  X7, X4
4894	MOVO  X4, X14
4895	PSLLL $0x07, X14
4896	PSRLL $0x19, X4
4897	PXOR  X14, X4
4898	PADDD X5, X2
4899	PXOR  X2, X11
4900	ROL16(X11, X14)
4901	PADDD X11, X8
4902	PXOR  X8, X5
4903	MOVO  X5, X14
4904	PSLLL $0x0c, X14
4905	PSRLL $0x14, X5
4906	PXOR  X14, X5
4907	PADDD X5, X2
4908	PXOR  X2, X11
4909	ROL8(X11, X14)
4910	PADDD X11, X8
4911	PXOR  X8, X5
4912	MOVO  X5, X14
4913	PSLLL $0x07, X14
4914	PSRLL $0x19, X5
4915	PXOR  X14, X5
4916	MOVO  64(BP), X14
4917	MOVO  X7, 64(BP)
4918	PADDD X13, X12
4919	PXOR  X12, X15
4920	ROL16(X15, X7)
4921	PADDD X15, X14
4922	PXOR  X14, X13
4923	MOVO  X13, X7
4924	PSLLL $0x0c, X7
4925	PSRLL $0x14, X13
4926	PXOR  X7, X13
4927	PADDD X13, X12
4928	PXOR  X12, X15
4929	ROL8(X15, X7)
4930	PADDD X15, X14
4931	PXOR  X14, X13
4932	MOVO  X13, X7
4933	PSLLL $0x07, X7
4934	PSRLL $0x19, X13
4935	PXOR  X7, X13
4936	MOVO  64(BP), X7
4937	BYTE  $0x66
4938	BYTE  $0x0f
4939	BYTE  $0x3a
4940	BYTE  $0x0f
4941	BYTE  $0xdb
4942	BYTE  $0x04
4943	BYTE  $0x66
4944	BYTE  $0x0f
4945	BYTE  $0x3a
4946	BYTE  $0x0f
4947	BYTE  $0xe4
4948	BYTE  $0x04
4949	BYTE  $0x66
4950	BYTE  $0x0f
4951	BYTE  $0x3a
4952	BYTE  $0x0f
4953	BYTE  $0xed
4954	BYTE  $0x04
4955	BYTE  $0x66
4956	BYTE  $0x45
4957	BYTE  $0x0f
4958	BYTE  $0x3a
4959	BYTE  $0x0f
4960	BYTE  $0xed
4961	BYTE  $0x04
4962	BYTE  $0x66
4963	BYTE  $0x0f
4964	BYTE  $0x3a
4965	BYTE  $0x0f
4966	BYTE  $0xf6
4967	BYTE  $0x08
4968	BYTE  $0x66
4969	BYTE  $0x0f
4970	BYTE  $0x3a
4971	BYTE  $0x0f
4972	BYTE  $0xff
4973	BYTE  $0x08
4974	BYTE  $0x66
4975	BYTE  $0x45
4976	BYTE  $0x0f
4977	BYTE  $0x3a
4978	BYTE  $0x0f
4979	BYTE  $0xc0
4980	BYTE  $0x08
4981	BYTE  $0x66
4982	BYTE  $0x45
4983	BYTE  $0x0f
4984	BYTE  $0x3a
4985	BYTE  $0x0f
4986	BYTE  $0xf6
4987	BYTE  $0x08
4988	BYTE  $0x66
4989	BYTE  $0x45
4990	BYTE  $0x0f
4991	BYTE  $0x3a
4992	BYTE  $0x0f
4993	BYTE  $0xc9
4994	BYTE  $0x0c
4995	BYTE  $0x66
4996	BYTE  $0x45
4997	BYTE  $0x0f
4998	BYTE  $0x3a
4999	BYTE  $0x0f
5000	BYTE  $0xd2
5001	BYTE  $0x0c
5002	BYTE  $0x66
5003	BYTE  $0x45
5004	BYTE  $0x0f
5005	BYTE  $0x3a
5006	BYTE  $0x0f
5007	BYTE  $0xdb
5008	BYTE  $0x0c
5009	BYTE  $0x66
5010	BYTE  $0x45
5011	BYTE  $0x0f
5012	BYTE  $0x3a
5013	BYTE  $0x0f
5014	BYTE  $0xff
5015	BYTE  $0x0c
5016	MOVO  X14, 64(BP)
5017	PADDD X3, X0
5018	PXOR  X0, X9
5019	ROL16(X9, X14)
5020	PADDD X9, X6
5021	PXOR  X6, X3
5022	MOVO  X3, X14
5023	PSLLL $0x0c, X14
5024	PSRLL $0x14, X3
5025	PXOR  X14, X3
5026	PADDD X3, X0
5027	PXOR  X0, X9
5028	ROL8(X9, X14)
5029	PADDD X9, X6
5030	PXOR  X6, X3
5031	MOVO  X3, X14
5032	PSLLL $0x07, X14
5033	PSRLL $0x19, X3
5034	PXOR  X14, X3
5035	PADDD X4, X1
5036	PXOR  X1, X10
5037	ROL16(X10, X14)
5038	PADDD X10, X7
5039	PXOR  X7, X4
5040	MOVO  X4, X14
5041	PSLLL $0x0c, X14
5042	PSRLL $0x14, X4
5043	PXOR  X14, X4
5044	PADDD X4, X1
5045	PXOR  X1, X10
5046	ROL8(X10, X14)
5047	PADDD X10, X7
5048	PXOR  X7, X4
5049	MOVO  X4, X14
5050	PSLLL $0x07, X14
5051	PSRLL $0x19, X4
5052	PXOR  X14, X4
5053	PADDD X5, X2
5054	PXOR  X2, X11
5055	ROL16(X11, X14)
5056	PADDD X11, X8
5057	PXOR  X8, X5
5058	MOVO  X5, X14
5059	PSLLL $0x0c, X14
5060	PSRLL $0x14, X5
5061	PXOR  X14, X5
5062	PADDD X5, X2
5063	PXOR  X2, X11
5064	ROL8(X11, X14)
5065	PADDD X11, X8
5066	PXOR  X8, X5
5067	MOVO  X5, X14
5068	PSLLL $0x07, X14
5069	PSRLL $0x19, X5
5070	PXOR  X14, X5
5071	MOVO  64(BP), X14
5072	MOVO  X7, 64(BP)
5073	PADDD X13, X12
5074	PXOR  X12, X15
5075	ROL16(X15, X7)
5076	PADDD X15, X14
5077	PXOR  X14, X13
5078	MOVO  X13, X7
5079	PSLLL $0x0c, X7
5080	PSRLL $0x14, X13
5081	PXOR  X7, X13
5082	PADDD X13, X12
5083	PXOR  X12, X15
5084	ROL8(X15, X7)
5085	PADDD X15, X14
5086	PXOR  X14, X13
5087	MOVO  X13, X7
5088	PSLLL $0x07, X7
5089	PSRLL $0x19, X13
5090	PXOR  X7, X13
5091	MOVO  64(BP), X7
5092	BYTE  $0x66
5093	BYTE  $0x0f
5094	BYTE  $0x3a
5095	BYTE  $0x0f
5096	BYTE  $0xdb
5097	BYTE  $0x0c
5098	BYTE  $0x66
5099	BYTE  $0x0f
5100	BYTE  $0x3a
5101	BYTE  $0x0f
5102	BYTE  $0xe4
5103	BYTE  $0x0c
5104	BYTE  $0x66
5105	BYTE  $0x0f
5106	BYTE  $0x3a
5107	BYTE  $0x0f
5108	BYTE  $0xed
5109	BYTE  $0x0c
5110	BYTE  $0x66
5111	BYTE  $0x45
5112	BYTE  $0x0f
5113	BYTE  $0x3a
5114	BYTE  $0x0f
5115	BYTE  $0xed
5116	BYTE  $0x0c
5117	BYTE  $0x66
5118	BYTE  $0x0f
5119	BYTE  $0x3a
5120	BYTE  $0x0f
5121	BYTE  $0xf6
5122	BYTE  $0x08
5123	BYTE  $0x66
5124	BYTE  $0x0f
5125	BYTE  $0x3a
5126	BYTE  $0x0f
5127	BYTE  $0xff
5128	BYTE  $0x08
5129	BYTE  $0x66
5130	BYTE  $0x45
5131	BYTE  $0x0f
5132	BYTE  $0x3a
5133	BYTE  $0x0f
5134	BYTE  $0xc0
5135	BYTE  $0x08
5136	BYTE  $0x66
5137	BYTE  $0x45
5138	BYTE  $0x0f
5139	BYTE  $0x3a
5140	BYTE  $0x0f
5141	BYTE  $0xf6
5142	BYTE  $0x08
5143	BYTE  $0x66
5144	BYTE  $0x45
5145	BYTE  $0x0f
5146	BYTE  $0x3a
5147	BYTE  $0x0f
5148	BYTE  $0xc9
5149	BYTE  $0x04
5150	BYTE  $0x66
5151	BYTE  $0x45
5152	BYTE  $0x0f
5153	BYTE  $0x3a
5154	BYTE  $0x0f
5155	BYTE  $0xd2
5156	BYTE  $0x04
5157	BYTE  $0x66
5158	BYTE  $0x45
5159	BYTE  $0x0f
5160	BYTE  $0x3a
5161	BYTE  $0x0f
5162	BYTE  $0xdb
5163	BYTE  $0x04
5164	BYTE  $0x66
5165	BYTE  $0x45
5166	BYTE  $0x0f
5167	BYTE  $0x3a
5168	BYTE  $0x0f
5169	BYTE  $0xff
5170	BYTE  $0x04
5171	DECQ  R9
5172	JNE   sealSSEIntroLoop
5173
5174	// Add in the state
5175	PADDD chacha20Constants<>+0(SB), X0
5176	PADDD chacha20Constants<>+0(SB), X1
5177	PADDD chacha20Constants<>+0(SB), X2
5178	PADDD chacha20Constants<>+0(SB), X12
5179	PADDD 32(BP), X3
5180	PADDD 32(BP), X4
5181	PADDD 32(BP), X5
5182	PADDD 32(BP), X13
5183	PADDD 48(BP), X7
5184	PADDD 48(BP), X8
5185	PADDD 48(BP), X14
5186	PADDD 96(BP), X10
5187	PADDD 112(BP), X11
5188	PADDD 128(BP), X15
5189
5190	// Clamp and store the key
5191	PAND polyClampMask<>+0(SB), X0
5192	MOVO X0, (BP)
5193	MOVO X3, 16(BP)
5194
5195	// Hash AAD
5196	MOVQ  ad_len+80(FP), R9
5197	CALL  polyHashADInternal<>(SB)
5198	MOVOU (SI), X0
5199	MOVOU 16(SI), X3
5200	MOVOU 32(SI), X6
5201	MOVOU 48(SI), X9
5202	PXOR  X0, X1
5203	PXOR  X3, X4
5204	PXOR  X6, X7
5205	PXOR  X9, X10
5206	MOVOU X1, (DI)
5207	MOVOU X4, 16(DI)
5208	MOVOU X7, 32(DI)
5209	MOVOU X10, 48(DI)
5210	MOVOU 64(SI), X0
5211	MOVOU 80(SI), X3
5212	MOVOU 96(SI), X6
5213	MOVOU 112(SI), X9
5214	PXOR  X0, X2
5215	PXOR  X3, X5
5216	PXOR  X6, X8
5217	PXOR  X9, X11
5218	MOVOU X2, 64(DI)
5219	MOVOU X5, 80(DI)
5220	MOVOU X8, 96(DI)
5221	MOVOU X11, 112(DI)
5222	MOVQ  $0x00000080, CX
5223	SUBQ  $0x80, BX
5224	LEAQ  128(SI), SI
5225	MOVO  X12, X1
5226	MOVO  X13, X4
5227	MOVO  X14, X7
5228	MOVO  X15, X10
5229	CMPQ  BX, $0x40
5230	JBE   sealSSE128SealHash
5231	MOVOU (SI), X0
5232	MOVOU 16(SI), X3
5233	MOVOU 32(SI), X6
5234	MOVOU 48(SI), X9
5235	PXOR  X0, X12
5236	PXOR  X3, X13
5237	PXOR  X6, X14
5238	PXOR  X9, X15
5239	MOVOU X12, 128(DI)
5240	MOVOU X13, 144(DI)
5241	MOVOU X14, 160(DI)
5242	MOVOU X15, 176(DI)
5243	ADDQ  $0x40, CX
5244	SUBQ  $0x40, BX
5245	LEAQ  64(SI), SI
5246	MOVQ  $0x00000002, CX
5247	MOVQ  $0x00000008, R9
5248	CMPQ  BX, $0x40
5249	JBE   sealSSETail64
5250	CMPQ  BX, $0x80
5251	JBE   sealSSETail128
5252	CMPQ  BX, $0xc0
5253	JBE   sealSSETail192
5254
5255sealSSEMainLoop:
5256	// Load state, increment counter blocks
5257	MOVO  chacha20Constants<>+0(SB), X0
5258	MOVO  32(BP), X3
5259	MOVO  48(BP), X6
5260	MOVO  128(BP), X9
5261	PADDL sseIncMask<>+0(SB), X9
5262	MOVO  X0, X1
5263	MOVO  X3, X4
5264	MOVO  X6, X7
5265	MOVO  X9, X10
5266	PADDL sseIncMask<>+0(SB), X10
5267	MOVO  X1, X2
5268	MOVO  X4, X5
5269	MOVO  X7, X8
5270	MOVO  X10, X11
5271	PADDL sseIncMask<>+0(SB), X11
5272	MOVO  X2, X12
5273	MOVO  X5, X13
5274	MOVO  X8, X14
5275	MOVO  X11, X15
5276	PADDL sseIncMask<>+0(SB), X15
5277
5278	// Store counters
5279	MOVO X9, 80(BP)
5280	MOVO X10, 96(BP)
5281	MOVO X11, 112(BP)
5282	MOVO X15, 128(BP)
5283
5284sealSSEInnerLoop:
5285	MOVO  X14, 64(BP)
5286	PADDD X3, X0
5287	PXOR  X0, X9
5288	ROL16(X9, X14)
5289	PADDD X9, X6
5290	PXOR  X6, X3
5291	MOVO  X3, X14
5292	PSLLL $0x0c, X14
5293	PSRLL $0x14, X3
5294	PXOR  X14, X3
5295	PADDD X3, X0
5296	PXOR  X0, X9
5297	ROL8(X9, X14)
5298	PADDD X9, X6
5299	PXOR  X6, X3
5300	MOVO  X3, X14
5301	PSLLL $0x07, X14
5302	PSRLL $0x19, X3
5303	PXOR  X14, X3
5304	PADDD X4, X1
5305	PXOR  X1, X10
5306	ROL16(X10, X14)
5307	PADDD X10, X7
5308	PXOR  X7, X4
5309	MOVO  X4, X14
5310	PSLLL $0x0c, X14
5311	PSRLL $0x14, X4
5312	PXOR  X14, X4
5313	PADDD X4, X1
5314	PXOR  X1, X10
5315	ROL8(X10, X14)
5316	PADDD X10, X7
5317	PXOR  X7, X4
5318	MOVO  X4, X14
5319	PSLLL $0x07, X14
5320	PSRLL $0x19, X4
5321	PXOR  X14, X4
5322	PADDD X5, X2
5323	PXOR  X2, X11
5324	ROL16(X11, X14)
5325	PADDD X11, X8
5326	PXOR  X8, X5
5327	MOVO  X5, X14
5328	PSLLL $0x0c, X14
5329	PSRLL $0x14, X5
5330	PXOR  X14, X5
5331	PADDD X5, X2
5332	PXOR  X2, X11
5333	ROL8(X11, X14)
5334	PADDD X11, X8
5335	PXOR  X8, X5
5336	MOVO  X5, X14
5337	PSLLL $0x07, X14
5338	PSRLL $0x19, X5
5339	PXOR  X14, X5
5340	MOVO  64(BP), X14
5341	MOVO  X7, 64(BP)
5342	PADDD X13, X12
5343	PXOR  X12, X15
5344	ROL16(X15, X7)
5345	PADDD X15, X14
5346	PXOR  X14, X13
5347	MOVO  X13, X7
5348	PSLLL $0x0c, X7
5349	PSRLL $0x14, X13
5350	PXOR  X7, X13
5351	PADDD X13, X12
5352	PXOR  X12, X15
5353	ROL8(X15, X7)
5354	PADDD X15, X14
5355	PXOR  X14, X13
5356	MOVO  X13, X7
5357	PSLLL $0x07, X7
5358	PSRLL $0x19, X13
5359	PXOR  X7, X13
5360	MOVO  64(BP), X7
5361	ADDQ  (DI), R10
5362	ADCQ  8(DI), R11
5363	ADCQ  $0x01, R12
5364	BYTE  $0x66
5365	BYTE  $0x0f
5366	BYTE  $0x3a
5367	BYTE  $0x0f
5368	BYTE  $0xdb
5369	BYTE  $0x04
5370	BYTE  $0x66
5371	BYTE  $0x0f
5372	BYTE  $0x3a
5373	BYTE  $0x0f
5374	BYTE  $0xe4
5375	BYTE  $0x04
5376	BYTE  $0x66
5377	BYTE  $0x0f
5378	BYTE  $0x3a
5379	BYTE  $0x0f
5380	BYTE  $0xed
5381	BYTE  $0x04
5382	BYTE  $0x66
5383	BYTE  $0x45
5384	BYTE  $0x0f
5385	BYTE  $0x3a
5386	BYTE  $0x0f
5387	BYTE  $0xed
5388	BYTE  $0x04
5389	BYTE  $0x66
5390	BYTE  $0x0f
5391	BYTE  $0x3a
5392	BYTE  $0x0f
5393	BYTE  $0xf6
5394	BYTE  $0x08
5395	BYTE  $0x66
5396	BYTE  $0x0f
5397	BYTE  $0x3a
5398	BYTE  $0x0f
5399	BYTE  $0xff
5400	BYTE  $0x08
5401	BYTE  $0x66
5402	BYTE  $0x45
5403	BYTE  $0x0f
5404	BYTE  $0x3a
5405	BYTE  $0x0f
5406	BYTE  $0xc0
5407	BYTE  $0x08
5408	BYTE  $0x66
5409	BYTE  $0x45
5410	BYTE  $0x0f
5411	BYTE  $0x3a
5412	BYTE  $0x0f
5413	BYTE  $0xf6
5414	BYTE  $0x08
5415	BYTE  $0x66
5416	BYTE  $0x45
5417	BYTE  $0x0f
5418	BYTE  $0x3a
5419	BYTE  $0x0f
5420	BYTE  $0xc9
5421	BYTE  $0x0c
5422	BYTE  $0x66
5423	BYTE  $0x45
5424	BYTE  $0x0f
5425	BYTE  $0x3a
5426	BYTE  $0x0f
5427	BYTE  $0xd2
5428	BYTE  $0x0c
5429	BYTE  $0x66
5430	BYTE  $0x45
5431	BYTE  $0x0f
5432	BYTE  $0x3a
5433	BYTE  $0x0f
5434	BYTE  $0xdb
5435	BYTE  $0x0c
5436	BYTE  $0x66
5437	BYTE  $0x45
5438	BYTE  $0x0f
5439	BYTE  $0x3a
5440	BYTE  $0x0f
5441	BYTE  $0xff
5442	BYTE  $0x0c
5443	MOVQ  (BP), AX
5444	MOVQ  AX, R15
5445	MULQ  R10
5446	MOVQ  AX, R13
5447	MOVQ  DX, R14
5448	MOVQ  (BP), AX
5449	MULQ  R11
5450	IMULQ R12, R15
5451	ADDQ  AX, R14
5452	ADCQ  DX, R15
5453	MOVQ  8(BP), AX
5454	MOVQ  AX, R8
5455	MULQ  R10
5456	ADDQ  AX, R14
5457	ADCQ  $0x00, DX
5458	MOVQ  DX, R10
5459	MOVQ  8(BP), AX
5460	MULQ  R11
5461	ADDQ  AX, R15
5462	ADCQ  $0x00, DX
5463	LEAQ  16(DI), DI
5464	MOVO  X14, 64(BP)
5465	PADDD X3, X0
5466	PXOR  X0, X9
5467	ROL16(X9, X14)
5468	PADDD X9, X6
5469	PXOR  X6, X3
5470	MOVO  X3, X14
5471	PSLLL $0x0c, X14
5472	PSRLL $0x14, X3
5473	PXOR  X14, X3
5474	PADDD X3, X0
5475	PXOR  X0, X9
5476	ROL8(X9, X14)
5477	PADDD X9, X6
5478	PXOR  X6, X3
5479	MOVO  X3, X14
5480	PSLLL $0x07, X14
5481	PSRLL $0x19, X3
5482	PXOR  X14, X3
5483	PADDD X4, X1
5484	PXOR  X1, X10
5485	ROL16(X10, X14)
5486	PADDD X10, X7
5487	PXOR  X7, X4
5488	MOVO  X4, X14
5489	PSLLL $0x0c, X14
5490	PSRLL $0x14, X4
5491	PXOR  X14, X4
5492	PADDD X4, X1
5493	PXOR  X1, X10
5494	ROL8(X10, X14)
5495	PADDD X10, X7
5496	PXOR  X7, X4
5497	MOVO  X4, X14
5498	PSLLL $0x07, X14
5499	PSRLL $0x19, X4
5500	PXOR  X14, X4
5501	PADDD X5, X2
5502	PXOR  X2, X11
5503	ROL16(X11, X14)
5504	PADDD X11, X8
5505	PXOR  X8, X5
5506	MOVO  X5, X14
5507	PSLLL $0x0c, X14
5508	PSRLL $0x14, X5
5509	PXOR  X14, X5
5510	PADDD X5, X2
5511	PXOR  X2, X11
5512	ROL8(X11, X14)
5513	PADDD X11, X8
5514	PXOR  X8, X5
5515	MOVO  X5, X14
5516	PSLLL $0x07, X14
5517	PSRLL $0x19, X5
5518	PXOR  X14, X5
5519	MOVO  64(BP), X14
5520	MOVO  X7, 64(BP)
5521	IMULQ R12, R8
5522	ADDQ  R10, R15
5523	ADCQ  DX, R8
5524	PADDD X13, X12
5525	PXOR  X12, X15
5526	ROL16(X15, X7)
5527	PADDD X15, X14
5528	PXOR  X14, X13
5529	MOVO  X13, X7
5530	PSLLL $0x0c, X7
5531	PSRLL $0x14, X13
5532	PXOR  X7, X13
5533	PADDD X13, X12
5534	PXOR  X12, X15
5535	ROL8(X15, X7)
5536	PADDD X15, X14
5537	PXOR  X14, X13
5538	MOVO  X13, X7
5539	PSLLL $0x07, X7
5540	PSRLL $0x19, X13
5541	PXOR  X7, X13
5542	MOVO  64(BP), X7
5543	MOVQ  R13, R10
5544	MOVQ  R14, R11
5545	MOVQ  R15, R12
5546	ANDQ  $0x03, R12
5547	MOVQ  R15, R13
5548	ANDQ  $-4, R13
5549	MOVQ  R8, R14
5550	SHRQ  $0x02, R8, R15
5551	SHRQ  $0x02, R8
5552	ADDQ  R13, R10
5553	ADCQ  R14, R11
5554	ADCQ  $0x00, R12
5555	ADDQ  R15, R10
5556	ADCQ  R8, R11
5557	ADCQ  $0x00, R12
5558	BYTE  $0x66
5559	BYTE  $0x0f
5560	BYTE  $0x3a
5561	BYTE  $0x0f
5562	BYTE  $0xdb
5563	BYTE  $0x0c
5564	BYTE  $0x66
5565	BYTE  $0x0f
5566	BYTE  $0x3a
5567	BYTE  $0x0f
5568	BYTE  $0xe4
5569	BYTE  $0x0c
5570	BYTE  $0x66
5571	BYTE  $0x0f
5572	BYTE  $0x3a
5573	BYTE  $0x0f
5574	BYTE  $0xed
5575	BYTE  $0x0c
5576	BYTE  $0x66
5577	BYTE  $0x45
5578	BYTE  $0x0f
5579	BYTE  $0x3a
5580	BYTE  $0x0f
5581	BYTE  $0xed
5582	BYTE  $0x0c
5583	BYTE  $0x66
5584	BYTE  $0x0f
5585	BYTE  $0x3a
5586	BYTE  $0x0f
5587	BYTE  $0xf6
5588	BYTE  $0x08
5589	BYTE  $0x66
5590	BYTE  $0x0f
5591	BYTE  $0x3a
5592	BYTE  $0x0f
5593	BYTE  $0xff
5594	BYTE  $0x08
5595	BYTE  $0x66
5596	BYTE  $0x45
5597	BYTE  $0x0f
5598	BYTE  $0x3a
5599	BYTE  $0x0f
5600	BYTE  $0xc0
5601	BYTE  $0x08
5602	BYTE  $0x66
5603	BYTE  $0x45
5604	BYTE  $0x0f
5605	BYTE  $0x3a
5606	BYTE  $0x0f
5607	BYTE  $0xf6
5608	BYTE  $0x08
5609	BYTE  $0x66
5610	BYTE  $0x45
5611	BYTE  $0x0f
5612	BYTE  $0x3a
5613	BYTE  $0x0f
5614	BYTE  $0xc9
5615	BYTE  $0x04
5616	BYTE  $0x66
5617	BYTE  $0x45
5618	BYTE  $0x0f
5619	BYTE  $0x3a
5620	BYTE  $0x0f
5621	BYTE  $0xd2
5622	BYTE  $0x04
5623	BYTE  $0x66
5624	BYTE  $0x45
5625	BYTE  $0x0f
5626	BYTE  $0x3a
5627	BYTE  $0x0f
5628	BYTE  $0xdb
5629	BYTE  $0x04
5630	BYTE  $0x66
5631	BYTE  $0x45
5632	BYTE  $0x0f
5633	BYTE  $0x3a
5634	BYTE  $0x0f
5635	BYTE  $0xff
5636	BYTE  $0x04
5637	DECQ  R9
5638	JGE   sealSSEInnerLoop
5639	ADDQ  (DI), R10
5640	ADCQ  8(DI), R11
5641	ADCQ  $0x01, R12
5642	MOVQ  (BP), AX
5643	MOVQ  AX, R15
5644	MULQ  R10
5645	MOVQ  AX, R13
5646	MOVQ  DX, R14
5647	MOVQ  (BP), AX
5648	MULQ  R11
5649	IMULQ R12, R15
5650	ADDQ  AX, R14
5651	ADCQ  DX, R15
5652	MOVQ  8(BP), AX
5653	MOVQ  AX, R8
5654	MULQ  R10
5655	ADDQ  AX, R14
5656	ADCQ  $0x00, DX
5657	MOVQ  DX, R10
5658	MOVQ  8(BP), AX
5659	MULQ  R11
5660	ADDQ  AX, R15
5661	ADCQ  $0x00, DX
5662	IMULQ R12, R8
5663	ADDQ  R10, R15
5664	ADCQ  DX, R8
5665	MOVQ  R13, R10
5666	MOVQ  R14, R11
5667	MOVQ  R15, R12
5668	ANDQ  $0x03, R12
5669	MOVQ  R15, R13
5670	ANDQ  $-4, R13
5671	MOVQ  R8, R14
5672	SHRQ  $0x02, R8, R15
5673	SHRQ  $0x02, R8
5674	ADDQ  R13, R10
5675	ADCQ  R14, R11
5676	ADCQ  $0x00, R12
5677	ADDQ  R15, R10
5678	ADCQ  R8, R11
5679	ADCQ  $0x00, R12
5680	LEAQ  16(DI), DI
5681	DECQ  CX
5682	JG    sealSSEInnerLoop
5683
5684	// Add in the state
5685	PADDD chacha20Constants<>+0(SB), X0
5686	PADDD chacha20Constants<>+0(SB), X1
5687	PADDD chacha20Constants<>+0(SB), X2
5688	PADDD chacha20Constants<>+0(SB), X12
5689	PADDD 32(BP), X3
5690	PADDD 32(BP), X4
5691	PADDD 32(BP), X5
5692	PADDD 32(BP), X13
5693	PADDD 48(BP), X6
5694	PADDD 48(BP), X7
5695	PADDD 48(BP), X8
5696	PADDD 48(BP), X14
5697	PADDD 80(BP), X9
5698	PADDD 96(BP), X10
5699	PADDD 112(BP), X11
5700	PADDD 128(BP), X15
5701	MOVO  X15, 64(BP)
5702
5703	// Load - xor - store
5704	MOVOU (SI), X15
5705	PXOR  X15, X0
5706	MOVOU 16(SI), X15
5707	PXOR  X15, X3
5708	MOVOU 32(SI), X15
5709	PXOR  X15, X6
5710	MOVOU 48(SI), X15
5711	PXOR  X15, X9
5712	MOVOU X0, (DI)
5713	MOVOU X3, 16(DI)
5714	MOVOU X6, 32(DI)
5715	MOVOU X9, 48(DI)
5716	MOVO  64(BP), X15
5717	MOVOU 64(SI), X0
5718	MOVOU 80(SI), X3
5719	MOVOU 96(SI), X6
5720	MOVOU 112(SI), X9
5721	PXOR  X0, X1
5722	PXOR  X3, X4
5723	PXOR  X6, X7
5724	PXOR  X9, X10
5725	MOVOU X1, 64(DI)
5726	MOVOU X4, 80(DI)
5727	MOVOU X7, 96(DI)
5728	MOVOU X10, 112(DI)
5729	MOVOU 128(SI), X0
5730	MOVOU 144(SI), X3
5731	MOVOU 160(SI), X6
5732	MOVOU 176(SI), X9
5733	PXOR  X0, X2
5734	PXOR  X3, X5
5735	PXOR  X6, X8
5736	PXOR  X9, X11
5737	MOVOU X2, 128(DI)
5738	MOVOU X5, 144(DI)
5739	MOVOU X8, 160(DI)
5740	MOVOU X11, 176(DI)
5741	ADDQ  $0xc0, SI
5742	MOVQ  $0x000000c0, CX
5743	SUBQ  $0xc0, BX
5744	MOVO  X12, X1
5745	MOVO  X13, X4
5746	MOVO  X14, X7
5747	MOVO  X15, X10
5748	CMPQ  BX, $0x40
5749	JBE   sealSSE128SealHash
5750	MOVOU (SI), X0
5751	MOVOU 16(SI), X3
5752	MOVOU 32(SI), X6
5753	MOVOU 48(SI), X9
5754	PXOR  X0, X12
5755	PXOR  X3, X13
5756	PXOR  X6, X14
5757	PXOR  X9, X15
5758	MOVOU X12, 192(DI)
5759	MOVOU X13, 208(DI)
5760	MOVOU X14, 224(DI)
5761	MOVOU X15, 240(DI)
5762	LEAQ  64(SI), SI
5763	SUBQ  $0x40, BX
5764	MOVQ  $0x00000006, CX
5765	MOVQ  $0x00000004, R9
5766	CMPQ  BX, $0xc0
5767	JG    sealSSEMainLoop
5768	MOVQ  BX, CX
5769	TESTQ BX, BX
5770	JE    sealSSE128SealHash
5771	MOVQ  $0x00000006, CX
5772	CMPQ  BX, $0x40
5773	JBE   sealSSETail64
5774	CMPQ  BX, $0x80
5775	JBE   sealSSETail128
5776	JMP   sealSSETail192
5777
5778sealSSETail64:
5779	MOVO  chacha20Constants<>+0(SB), X1
5780	MOVO  32(BP), X4
5781	MOVO  48(BP), X7
5782	MOVO  128(BP), X10
5783	PADDL sseIncMask<>+0(SB), X10
5784	MOVO  X10, 80(BP)
5785
5786sealSSETail64LoopA:
5787	ADDQ  (DI), R10
5788	ADCQ  8(DI), R11
5789	ADCQ  $0x01, R12
5790	MOVQ  (BP), AX
5791	MOVQ  AX, R15
5792	MULQ  R10
5793	MOVQ  AX, R13
5794	MOVQ  DX, R14
5795	MOVQ  (BP), AX
5796	MULQ  R11
5797	IMULQ R12, R15
5798	ADDQ  AX, R14
5799	ADCQ  DX, R15
5800	MOVQ  8(BP), AX
5801	MOVQ  AX, R8
5802	MULQ  R10
5803	ADDQ  AX, R14
5804	ADCQ  $0x00, DX
5805	MOVQ  DX, R10
5806	MOVQ  8(BP), AX
5807	MULQ  R11
5808	ADDQ  AX, R15
5809	ADCQ  $0x00, DX
5810	IMULQ R12, R8
5811	ADDQ  R10, R15
5812	ADCQ  DX, R8
5813	MOVQ  R13, R10
5814	MOVQ  R14, R11
5815	MOVQ  R15, R12
5816	ANDQ  $0x03, R12
5817	MOVQ  R15, R13
5818	ANDQ  $-4, R13
5819	MOVQ  R8, R14
5820	SHRQ  $0x02, R8, R15
5821	SHRQ  $0x02, R8
5822	ADDQ  R13, R10
5823	ADCQ  R14, R11
5824	ADCQ  $0x00, R12
5825	ADDQ  R15, R10
5826	ADCQ  R8, R11
5827	ADCQ  $0x00, R12
5828	LEAQ  16(DI), DI
5829
5830sealSSETail64LoopB:
5831	PADDD X4, X1
5832	PXOR  X1, X10
5833	ROL16(X10, X13)
5834	PADDD X10, X7
5835	PXOR  X7, X4
5836	MOVO  X4, X13
5837	PSLLL $0x0c, X13
5838	PSRLL $0x14, X4
5839	PXOR  X13, X4
5840	PADDD X4, X1
5841	PXOR  X1, X10
5842	ROL8(X10, X13)
5843	PADDD X10, X7
5844	PXOR  X7, X4
5845	MOVO  X4, X13
5846	PSLLL $0x07, X13
5847	PSRLL $0x19, X4
5848	PXOR  X13, X4
5849	BYTE  $0x66
5850	BYTE  $0x0f
5851	BYTE  $0x3a
5852	BYTE  $0x0f
5853	BYTE  $0xe4
5854	BYTE  $0x04
5855	BYTE  $0x66
5856	BYTE  $0x0f
5857	BYTE  $0x3a
5858	BYTE  $0x0f
5859	BYTE  $0xff
5860	BYTE  $0x08
5861	BYTE  $0x66
5862	BYTE  $0x45
5863	BYTE  $0x0f
5864	BYTE  $0x3a
5865	BYTE  $0x0f
5866	BYTE  $0xd2
5867	BYTE  $0x0c
5868	PADDD X4, X1
5869	PXOR  X1, X10
5870	ROL16(X10, X13)
5871	PADDD X10, X7
5872	PXOR  X7, X4
5873	MOVO  X4, X13
5874	PSLLL $0x0c, X13
5875	PSRLL $0x14, X4
5876	PXOR  X13, X4
5877	PADDD X4, X1
5878	PXOR  X1, X10
5879	ROL8(X10, X13)
5880	PADDD X10, X7
5881	PXOR  X7, X4
5882	MOVO  X4, X13
5883	PSLLL $0x07, X13
5884	PSRLL $0x19, X4
5885	PXOR  X13, X4
5886	BYTE  $0x66
5887	BYTE  $0x0f
5888	BYTE  $0x3a
5889	BYTE  $0x0f
5890	BYTE  $0xe4
5891	BYTE  $0x0c
5892	BYTE  $0x66
5893	BYTE  $0x0f
5894	BYTE  $0x3a
5895	BYTE  $0x0f
5896	BYTE  $0xff
5897	BYTE  $0x08
5898	BYTE  $0x66
5899	BYTE  $0x45
5900	BYTE  $0x0f
5901	BYTE  $0x3a
5902	BYTE  $0x0f
5903	BYTE  $0xd2
5904	BYTE  $0x04
5905	ADDQ  (DI), R10
5906	ADCQ  8(DI), R11
5907	ADCQ  $0x01, R12
5908	MOVQ  (BP), AX
5909	MOVQ  AX, R15
5910	MULQ  R10
5911	MOVQ  AX, R13
5912	MOVQ  DX, R14
5913	MOVQ  (BP), AX
5914	MULQ  R11
5915	IMULQ R12, R15
5916	ADDQ  AX, R14
5917	ADCQ  DX, R15
5918	MOVQ  8(BP), AX
5919	MOVQ  AX, R8
5920	MULQ  R10
5921	ADDQ  AX, R14
5922	ADCQ  $0x00, DX
5923	MOVQ  DX, R10
5924	MOVQ  8(BP), AX
5925	MULQ  R11
5926	ADDQ  AX, R15
5927	ADCQ  $0x00, DX
5928	IMULQ R12, R8
5929	ADDQ  R10, R15
5930	ADCQ  DX, R8
5931	MOVQ  R13, R10
5932	MOVQ  R14, R11
5933	MOVQ  R15, R12
5934	ANDQ  $0x03, R12
5935	MOVQ  R15, R13
5936	ANDQ  $-4, R13
5937	MOVQ  R8, R14
5938	SHRQ  $0x02, R8, R15
5939	SHRQ  $0x02, R8
5940	ADDQ  R13, R10
5941	ADCQ  R14, R11
5942	ADCQ  $0x00, R12
5943	ADDQ  R15, R10
5944	ADCQ  R8, R11
5945	ADCQ  $0x00, R12
5946	LEAQ  16(DI), DI
5947	DECQ  CX
5948	JG    sealSSETail64LoopA
5949	DECQ  R9
5950	JGE   sealSSETail64LoopB
5951	PADDL chacha20Constants<>+0(SB), X1
5952	PADDL 32(BP), X4
5953	PADDL 48(BP), X7
5954	PADDL 80(BP), X10
5955	JMP   sealSSE128Seal
5956
5957sealSSETail128:
5958	MOVO  chacha20Constants<>+0(SB), X0
5959	MOVO  32(BP), X3
5960	MOVO  48(BP), X6
5961	MOVO  128(BP), X9
5962	PADDL sseIncMask<>+0(SB), X9
5963	MOVO  X9, 80(BP)
5964	MOVO  X0, X1
5965	MOVO  X3, X4
5966	MOVO  X6, X7
5967	MOVO  X9, X10
5968	PADDL sseIncMask<>+0(SB), X10
5969	MOVO  X10, 96(BP)
5970
5971sealSSETail128LoopA:
5972	ADDQ  (DI), R10
5973	ADCQ  8(DI), R11
5974	ADCQ  $0x01, R12
5975	MOVQ  (BP), AX
5976	MOVQ  AX, R15
5977	MULQ  R10
5978	MOVQ  AX, R13
5979	MOVQ  DX, R14
5980	MOVQ  (BP), AX
5981	MULQ  R11
5982	IMULQ R12, R15
5983	ADDQ  AX, R14
5984	ADCQ  DX, R15
5985	MOVQ  8(BP), AX
5986	MOVQ  AX, R8
5987	MULQ  R10
5988	ADDQ  AX, R14
5989	ADCQ  $0x00, DX
5990	MOVQ  DX, R10
5991	MOVQ  8(BP), AX
5992	MULQ  R11
5993	ADDQ  AX, R15
5994	ADCQ  $0x00, DX
5995	IMULQ R12, R8
5996	ADDQ  R10, R15
5997	ADCQ  DX, R8
5998	MOVQ  R13, R10
5999	MOVQ  R14, R11
6000	MOVQ  R15, R12
6001	ANDQ  $0x03, R12
6002	MOVQ  R15, R13
6003	ANDQ  $-4, R13
6004	MOVQ  R8, R14
6005	SHRQ  $0x02, R8, R15
6006	SHRQ  $0x02, R8
6007	ADDQ  R13, R10
6008	ADCQ  R14, R11
6009	ADCQ  $0x00, R12
6010	ADDQ  R15, R10
6011	ADCQ  R8, R11
6012	ADCQ  $0x00, R12
6013	LEAQ  16(DI), DI
6014
6015sealSSETail128LoopB:
6016	PADDD X3, X0
6017	PXOR  X0, X9
6018	ROL16(X9, X12)
6019	PADDD X9, X6
6020	PXOR  X6, X3
6021	MOVO  X3, X12
6022	PSLLL $0x0c, X12
6023	PSRLL $0x14, X3
6024	PXOR  X12, X3
6025	PADDD X3, X0
6026	PXOR  X0, X9
6027	ROL8(X9, X12)
6028	PADDD X9, X6
6029	PXOR  X6, X3
6030	MOVO  X3, X12
6031	PSLLL $0x07, X12
6032	PSRLL $0x19, X3
6033	PXOR  X12, X3
6034	PADDD X4, X1
6035	PXOR  X1, X10
6036	ROL16(X10, X12)
6037	PADDD X10, X7
6038	PXOR  X7, X4
6039	MOVO  X4, X12
6040	PSLLL $0x0c, X12
6041	PSRLL $0x14, X4
6042	PXOR  X12, X4
6043	PADDD X4, X1
6044	PXOR  X1, X10
6045	ROL8(X10, X12)
6046	PADDD X10, X7
6047	PXOR  X7, X4
6048	MOVO  X4, X12
6049	PSLLL $0x07, X12
6050	PSRLL $0x19, X4
6051	PXOR  X12, X4
6052	BYTE  $0x66
6053	BYTE  $0x0f
6054	BYTE  $0x3a
6055	BYTE  $0x0f
6056	BYTE  $0xdb
6057	BYTE  $0x04
6058	BYTE  $0x66
6059	BYTE  $0x0f
6060	BYTE  $0x3a
6061	BYTE  $0x0f
6062	BYTE  $0xf6
6063	BYTE  $0x08
6064	BYTE  $0x66
6065	BYTE  $0x45
6066	BYTE  $0x0f
6067	BYTE  $0x3a
6068	BYTE  $0x0f
6069	BYTE  $0xc9
6070	BYTE  $0x0c
6071	BYTE  $0x66
6072	BYTE  $0x0f
6073	BYTE  $0x3a
6074	BYTE  $0x0f
6075	BYTE  $0xe4
6076	BYTE  $0x04
6077	BYTE  $0x66
6078	BYTE  $0x0f
6079	BYTE  $0x3a
6080	BYTE  $0x0f
6081	BYTE  $0xff
6082	BYTE  $0x08
6083	BYTE  $0x66
6084	BYTE  $0x45
6085	BYTE  $0x0f
6086	BYTE  $0x3a
6087	BYTE  $0x0f
6088	BYTE  $0xd2
6089	BYTE  $0x0c
6090	ADDQ  (DI), R10
6091	ADCQ  8(DI), R11
6092	ADCQ  $0x01, R12
6093	MOVQ  (BP), AX
6094	MOVQ  AX, R15
6095	MULQ  R10
6096	MOVQ  AX, R13
6097	MOVQ  DX, R14
6098	MOVQ  (BP), AX
6099	MULQ  R11
6100	IMULQ R12, R15
6101	ADDQ  AX, R14
6102	ADCQ  DX, R15
6103	MOVQ  8(BP), AX
6104	MOVQ  AX, R8
6105	MULQ  R10
6106	ADDQ  AX, R14
6107	ADCQ  $0x00, DX
6108	MOVQ  DX, R10
6109	MOVQ  8(BP), AX
6110	MULQ  R11
6111	ADDQ  AX, R15
6112	ADCQ  $0x00, DX
6113	IMULQ R12, R8
6114	ADDQ  R10, R15
6115	ADCQ  DX, R8
6116	MOVQ  R13, R10
6117	MOVQ  R14, R11
6118	MOVQ  R15, R12
6119	ANDQ  $0x03, R12
6120	MOVQ  R15, R13
6121	ANDQ  $-4, R13
6122	MOVQ  R8, R14
6123	SHRQ  $0x02, R8, R15
6124	SHRQ  $0x02, R8
6125	ADDQ  R13, R10
6126	ADCQ  R14, R11
6127	ADCQ  $0x00, R12
6128	ADDQ  R15, R10
6129	ADCQ  R8, R11
6130	ADCQ  $0x00, R12
6131	LEAQ  16(DI), DI
6132	PADDD X3, X0
6133	PXOR  X0, X9
6134	ROL16(X9, X12)
6135	PADDD X9, X6
6136	PXOR  X6, X3
6137	MOVO  X3, X12
6138	PSLLL $0x0c, X12
6139	PSRLL $0x14, X3
6140	PXOR  X12, X3
6141	PADDD X3, X0
6142	PXOR  X0, X9
6143	ROL8(X9, X12)
6144	PADDD X9, X6
6145	PXOR  X6, X3
6146	MOVO  X3, X12
6147	PSLLL $0x07, X12
6148	PSRLL $0x19, X3
6149	PXOR  X12, X3
6150	PADDD X4, X1
6151	PXOR  X1, X10
6152	ROL16(X10, X12)
6153	PADDD X10, X7
6154	PXOR  X7, X4
6155	MOVO  X4, X12
6156	PSLLL $0x0c, X12
6157	PSRLL $0x14, X4
6158	PXOR  X12, X4
6159	PADDD X4, X1
6160	PXOR  X1, X10
6161	ROL8(X10, X12)
6162	PADDD X10, X7
6163	PXOR  X7, X4
6164	MOVO  X4, X12
6165	PSLLL $0x07, X12
6166	PSRLL $0x19, X4
6167	PXOR  X12, X4
6168	BYTE  $0x66
6169	BYTE  $0x0f
6170	BYTE  $0x3a
6171	BYTE  $0x0f
6172	BYTE  $0xdb
6173	BYTE  $0x0c
6174	BYTE  $0x66
6175	BYTE  $0x0f
6176	BYTE  $0x3a
6177	BYTE  $0x0f
6178	BYTE  $0xf6
6179	BYTE  $0x08
6180	BYTE  $0x66
6181	BYTE  $0x45
6182	BYTE  $0x0f
6183	BYTE  $0x3a
6184	BYTE  $0x0f
6185	BYTE  $0xc9
6186	BYTE  $0x04
6187	BYTE  $0x66
6188	BYTE  $0x0f
6189	BYTE  $0x3a
6190	BYTE  $0x0f
6191	BYTE  $0xe4
6192	BYTE  $0x0c
6193	BYTE  $0x66
6194	BYTE  $0x0f
6195	BYTE  $0x3a
6196	BYTE  $0x0f
6197	BYTE  $0xff
6198	BYTE  $0x08
6199	BYTE  $0x66
6200	BYTE  $0x45
6201	BYTE  $0x0f
6202	BYTE  $0x3a
6203	BYTE  $0x0f
6204	BYTE  $0xd2
6205	BYTE  $0x04
6206	DECQ  CX
6207	JG    sealSSETail128LoopA
6208	DECQ  R9
6209	JGE   sealSSETail128LoopB
6210	PADDL chacha20Constants<>+0(SB), X0
6211	PADDL chacha20Constants<>+0(SB), X1
6212	PADDL 32(BP), X3
6213	PADDL 32(BP), X4
6214	PADDL 48(BP), X6
6215	PADDL 48(BP), X7
6216	PADDL 80(BP), X9
6217	PADDL 96(BP), X10
6218	MOVOU (SI), X12
6219	MOVOU 16(SI), X13
6220	MOVOU 32(SI), X14
6221	MOVOU 48(SI), X15
6222	PXOR  X12, X0
6223	PXOR  X13, X3
6224	PXOR  X14, X6
6225	PXOR  X15, X9
6226	MOVOU X0, (DI)
6227	MOVOU X3, 16(DI)
6228	MOVOU X6, 32(DI)
6229	MOVOU X9, 48(DI)
6230	MOVQ  $0x00000040, CX
6231	LEAQ  64(SI), SI
6232	SUBQ  $0x40, BX
6233	JMP   sealSSE128SealHash
6234
6235sealSSETail192:
6236	MOVO  chacha20Constants<>+0(SB), X0
6237	MOVO  32(BP), X3
6238	MOVO  48(BP), X6
6239	MOVO  128(BP), X9
6240	PADDL sseIncMask<>+0(SB), X9
6241	MOVO  X9, 80(BP)
6242	MOVO  X0, X1
6243	MOVO  X3, X4
6244	MOVO  X6, X7
6245	MOVO  X9, X10
6246	PADDL sseIncMask<>+0(SB), X10
6247	MOVO  X10, 96(BP)
6248	MOVO  X1, X2
6249	MOVO  X4, X5
6250	MOVO  X7, X8
6251	MOVO  X10, X11
6252	PADDL sseIncMask<>+0(SB), X11
6253	MOVO  X11, 112(BP)
6254
6255sealSSETail192LoopA:
6256	ADDQ  (DI), R10
6257	ADCQ  8(DI), R11
6258	ADCQ  $0x01, R12
6259	MOVQ  (BP), AX
6260	MOVQ  AX, R15
6261	MULQ  R10
6262	MOVQ  AX, R13
6263	MOVQ  DX, R14
6264	MOVQ  (BP), AX
6265	MULQ  R11
6266	IMULQ R12, R15
6267	ADDQ  AX, R14
6268	ADCQ  DX, R15
6269	MOVQ  8(BP), AX
6270	MOVQ  AX, R8
6271	MULQ  R10
6272	ADDQ  AX, R14
6273	ADCQ  $0x00, DX
6274	MOVQ  DX, R10
6275	MOVQ  8(BP), AX
6276	MULQ  R11
6277	ADDQ  AX, R15
6278	ADCQ  $0x00, DX
6279	IMULQ R12, R8
6280	ADDQ  R10, R15
6281	ADCQ  DX, R8
6282	MOVQ  R13, R10
6283	MOVQ  R14, R11
6284	MOVQ  R15, R12
6285	ANDQ  $0x03, R12
6286	MOVQ  R15, R13
6287	ANDQ  $-4, R13
6288	MOVQ  R8, R14
6289	SHRQ  $0x02, R8, R15
6290	SHRQ  $0x02, R8
6291	ADDQ  R13, R10
6292	ADCQ  R14, R11
6293	ADCQ  $0x00, R12
6294	ADDQ  R15, R10
6295	ADCQ  R8, R11
6296	ADCQ  $0x00, R12
6297	LEAQ  16(DI), DI
6298
6299sealSSETail192LoopB:
6300	PADDD X3, X0
6301	PXOR  X0, X9
6302	ROL16(X9, X12)
6303	PADDD X9, X6
6304	PXOR  X6, X3
6305	MOVO  X3, X12
6306	PSLLL $0x0c, X12
6307	PSRLL $0x14, X3
6308	PXOR  X12, X3
6309	PADDD X3, X0
6310	PXOR  X0, X9
6311	ROL8(X9, X12)
6312	PADDD X9, X6
6313	PXOR  X6, X3
6314	MOVO  X3, X12
6315	PSLLL $0x07, X12
6316	PSRLL $0x19, X3
6317	PXOR  X12, X3
6318	PADDD X4, X1
6319	PXOR  X1, X10
6320	ROL16(X10, X12)
6321	PADDD X10, X7
6322	PXOR  X7, X4
6323	MOVO  X4, X12
6324	PSLLL $0x0c, X12
6325	PSRLL $0x14, X4
6326	PXOR  X12, X4
6327	PADDD X4, X1
6328	PXOR  X1, X10
6329	ROL8(X10, X12)
6330	PADDD X10, X7
6331	PXOR  X7, X4
6332	MOVO  X4, X12
6333	PSLLL $0x07, X12
6334	PSRLL $0x19, X4
6335	PXOR  X12, X4
6336	PADDD X5, X2
6337	PXOR  X2, X11
6338	ROL16(X11, X12)
6339	PADDD X11, X8
6340	PXOR  X8, X5
6341	MOVO  X5, X12
6342	PSLLL $0x0c, X12
6343	PSRLL $0x14, X5
6344	PXOR  X12, X5
6345	PADDD X5, X2
6346	PXOR  X2, X11
6347	ROL8(X11, X12)
6348	PADDD X11, X8
6349	PXOR  X8, X5
6350	MOVO  X5, X12
6351	PSLLL $0x07, X12
6352	PSRLL $0x19, X5
6353	PXOR  X12, X5
6354	BYTE  $0x66
6355	BYTE  $0x0f
6356	BYTE  $0x3a
6357	BYTE  $0x0f
6358	BYTE  $0xdb
6359	BYTE  $0x04
6360	BYTE  $0x66
6361	BYTE  $0x0f
6362	BYTE  $0x3a
6363	BYTE  $0x0f
6364	BYTE  $0xf6
6365	BYTE  $0x08
6366	BYTE  $0x66
6367	BYTE  $0x45
6368	BYTE  $0x0f
6369	BYTE  $0x3a
6370	BYTE  $0x0f
6371	BYTE  $0xc9
6372	BYTE  $0x0c
6373	BYTE  $0x66
6374	BYTE  $0x0f
6375	BYTE  $0x3a
6376	BYTE  $0x0f
6377	BYTE  $0xe4
6378	BYTE  $0x04
6379	BYTE  $0x66
6380	BYTE  $0x0f
6381	BYTE  $0x3a
6382	BYTE  $0x0f
6383	BYTE  $0xff
6384	BYTE  $0x08
6385	BYTE  $0x66
6386	BYTE  $0x45
6387	BYTE  $0x0f
6388	BYTE  $0x3a
6389	BYTE  $0x0f
6390	BYTE  $0xd2
6391	BYTE  $0x0c
6392	BYTE  $0x66
6393	BYTE  $0x0f
6394	BYTE  $0x3a
6395	BYTE  $0x0f
6396	BYTE  $0xed
6397	BYTE  $0x04
6398	BYTE  $0x66
6399	BYTE  $0x45
6400	BYTE  $0x0f
6401	BYTE  $0x3a
6402	BYTE  $0x0f
6403	BYTE  $0xc0
6404	BYTE  $0x08
6405	BYTE  $0x66
6406	BYTE  $0x45
6407	BYTE  $0x0f
6408	BYTE  $0x3a
6409	BYTE  $0x0f
6410	BYTE  $0xdb
6411	BYTE  $0x0c
6412	ADDQ  (DI), R10
6413	ADCQ  8(DI), R11
6414	ADCQ  $0x01, R12
6415	MOVQ  (BP), AX
6416	MOVQ  AX, R15
6417	MULQ  R10
6418	MOVQ  AX, R13
6419	MOVQ  DX, R14
6420	MOVQ  (BP), AX
6421	MULQ  R11
6422	IMULQ R12, R15
6423	ADDQ  AX, R14
6424	ADCQ  DX, R15
6425	MOVQ  8(BP), AX
6426	MOVQ  AX, R8
6427	MULQ  R10
6428	ADDQ  AX, R14
6429	ADCQ  $0x00, DX
6430	MOVQ  DX, R10
6431	MOVQ  8(BP), AX
6432	MULQ  R11
6433	ADDQ  AX, R15
6434	ADCQ  $0x00, DX
6435	IMULQ R12, R8
6436	ADDQ  R10, R15
6437	ADCQ  DX, R8
6438	MOVQ  R13, R10
6439	MOVQ  R14, R11
6440	MOVQ  R15, R12
6441	ANDQ  $0x03, R12
6442	MOVQ  R15, R13
6443	ANDQ  $-4, R13
6444	MOVQ  R8, R14
6445	SHRQ  $0x02, R8, R15
6446	SHRQ  $0x02, R8
6447	ADDQ  R13, R10
6448	ADCQ  R14, R11
6449	ADCQ  $0x00, R12
6450	ADDQ  R15, R10
6451	ADCQ  R8, R11
6452	ADCQ  $0x00, R12
6453	LEAQ  16(DI), DI
6454	PADDD X3, X0
6455	PXOR  X0, X9
6456	ROL16(X9, X12)
6457	PADDD X9, X6
6458	PXOR  X6, X3
6459	MOVO  X3, X12
6460	PSLLL $0x0c, X12
6461	PSRLL $0x14, X3
6462	PXOR  X12, X3
6463	PADDD X3, X0
6464	PXOR  X0, X9
6465	ROL8(X9, X12)
6466	PADDD X9, X6
6467	PXOR  X6, X3
6468	MOVO  X3, X12
6469	PSLLL $0x07, X12
6470	PSRLL $0x19, X3
6471	PXOR  X12, X3
6472	PADDD X4, X1
6473	PXOR  X1, X10
6474	ROL16(X10, X12)
6475	PADDD X10, X7
6476	PXOR  X7, X4
6477	MOVO  X4, X12
6478	PSLLL $0x0c, X12
6479	PSRLL $0x14, X4
6480	PXOR  X12, X4
6481	PADDD X4, X1
6482	PXOR  X1, X10
6483	ROL8(X10, X12)
6484	PADDD X10, X7
6485	PXOR  X7, X4
6486	MOVO  X4, X12
6487	PSLLL $0x07, X12
6488	PSRLL $0x19, X4
6489	PXOR  X12, X4
6490	PADDD X5, X2
6491	PXOR  X2, X11
6492	ROL16(X11, X12)
6493	PADDD X11, X8
6494	PXOR  X8, X5
6495	MOVO  X5, X12
6496	PSLLL $0x0c, X12
6497	PSRLL $0x14, X5
6498	PXOR  X12, X5
6499	PADDD X5, X2
6500	PXOR  X2, X11
6501	ROL8(X11, X12)
6502	PADDD X11, X8
6503	PXOR  X8, X5
6504	MOVO  X5, X12
6505	PSLLL $0x07, X12
6506	PSRLL $0x19, X5
6507	PXOR  X12, X5
6508	BYTE  $0x66
6509	BYTE  $0x0f
6510	BYTE  $0x3a
6511	BYTE  $0x0f
6512	BYTE  $0xdb
6513	BYTE  $0x0c
6514	BYTE  $0x66
6515	BYTE  $0x0f
6516	BYTE  $0x3a
6517	BYTE  $0x0f
6518	BYTE  $0xf6
6519	BYTE  $0x08
6520	BYTE  $0x66
6521	BYTE  $0x45
6522	BYTE  $0x0f
6523	BYTE  $0x3a
6524	BYTE  $0x0f
6525	BYTE  $0xc9
6526	BYTE  $0x04
6527	BYTE  $0x66
6528	BYTE  $0x0f
6529	BYTE  $0x3a
6530	BYTE  $0x0f
6531	BYTE  $0xe4
6532	BYTE  $0x0c
6533	BYTE  $0x66
6534	BYTE  $0x0f
6535	BYTE  $0x3a
6536	BYTE  $0x0f
6537	BYTE  $0xff
6538	BYTE  $0x08
6539	BYTE  $0x66
6540	BYTE  $0x45
6541	BYTE  $0x0f
6542	BYTE  $0x3a
6543	BYTE  $0x0f
6544	BYTE  $0xd2
6545	BYTE  $0x04
6546	BYTE  $0x66
6547	BYTE  $0x0f
6548	BYTE  $0x3a
6549	BYTE  $0x0f
6550	BYTE  $0xed
6551	BYTE  $0x0c
6552	BYTE  $0x66
6553	BYTE  $0x45
6554	BYTE  $0x0f
6555	BYTE  $0x3a
6556	BYTE  $0x0f
6557	BYTE  $0xc0
6558	BYTE  $0x08
6559	BYTE  $0x66
6560	BYTE  $0x45
6561	BYTE  $0x0f
6562	BYTE  $0x3a
6563	BYTE  $0x0f
6564	BYTE  $0xdb
6565	BYTE  $0x04
6566	DECQ  CX
6567	JG    sealSSETail192LoopA
6568	DECQ  R9
6569	JGE   sealSSETail192LoopB
6570	PADDL chacha20Constants<>+0(SB), X0
6571	PADDL chacha20Constants<>+0(SB), X1
6572	PADDL chacha20Constants<>+0(SB), X2
6573	PADDL 32(BP), X3
6574	PADDL 32(BP), X4
6575	PADDL 32(BP), X5
6576	PADDL 48(BP), X6
6577	PADDL 48(BP), X7
6578	PADDL 48(BP), X8
6579	PADDL 80(BP), X9
6580	PADDL 96(BP), X10
6581	PADDL 112(BP), X11
6582	MOVOU (SI), X12
6583	MOVOU 16(SI), X13
6584	MOVOU 32(SI), X14
6585	MOVOU 48(SI), X15
6586	PXOR  X12, X0
6587	PXOR  X13, X3
6588	PXOR  X14, X6
6589	PXOR  X15, X9
6590	MOVOU X0, (DI)
6591	MOVOU X3, 16(DI)
6592	MOVOU X6, 32(DI)
6593	MOVOU X9, 48(DI)
6594	MOVOU 64(SI), X12
6595	MOVOU 80(SI), X13
6596	MOVOU 96(SI), X14
6597	MOVOU 112(SI), X15
6598	PXOR  X12, X1
6599	PXOR  X13, X4
6600	PXOR  X14, X7
6601	PXOR  X15, X10
6602	MOVOU X1, 64(DI)
6603	MOVOU X4, 80(DI)
6604	MOVOU X7, 96(DI)
6605	MOVOU X10, 112(DI)
6606	MOVO  X2, X1
6607	MOVO  X5, X4
6608	MOVO  X8, X7
6609	MOVO  X11, X10
6610	MOVQ  $0x00000080, CX
6611	LEAQ  128(SI), SI
6612	SUBQ  $0x80, BX
6613	JMP   sealSSE128SealHash
6614
6615sealSSE128:
6616	MOVOU chacha20Constants<>+0(SB), X0
6617	MOVOU 16(R8), X3
6618	MOVOU 32(R8), X6
6619	MOVOU 48(R8), X9
6620	MOVO  X0, X1
6621	MOVO  X3, X4
6622	MOVO  X6, X7
6623	MOVO  X9, X10
6624	PADDL sseIncMask<>+0(SB), X10
6625	MOVO  X1, X2
6626	MOVO  X4, X5
6627	MOVO  X7, X8
6628	MOVO  X10, X11
6629	PADDL sseIncMask<>+0(SB), X11
6630	MOVO  X3, X13
6631	MOVO  X6, X14
6632	MOVO  X10, X15
6633	MOVQ  $0x0000000a, R9
6634
6635sealSSE128InnerCipherLoop:
6636	PADDD X3, X0
6637	PXOR  X0, X9
6638	ROL16(X9, X12)
6639	PADDD X9, X6
6640	PXOR  X6, X3
6641	MOVO  X3, X12
6642	PSLLL $0x0c, X12
6643	PSRLL $0x14, X3
6644	PXOR  X12, X3
6645	PADDD X3, X0
6646	PXOR  X0, X9
6647	ROL8(X9, X12)
6648	PADDD X9, X6
6649	PXOR  X6, X3
6650	MOVO  X3, X12
6651	PSLLL $0x07, X12
6652	PSRLL $0x19, X3
6653	PXOR  X12, X3
6654	PADDD X4, X1
6655	PXOR  X1, X10
6656	ROL16(X10, X12)
6657	PADDD X10, X7
6658	PXOR  X7, X4
6659	MOVO  X4, X12
6660	PSLLL $0x0c, X12
6661	PSRLL $0x14, X4
6662	PXOR  X12, X4
6663	PADDD X4, X1
6664	PXOR  X1, X10
6665	ROL8(X10, X12)
6666	PADDD X10, X7
6667	PXOR  X7, X4
6668	MOVO  X4, X12
6669	PSLLL $0x07, X12
6670	PSRLL $0x19, X4
6671	PXOR  X12, X4
6672	PADDD X5, X2
6673	PXOR  X2, X11
6674	ROL16(X11, X12)
6675	PADDD X11, X8
6676	PXOR  X8, X5
6677	MOVO  X5, X12
6678	PSLLL $0x0c, X12
6679	PSRLL $0x14, X5
6680	PXOR  X12, X5
6681	PADDD X5, X2
6682	PXOR  X2, X11
6683	ROL8(X11, X12)
6684	PADDD X11, X8
6685	PXOR  X8, X5
6686	MOVO  X5, X12
6687	PSLLL $0x07, X12
6688	PSRLL $0x19, X5
6689	PXOR  X12, X5
6690	BYTE  $0x66
6691	BYTE  $0x0f
6692	BYTE  $0x3a
6693	BYTE  $0x0f
6694	BYTE  $0xdb
6695	BYTE  $0x04
6696	BYTE  $0x66
6697	BYTE  $0x0f
6698	BYTE  $0x3a
6699	BYTE  $0x0f
6700	BYTE  $0xe4
6701	BYTE  $0x04
6702	BYTE  $0x66
6703	BYTE  $0x0f
6704	BYTE  $0x3a
6705	BYTE  $0x0f
6706	BYTE  $0xed
6707	BYTE  $0x04
6708	BYTE  $0x66
6709	BYTE  $0x0f
6710	BYTE  $0x3a
6711	BYTE  $0x0f
6712	BYTE  $0xf6
6713	BYTE  $0x08
6714	BYTE  $0x66
6715	BYTE  $0x0f
6716	BYTE  $0x3a
6717	BYTE  $0x0f
6718	BYTE  $0xff
6719	BYTE  $0x08
6720	BYTE  $0x66
6721	BYTE  $0x45
6722	BYTE  $0x0f
6723	BYTE  $0x3a
6724	BYTE  $0x0f
6725	BYTE  $0xc0
6726	BYTE  $0x08
6727	BYTE  $0x66
6728	BYTE  $0x45
6729	BYTE  $0x0f
6730	BYTE  $0x3a
6731	BYTE  $0x0f
6732	BYTE  $0xc9
6733	BYTE  $0x0c
6734	BYTE  $0x66
6735	BYTE  $0x45
6736	BYTE  $0x0f
6737	BYTE  $0x3a
6738	BYTE  $0x0f
6739	BYTE  $0xd2
6740	BYTE  $0x0c
6741	BYTE  $0x66
6742	BYTE  $0x45
6743	BYTE  $0x0f
6744	BYTE  $0x3a
6745	BYTE  $0x0f
6746	BYTE  $0xdb
6747	BYTE  $0x0c
6748	PADDD X3, X0
6749	PXOR  X0, X9
6750	ROL16(X9, X12)
6751	PADDD X9, X6
6752	PXOR  X6, X3
6753	MOVO  X3, X12
6754	PSLLL $0x0c, X12
6755	PSRLL $0x14, X3
6756	PXOR  X12, X3
6757	PADDD X3, X0
6758	PXOR  X0, X9
6759	ROL8(X9, X12)
6760	PADDD X9, X6
6761	PXOR  X6, X3
6762	MOVO  X3, X12
6763	PSLLL $0x07, X12
6764	PSRLL $0x19, X3
6765	PXOR  X12, X3
6766	PADDD X4, X1
6767	PXOR  X1, X10
6768	ROL16(X10, X12)
6769	PADDD X10, X7
6770	PXOR  X7, X4
6771	MOVO  X4, X12
6772	PSLLL $0x0c, X12
6773	PSRLL $0x14, X4
6774	PXOR  X12, X4
6775	PADDD X4, X1
6776	PXOR  X1, X10
6777	ROL8(X10, X12)
6778	PADDD X10, X7
6779	PXOR  X7, X4
6780	MOVO  X4, X12
6781	PSLLL $0x07, X12
6782	PSRLL $0x19, X4
6783	PXOR  X12, X4
6784	PADDD X5, X2
6785	PXOR  X2, X11
6786	ROL16(X11, X12)
6787	PADDD X11, X8
6788	PXOR  X8, X5
6789	MOVO  X5, X12
6790	PSLLL $0x0c, X12
6791	PSRLL $0x14, X5
6792	PXOR  X12, X5
6793	PADDD X5, X2
6794	PXOR  X2, X11
6795	ROL8(X11, X12)
6796	PADDD X11, X8
6797	PXOR  X8, X5
6798	MOVO  X5, X12
6799	PSLLL $0x07, X12
6800	PSRLL $0x19, X5
6801	PXOR  X12, X5
6802	BYTE  $0x66
6803	BYTE  $0x0f
6804	BYTE  $0x3a
6805	BYTE  $0x0f
6806	BYTE  $0xdb
6807	BYTE  $0x0c
6808	BYTE  $0x66
6809	BYTE  $0x0f
6810	BYTE  $0x3a
6811	BYTE  $0x0f
6812	BYTE  $0xe4
6813	BYTE  $0x0c
6814	BYTE  $0x66
6815	BYTE  $0x0f
6816	BYTE  $0x3a
6817	BYTE  $0x0f
6818	BYTE  $0xed
6819	BYTE  $0x0c
6820	BYTE  $0x66
6821	BYTE  $0x0f
6822	BYTE  $0x3a
6823	BYTE  $0x0f
6824	BYTE  $0xf6
6825	BYTE  $0x08
6826	BYTE  $0x66
6827	BYTE  $0x0f
6828	BYTE  $0x3a
6829	BYTE  $0x0f
6830	BYTE  $0xff
6831	BYTE  $0x08
6832	BYTE  $0x66
6833	BYTE  $0x45
6834	BYTE  $0x0f
6835	BYTE  $0x3a
6836	BYTE  $0x0f
6837	BYTE  $0xc0
6838	BYTE  $0x08
6839	BYTE  $0x66
6840	BYTE  $0x45
6841	BYTE  $0x0f
6842	BYTE  $0x3a
6843	BYTE  $0x0f
6844	BYTE  $0xc9
6845	BYTE  $0x04
6846	BYTE  $0x66
6847	BYTE  $0x45
6848	BYTE  $0x0f
6849	BYTE  $0x3a
6850	BYTE  $0x0f
6851	BYTE  $0xd2
6852	BYTE  $0x04
6853	BYTE  $0x66
6854	BYTE  $0x45
6855	BYTE  $0x0f
6856	BYTE  $0x3a
6857	BYTE  $0x0f
6858	BYTE  $0xdb
6859	BYTE  $0x04
6860	DECQ  R9
6861	JNE   sealSSE128InnerCipherLoop
6862
6863	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
6864	PADDL chacha20Constants<>+0(SB), X0
6865	PADDL chacha20Constants<>+0(SB), X1
6866	PADDL chacha20Constants<>+0(SB), X2
6867	PADDL X13, X3
6868	PADDL X13, X4
6869	PADDL X13, X5
6870	PADDL X14, X7
6871	PADDL X14, X8
6872	PADDL X15, X10
6873	PADDL sseIncMask<>+0(SB), X15
6874	PADDL X15, X11
6875	PAND  polyClampMask<>+0(SB), X0
6876	MOVOU X0, (BP)
6877	MOVOU X3, 16(BP)
6878
6879	// Hash
6880	MOVQ ad_len+80(FP), R9
6881	CALL polyHashADInternal<>(SB)
6882	XORQ CX, CX
6883
6884sealSSE128SealHash:
6885	CMPQ  CX, $0x10
6886	JB    sealSSE128Seal
6887	ADDQ  (DI), R10
6888	ADCQ  8(DI), R11
6889	ADCQ  $0x01, R12
6890	MOVQ  (BP), AX
6891	MOVQ  AX, R15
6892	MULQ  R10
6893	MOVQ  AX, R13
6894	MOVQ  DX, R14
6895	MOVQ  (BP), AX
6896	MULQ  R11
6897	IMULQ R12, R15
6898	ADDQ  AX, R14
6899	ADCQ  DX, R15
6900	MOVQ  8(BP), AX
6901	MOVQ  AX, R8
6902	MULQ  R10
6903	ADDQ  AX, R14
6904	ADCQ  $0x00, DX
6905	MOVQ  DX, R10
6906	MOVQ  8(BP), AX
6907	MULQ  R11
6908	ADDQ  AX, R15
6909	ADCQ  $0x00, DX
6910	IMULQ R12, R8
6911	ADDQ  R10, R15
6912	ADCQ  DX, R8
6913	MOVQ  R13, R10
6914	MOVQ  R14, R11
6915	MOVQ  R15, R12
6916	ANDQ  $0x03, R12
6917	MOVQ  R15, R13
6918	ANDQ  $-4, R13
6919	MOVQ  R8, R14
6920	SHRQ  $0x02, R8, R15
6921	SHRQ  $0x02, R8
6922	ADDQ  R13, R10
6923	ADCQ  R14, R11
6924	ADCQ  $0x00, R12
6925	ADDQ  R15, R10
6926	ADCQ  R8, R11
6927	ADCQ  $0x00, R12
6928	SUBQ  $0x10, CX
6929	ADDQ  $0x10, DI
6930	JMP   sealSSE128SealHash
6931
6932sealSSE128Seal:
6933	CMPQ BX, $0x10
6934	JB   sealSSETail
6935	SUBQ $0x10, BX
6936
6937	// Load for decryption
6938	MOVOU (SI), X12
6939	PXOR  X12, X1
6940	MOVOU X1, (DI)
6941	LEAQ  16(SI), SI
6942	LEAQ  16(DI), DI
6943
6944	// Extract for hashing
6945	MOVQ   X1, R13
6946	PSRLDQ $0x08, X1
6947	MOVQ   X1, R14
6948	ADDQ   R13, R10
6949	ADCQ   R14, R11
6950	ADCQ   $0x01, R12
6951	MOVQ   (BP), AX
6952	MOVQ   AX, R15
6953	MULQ   R10
6954	MOVQ   AX, R13
6955	MOVQ   DX, R14
6956	MOVQ   (BP), AX
6957	MULQ   R11
6958	IMULQ  R12, R15
6959	ADDQ   AX, R14
6960	ADCQ   DX, R15
6961	MOVQ   8(BP), AX
6962	MOVQ   AX, R8
6963	MULQ   R10
6964	ADDQ   AX, R14
6965	ADCQ   $0x00, DX
6966	MOVQ   DX, R10
6967	MOVQ   8(BP), AX
6968	MULQ   R11
6969	ADDQ   AX, R15
6970	ADCQ   $0x00, DX
6971	IMULQ  R12, R8
6972	ADDQ   R10, R15
6973	ADCQ   DX, R8
6974	MOVQ   R13, R10
6975	MOVQ   R14, R11
6976	MOVQ   R15, R12
6977	ANDQ   $0x03, R12
6978	MOVQ   R15, R13
6979	ANDQ   $-4, R13
6980	MOVQ   R8, R14
6981	SHRQ   $0x02, R8, R15
6982	SHRQ   $0x02, R8
6983	ADDQ   R13, R10
6984	ADCQ   R14, R11
6985	ADCQ   $0x00, R12
6986	ADDQ   R15, R10
6987	ADCQ   R8, R11
6988	ADCQ   $0x00, R12
6989
6990	// Shift the stream "left"
6991	MOVO X4, X1
6992	MOVO X7, X4
6993	MOVO X10, X7
6994	MOVO X2, X10
6995	MOVO X5, X2
6996	MOVO X8, X5
6997	MOVO X11, X8
6998	JMP  sealSSE128Seal
6999
7000sealSSETail:
7001	TESTQ BX, BX
7002	JE    sealSSEFinalize
7003
7004	// We can only load the PT one byte at a time to avoid read after end of buffer
7005	MOVQ BX, R9
7006	SHLQ $0x04, R9
7007	LEAQ andMask<>+0(SB), R13
7008	MOVQ BX, CX
7009	LEAQ -1(SI)(BX*1), SI
7010	XORQ R15, R15
7011	XORQ R8, R8
7012	XORQ AX, AX
7013
7014sealSSETailLoadLoop:
7015	SHLQ   $0x08, R15, R8
7016	SHLQ   $0x08, R15
7017	MOVB   (SI), AX
7018	XORQ   AX, R15
7019	LEAQ   -1(SI), SI
7020	DECQ   CX
7021	JNE    sealSSETailLoadLoop
7022	MOVQ   R15, 64(BP)
7023	MOVQ   R8, 72(BP)
7024	PXOR   64(BP), X1
7025	MOVOU  X1, (DI)
7026	MOVOU  -16(R13)(R9*1), X12
7027	PAND   X12, X1
7028	MOVQ   X1, R13
7029	PSRLDQ $0x08, X1
7030	MOVQ   X1, R14
7031	ADDQ   R13, R10
7032	ADCQ   R14, R11
7033	ADCQ   $0x01, R12
7034	MOVQ   (BP), AX
7035	MOVQ   AX, R15
7036	MULQ   R10
7037	MOVQ   AX, R13
7038	MOVQ   DX, R14
7039	MOVQ   (BP), AX
7040	MULQ   R11
7041	IMULQ  R12, R15
7042	ADDQ   AX, R14
7043	ADCQ   DX, R15
7044	MOVQ   8(BP), AX
7045	MOVQ   AX, R8
7046	MULQ   R10
7047	ADDQ   AX, R14
7048	ADCQ   $0x00, DX
7049	MOVQ   DX, R10
7050	MOVQ   8(BP), AX
7051	MULQ   R11
7052	ADDQ   AX, R15
7053	ADCQ   $0x00, DX
7054	IMULQ  R12, R8
7055	ADDQ   R10, R15
7056	ADCQ   DX, R8
7057	MOVQ   R13, R10
7058	MOVQ   R14, R11
7059	MOVQ   R15, R12
7060	ANDQ   $0x03, R12
7061	MOVQ   R15, R13
7062	ANDQ   $-4, R13
7063	MOVQ   R8, R14
7064	SHRQ   $0x02, R8, R15
7065	SHRQ   $0x02, R8
7066	ADDQ   R13, R10
7067	ADCQ   R14, R11
7068	ADCQ   $0x00, R12
7069	ADDQ   R15, R10
7070	ADCQ   R8, R11
7071	ADCQ   $0x00, R12
7072	ADDQ   BX, DI
7073
7074sealSSEFinalize:
7075	// Hash in the buffer lengths
7076	ADDQ  ad_len+80(FP), R10
7077	ADCQ  src_len+56(FP), R11
7078	ADCQ  $0x01, R12
7079	MOVQ  (BP), AX
7080	MOVQ  AX, R15
7081	MULQ  R10
7082	MOVQ  AX, R13
7083	MOVQ  DX, R14
7084	MOVQ  (BP), AX
7085	MULQ  R11
7086	IMULQ R12, R15
7087	ADDQ  AX, R14
7088	ADCQ  DX, R15
7089	MOVQ  8(BP), AX
7090	MOVQ  AX, R8
7091	MULQ  R10
7092	ADDQ  AX, R14
7093	ADCQ  $0x00, DX
7094	MOVQ  DX, R10
7095	MOVQ  8(BP), AX
7096	MULQ  R11
7097	ADDQ  AX, R15
7098	ADCQ  $0x00, DX
7099	IMULQ R12, R8
7100	ADDQ  R10, R15
7101	ADCQ  DX, R8
7102	MOVQ  R13, R10
7103	MOVQ  R14, R11
7104	MOVQ  R15, R12
7105	ANDQ  $0x03, R12
7106	MOVQ  R15, R13
7107	ANDQ  $-4, R13
7108	MOVQ  R8, R14
7109	SHRQ  $0x02, R8, R15
7110	SHRQ  $0x02, R8
7111	ADDQ  R13, R10
7112	ADCQ  R14, R11
7113	ADCQ  $0x00, R12
7114	ADDQ  R15, R10
7115	ADCQ  R8, R11
7116	ADCQ  $0x00, R12
7117
7118	// Final reduce
7119	MOVQ    R10, R13
7120	MOVQ    R11, R14
7121	MOVQ    R12, R15
7122	SUBQ    $-5, R10
7123	SBBQ    $-1, R11
7124	SBBQ    $0x03, R12
7125	CMOVQCS R13, R10
7126	CMOVQCS R14, R11
7127	CMOVQCS R15, R12
7128
7129	// Add in the "s" part of the key
7130	ADDQ 16(BP), R10
7131	ADCQ 24(BP), R11
7132
7133	// Finally store the tag at the end of the message
7134	MOVQ R10, (DI)
7135	MOVQ R11, 8(DI)
7136	RET
7137
7138chacha20Poly1305Seal_AVX2:
7139	VZEROUPPER
7140	VMOVDQU chacha20Constants<>+0(SB), Y0
7141	BYTE    $0xc4
7142	BYTE    $0x42
7143	BYTE    $0x7d
7144	BYTE    $0x5a
7145	BYTE    $0x70
7146	BYTE    $0x10
7147	BYTE    $0xc4
7148	BYTE    $0x42
7149	BYTE    $0x7d
7150	BYTE    $0x5a
7151	BYTE    $0x60
7152	BYTE    $0x20
7153	BYTE    $0xc4
7154	BYTE    $0xc2
7155	BYTE    $0x7d
7156	BYTE    $0x5a
7157	BYTE    $0x60
7158	BYTE    $0x30
7159	VPADDD  avx2InitMask<>+0(SB), Y4, Y4
7160
7161	// Special optimizations, for very short buffers
7162	CMPQ BX, $0x000000c0
7163	JBE  seal192AVX2
7164	CMPQ BX, $0x00000140
7165	JBE  seal320AVX2
7166
7167	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
7168	VMOVDQA Y0, Y5
7169	VMOVDQA Y0, Y6
7170	VMOVDQA Y0, Y7
7171	VMOVDQA Y14, Y9
7172	VMOVDQA Y14, Y10
7173	VMOVDQA Y14, Y11
7174	VMOVDQA Y14, 32(BP)
7175	VMOVDQA Y12, Y13
7176	VMOVDQA Y12, Y8
7177	VMOVDQA Y12, Y15
7178	VMOVDQA Y12, 64(BP)
7179	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
7180	VMOVDQA Y4, 96(BP)
7181	VPADDD  avx2IncMask<>+0(SB), Y1, Y2
7182	VMOVDQA Y1, 128(BP)
7183	VPADDD  avx2IncMask<>+0(SB), Y2, Y3
7184	VMOVDQA Y2, 160(BP)
7185	VMOVDQA Y3, 192(BP)
7186	MOVQ    $0x0000000a, R9
7187
7188sealAVX2IntroLoop:
7189	VMOVDQA    Y15, 224(BP)
7190	VPADDD     Y14, Y0, Y0
7191	VPXOR      Y0, Y4, Y4
7192	VPSHUFB    rol16<>+0(SB), Y4, Y4
7193	VPADDD     Y4, Y12, Y12
7194	VPXOR      Y12, Y14, Y14
7195	VPSLLD     $0x0c, Y14, Y15
7196	VPSRLD     $0x14, Y14, Y14
7197	VPXOR      Y15, Y14, Y14
7198	VPADDD     Y14, Y0, Y0
7199	VPXOR      Y0, Y4, Y4
7200	VPSHUFB    rol8<>+0(SB), Y4, Y4
7201	VPADDD     Y4, Y12, Y12
7202	VPXOR      Y12, Y14, Y14
7203	VPSLLD     $0x07, Y14, Y15
7204	VPSRLD     $0x19, Y14, Y14
7205	VPXOR      Y15, Y14, Y14
7206	VPADDD     Y9, Y5, Y5
7207	VPXOR      Y5, Y1, Y1
7208	VPSHUFB    rol16<>+0(SB), Y1, Y1
7209	VPADDD     Y1, Y13, Y13
7210	VPXOR      Y13, Y9, Y9
7211	VPSLLD     $0x0c, Y9, Y15
7212	VPSRLD     $0x14, Y9, Y9
7213	VPXOR      Y15, Y9, Y9
7214	VPADDD     Y9, Y5, Y5
7215	VPXOR      Y5, Y1, Y1
7216	VPSHUFB    rol8<>+0(SB), Y1, Y1
7217	VPADDD     Y1, Y13, Y13
7218	VPXOR      Y13, Y9, Y9
7219	VPSLLD     $0x07, Y9, Y15
7220	VPSRLD     $0x19, Y9, Y9
7221	VPXOR      Y15, Y9, Y9
7222	VPADDD     Y10, Y6, Y6
7223	VPXOR      Y6, Y2, Y2
7224	VPSHUFB    rol16<>+0(SB), Y2, Y2
7225	VPADDD     Y2, Y8, Y8
7226	VPXOR      Y8, Y10, Y10
7227	VPSLLD     $0x0c, Y10, Y15
7228	VPSRLD     $0x14, Y10, Y10
7229	VPXOR      Y15, Y10, Y10
7230	VPADDD     Y10, Y6, Y6
7231	VPXOR      Y6, Y2, Y2
7232	VPSHUFB    rol8<>+0(SB), Y2, Y2
7233	VPADDD     Y2, Y8, Y8
7234	VPXOR      Y8, Y10, Y10
7235	VPSLLD     $0x07, Y10, Y15
7236	VPSRLD     $0x19, Y10, Y10
7237	VPXOR      Y15, Y10, Y10
7238	VMOVDQA    224(BP), Y15
7239	VMOVDQA    Y13, 224(BP)
7240	VPADDD     Y11, Y7, Y7
7241	VPXOR      Y7, Y3, Y3
7242	VPSHUFB    rol16<>+0(SB), Y3, Y3
7243	VPADDD     Y3, Y15, Y15
7244	VPXOR      Y15, Y11, Y11
7245	VPSLLD     $0x0c, Y11, Y13
7246	VPSRLD     $0x14, Y11, Y11
7247	VPXOR      Y13, Y11, Y11
7248	VPADDD     Y11, Y7, Y7
7249	VPXOR      Y7, Y3, Y3
7250	VPSHUFB    rol8<>+0(SB), Y3, Y3
7251	VPADDD     Y3, Y15, Y15
7252	VPXOR      Y15, Y11, Y11
7253	VPSLLD     $0x07, Y11, Y13
7254	VPSRLD     $0x19, Y11, Y11
7255	VPXOR      Y13, Y11, Y11
7256	VMOVDQA    224(BP), Y13
7257	VPALIGNR   $0x04, Y14, Y14, Y14
7258	VPALIGNR   $0x08, Y12, Y12, Y12
7259	VPALIGNR   $0x0c, Y4, Y4, Y4
7260	VPALIGNR   $0x04, Y9, Y9, Y9
7261	VPALIGNR   $0x08, Y13, Y13, Y13
7262	VPALIGNR   $0x0c, Y1, Y1, Y1
7263	VPALIGNR   $0x04, Y10, Y10, Y10
7264	VPALIGNR   $0x08, Y8, Y8, Y8
7265	VPALIGNR   $0x0c, Y2, Y2, Y2
7266	VPALIGNR   $0x04, Y11, Y11, Y11
7267	VPALIGNR   $0x08, Y15, Y15, Y15
7268	VPALIGNR   $0x0c, Y3, Y3, Y3
7269	VMOVDQA    Y15, 224(BP)
7270	VPADDD     Y14, Y0, Y0
7271	VPXOR      Y0, Y4, Y4
7272	VPSHUFB    rol16<>+0(SB), Y4, Y4
7273	VPADDD     Y4, Y12, Y12
7274	VPXOR      Y12, Y14, Y14
7275	VPSLLD     $0x0c, Y14, Y15
7276	VPSRLD     $0x14, Y14, Y14
7277	VPXOR      Y15, Y14, Y14
7278	VPADDD     Y14, Y0, Y0
7279	VPXOR      Y0, Y4, Y4
7280	VPSHUFB    rol8<>+0(SB), Y4, Y4
7281	VPADDD     Y4, Y12, Y12
7282	VPXOR      Y12, Y14, Y14
7283	VPSLLD     $0x07, Y14, Y15
7284	VPSRLD     $0x19, Y14, Y14
7285	VPXOR      Y15, Y14, Y14
7286	VPADDD     Y9, Y5, Y5
7287	VPXOR      Y5, Y1, Y1
7288	VPSHUFB    rol16<>+0(SB), Y1, Y1
7289	VPADDD     Y1, Y13, Y13
7290	VPXOR      Y13, Y9, Y9
7291	VPSLLD     $0x0c, Y9, Y15
7292	VPSRLD     $0x14, Y9, Y9
7293	VPXOR      Y15, Y9, Y9
7294	VPADDD     Y9, Y5, Y5
7295	VPXOR      Y5, Y1, Y1
7296	VPSHUFB    rol8<>+0(SB), Y1, Y1
7297	VPADDD     Y1, Y13, Y13
7298	VPXOR      Y13, Y9, Y9
7299	VPSLLD     $0x07, Y9, Y15
7300	VPSRLD     $0x19, Y9, Y9
7301	VPXOR      Y15, Y9, Y9
7302	VPADDD     Y10, Y6, Y6
7303	VPXOR      Y6, Y2, Y2
7304	VPSHUFB    rol16<>+0(SB), Y2, Y2
7305	VPADDD     Y2, Y8, Y8
7306	VPXOR      Y8, Y10, Y10
7307	VPSLLD     $0x0c, Y10, Y15
7308	VPSRLD     $0x14, Y10, Y10
7309	VPXOR      Y15, Y10, Y10
7310	VPADDD     Y10, Y6, Y6
7311	VPXOR      Y6, Y2, Y2
7312	VPSHUFB    rol8<>+0(SB), Y2, Y2
7313	VPADDD     Y2, Y8, Y8
7314	VPXOR      Y8, Y10, Y10
7315	VPSLLD     $0x07, Y10, Y15
7316	VPSRLD     $0x19, Y10, Y10
7317	VPXOR      Y15, Y10, Y10
7318	VMOVDQA    224(BP), Y15
7319	VMOVDQA    Y13, 224(BP)
7320	VPADDD     Y11, Y7, Y7
7321	VPXOR      Y7, Y3, Y3
7322	VPSHUFB    rol16<>+0(SB), Y3, Y3
7323	VPADDD     Y3, Y15, Y15
7324	VPXOR      Y15, Y11, Y11
7325	VPSLLD     $0x0c, Y11, Y13
7326	VPSRLD     $0x14, Y11, Y11
7327	VPXOR      Y13, Y11, Y11
7328	VPADDD     Y11, Y7, Y7
7329	VPXOR      Y7, Y3, Y3
7330	VPSHUFB    rol8<>+0(SB), Y3, Y3
7331	VPADDD     Y3, Y15, Y15
7332	VPXOR      Y15, Y11, Y11
7333	VPSLLD     $0x07, Y11, Y13
7334	VPSRLD     $0x19, Y11, Y11
7335	VPXOR      Y13, Y11, Y11
7336	VMOVDQA    224(BP), Y13
7337	VPALIGNR   $0x0c, Y14, Y14, Y14
7338	VPALIGNR   $0x08, Y12, Y12, Y12
7339	VPALIGNR   $0x04, Y4, Y4, Y4
7340	VPALIGNR   $0x0c, Y9, Y9, Y9
7341	VPALIGNR   $0x08, Y13, Y13, Y13
7342	VPALIGNR   $0x04, Y1, Y1, Y1
7343	VPALIGNR   $0x0c, Y10, Y10, Y10
7344	VPALIGNR   $0x08, Y8, Y8, Y8
7345	VPALIGNR   $0x04, Y2, Y2, Y2
7346	VPALIGNR   $0x0c, Y11, Y11, Y11
7347	VPALIGNR   $0x08, Y15, Y15, Y15
7348	VPALIGNR   $0x04, Y3, Y3, Y3
7349	DECQ       R9
7350	JNE        sealAVX2IntroLoop
7351	VPADDD     chacha20Constants<>+0(SB), Y0, Y0
7352	VPADDD     chacha20Constants<>+0(SB), Y5, Y5
7353	VPADDD     chacha20Constants<>+0(SB), Y6, Y6
7354	VPADDD     chacha20Constants<>+0(SB), Y7, Y7
7355	VPADDD     32(BP), Y14, Y14
7356	VPADDD     32(BP), Y9, Y9
7357	VPADDD     32(BP), Y10, Y10
7358	VPADDD     32(BP), Y11, Y11
7359	VPADDD     64(BP), Y12, Y12
7360	VPADDD     64(BP), Y13, Y13
7361	VPADDD     64(BP), Y8, Y8
7362	VPADDD     64(BP), Y15, Y15
7363	VPADDD     96(BP), Y4, Y4
7364	VPADDD     128(BP), Y1, Y1
7365	VPADDD     160(BP), Y2, Y2
7366	VPADDD     192(BP), Y3, Y3
7367	VPERM2I128 $0x13, Y12, Y4, Y12
7368	VPERM2I128 $0x02, Y0, Y14, Y4
7369	VPERM2I128 $0x13, Y0, Y14, Y0
7370
7371	// Clamp and store poly key
7372	VPAND   polyClampMask<>+0(SB), Y4, Y4
7373	VMOVDQA Y4, (BP)
7374
7375	// Hash AD
7376	MOVQ ad_len+80(FP), R9
7377	CALL polyHashADInternal<>(SB)
7378
7379	// Can store at least 320 bytes
7380	VPXOR      (SI), Y0, Y0
7381	VPXOR      32(SI), Y12, Y12
7382	VMOVDQU    Y0, (DI)
7383	VMOVDQU    Y12, 32(DI)
7384	VPERM2I128 $0x02, Y5, Y9, Y0
7385	VPERM2I128 $0x02, Y13, Y1, Y14
7386	VPERM2I128 $0x13, Y5, Y9, Y12
7387	VPERM2I128 $0x13, Y13, Y1, Y4
7388	VPXOR      64(SI), Y0, Y0
7389	VPXOR      96(SI), Y14, Y14
7390	VPXOR      128(SI), Y12, Y12
7391	VPXOR      160(SI), Y4, Y4
7392	VMOVDQU    Y0, 64(DI)
7393	VMOVDQU    Y14, 96(DI)
7394	VMOVDQU    Y12, 128(DI)
7395	VMOVDQU    Y4, 160(DI)
7396	VPERM2I128 $0x02, Y6, Y10, Y0
7397	VPERM2I128 $0x02, Y8, Y2, Y14
7398	VPERM2I128 $0x13, Y6, Y10, Y12
7399	VPERM2I128 $0x13, Y8, Y2, Y4
7400	VPXOR      192(SI), Y0, Y0
7401	VPXOR      224(SI), Y14, Y14
7402	VPXOR      256(SI), Y12, Y12
7403	VPXOR      288(SI), Y4, Y4
7404	VMOVDQU    Y0, 192(DI)
7405	VMOVDQU    Y14, 224(DI)
7406	VMOVDQU    Y12, 256(DI)
7407	VMOVDQU    Y4, 288(DI)
7408	MOVQ       $0x00000140, CX
7409	SUBQ       $0x00000140, BX
7410	LEAQ       320(SI), SI
7411	VPERM2I128 $0x02, Y7, Y11, Y0
7412	VPERM2I128 $0x02, Y15, Y3, Y14
7413	VPERM2I128 $0x13, Y7, Y11, Y12
7414	VPERM2I128 $0x13, Y15, Y3, Y4
7415	CMPQ       BX, $0x80
7416	JBE        sealAVX2SealHash
7417	VPXOR      (SI), Y0, Y0
7418	VPXOR      32(SI), Y14, Y14
7419	VPXOR      64(SI), Y12, Y12
7420	VPXOR      96(SI), Y4, Y4
7421	VMOVDQU    Y0, 320(DI)
7422	VMOVDQU    Y14, 352(DI)
7423	VMOVDQU    Y12, 384(DI)
7424	VMOVDQU    Y4, 416(DI)
7425	SUBQ       $0x80, BX
7426	LEAQ       128(SI), SI
7427	MOVQ       $0x00000008, CX
7428	MOVQ       $0x00000002, R9
7429	CMPQ       BX, $0x80
7430	JBE        sealAVX2Tail128
7431	CMPQ       BX, $0x00000100
7432	JBE        sealAVX2Tail256
7433	CMPQ       BX, $0x00000180
7434	JBE        sealAVX2Tail384
7435	CMPQ       BX, $0x00000200
7436	JBE        sealAVX2Tail512
7437
7438	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
7439	VMOVDQA  chacha20Constants<>+0(SB), Y0
7440	VMOVDQA  Y0, Y5
7441	VMOVDQA  Y0, Y6
7442	VMOVDQA  Y0, Y7
7443	VMOVDQA  32(BP), Y14
7444	VMOVDQA  Y14, Y9
7445	VMOVDQA  Y14, Y10
7446	VMOVDQA  Y14, Y11
7447	VMOVDQA  64(BP), Y12
7448	VMOVDQA  Y12, Y13
7449	VMOVDQA  Y12, Y8
7450	VMOVDQA  Y12, Y15
7451	VMOVDQA  192(BP), Y4
7452	VPADDD   avx2IncMask<>+0(SB), Y4, Y4
7453	VPADDD   avx2IncMask<>+0(SB), Y4, Y1
7454	VPADDD   avx2IncMask<>+0(SB), Y1, Y2
7455	VPADDD   avx2IncMask<>+0(SB), Y2, Y3
7456	VMOVDQA  Y4, 96(BP)
7457	VMOVDQA  Y1, 128(BP)
7458	VMOVDQA  Y2, 160(BP)
7459	VMOVDQA  Y3, 192(BP)
7460	VMOVDQA  Y15, 224(BP)
7461	VPADDD   Y14, Y0, Y0
7462	VPXOR    Y0, Y4, Y4
7463	VPSHUFB  rol16<>+0(SB), Y4, Y4
7464	VPADDD   Y4, Y12, Y12
7465	VPXOR    Y12, Y14, Y14
7466	VPSLLD   $0x0c, Y14, Y15
7467	VPSRLD   $0x14, Y14, Y14
7468	VPXOR    Y15, Y14, Y14
7469	VPADDD   Y14, Y0, Y0
7470	VPXOR    Y0, Y4, Y4
7471	VPSHUFB  rol8<>+0(SB), Y4, Y4
7472	VPADDD   Y4, Y12, Y12
7473	VPXOR    Y12, Y14, Y14
7474	VPSLLD   $0x07, Y14, Y15
7475	VPSRLD   $0x19, Y14, Y14
7476	VPXOR    Y15, Y14, Y14
7477	VPADDD   Y9, Y5, Y5
7478	VPXOR    Y5, Y1, Y1
7479	VPSHUFB  rol16<>+0(SB), Y1, Y1
7480	VPADDD   Y1, Y13, Y13
7481	VPXOR    Y13, Y9, Y9
7482	VPSLLD   $0x0c, Y9, Y15
7483	VPSRLD   $0x14, Y9, Y9
7484	VPXOR    Y15, Y9, Y9
7485	VPADDD   Y9, Y5, Y5
7486	VPXOR    Y5, Y1, Y1
7487	VPSHUFB  rol8<>+0(SB), Y1, Y1
7488	VPADDD   Y1, Y13, Y13
7489	VPXOR    Y13, Y9, Y9
7490	VPSLLD   $0x07, Y9, Y15
7491	VPSRLD   $0x19, Y9, Y9
7492	VPXOR    Y15, Y9, Y9
7493	VPADDD   Y10, Y6, Y6
7494	VPXOR    Y6, Y2, Y2
7495	VPSHUFB  rol16<>+0(SB), Y2, Y2
7496	VPADDD   Y2, Y8, Y8
7497	VPXOR    Y8, Y10, Y10
7498	VPSLLD   $0x0c, Y10, Y15
7499	VPSRLD   $0x14, Y10, Y10
7500	VPXOR    Y15, Y10, Y10
7501	VPADDD   Y10, Y6, Y6
7502	VPXOR    Y6, Y2, Y2
7503	VPSHUFB  rol8<>+0(SB), Y2, Y2
7504	VPADDD   Y2, Y8, Y8
7505	VPXOR    Y8, Y10, Y10
7506	VPSLLD   $0x07, Y10, Y15
7507	VPSRLD   $0x19, Y10, Y10
7508	VPXOR    Y15, Y10, Y10
7509	VMOVDQA  224(BP), Y15
7510	VMOVDQA  Y13, 224(BP)
7511	VPADDD   Y11, Y7, Y7
7512	VPXOR    Y7, Y3, Y3
7513	VPSHUFB  rol16<>+0(SB), Y3, Y3
7514	VPADDD   Y3, Y15, Y15
7515	VPXOR    Y15, Y11, Y11
7516	VPSLLD   $0x0c, Y11, Y13
7517	VPSRLD   $0x14, Y11, Y11
7518	VPXOR    Y13, Y11, Y11
7519	VPADDD   Y11, Y7, Y7
7520	VPXOR    Y7, Y3, Y3
7521	VPSHUFB  rol8<>+0(SB), Y3, Y3
7522	VPADDD   Y3, Y15, Y15
7523	VPXOR    Y15, Y11, Y11
7524	VPSLLD   $0x07, Y11, Y13
7525	VPSRLD   $0x19, Y11, Y11
7526	VPXOR    Y13, Y11, Y11
7527	VMOVDQA  224(BP), Y13
7528	VPALIGNR $0x04, Y14, Y14, Y14
7529	VPALIGNR $0x08, Y12, Y12, Y12
7530	VPALIGNR $0x0c, Y4, Y4, Y4
7531	VPALIGNR $0x04, Y9, Y9, Y9
7532	VPALIGNR $0x08, Y13, Y13, Y13
7533	VPALIGNR $0x0c, Y1, Y1, Y1
7534	VPALIGNR $0x04, Y10, Y10, Y10
7535	VPALIGNR $0x08, Y8, Y8, Y8
7536	VPALIGNR $0x0c, Y2, Y2, Y2
7537	VPALIGNR $0x04, Y11, Y11, Y11
7538	VPALIGNR $0x08, Y15, Y15, Y15
7539	VPALIGNR $0x0c, Y3, Y3, Y3
7540	VMOVDQA  Y15, 224(BP)
7541	VPADDD   Y14, Y0, Y0
7542	VPXOR    Y0, Y4, Y4
7543	VPSHUFB  rol16<>+0(SB), Y4, Y4
7544	VPADDD   Y4, Y12, Y12
7545	VPXOR    Y12, Y14, Y14
7546	VPSLLD   $0x0c, Y14, Y15
7547	VPSRLD   $0x14, Y14, Y14
7548	VPXOR    Y15, Y14, Y14
7549	VPADDD   Y14, Y0, Y0
7550	VPXOR    Y0, Y4, Y4
7551	VPSHUFB  rol8<>+0(SB), Y4, Y4
7552	VPADDD   Y4, Y12, Y12
7553	VPXOR    Y12, Y14, Y14
7554	VPSLLD   $0x07, Y14, Y15
7555	VPSRLD   $0x19, Y14, Y14
7556	VPXOR    Y15, Y14, Y14
7557	VPADDD   Y9, Y5, Y5
7558	VPXOR    Y5, Y1, Y1
7559	VPSHUFB  rol16<>+0(SB), Y1, Y1
7560	VPADDD   Y1, Y13, Y13
7561	VPXOR    Y13, Y9, Y9
7562	VPSLLD   $0x0c, Y9, Y15
7563	VPSRLD   $0x14, Y9, Y9
7564	VPXOR    Y15, Y9, Y9
7565	VPADDD   Y9, Y5, Y5
7566	VPXOR    Y5, Y1, Y1
7567	VPSHUFB  rol8<>+0(SB), Y1, Y1
7568	VPADDD   Y1, Y13, Y13
7569	VPXOR    Y13, Y9, Y9
7570	VPSLLD   $0x07, Y9, Y15
7571	VPSRLD   $0x19, Y9, Y9
7572	VPXOR    Y15, Y9, Y9
7573	VPADDD   Y10, Y6, Y6
7574	VPXOR    Y6, Y2, Y2
7575	VPSHUFB  rol16<>+0(SB), Y2, Y2
7576	VPADDD   Y2, Y8, Y8
7577	VPXOR    Y8, Y10, Y10
7578	VPSLLD   $0x0c, Y10, Y15
7579	VPSRLD   $0x14, Y10, Y10
7580	VPXOR    Y15, Y10, Y10
7581	VPADDD   Y10, Y6, Y6
7582	VPXOR    Y6, Y2, Y2
7583	VPSHUFB  rol8<>+0(SB), Y2, Y2
7584	VPADDD   Y2, Y8, Y8
7585	VPXOR    Y8, Y10, Y10
7586	VPSLLD   $0x07, Y10, Y15
7587	VPSRLD   $0x19, Y10, Y10
7588	VPXOR    Y15, Y10, Y10
7589	VMOVDQA  224(BP), Y15
7590	VMOVDQA  Y13, 224(BP)
7591	VPADDD   Y11, Y7, Y7
7592	VPXOR    Y7, Y3, Y3
7593	VPSHUFB  rol16<>+0(SB), Y3, Y3
7594	VPADDD   Y3, Y15, Y15
7595	VPXOR    Y15, Y11, Y11
7596	VPSLLD   $0x0c, Y11, Y13
7597	VPSRLD   $0x14, Y11, Y11
7598	VPXOR    Y13, Y11, Y11
7599	VPADDD   Y11, Y7, Y7
7600	VPXOR    Y7, Y3, Y3
7601	VPSHUFB  rol8<>+0(SB), Y3, Y3
7602	VPADDD   Y3, Y15, Y15
7603	VPXOR    Y15, Y11, Y11
7604	VPSLLD   $0x07, Y11, Y13
7605	VPSRLD   $0x19, Y11, Y11
7606	VPXOR    Y13, Y11, Y11
7607	VMOVDQA  224(BP), Y13
7608	VPALIGNR $0x0c, Y14, Y14, Y14
7609	VPALIGNR $0x08, Y12, Y12, Y12
7610	VPALIGNR $0x04, Y4, Y4, Y4
7611	VPALIGNR $0x0c, Y9, Y9, Y9
7612	VPALIGNR $0x08, Y13, Y13, Y13
7613	VPALIGNR $0x04, Y1, Y1, Y1
7614	VPALIGNR $0x0c, Y10, Y10, Y10
7615	VPALIGNR $0x08, Y8, Y8, Y8
7616	VPALIGNR $0x04, Y2, Y2, Y2
7617	VPALIGNR $0x0c, Y11, Y11, Y11
7618	VPALIGNR $0x08, Y15, Y15, Y15
7619	VPALIGNR $0x04, Y3, Y3, Y3
7620	VPADDD   Y14, Y0, Y0
7621	VPADDD   Y9, Y5, Y5
7622	VPADDD   Y10, Y6, Y6
7623	VPADDD   Y11, Y7, Y7
7624	VPXOR    Y0, Y4, Y4
7625	VPXOR    Y5, Y1, Y1
7626	VPXOR    Y6, Y2, Y2
7627	VPXOR    Y7, Y3, Y3
7628	VPSHUFB  rol16<>+0(SB), Y4, Y4
7629	VPSHUFB  rol16<>+0(SB), Y1, Y1
7630	VPSHUFB  rol16<>+0(SB), Y2, Y2
7631	VPSHUFB  rol16<>+0(SB), Y3, Y3
7632	VPADDD   Y4, Y12, Y12
7633	VPADDD   Y1, Y13, Y13
7634	VPADDD   Y2, Y8, Y8
7635	VPADDD   Y3, Y15, Y15
7636	VPXOR    Y12, Y14, Y14
7637	VPXOR    Y13, Y9, Y9
7638	VPXOR    Y8, Y10, Y10
7639	VPXOR    Y15, Y11, Y11
7640	VMOVDQA  Y15, 224(BP)
7641	VPSLLD   $0x0c, Y14, Y15
7642	VPSRLD   $0x14, Y14, Y14
7643	VPXOR    Y15, Y14, Y14
7644	VPSLLD   $0x0c, Y9, Y15
7645	VPSRLD   $0x14, Y9, Y9
7646	VPXOR    Y15, Y9, Y9
7647	VPSLLD   $0x0c, Y10, Y15
7648	VPSRLD   $0x14, Y10, Y10
7649	VPXOR    Y15, Y10, Y10
7650	VPSLLD   $0x0c, Y11, Y15
7651	VPSRLD   $0x14, Y11, Y11
7652	VPXOR    Y15, Y11, Y11
7653	VMOVDQA  224(BP), Y15
7654	SUBQ     $0x10, DI
7655	MOVQ     $0x00000009, CX
7656	JMP      sealAVX2InternalLoopStart
7657
7658sealAVX2MainLoop:
7659	VMOVDQU chacha20Constants<>+0(SB), Y0
7660	VMOVDQA Y0, Y5
7661	VMOVDQA Y0, Y6
7662	VMOVDQA Y0, Y7
7663	VMOVDQA 32(BP), Y14
7664	VMOVDQA Y14, Y9
7665	VMOVDQA Y14, Y10
7666	VMOVDQA Y14, Y11
7667	VMOVDQA 64(BP), Y12
7668	VMOVDQA Y12, Y13
7669	VMOVDQA Y12, Y8
7670	VMOVDQA Y12, Y15
7671	VMOVDQA 192(BP), Y4
7672	VPADDD  avx2IncMask<>+0(SB), Y4, Y4
7673	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
7674	VPADDD  avx2IncMask<>+0(SB), Y1, Y2
7675	VPADDD  avx2IncMask<>+0(SB), Y2, Y3
7676	VMOVDQA Y4, 96(BP)
7677	VMOVDQA Y1, 128(BP)
7678	VMOVDQA Y2, 160(BP)
7679	VMOVDQA Y3, 192(BP)
7680	MOVQ    $0x0000000a, CX
7681
7682sealAVX2InternalLoop:
7683	ADDQ    (DI), R10
7684	ADCQ    8(DI), R11
7685	ADCQ    $0x01, R12
7686	VPADDD  Y14, Y0, Y0
7687	VPADDD  Y9, Y5, Y5
7688	VPADDD  Y10, Y6, Y6
7689	VPADDD  Y11, Y7, Y7
7690	MOVQ    (BP), DX
7691	MOVQ    DX, R15
7692	MULXQ   R10, R13, R14
7693	IMULQ   R12, R15
7694	MULXQ   R11, AX, DX
7695	ADDQ    AX, R14
7696	ADCQ    DX, R15
7697	VPXOR   Y0, Y4, Y4
7698	VPXOR   Y5, Y1, Y1
7699	VPXOR   Y6, Y2, Y2
7700	VPXOR   Y7, Y3, Y3
7701	VPSHUFB rol16<>+0(SB), Y4, Y4
7702	VPSHUFB rol16<>+0(SB), Y1, Y1
7703	VPSHUFB rol16<>+0(SB), Y2, Y2
7704	VPSHUFB rol16<>+0(SB), Y3, Y3
7705	MOVQ    8(BP), DX
7706	MULXQ   R10, R10, AX
7707	ADDQ    R10, R14
7708	MULXQ   R11, R11, R8
7709	ADCQ    R11, R15
7710	ADCQ    $0x00, R8
7711	VPADDD  Y4, Y12, Y12
7712	VPADDD  Y1, Y13, Y13
7713	VPADDD  Y2, Y8, Y8
7714	VPADDD  Y3, Y15, Y15
7715	VPXOR   Y12, Y14, Y14
7716	VPXOR   Y13, Y9, Y9
7717	VPXOR   Y8, Y10, Y10
7718	VPXOR   Y15, Y11, Y11
7719	IMULQ   R12, DX
7720	ADDQ    AX, R15
7721	ADCQ    DX, R8
7722	VMOVDQA Y15, 224(BP)
7723	VPSLLD  $0x0c, Y14, Y15
7724	VPSRLD  $0x14, Y14, Y14
7725	VPXOR   Y15, Y14, Y14
7726	VPSLLD  $0x0c, Y9, Y15
7727	VPSRLD  $0x14, Y9, Y9
7728	VPXOR   Y15, Y9, Y9
7729	VPSLLD  $0x0c, Y10, Y15
7730	VPSRLD  $0x14, Y10, Y10
7731	VPXOR   Y15, Y10, Y10
7732	VPSLLD  $0x0c, Y11, Y15
7733	VPSRLD  $0x14, Y11, Y11
7734	VPXOR   Y15, Y11, Y11
7735	VMOVDQA 224(BP), Y15
7736	MOVQ    R13, R10
7737	MOVQ    R14, R11
7738	MOVQ    R15, R12
7739	ANDQ    $0x03, R12
7740	MOVQ    R15, R13
7741	ANDQ    $-4, R13
7742	MOVQ    R8, R14
7743	SHRQ    $0x02, R8, R15
7744	SHRQ    $0x02, R8
7745	ADDQ    R13, R10
7746	ADCQ    R14, R11
7747	ADCQ    $0x00, R12
7748	ADDQ    R15, R10
7749	ADCQ    R8, R11
7750	ADCQ    $0x00, R12
7751
7752sealAVX2InternalLoopStart:
7753	VPADDD   Y14, Y0, Y0
7754	VPADDD   Y9, Y5, Y5
7755	VPADDD   Y10, Y6, Y6
7756	VPADDD   Y11, Y7, Y7
7757	VPXOR    Y0, Y4, Y4
7758	VPXOR    Y5, Y1, Y1
7759	VPXOR    Y6, Y2, Y2
7760	VPXOR    Y7, Y3, Y3
7761	VPSHUFB  rol8<>+0(SB), Y4, Y4
7762	VPSHUFB  rol8<>+0(SB), Y1, Y1
7763	VPSHUFB  rol8<>+0(SB), Y2, Y2
7764	VPSHUFB  rol8<>+0(SB), Y3, Y3
7765	ADDQ     16(DI), R10
7766	ADCQ     24(DI), R11
7767	ADCQ     $0x01, R12
7768	VPADDD   Y4, Y12, Y12
7769	VPADDD   Y1, Y13, Y13
7770	VPADDD   Y2, Y8, Y8
7771	VPADDD   Y3, Y15, Y15
7772	MOVQ     (BP), DX
7773	MOVQ     DX, R15
7774	MULXQ    R10, R13, R14
7775	IMULQ    R12, R15
7776	MULXQ    R11, AX, DX
7777	ADDQ     AX, R14
7778	ADCQ     DX, R15
7779	VPXOR    Y12, Y14, Y14
7780	VPXOR    Y13, Y9, Y9
7781	VPXOR    Y8, Y10, Y10
7782	VPXOR    Y15, Y11, Y11
7783	VMOVDQA  Y15, 224(BP)
7784	VPSLLD   $0x07, Y14, Y15
7785	VPSRLD   $0x19, Y14, Y14
7786	VPXOR    Y15, Y14, Y14
7787	VPSLLD   $0x07, Y9, Y15
7788	VPSRLD   $0x19, Y9, Y9
7789	VPXOR    Y15, Y9, Y9
7790	VPSLLD   $0x07, Y10, Y15
7791	VPSRLD   $0x19, Y10, Y10
7792	VPXOR    Y15, Y10, Y10
7793	VPSLLD   $0x07, Y11, Y15
7794	VPSRLD   $0x19, Y11, Y11
7795	VPXOR    Y15, Y11, Y11
7796	VMOVDQA  224(BP), Y15
7797	MOVQ     8(BP), DX
7798	MULXQ    R10, R10, AX
7799	ADDQ     R10, R14
7800	MULXQ    R11, R11, R8
7801	ADCQ     R11, R15
7802	ADCQ     $0x00, R8
7803	VPALIGNR $0x04, Y14, Y14, Y14
7804	VPALIGNR $0x04, Y9, Y9, Y9
7805	VPALIGNR $0x04, Y10, Y10, Y10
7806	VPALIGNR $0x04, Y11, Y11, Y11
7807	VPALIGNR $0x08, Y12, Y12, Y12
7808	VPALIGNR $0x08, Y13, Y13, Y13
7809	VPALIGNR $0x08, Y8, Y8, Y8
7810	VPALIGNR $0x08, Y15, Y15, Y15
7811	VPALIGNR $0x0c, Y4, Y4, Y4
7812	VPALIGNR $0x0c, Y1, Y1, Y1
7813	VPALIGNR $0x0c, Y2, Y2, Y2
7814	VPALIGNR $0x0c, Y3, Y3, Y3
7815	VPADDD   Y14, Y0, Y0
7816	VPADDD   Y9, Y5, Y5
7817	VPADDD   Y10, Y6, Y6
7818	VPADDD   Y11, Y7, Y7
7819	IMULQ    R12, DX
7820	ADDQ     AX, R15
7821	ADCQ     DX, R8
7822	VPXOR    Y0, Y4, Y4
7823	VPXOR    Y5, Y1, Y1
7824	VPXOR    Y6, Y2, Y2
7825	VPXOR    Y7, Y3, Y3
7826	VPSHUFB  rol16<>+0(SB), Y4, Y4
7827	VPSHUFB  rol16<>+0(SB), Y1, Y1
7828	VPSHUFB  rol16<>+0(SB), Y2, Y2
7829	VPSHUFB  rol16<>+0(SB), Y3, Y3
7830	MOVQ     R13, R10
7831	MOVQ     R14, R11
7832	MOVQ     R15, R12
7833	ANDQ     $0x03, R12
7834	MOVQ     R15, R13
7835	ANDQ     $-4, R13
7836	MOVQ     R8, R14
7837	SHRQ     $0x02, R8, R15
7838	SHRQ     $0x02, R8
7839	ADDQ     R13, R10
7840	ADCQ     R14, R11
7841	ADCQ     $0x00, R12
7842	ADDQ     R15, R10
7843	ADCQ     R8, R11
7844	ADCQ     $0x00, R12
7845	VPADDD   Y4, Y12, Y12
7846	VPADDD   Y1, Y13, Y13
7847	VPADDD   Y2, Y8, Y8
7848	VPADDD   Y3, Y15, Y15
7849	VPXOR    Y12, Y14, Y14
7850	VPXOR    Y13, Y9, Y9
7851	VPXOR    Y8, Y10, Y10
7852	VPXOR    Y15, Y11, Y11
7853	ADDQ     32(DI), R10
7854	ADCQ     40(DI), R11
7855	ADCQ     $0x01, R12
7856	LEAQ     48(DI), DI
7857	VMOVDQA  Y15, 224(BP)
7858	VPSLLD   $0x0c, Y14, Y15
7859	VPSRLD   $0x14, Y14, Y14
7860	VPXOR    Y15, Y14, Y14
7861	VPSLLD   $0x0c, Y9, Y15
7862	VPSRLD   $0x14, Y9, Y9
7863	VPXOR    Y15, Y9, Y9
7864	VPSLLD   $0x0c, Y10, Y15
7865	VPSRLD   $0x14, Y10, Y10
7866	VPXOR    Y15, Y10, Y10
7867	VPSLLD   $0x0c, Y11, Y15
7868	VPSRLD   $0x14, Y11, Y11
7869	VPXOR    Y15, Y11, Y11
7870	VMOVDQA  224(BP), Y15
7871	MOVQ     (BP), DX
7872	MOVQ     DX, R15
7873	MULXQ    R10, R13, R14
7874	IMULQ    R12, R15
7875	MULXQ    R11, AX, DX
7876	ADDQ     AX, R14
7877	ADCQ     DX, R15
7878	VPADDD   Y14, Y0, Y0
7879	VPADDD   Y9, Y5, Y5
7880	VPADDD   Y10, Y6, Y6
7881	VPADDD   Y11, Y7, Y7
7882	VPXOR    Y0, Y4, Y4
7883	VPXOR    Y5, Y1, Y1
7884	VPXOR    Y6, Y2, Y2
7885	VPXOR    Y7, Y3, Y3
7886	MOVQ     8(BP), DX
7887	MULXQ    R10, R10, AX
7888	ADDQ     R10, R14
7889	MULXQ    R11, R11, R8
7890	ADCQ     R11, R15
7891	ADCQ     $0x00, R8
7892	VPSHUFB  rol8<>+0(SB), Y4, Y4
7893	VPSHUFB  rol8<>+0(SB), Y1, Y1
7894	VPSHUFB  rol8<>+0(SB), Y2, Y2
7895	VPSHUFB  rol8<>+0(SB), Y3, Y3
7896	VPADDD   Y4, Y12, Y12
7897	VPADDD   Y1, Y13, Y13
7898	VPADDD   Y2, Y8, Y8
7899	VPADDD   Y3, Y15, Y15
7900	IMULQ    R12, DX
7901	ADDQ     AX, R15
7902	ADCQ     DX, R8
7903	VPXOR    Y12, Y14, Y14
7904	VPXOR    Y13, Y9, Y9
7905	VPXOR    Y8, Y10, Y10
7906	VPXOR    Y15, Y11, Y11
7907	VMOVDQA  Y15, 224(BP)
7908	VPSLLD   $0x07, Y14, Y15
7909	VPSRLD   $0x19, Y14, Y14
7910	VPXOR    Y15, Y14, Y14
7911	VPSLLD   $0x07, Y9, Y15
7912	VPSRLD   $0x19, Y9, Y9
7913	VPXOR    Y15, Y9, Y9
7914	VPSLLD   $0x07, Y10, Y15
7915	VPSRLD   $0x19, Y10, Y10
7916	VPXOR    Y15, Y10, Y10
7917	VPSLLD   $0x07, Y11, Y15
7918	VPSRLD   $0x19, Y11, Y11
7919	VPXOR    Y15, Y11, Y11
7920	VMOVDQA  224(BP), Y15
7921	MOVQ     R13, R10
7922	MOVQ     R14, R11
7923	MOVQ     R15, R12
7924	ANDQ     $0x03, R12
7925	MOVQ     R15, R13
7926	ANDQ     $-4, R13
7927	MOVQ     R8, R14
7928	SHRQ     $0x02, R8, R15
7929	SHRQ     $0x02, R8
7930	ADDQ     R13, R10
7931	ADCQ     R14, R11
7932	ADCQ     $0x00, R12
7933	ADDQ     R15, R10
7934	ADCQ     R8, R11
7935	ADCQ     $0x00, R12
7936	VPALIGNR $0x0c, Y14, Y14, Y14
7937	VPALIGNR $0x0c, Y9, Y9, Y9
7938	VPALIGNR $0x0c, Y10, Y10, Y10
7939	VPALIGNR $0x0c, Y11, Y11, Y11
7940	VPALIGNR $0x08, Y12, Y12, Y12
7941	VPALIGNR $0x08, Y13, Y13, Y13
7942	VPALIGNR $0x08, Y8, Y8, Y8
7943	VPALIGNR $0x08, Y15, Y15, Y15
7944	VPALIGNR $0x04, Y4, Y4, Y4
7945	VPALIGNR $0x04, Y1, Y1, Y1
7946	VPALIGNR $0x04, Y2, Y2, Y2
7947	VPALIGNR $0x04, Y3, Y3, Y3
7948	DECQ     CX
7949	JNE      sealAVX2InternalLoop
7950	VPADDD   chacha20Constants<>+0(SB), Y0, Y0
7951	VPADDD   chacha20Constants<>+0(SB), Y5, Y5
7952	VPADDD   chacha20Constants<>+0(SB), Y6, Y6
7953	VPADDD   chacha20Constants<>+0(SB), Y7, Y7
7954	VPADDD   32(BP), Y14, Y14
7955	VPADDD   32(BP), Y9, Y9
7956	VPADDD   32(BP), Y10, Y10
7957	VPADDD   32(BP), Y11, Y11
7958	VPADDD   64(BP), Y12, Y12
7959	VPADDD   64(BP), Y13, Y13
7960	VPADDD   64(BP), Y8, Y8
7961	VPADDD   64(BP), Y15, Y15
7962	VPADDD   96(BP), Y4, Y4
7963	VPADDD   128(BP), Y1, Y1
7964	VPADDD   160(BP), Y2, Y2
7965	VPADDD   192(BP), Y3, Y3
7966	VMOVDQA  Y15, 224(BP)
7967
7968	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
7969	ADDQ       (DI), R10
7970	ADCQ       8(DI), R11
7971	ADCQ       $0x01, R12
7972	MOVQ       (BP), DX
7973	MOVQ       DX, R15
7974	MULXQ      R10, R13, R14
7975	IMULQ      R12, R15
7976	MULXQ      R11, AX, DX
7977	ADDQ       AX, R14
7978	ADCQ       DX, R15
7979	MOVQ       8(BP), DX
7980	MULXQ      R10, R10, AX
7981	ADDQ       R10, R14
7982	MULXQ      R11, R11, R8
7983	ADCQ       R11, R15
7984	ADCQ       $0x00, R8
7985	IMULQ      R12, DX
7986	ADDQ       AX, R15
7987	ADCQ       DX, R8
7988	MOVQ       R13, R10
7989	MOVQ       R14, R11
7990	MOVQ       R15, R12
7991	ANDQ       $0x03, R12
7992	MOVQ       R15, R13
7993	ANDQ       $-4, R13
7994	MOVQ       R8, R14
7995	SHRQ       $0x02, R8, R15
7996	SHRQ       $0x02, R8
7997	ADDQ       R13, R10
7998	ADCQ       R14, R11
7999	ADCQ       $0x00, R12
8000	ADDQ       R15, R10
8001	ADCQ       R8, R11
8002	ADCQ       $0x00, R12
8003	LEAQ       32(DI), DI
8004	VPERM2I128 $0x02, Y0, Y14, Y15
8005	VPERM2I128 $0x13, Y0, Y14, Y14
8006	VPERM2I128 $0x02, Y12, Y4, Y0
8007	VPERM2I128 $0x13, Y12, Y4, Y12
8008	VPXOR      (SI), Y15, Y15
8009	VPXOR      32(SI), Y0, Y0
8010	VPXOR      64(SI), Y14, Y14
8011	VPXOR      96(SI), Y12, Y12
8012	VMOVDQU    Y15, (DI)
8013	VMOVDQU    Y0, 32(DI)
8014	VMOVDQU    Y14, 64(DI)
8015	VMOVDQU    Y12, 96(DI)
8016	VPERM2I128 $0x02, Y5, Y9, Y0
8017	VPERM2I128 $0x02, Y13, Y1, Y14
8018	VPERM2I128 $0x13, Y5, Y9, Y12
8019	VPERM2I128 $0x13, Y13, Y1, Y4
8020	VPXOR      128(SI), Y0, Y0
8021	VPXOR      160(SI), Y14, Y14
8022	VPXOR      192(SI), Y12, Y12
8023	VPXOR      224(SI), Y4, Y4
8024	VMOVDQU    Y0, 128(DI)
8025	VMOVDQU    Y14, 160(DI)
8026	VMOVDQU    Y12, 192(DI)
8027	VMOVDQU    Y4, 224(DI)
8028
8029	// and here
8030	ADDQ       -16(DI), R10
8031	ADCQ       -8(DI), R11
8032	ADCQ       $0x01, R12
8033	MOVQ       (BP), DX
8034	MOVQ       DX, R15
8035	MULXQ      R10, R13, R14
8036	IMULQ      R12, R15
8037	MULXQ      R11, AX, DX
8038	ADDQ       AX, R14
8039	ADCQ       DX, R15
8040	MOVQ       8(BP), DX
8041	MULXQ      R10, R10, AX
8042	ADDQ       R10, R14
8043	MULXQ      R11, R11, R8
8044	ADCQ       R11, R15
8045	ADCQ       $0x00, R8
8046	IMULQ      R12, DX
8047	ADDQ       AX, R15
8048	ADCQ       DX, R8
8049	MOVQ       R13, R10
8050	MOVQ       R14, R11
8051	MOVQ       R15, R12
8052	ANDQ       $0x03, R12
8053	MOVQ       R15, R13
8054	ANDQ       $-4, R13
8055	MOVQ       R8, R14
8056	SHRQ       $0x02, R8, R15
8057	SHRQ       $0x02, R8
8058	ADDQ       R13, R10
8059	ADCQ       R14, R11
8060	ADCQ       $0x00, R12
8061	ADDQ       R15, R10
8062	ADCQ       R8, R11
8063	ADCQ       $0x00, R12
8064	VPERM2I128 $0x02, Y6, Y10, Y0
8065	VPERM2I128 $0x02, Y8, Y2, Y14
8066	VPERM2I128 $0x13, Y6, Y10, Y12
8067	VPERM2I128 $0x13, Y8, Y2, Y4
8068	VPXOR      256(SI), Y0, Y0
8069	VPXOR      288(SI), Y14, Y14
8070	VPXOR      320(SI), Y12, Y12
8071	VPXOR      352(SI), Y4, Y4
8072	VMOVDQU    Y0, 256(DI)
8073	VMOVDQU    Y14, 288(DI)
8074	VMOVDQU    Y12, 320(DI)
8075	VMOVDQU    Y4, 352(DI)
8076	VPERM2I128 $0x02, Y7, Y11, Y0
8077	VPERM2I128 $0x02, 224(BP), Y3, Y14
8078	VPERM2I128 $0x13, Y7, Y11, Y12
8079	VPERM2I128 $0x13, 224(BP), Y3, Y4
8080	VPXOR      384(SI), Y0, Y0
8081	VPXOR      416(SI), Y14, Y14
8082	VPXOR      448(SI), Y12, Y12
8083	VPXOR      480(SI), Y4, Y4
8084	VMOVDQU    Y0, 384(DI)
8085	VMOVDQU    Y14, 416(DI)
8086	VMOVDQU    Y12, 448(DI)
8087	VMOVDQU    Y4, 480(DI)
8088	LEAQ       512(SI), SI
8089	SUBQ       $0x00000200, BX
8090	CMPQ       BX, $0x00000200
8091	JG         sealAVX2MainLoop
8092
8093	// Tail can only hash 480 bytes
8094	ADDQ  (DI), R10
8095	ADCQ  8(DI), R11
8096	ADCQ  $0x01, R12
8097	MOVQ  (BP), DX
8098	MOVQ  DX, R15
8099	MULXQ R10, R13, R14
8100	IMULQ R12, R15
8101	MULXQ R11, AX, DX
8102	ADDQ  AX, R14
8103	ADCQ  DX, R15
8104	MOVQ  8(BP), DX
8105	MULXQ R10, R10, AX
8106	ADDQ  R10, R14
8107	MULXQ R11, R11, R8
8108	ADCQ  R11, R15
8109	ADCQ  $0x00, R8
8110	IMULQ R12, DX
8111	ADDQ  AX, R15
8112	ADCQ  DX, R8
8113	MOVQ  R13, R10
8114	MOVQ  R14, R11
8115	MOVQ  R15, R12
8116	ANDQ  $0x03, R12
8117	MOVQ  R15, R13
8118	ANDQ  $-4, R13
8119	MOVQ  R8, R14
8120	SHRQ  $0x02, R8, R15
8121	SHRQ  $0x02, R8
8122	ADDQ  R13, R10
8123	ADCQ  R14, R11
8124	ADCQ  $0x00, R12
8125	ADDQ  R15, R10
8126	ADCQ  R8, R11
8127	ADCQ  $0x00, R12
8128	ADDQ  16(DI), R10
8129	ADCQ  24(DI), R11
8130	ADCQ  $0x01, R12
8131	MOVQ  (BP), DX
8132	MOVQ  DX, R15
8133	MULXQ R10, R13, R14
8134	IMULQ R12, R15
8135	MULXQ R11, AX, DX
8136	ADDQ  AX, R14
8137	ADCQ  DX, R15
8138	MOVQ  8(BP), DX
8139	MULXQ R10, R10, AX
8140	ADDQ  R10, R14
8141	MULXQ R11, R11, R8
8142	ADCQ  R11, R15
8143	ADCQ  $0x00, R8
8144	IMULQ R12, DX
8145	ADDQ  AX, R15
8146	ADCQ  DX, R8
8147	MOVQ  R13, R10
8148	MOVQ  R14, R11
8149	MOVQ  R15, R12
8150	ANDQ  $0x03, R12
8151	MOVQ  R15, R13
8152	ANDQ  $-4, R13
8153	MOVQ  R8, R14
8154	SHRQ  $0x02, R8, R15
8155	SHRQ  $0x02, R8
8156	ADDQ  R13, R10
8157	ADCQ  R14, R11
8158	ADCQ  $0x00, R12
8159	ADDQ  R15, R10
8160	ADCQ  R8, R11
8161	ADCQ  $0x00, R12
8162	LEAQ  32(DI), DI
8163	MOVQ  $0x0000000a, CX
8164	MOVQ  $0x00000000, R9
8165	CMPQ  BX, $0x80
8166	JBE   sealAVX2Tail128
8167	CMPQ  BX, $0x00000100
8168	JBE   sealAVX2Tail256
8169	CMPQ  BX, $0x00000180
8170	JBE   sealAVX2Tail384
8171	JMP   sealAVX2Tail512
8172
8173seal192AVX2:
8174	VMOVDQA Y0, Y5
8175	VMOVDQA Y14, Y9
8176	VMOVDQA Y12, Y13
8177	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
8178	VMOVDQA Y0, Y6
8179	VMOVDQA Y14, Y10
8180	VMOVDQA Y12, Y8
8181	VMOVDQA Y4, Y2
8182	VMOVDQA Y1, Y15
8183	MOVQ    $0x0000000a, R9
8184
8185sealAVX2192InnerCipherLoop:
8186	VPADDD     Y14, Y0, Y0
8187	VPXOR      Y0, Y4, Y4
8188	VPSHUFB    rol16<>+0(SB), Y4, Y4
8189	VPADDD     Y4, Y12, Y12
8190	VPXOR      Y12, Y14, Y14
8191	VPSLLD     $0x0c, Y14, Y3
8192	VPSRLD     $0x14, Y14, Y14
8193	VPXOR      Y3, Y14, Y14
8194	VPADDD     Y14, Y0, Y0
8195	VPXOR      Y0, Y4, Y4
8196	VPSHUFB    rol8<>+0(SB), Y4, Y4
8197	VPADDD     Y4, Y12, Y12
8198	VPXOR      Y12, Y14, Y14
8199	VPSLLD     $0x07, Y14, Y3
8200	VPSRLD     $0x19, Y14, Y14
8201	VPXOR      Y3, Y14, Y14
8202	VPADDD     Y9, Y5, Y5
8203	VPXOR      Y5, Y1, Y1
8204	VPSHUFB    rol16<>+0(SB), Y1, Y1
8205	VPADDD     Y1, Y13, Y13
8206	VPXOR      Y13, Y9, Y9
8207	VPSLLD     $0x0c, Y9, Y3
8208	VPSRLD     $0x14, Y9, Y9
8209	VPXOR      Y3, Y9, Y9
8210	VPADDD     Y9, Y5, Y5
8211	VPXOR      Y5, Y1, Y1
8212	VPSHUFB    rol8<>+0(SB), Y1, Y1
8213	VPADDD     Y1, Y13, Y13
8214	VPXOR      Y13, Y9, Y9
8215	VPSLLD     $0x07, Y9, Y3
8216	VPSRLD     $0x19, Y9, Y9
8217	VPXOR      Y3, Y9, Y9
8218	VPALIGNR   $0x04, Y14, Y14, Y14
8219	VPALIGNR   $0x04, Y9, Y9, Y9
8220	VPALIGNR   $0x08, Y12, Y12, Y12
8221	VPALIGNR   $0x08, Y13, Y13, Y13
8222	VPALIGNR   $0x0c, Y4, Y4, Y4
8223	VPALIGNR   $0x0c, Y1, Y1, Y1
8224	VPADDD     Y14, Y0, Y0
8225	VPXOR      Y0, Y4, Y4
8226	VPSHUFB    rol16<>+0(SB), Y4, Y4
8227	VPADDD     Y4, Y12, Y12
8228	VPXOR      Y12, Y14, Y14
8229	VPSLLD     $0x0c, Y14, Y3
8230	VPSRLD     $0x14, Y14, Y14
8231	VPXOR      Y3, Y14, Y14
8232	VPADDD     Y14, Y0, Y0
8233	VPXOR      Y0, Y4, Y4
8234	VPSHUFB    rol8<>+0(SB), Y4, Y4
8235	VPADDD     Y4, Y12, Y12
8236	VPXOR      Y12, Y14, Y14
8237	VPSLLD     $0x07, Y14, Y3
8238	VPSRLD     $0x19, Y14, Y14
8239	VPXOR      Y3, Y14, Y14
8240	VPADDD     Y9, Y5, Y5
8241	VPXOR      Y5, Y1, Y1
8242	VPSHUFB    rol16<>+0(SB), Y1, Y1
8243	VPADDD     Y1, Y13, Y13
8244	VPXOR      Y13, Y9, Y9
8245	VPSLLD     $0x0c, Y9, Y3
8246	VPSRLD     $0x14, Y9, Y9
8247	VPXOR      Y3, Y9, Y9
8248	VPADDD     Y9, Y5, Y5
8249	VPXOR      Y5, Y1, Y1
8250	VPSHUFB    rol8<>+0(SB), Y1, Y1
8251	VPADDD     Y1, Y13, Y13
8252	VPXOR      Y13, Y9, Y9
8253	VPSLLD     $0x07, Y9, Y3
8254	VPSRLD     $0x19, Y9, Y9
8255	VPXOR      Y3, Y9, Y9
8256	VPALIGNR   $0x0c, Y14, Y14, Y14
8257	VPALIGNR   $0x0c, Y9, Y9, Y9
8258	VPALIGNR   $0x08, Y12, Y12, Y12
8259	VPALIGNR   $0x08, Y13, Y13, Y13
8260	VPALIGNR   $0x04, Y4, Y4, Y4
8261	VPALIGNR   $0x04, Y1, Y1, Y1
8262	DECQ       R9
8263	JNE        sealAVX2192InnerCipherLoop
8264	VPADDD     Y6, Y0, Y0
8265	VPADDD     Y6, Y5, Y5
8266	VPADDD     Y10, Y14, Y14
8267	VPADDD     Y10, Y9, Y9
8268	VPADDD     Y8, Y12, Y12
8269	VPADDD     Y8, Y13, Y13
8270	VPADDD     Y2, Y4, Y4
8271	VPADDD     Y15, Y1, Y1
8272	VPERM2I128 $0x02, Y0, Y14, Y3
8273
8274	// Clamp and store poly key
8275	VPAND   polyClampMask<>+0(SB), Y3, Y3
8276	VMOVDQA Y3, (BP)
8277
8278	// Stream for up to 192 bytes
8279	VPERM2I128 $0x13, Y0, Y14, Y0
8280	VPERM2I128 $0x13, Y12, Y4, Y14
8281	VPERM2I128 $0x02, Y5, Y9, Y12
8282	VPERM2I128 $0x02, Y13, Y1, Y4
8283	VPERM2I128 $0x13, Y5, Y9, Y5
8284	VPERM2I128 $0x13, Y13, Y1, Y9
8285
8286sealAVX2ShortSeal:
8287	// Hash aad
8288	MOVQ ad_len+80(FP), R9
8289	CALL polyHashADInternal<>(SB)
8290	XORQ CX, CX
8291
8292sealAVX2SealHash:
8293	// itr1 holds the number of bytes encrypted but not yet hashed
8294	CMPQ  CX, $0x10
8295	JB    sealAVX2ShortSealLoop
8296	ADDQ  (DI), R10
8297	ADCQ  8(DI), R11
8298	ADCQ  $0x01, R12
8299	MOVQ  (BP), AX
8300	MOVQ  AX, R15
8301	MULQ  R10
8302	MOVQ  AX, R13
8303	MOVQ  DX, R14
8304	MOVQ  (BP), AX
8305	MULQ  R11
8306	IMULQ R12, R15
8307	ADDQ  AX, R14
8308	ADCQ  DX, R15
8309	MOVQ  8(BP), AX
8310	MOVQ  AX, R8
8311	MULQ  R10
8312	ADDQ  AX, R14
8313	ADCQ  $0x00, DX
8314	MOVQ  DX, R10
8315	MOVQ  8(BP), AX
8316	MULQ  R11
8317	ADDQ  AX, R15
8318	ADCQ  $0x00, DX
8319	IMULQ R12, R8
8320	ADDQ  R10, R15
8321	ADCQ  DX, R8
8322	MOVQ  R13, R10
8323	MOVQ  R14, R11
8324	MOVQ  R15, R12
8325	ANDQ  $0x03, R12
8326	MOVQ  R15, R13
8327	ANDQ  $-4, R13
8328	MOVQ  R8, R14
8329	SHRQ  $0x02, R8, R15
8330	SHRQ  $0x02, R8
8331	ADDQ  R13, R10
8332	ADCQ  R14, R11
8333	ADCQ  $0x00, R12
8334	ADDQ  R15, R10
8335	ADCQ  R8, R11
8336	ADCQ  $0x00, R12
8337	SUBQ  $0x10, CX
8338	ADDQ  $0x10, DI
8339	JMP   sealAVX2SealHash
8340
8341sealAVX2ShortSealLoop:
8342	CMPQ BX, $0x20
8343	JB   sealAVX2ShortTail32
8344	SUBQ $0x20, BX
8345
8346	// Load for encryption
8347	VPXOR   (SI), Y0, Y0
8348	VMOVDQU Y0, (DI)
8349	LEAQ    32(SI), SI
8350
8351	// Now can hash
8352	ADDQ  (DI), R10
8353	ADCQ  8(DI), R11
8354	ADCQ  $0x01, R12
8355	MOVQ  (BP), DX
8356	MOVQ  DX, R15
8357	MULXQ R10, R13, R14
8358	IMULQ R12, R15
8359	MULXQ R11, AX, DX
8360	ADDQ  AX, R14
8361	ADCQ  DX, R15
8362	MOVQ  8(BP), DX
8363	MULXQ R10, R10, AX
8364	ADDQ  R10, R14
8365	MULXQ R11, R11, R8
8366	ADCQ  R11, R15
8367	ADCQ  $0x00, R8
8368	IMULQ R12, DX
8369	ADDQ  AX, R15
8370	ADCQ  DX, R8
8371	MOVQ  R13, R10
8372	MOVQ  R14, R11
8373	MOVQ  R15, R12
8374	ANDQ  $0x03, R12
8375	MOVQ  R15, R13
8376	ANDQ  $-4, R13
8377	MOVQ  R8, R14
8378	SHRQ  $0x02, R8, R15
8379	SHRQ  $0x02, R8
8380	ADDQ  R13, R10
8381	ADCQ  R14, R11
8382	ADCQ  $0x00, R12
8383	ADDQ  R15, R10
8384	ADCQ  R8, R11
8385	ADCQ  $0x00, R12
8386	ADDQ  16(DI), R10
8387	ADCQ  24(DI), R11
8388	ADCQ  $0x01, R12
8389	MOVQ  (BP), DX
8390	MOVQ  DX, R15
8391	MULXQ R10, R13, R14
8392	IMULQ R12, R15
8393	MULXQ R11, AX, DX
8394	ADDQ  AX, R14
8395	ADCQ  DX, R15
8396	MOVQ  8(BP), DX
8397	MULXQ R10, R10, AX
8398	ADDQ  R10, R14
8399	MULXQ R11, R11, R8
8400	ADCQ  R11, R15
8401	ADCQ  $0x00, R8
8402	IMULQ R12, DX
8403	ADDQ  AX, R15
8404	ADCQ  DX, R8
8405	MOVQ  R13, R10
8406	MOVQ  R14, R11
8407	MOVQ  R15, R12
8408	ANDQ  $0x03, R12
8409	MOVQ  R15, R13
8410	ANDQ  $-4, R13
8411	MOVQ  R8, R14
8412	SHRQ  $0x02, R8, R15
8413	SHRQ  $0x02, R8
8414	ADDQ  R13, R10
8415	ADCQ  R14, R11
8416	ADCQ  $0x00, R12
8417	ADDQ  R15, R10
8418	ADCQ  R8, R11
8419	ADCQ  $0x00, R12
8420	LEAQ  32(DI), DI
8421
8422	// Shift stream left
8423	VMOVDQA Y14, Y0
8424	VMOVDQA Y12, Y14
8425	VMOVDQA Y4, Y12
8426	VMOVDQA Y5, Y4
8427	VMOVDQA Y9, Y5
8428	VMOVDQA Y13, Y9
8429	VMOVDQA Y1, Y13
8430	VMOVDQA Y6, Y1
8431	VMOVDQA Y10, Y6
8432	JMP     sealAVX2ShortSealLoop
8433
8434sealAVX2ShortTail32:
8435	CMPQ    BX, $0x10
8436	VMOVDQA X0, X1
8437	JB      sealAVX2ShortDone
8438	SUBQ    $0x10, BX
8439
8440	// Load for encryption
8441	VPXOR   (SI), X0, X12
8442	VMOVDQU X12, (DI)
8443	LEAQ    16(SI), SI
8444
8445	// Hash
8446	ADDQ       (DI), R10
8447	ADCQ       8(DI), R11
8448	ADCQ       $0x01, R12
8449	MOVQ       (BP), DX
8450	MOVQ       DX, R15
8451	MULXQ      R10, R13, R14
8452	IMULQ      R12, R15
8453	MULXQ      R11, AX, DX
8454	ADDQ       AX, R14
8455	ADCQ       DX, R15
8456	MOVQ       8(BP), DX
8457	MULXQ      R10, R10, AX
8458	ADDQ       R10, R14
8459	MULXQ      R11, R11, R8
8460	ADCQ       R11, R15
8461	ADCQ       $0x00, R8
8462	IMULQ      R12, DX
8463	ADDQ       AX, R15
8464	ADCQ       DX, R8
8465	MOVQ       R13, R10
8466	MOVQ       R14, R11
8467	MOVQ       R15, R12
8468	ANDQ       $0x03, R12
8469	MOVQ       R15, R13
8470	ANDQ       $-4, R13
8471	MOVQ       R8, R14
8472	SHRQ       $0x02, R8, R15
8473	SHRQ       $0x02, R8
8474	ADDQ       R13, R10
8475	ADCQ       R14, R11
8476	ADCQ       $0x00, R12
8477	ADDQ       R15, R10
8478	ADCQ       R8, R11
8479	ADCQ       $0x00, R12
8480	LEAQ       16(DI), DI
8481	VPERM2I128 $0x11, Y0, Y0, Y0
8482	VMOVDQA    X0, X1
8483
8484sealAVX2ShortDone:
8485	VZEROUPPER
8486	JMP sealSSETail
8487
8488seal320AVX2:
8489	VMOVDQA Y0, Y5
8490	VMOVDQA Y14, Y9
8491	VMOVDQA Y12, Y13
8492	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
8493	VMOVDQA Y0, Y6
8494	VMOVDQA Y14, Y10
8495	VMOVDQA Y12, Y8
8496	VPADDD  avx2IncMask<>+0(SB), Y1, Y2
8497	VMOVDQA Y14, Y7
8498	VMOVDQA Y12, Y11
8499	VMOVDQA Y4, Y15
8500	MOVQ    $0x0000000a, R9
8501
8502sealAVX2320InnerCipherLoop:
8503	VPADDD   Y14, Y0, Y0
8504	VPXOR    Y0, Y4, Y4
8505	VPSHUFB  rol16<>+0(SB), Y4, Y4
8506	VPADDD   Y4, Y12, Y12
8507	VPXOR    Y12, Y14, Y14
8508	VPSLLD   $0x0c, Y14, Y3
8509	VPSRLD   $0x14, Y14, Y14
8510	VPXOR    Y3, Y14, Y14
8511	VPADDD   Y14, Y0, Y0
8512	VPXOR    Y0, Y4, Y4
8513	VPSHUFB  rol8<>+0(SB), Y4, Y4
8514	VPADDD   Y4, Y12, Y12
8515	VPXOR    Y12, Y14, Y14
8516	VPSLLD   $0x07, Y14, Y3
8517	VPSRLD   $0x19, Y14, Y14
8518	VPXOR    Y3, Y14, Y14
8519	VPADDD   Y9, Y5, Y5
8520	VPXOR    Y5, Y1, Y1
8521	VPSHUFB  rol16<>+0(SB), Y1, Y1
8522	VPADDD   Y1, Y13, Y13
8523	VPXOR    Y13, Y9, Y9
8524	VPSLLD   $0x0c, Y9, Y3
8525	VPSRLD   $0x14, Y9, Y9
8526	VPXOR    Y3, Y9, Y9
8527	VPADDD   Y9, Y5, Y5
8528	VPXOR    Y5, Y1, Y1
8529	VPSHUFB  rol8<>+0(SB), Y1, Y1
8530	VPADDD   Y1, Y13, Y13
8531	VPXOR    Y13, Y9, Y9
8532	VPSLLD   $0x07, Y9, Y3
8533	VPSRLD   $0x19, Y9, Y9
8534	VPXOR    Y3, Y9, Y9
8535	VPADDD   Y10, Y6, Y6
8536	VPXOR    Y6, Y2, Y2
8537	VPSHUFB  rol16<>+0(SB), Y2, Y2
8538	VPADDD   Y2, Y8, Y8
8539	VPXOR    Y8, Y10, Y10
8540	VPSLLD   $0x0c, Y10, Y3
8541	VPSRLD   $0x14, Y10, Y10
8542	VPXOR    Y3, Y10, Y10
8543	VPADDD   Y10, Y6, Y6
8544	VPXOR    Y6, Y2, Y2
8545	VPSHUFB  rol8<>+0(SB), Y2, Y2
8546	VPADDD   Y2, Y8, Y8
8547	VPXOR    Y8, Y10, Y10
8548	VPSLLD   $0x07, Y10, Y3
8549	VPSRLD   $0x19, Y10, Y10
8550	VPXOR    Y3, Y10, Y10
8551	VPALIGNR $0x04, Y14, Y14, Y14
8552	VPALIGNR $0x04, Y9, Y9, Y9
8553	VPALIGNR $0x04, Y10, Y10, Y10
8554	VPALIGNR $0x08, Y12, Y12, Y12
8555	VPALIGNR $0x08, Y13, Y13, Y13
8556	VPALIGNR $0x08, Y8, Y8, Y8
8557	VPALIGNR $0x0c, Y4, Y4, Y4
8558	VPALIGNR $0x0c, Y1, Y1, Y1
8559	VPALIGNR $0x0c, Y2, Y2, Y2
8560	VPADDD   Y14, Y0, Y0
8561	VPXOR    Y0, Y4, Y4
8562	VPSHUFB  rol16<>+0(SB), Y4, Y4
8563	VPADDD   Y4, Y12, Y12
8564	VPXOR    Y12, Y14, Y14
8565	VPSLLD   $0x0c, Y14, Y3
8566	VPSRLD   $0x14, Y14, Y14
8567	VPXOR    Y3, Y14, Y14
8568	VPADDD   Y14, Y0, Y0
8569	VPXOR    Y0, Y4, Y4
8570	VPSHUFB  rol8<>+0(SB), Y4, Y4
8571	VPADDD   Y4, Y12, Y12
8572	VPXOR    Y12, Y14, Y14
8573	VPSLLD   $0x07, Y14, Y3
8574	VPSRLD   $0x19, Y14, Y14
8575	VPXOR    Y3, Y14, Y14
8576	VPADDD   Y9, Y5, Y5
8577	VPXOR    Y5, Y1, Y1
8578	VPSHUFB  rol16<>+0(SB), Y1, Y1
8579	VPADDD   Y1, Y13, Y13
8580	VPXOR    Y13, Y9, Y9
8581	VPSLLD   $0x0c, Y9, Y3
8582	VPSRLD   $0x14, Y9, Y9
8583	VPXOR    Y3, Y9, Y9
8584	VPADDD   Y9, Y5, Y5
8585	VPXOR    Y5, Y1, Y1
8586	VPSHUFB  rol8<>+0(SB), Y1, Y1
8587	VPADDD   Y1, Y13, Y13
8588	VPXOR    Y13, Y9, Y9
8589	VPSLLD   $0x07, Y9, Y3
8590	VPSRLD   $0x19, Y9, Y9
8591	VPXOR    Y3, Y9, Y9
8592	VPADDD   Y10, Y6, Y6
8593	VPXOR    Y6, Y2, Y2
8594	VPSHUFB  rol16<>+0(SB), Y2, Y2
8595	VPADDD   Y2, Y8, Y8
8596	VPXOR    Y8, Y10, Y10
8597	VPSLLD   $0x0c, Y10, Y3
8598	VPSRLD   $0x14, Y10, Y10
8599	VPXOR    Y3, Y10, Y10
8600	VPADDD   Y10, Y6, Y6
8601	VPXOR    Y6, Y2, Y2
8602	VPSHUFB  rol8<>+0(SB), Y2, Y2
8603	VPADDD   Y2, Y8, Y8
8604	VPXOR    Y8, Y10, Y10
8605	VPSLLD   $0x07, Y10, Y3
8606	VPSRLD   $0x19, Y10, Y10
8607	VPXOR    Y3, Y10, Y10
8608	VPALIGNR $0x0c, Y14, Y14, Y14
8609	VPALIGNR $0x0c, Y9, Y9, Y9
8610	VPALIGNR $0x0c, Y10, Y10, Y10
8611	VPALIGNR $0x08, Y12, Y12, Y12
8612	VPALIGNR $0x08, Y13, Y13, Y13
8613	VPALIGNR $0x08, Y8, Y8, Y8
8614	VPALIGNR $0x04, Y4, Y4, Y4
8615	VPALIGNR $0x04, Y1, Y1, Y1
8616	VPALIGNR $0x04, Y2, Y2, Y2
8617	DECQ     R9
8618	JNE      sealAVX2320InnerCipherLoop
8619	VMOVDQA  chacha20Constants<>+0(SB), Y3
8620	VPADDD   Y3, Y0, Y0
8621	VPADDD   Y3, Y5, Y5
8622	VPADDD   Y3, Y6, Y6
8623	VPADDD   Y7, Y14, Y14
8624	VPADDD   Y7, Y9, Y9
8625	VPADDD   Y7, Y10, Y10
8626	VPADDD   Y11, Y12, Y12
8627	VPADDD   Y11, Y13, Y13
8628	VPADDD   Y11, Y8, Y8
8629	VMOVDQA  avx2IncMask<>+0(SB), Y3
8630	VPADDD   Y15, Y4, Y4
8631	VPADDD   Y3, Y15, Y15
8632	VPADDD   Y15, Y1, Y1
8633	VPADDD   Y3, Y15, Y15
8634	VPADDD   Y15, Y2, Y2
8635
8636	// Clamp and store poly key
8637	VPERM2I128 $0x02, Y0, Y14, Y3
8638	VPAND      polyClampMask<>+0(SB), Y3, Y3
8639	VMOVDQA    Y3, (BP)
8640
8641	// Stream for up to 320 bytes
8642	VPERM2I128 $0x13, Y0, Y14, Y0
8643	VPERM2I128 $0x13, Y12, Y4, Y14
8644	VPERM2I128 $0x02, Y5, Y9, Y12
8645	VPERM2I128 $0x02, Y13, Y1, Y4
8646	VPERM2I128 $0x13, Y5, Y9, Y5
8647	VPERM2I128 $0x13, Y13, Y1, Y9
8648	VPERM2I128 $0x02, Y6, Y10, Y13
8649	VPERM2I128 $0x02, Y8, Y2, Y1
8650	VPERM2I128 $0x13, Y6, Y10, Y6
8651	VPERM2I128 $0x13, Y8, Y2, Y10
8652	JMP        sealAVX2ShortSeal
8653
8654sealAVX2Tail128:
8655	VMOVDQA chacha20Constants<>+0(SB), Y0
8656	VMOVDQA 32(BP), Y14
8657	VMOVDQA 64(BP), Y12
8658	VMOVDQA 192(BP), Y4
8659	VPADDD  avx2IncMask<>+0(SB), Y4, Y4
8660	VMOVDQA Y4, Y1
8661
8662sealAVX2Tail128LoopA:
8663	ADDQ  (DI), R10
8664	ADCQ  8(DI), R11
8665	ADCQ  $0x01, R12
8666	MOVQ  (BP), AX
8667	MOVQ  AX, R15
8668	MULQ  R10
8669	MOVQ  AX, R13
8670	MOVQ  DX, R14
8671	MOVQ  (BP), AX
8672	MULQ  R11
8673	IMULQ R12, R15
8674	ADDQ  AX, R14
8675	ADCQ  DX, R15
8676	MOVQ  8(BP), AX
8677	MOVQ  AX, R8
8678	MULQ  R10
8679	ADDQ  AX, R14
8680	ADCQ  $0x00, DX
8681	MOVQ  DX, R10
8682	MOVQ  8(BP), AX
8683	MULQ  R11
8684	ADDQ  AX, R15
8685	ADCQ  $0x00, DX
8686	IMULQ R12, R8
8687	ADDQ  R10, R15
8688	ADCQ  DX, R8
8689	MOVQ  R13, R10
8690	MOVQ  R14, R11
8691	MOVQ  R15, R12
8692	ANDQ  $0x03, R12
8693	MOVQ  R15, R13
8694	ANDQ  $-4, R13
8695	MOVQ  R8, R14
8696	SHRQ  $0x02, R8, R15
8697	SHRQ  $0x02, R8
8698	ADDQ  R13, R10
8699	ADCQ  R14, R11
8700	ADCQ  $0x00, R12
8701	ADDQ  R15, R10
8702	ADCQ  R8, R11
8703	ADCQ  $0x00, R12
8704	LEAQ  16(DI), DI
8705
8706sealAVX2Tail128LoopB:
8707	VPADDD     Y14, Y0, Y0
8708	VPXOR      Y0, Y4, Y4
8709	VPSHUFB    rol16<>+0(SB), Y4, Y4
8710	VPADDD     Y4, Y12, Y12
8711	VPXOR      Y12, Y14, Y14
8712	VPSLLD     $0x0c, Y14, Y3
8713	VPSRLD     $0x14, Y14, Y14
8714	VPXOR      Y3, Y14, Y14
8715	VPADDD     Y14, Y0, Y0
8716	VPXOR      Y0, Y4, Y4
8717	VPSHUFB    rol8<>+0(SB), Y4, Y4
8718	VPADDD     Y4, Y12, Y12
8719	VPXOR      Y12, Y14, Y14
8720	VPSLLD     $0x07, Y14, Y3
8721	VPSRLD     $0x19, Y14, Y14
8722	VPXOR      Y3, Y14, Y14
8723	ADDQ       (DI), R10
8724	ADCQ       8(DI), R11
8725	ADCQ       $0x01, R12
8726	MOVQ       (BP), AX
8727	MOVQ       AX, R15
8728	MULQ       R10
8729	MOVQ       AX, R13
8730	MOVQ       DX, R14
8731	MOVQ       (BP), AX
8732	MULQ       R11
8733	IMULQ      R12, R15
8734	ADDQ       AX, R14
8735	ADCQ       DX, R15
8736	MOVQ       8(BP), AX
8737	MOVQ       AX, R8
8738	MULQ       R10
8739	ADDQ       AX, R14
8740	ADCQ       $0x00, DX
8741	MOVQ       DX, R10
8742	MOVQ       8(BP), AX
8743	MULQ       R11
8744	ADDQ       AX, R15
8745	ADCQ       $0x00, DX
8746	IMULQ      R12, R8
8747	ADDQ       R10, R15
8748	ADCQ       DX, R8
8749	MOVQ       R13, R10
8750	MOVQ       R14, R11
8751	MOVQ       R15, R12
8752	ANDQ       $0x03, R12
8753	MOVQ       R15, R13
8754	ANDQ       $-4, R13
8755	MOVQ       R8, R14
8756	SHRQ       $0x02, R8, R15
8757	SHRQ       $0x02, R8
8758	ADDQ       R13, R10
8759	ADCQ       R14, R11
8760	ADCQ       $0x00, R12
8761	ADDQ       R15, R10
8762	ADCQ       R8, R11
8763	ADCQ       $0x00, R12
8764	VPALIGNR   $0x04, Y14, Y14, Y14
8765	VPALIGNR   $0x08, Y12, Y12, Y12
8766	VPALIGNR   $0x0c, Y4, Y4, Y4
8767	VPADDD     Y14, Y0, Y0
8768	VPXOR      Y0, Y4, Y4
8769	VPSHUFB    rol16<>+0(SB), Y4, Y4
8770	VPADDD     Y4, Y12, Y12
8771	VPXOR      Y12, Y14, Y14
8772	VPSLLD     $0x0c, Y14, Y3
8773	VPSRLD     $0x14, Y14, Y14
8774	VPXOR      Y3, Y14, Y14
8775	VPADDD     Y14, Y0, Y0
8776	VPXOR      Y0, Y4, Y4
8777	VPSHUFB    rol8<>+0(SB), Y4, Y4
8778	VPADDD     Y4, Y12, Y12
8779	VPXOR      Y12, Y14, Y14
8780	VPSLLD     $0x07, Y14, Y3
8781	VPSRLD     $0x19, Y14, Y14
8782	VPXOR      Y3, Y14, Y14
8783	ADDQ       16(DI), R10
8784	ADCQ       24(DI), R11
8785	ADCQ       $0x01, R12
8786	MOVQ       (BP), AX
8787	MOVQ       AX, R15
8788	MULQ       R10
8789	MOVQ       AX, R13
8790	MOVQ       DX, R14
8791	MOVQ       (BP), AX
8792	MULQ       R11
8793	IMULQ      R12, R15
8794	ADDQ       AX, R14
8795	ADCQ       DX, R15
8796	MOVQ       8(BP), AX
8797	MOVQ       AX, R8
8798	MULQ       R10
8799	ADDQ       AX, R14
8800	ADCQ       $0x00, DX
8801	MOVQ       DX, R10
8802	MOVQ       8(BP), AX
8803	MULQ       R11
8804	ADDQ       AX, R15
8805	ADCQ       $0x00, DX
8806	IMULQ      R12, R8
8807	ADDQ       R10, R15
8808	ADCQ       DX, R8
8809	MOVQ       R13, R10
8810	MOVQ       R14, R11
8811	MOVQ       R15, R12
8812	ANDQ       $0x03, R12
8813	MOVQ       R15, R13
8814	ANDQ       $-4, R13
8815	MOVQ       R8, R14
8816	SHRQ       $0x02, R8, R15
8817	SHRQ       $0x02, R8
8818	ADDQ       R13, R10
8819	ADCQ       R14, R11
8820	ADCQ       $0x00, R12
8821	ADDQ       R15, R10
8822	ADCQ       R8, R11
8823	ADCQ       $0x00, R12
8824	LEAQ       32(DI), DI
8825	VPALIGNR   $0x0c, Y14, Y14, Y14
8826	VPALIGNR   $0x08, Y12, Y12, Y12
8827	VPALIGNR   $0x04, Y4, Y4, Y4
8828	DECQ       CX
8829	JG         sealAVX2Tail128LoopA
8830	DECQ       R9
8831	JGE        sealAVX2Tail128LoopB
8832	VPADDD     chacha20Constants<>+0(SB), Y0, Y5
8833	VPADDD     32(BP), Y14, Y9
8834	VPADDD     64(BP), Y12, Y13
8835	VPADDD     Y1, Y4, Y1
8836	VPERM2I128 $0x02, Y5, Y9, Y0
8837	VPERM2I128 $0x02, Y13, Y1, Y14
8838	VPERM2I128 $0x13, Y5, Y9, Y12
8839	VPERM2I128 $0x13, Y13, Y1, Y4
8840	JMP        sealAVX2ShortSealLoop
8841
8842sealAVX2Tail256:
8843	VMOVDQA chacha20Constants<>+0(SB), Y0
8844	VMOVDQA chacha20Constants<>+0(SB), Y5
8845	VMOVDQA 32(BP), Y14
8846	VMOVDQA 32(BP), Y9
8847	VMOVDQA 64(BP), Y12
8848	VMOVDQA 64(BP), Y13
8849	VMOVDQA 192(BP), Y4
8850	VPADDD  avx2IncMask<>+0(SB), Y4, Y4
8851	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
8852	VMOVDQA Y4, Y7
8853	VMOVDQA Y1, Y11
8854
8855sealAVX2Tail256LoopA:
8856	ADDQ  (DI), R10
8857	ADCQ  8(DI), R11
8858	ADCQ  $0x01, R12
8859	MOVQ  (BP), AX
8860	MOVQ  AX, R15
8861	MULQ  R10
8862	MOVQ  AX, R13
8863	MOVQ  DX, R14
8864	MOVQ  (BP), AX
8865	MULQ  R11
8866	IMULQ R12, R15
8867	ADDQ  AX, R14
8868	ADCQ  DX, R15
8869	MOVQ  8(BP), AX
8870	MOVQ  AX, R8
8871	MULQ  R10
8872	ADDQ  AX, R14
8873	ADCQ  $0x00, DX
8874	MOVQ  DX, R10
8875	MOVQ  8(BP), AX
8876	MULQ  R11
8877	ADDQ  AX, R15
8878	ADCQ  $0x00, DX
8879	IMULQ R12, R8
8880	ADDQ  R10, R15
8881	ADCQ  DX, R8
8882	MOVQ  R13, R10
8883	MOVQ  R14, R11
8884	MOVQ  R15, R12
8885	ANDQ  $0x03, R12
8886	MOVQ  R15, R13
8887	ANDQ  $-4, R13
8888	MOVQ  R8, R14
8889	SHRQ  $0x02, R8, R15
8890	SHRQ  $0x02, R8
8891	ADDQ  R13, R10
8892	ADCQ  R14, R11
8893	ADCQ  $0x00, R12
8894	ADDQ  R15, R10
8895	ADCQ  R8, R11
8896	ADCQ  $0x00, R12
8897	LEAQ  16(DI), DI
8898
8899sealAVX2Tail256LoopB:
8900	VPADDD     Y14, Y0, Y0
8901	VPXOR      Y0, Y4, Y4
8902	VPSHUFB    rol16<>+0(SB), Y4, Y4
8903	VPADDD     Y4, Y12, Y12
8904	VPXOR      Y12, Y14, Y14
8905	VPSLLD     $0x0c, Y14, Y3
8906	VPSRLD     $0x14, Y14, Y14
8907	VPXOR      Y3, Y14, Y14
8908	VPADDD     Y14, Y0, Y0
8909	VPXOR      Y0, Y4, Y4
8910	VPSHUFB    rol8<>+0(SB), Y4, Y4
8911	VPADDD     Y4, Y12, Y12
8912	VPXOR      Y12, Y14, Y14
8913	VPSLLD     $0x07, Y14, Y3
8914	VPSRLD     $0x19, Y14, Y14
8915	VPXOR      Y3, Y14, Y14
8916	VPADDD     Y9, Y5, Y5
8917	VPXOR      Y5, Y1, Y1
8918	VPSHUFB    rol16<>+0(SB), Y1, Y1
8919	VPADDD     Y1, Y13, Y13
8920	VPXOR      Y13, Y9, Y9
8921	VPSLLD     $0x0c, Y9, Y3
8922	VPSRLD     $0x14, Y9, Y9
8923	VPXOR      Y3, Y9, Y9
8924	VPADDD     Y9, Y5, Y5
8925	VPXOR      Y5, Y1, Y1
8926	VPSHUFB    rol8<>+0(SB), Y1, Y1
8927	VPADDD     Y1, Y13, Y13
8928	VPXOR      Y13, Y9, Y9
8929	VPSLLD     $0x07, Y9, Y3
8930	VPSRLD     $0x19, Y9, Y9
8931	VPXOR      Y3, Y9, Y9
8932	ADDQ       (DI), R10
8933	ADCQ       8(DI), R11
8934	ADCQ       $0x01, R12
8935	MOVQ       (BP), AX
8936	MOVQ       AX, R15
8937	MULQ       R10
8938	MOVQ       AX, R13
8939	MOVQ       DX, R14
8940	MOVQ       (BP), AX
8941	MULQ       R11
8942	IMULQ      R12, R15
8943	ADDQ       AX, R14
8944	ADCQ       DX, R15
8945	MOVQ       8(BP), AX
8946	MOVQ       AX, R8
8947	MULQ       R10
8948	ADDQ       AX, R14
8949	ADCQ       $0x00, DX
8950	MOVQ       DX, R10
8951	MOVQ       8(BP), AX
8952	MULQ       R11
8953	ADDQ       AX, R15
8954	ADCQ       $0x00, DX
8955	IMULQ      R12, R8
8956	ADDQ       R10, R15
8957	ADCQ       DX, R8
8958	MOVQ       R13, R10
8959	MOVQ       R14, R11
8960	MOVQ       R15, R12
8961	ANDQ       $0x03, R12
8962	MOVQ       R15, R13
8963	ANDQ       $-4, R13
8964	MOVQ       R8, R14
8965	SHRQ       $0x02, R8, R15
8966	SHRQ       $0x02, R8
8967	ADDQ       R13, R10
8968	ADCQ       R14, R11
8969	ADCQ       $0x00, R12
8970	ADDQ       R15, R10
8971	ADCQ       R8, R11
8972	ADCQ       $0x00, R12
8973	VPALIGNR   $0x04, Y14, Y14, Y14
8974	VPALIGNR   $0x04, Y9, Y9, Y9
8975	VPALIGNR   $0x08, Y12, Y12, Y12
8976	VPALIGNR   $0x08, Y13, Y13, Y13
8977	VPALIGNR   $0x0c, Y4, Y4, Y4
8978	VPALIGNR   $0x0c, Y1, Y1, Y1
8979	VPADDD     Y14, Y0, Y0
8980	VPXOR      Y0, Y4, Y4
8981	VPSHUFB    rol16<>+0(SB), Y4, Y4
8982	VPADDD     Y4, Y12, Y12
8983	VPXOR      Y12, Y14, Y14
8984	VPSLLD     $0x0c, Y14, Y3
8985	VPSRLD     $0x14, Y14, Y14
8986	VPXOR      Y3, Y14, Y14
8987	VPADDD     Y14, Y0, Y0
8988	VPXOR      Y0, Y4, Y4
8989	VPSHUFB    rol8<>+0(SB), Y4, Y4
8990	VPADDD     Y4, Y12, Y12
8991	VPXOR      Y12, Y14, Y14
8992	VPSLLD     $0x07, Y14, Y3
8993	VPSRLD     $0x19, Y14, Y14
8994	VPXOR      Y3, Y14, Y14
8995	VPADDD     Y9, Y5, Y5
8996	VPXOR      Y5, Y1, Y1
8997	VPSHUFB    rol16<>+0(SB), Y1, Y1
8998	VPADDD     Y1, Y13, Y13
8999	VPXOR      Y13, Y9, Y9
9000	VPSLLD     $0x0c, Y9, Y3
9001	VPSRLD     $0x14, Y9, Y9
9002	VPXOR      Y3, Y9, Y9
9003	VPADDD     Y9, Y5, Y5
9004	VPXOR      Y5, Y1, Y1
9005	VPSHUFB    rol8<>+0(SB), Y1, Y1
9006	VPADDD     Y1, Y13, Y13
9007	VPXOR      Y13, Y9, Y9
9008	VPSLLD     $0x07, Y9, Y3
9009	VPSRLD     $0x19, Y9, Y9
9010	VPXOR      Y3, Y9, Y9
9011	ADDQ       16(DI), R10
9012	ADCQ       24(DI), R11
9013	ADCQ       $0x01, R12
9014	MOVQ       (BP), AX
9015	MOVQ       AX, R15
9016	MULQ       R10
9017	MOVQ       AX, R13
9018	MOVQ       DX, R14
9019	MOVQ       (BP), AX
9020	MULQ       R11
9021	IMULQ      R12, R15
9022	ADDQ       AX, R14
9023	ADCQ       DX, R15
9024	MOVQ       8(BP), AX
9025	MOVQ       AX, R8
9026	MULQ       R10
9027	ADDQ       AX, R14
9028	ADCQ       $0x00, DX
9029	MOVQ       DX, R10
9030	MOVQ       8(BP), AX
9031	MULQ       R11
9032	ADDQ       AX, R15
9033	ADCQ       $0x00, DX
9034	IMULQ      R12, R8
9035	ADDQ       R10, R15
9036	ADCQ       DX, R8
9037	MOVQ       R13, R10
9038	MOVQ       R14, R11
9039	MOVQ       R15, R12
9040	ANDQ       $0x03, R12
9041	MOVQ       R15, R13
9042	ANDQ       $-4, R13
9043	MOVQ       R8, R14
9044	SHRQ       $0x02, R8, R15
9045	SHRQ       $0x02, R8
9046	ADDQ       R13, R10
9047	ADCQ       R14, R11
9048	ADCQ       $0x00, R12
9049	ADDQ       R15, R10
9050	ADCQ       R8, R11
9051	ADCQ       $0x00, R12
9052	LEAQ       32(DI), DI
9053	VPALIGNR   $0x0c, Y14, Y14, Y14
9054	VPALIGNR   $0x0c, Y9, Y9, Y9
9055	VPALIGNR   $0x08, Y12, Y12, Y12
9056	VPALIGNR   $0x08, Y13, Y13, Y13
9057	VPALIGNR   $0x04, Y4, Y4, Y4
9058	VPALIGNR   $0x04, Y1, Y1, Y1
9059	DECQ       CX
9060	JG         sealAVX2Tail256LoopA
9061	DECQ       R9
9062	JGE        sealAVX2Tail256LoopB
9063	VPADDD     chacha20Constants<>+0(SB), Y0, Y0
9064	VPADDD     chacha20Constants<>+0(SB), Y5, Y5
9065	VPADDD     32(BP), Y14, Y14
9066	VPADDD     32(BP), Y9, Y9
9067	VPADDD     64(BP), Y12, Y12
9068	VPADDD     64(BP), Y13, Y13
9069	VPADDD     Y7, Y4, Y4
9070	VPADDD     Y11, Y1, Y1
9071	VPERM2I128 $0x02, Y0, Y14, Y3
9072	VPERM2I128 $0x02, Y12, Y4, Y7
9073	VPERM2I128 $0x13, Y0, Y14, Y11
9074	VPERM2I128 $0x13, Y12, Y4, Y15
9075	VPXOR      (SI), Y3, Y3
9076	VPXOR      32(SI), Y7, Y7
9077	VPXOR      64(SI), Y11, Y11
9078	VPXOR      96(SI), Y15, Y15
9079	VMOVDQU    Y3, (DI)
9080	VMOVDQU    Y7, 32(DI)
9081	VMOVDQU    Y11, 64(DI)
9082	VMOVDQU    Y15, 96(DI)
9083	MOVQ       $0x00000080, CX
9084	LEAQ       128(SI), SI
9085	SUBQ       $0x80, BX
9086	VPERM2I128 $0x02, Y5, Y9, Y0
9087	VPERM2I128 $0x02, Y13, Y1, Y14
9088	VPERM2I128 $0x13, Y5, Y9, Y12
9089	VPERM2I128 $0x13, Y13, Y1, Y4
9090	JMP        sealAVX2SealHash
9091
9092sealAVX2Tail384:
9093	VMOVDQA chacha20Constants<>+0(SB), Y0
9094	VMOVDQA Y0, Y5
9095	VMOVDQA Y0, Y6
9096	VMOVDQA 32(BP), Y14
9097	VMOVDQA Y14, Y9
9098	VMOVDQA Y14, Y10
9099	VMOVDQA 64(BP), Y12
9100	VMOVDQA Y12, Y13
9101	VMOVDQA Y12, Y8
9102	VMOVDQA 192(BP), Y4
9103	VPADDD  avx2IncMask<>+0(SB), Y4, Y4
9104	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
9105	VPADDD  avx2IncMask<>+0(SB), Y1, Y2
9106	VMOVDQA Y4, Y7
9107	VMOVDQA Y1, Y11
9108	VMOVDQA Y2, Y15
9109
9110sealAVX2Tail384LoopA:
9111	ADDQ  (DI), R10
9112	ADCQ  8(DI), R11
9113	ADCQ  $0x01, R12
9114	MOVQ  (BP), AX
9115	MOVQ  AX, R15
9116	MULQ  R10
9117	MOVQ  AX, R13
9118	MOVQ  DX, R14
9119	MOVQ  (BP), AX
9120	MULQ  R11
9121	IMULQ R12, R15
9122	ADDQ  AX, R14
9123	ADCQ  DX, R15
9124	MOVQ  8(BP), AX
9125	MOVQ  AX, R8
9126	MULQ  R10
9127	ADDQ  AX, R14
9128	ADCQ  $0x00, DX
9129	MOVQ  DX, R10
9130	MOVQ  8(BP), AX
9131	MULQ  R11
9132	ADDQ  AX, R15
9133	ADCQ  $0x00, DX
9134	IMULQ R12, R8
9135	ADDQ  R10, R15
9136	ADCQ  DX, R8
9137	MOVQ  R13, R10
9138	MOVQ  R14, R11
9139	MOVQ  R15, R12
9140	ANDQ  $0x03, R12
9141	MOVQ  R15, R13
9142	ANDQ  $-4, R13
9143	MOVQ  R8, R14
9144	SHRQ  $0x02, R8, R15
9145	SHRQ  $0x02, R8
9146	ADDQ  R13, R10
9147	ADCQ  R14, R11
9148	ADCQ  $0x00, R12
9149	ADDQ  R15, R10
9150	ADCQ  R8, R11
9151	ADCQ  $0x00, R12
9152	LEAQ  16(DI), DI
9153
9154sealAVX2Tail384LoopB:
9155	VPADDD     Y14, Y0, Y0
9156	VPXOR      Y0, Y4, Y4
9157	VPSHUFB    rol16<>+0(SB), Y4, Y4
9158	VPADDD     Y4, Y12, Y12
9159	VPXOR      Y12, Y14, Y14
9160	VPSLLD     $0x0c, Y14, Y3
9161	VPSRLD     $0x14, Y14, Y14
9162	VPXOR      Y3, Y14, Y14
9163	VPADDD     Y14, Y0, Y0
9164	VPXOR      Y0, Y4, Y4
9165	VPSHUFB    rol8<>+0(SB), Y4, Y4
9166	VPADDD     Y4, Y12, Y12
9167	VPXOR      Y12, Y14, Y14
9168	VPSLLD     $0x07, Y14, Y3
9169	VPSRLD     $0x19, Y14, Y14
9170	VPXOR      Y3, Y14, Y14
9171	VPADDD     Y9, Y5, Y5
9172	VPXOR      Y5, Y1, Y1
9173	VPSHUFB    rol16<>+0(SB), Y1, Y1
9174	VPADDD     Y1, Y13, Y13
9175	VPXOR      Y13, Y9, Y9
9176	VPSLLD     $0x0c, Y9, Y3
9177	VPSRLD     $0x14, Y9, Y9
9178	VPXOR      Y3, Y9, Y9
9179	VPADDD     Y9, Y5, Y5
9180	VPXOR      Y5, Y1, Y1
9181	VPSHUFB    rol8<>+0(SB), Y1, Y1
9182	VPADDD     Y1, Y13, Y13
9183	VPXOR      Y13, Y9, Y9
9184	VPSLLD     $0x07, Y9, Y3
9185	VPSRLD     $0x19, Y9, Y9
9186	VPXOR      Y3, Y9, Y9
9187	VPADDD     Y10, Y6, Y6
9188	VPXOR      Y6, Y2, Y2
9189	VPSHUFB    rol16<>+0(SB), Y2, Y2
9190	VPADDD     Y2, Y8, Y8
9191	VPXOR      Y8, Y10, Y10
9192	VPSLLD     $0x0c, Y10, Y3
9193	VPSRLD     $0x14, Y10, Y10
9194	VPXOR      Y3, Y10, Y10
9195	VPADDD     Y10, Y6, Y6
9196	VPXOR      Y6, Y2, Y2
9197	VPSHUFB    rol8<>+0(SB), Y2, Y2
9198	VPADDD     Y2, Y8, Y8
9199	VPXOR      Y8, Y10, Y10
9200	VPSLLD     $0x07, Y10, Y3
9201	VPSRLD     $0x19, Y10, Y10
9202	VPXOR      Y3, Y10, Y10
9203	ADDQ       (DI), R10
9204	ADCQ       8(DI), R11
9205	ADCQ       $0x01, R12
9206	MOVQ       (BP), AX
9207	MOVQ       AX, R15
9208	MULQ       R10
9209	MOVQ       AX, R13
9210	MOVQ       DX, R14
9211	MOVQ       (BP), AX
9212	MULQ       R11
9213	IMULQ      R12, R15
9214	ADDQ       AX, R14
9215	ADCQ       DX, R15
9216	MOVQ       8(BP), AX
9217	MOVQ       AX, R8
9218	MULQ       R10
9219	ADDQ       AX, R14
9220	ADCQ       $0x00, DX
9221	MOVQ       DX, R10
9222	MOVQ       8(BP), AX
9223	MULQ       R11
9224	ADDQ       AX, R15
9225	ADCQ       $0x00, DX
9226	IMULQ      R12, R8
9227	ADDQ       R10, R15
9228	ADCQ       DX, R8
9229	MOVQ       R13, R10
9230	MOVQ       R14, R11
9231	MOVQ       R15, R12
9232	ANDQ       $0x03, R12
9233	MOVQ       R15, R13
9234	ANDQ       $-4, R13
9235	MOVQ       R8, R14
9236	SHRQ       $0x02, R8, R15
9237	SHRQ       $0x02, R8
9238	ADDQ       R13, R10
9239	ADCQ       R14, R11
9240	ADCQ       $0x00, R12
9241	ADDQ       R15, R10
9242	ADCQ       R8, R11
9243	ADCQ       $0x00, R12
9244	VPALIGNR   $0x04, Y14, Y14, Y14
9245	VPALIGNR   $0x04, Y9, Y9, Y9
9246	VPALIGNR   $0x04, Y10, Y10, Y10
9247	VPALIGNR   $0x08, Y12, Y12, Y12
9248	VPALIGNR   $0x08, Y13, Y13, Y13
9249	VPALIGNR   $0x08, Y8, Y8, Y8
9250	VPALIGNR   $0x0c, Y4, Y4, Y4
9251	VPALIGNR   $0x0c, Y1, Y1, Y1
9252	VPALIGNR   $0x0c, Y2, Y2, Y2
9253	VPADDD     Y14, Y0, Y0
9254	VPXOR      Y0, Y4, Y4
9255	VPSHUFB    rol16<>+0(SB), Y4, Y4
9256	VPADDD     Y4, Y12, Y12
9257	VPXOR      Y12, Y14, Y14
9258	VPSLLD     $0x0c, Y14, Y3
9259	VPSRLD     $0x14, Y14, Y14
9260	VPXOR      Y3, Y14, Y14
9261	VPADDD     Y14, Y0, Y0
9262	VPXOR      Y0, Y4, Y4
9263	VPSHUFB    rol8<>+0(SB), Y4, Y4
9264	VPADDD     Y4, Y12, Y12
9265	VPXOR      Y12, Y14, Y14
9266	VPSLLD     $0x07, Y14, Y3
9267	VPSRLD     $0x19, Y14, Y14
9268	VPXOR      Y3, Y14, Y14
9269	VPADDD     Y9, Y5, Y5
9270	VPXOR      Y5, Y1, Y1
9271	VPSHUFB    rol16<>+0(SB), Y1, Y1
9272	VPADDD     Y1, Y13, Y13
9273	VPXOR      Y13, Y9, Y9
9274	VPSLLD     $0x0c, Y9, Y3
9275	VPSRLD     $0x14, Y9, Y9
9276	VPXOR      Y3, Y9, Y9
9277	VPADDD     Y9, Y5, Y5
9278	VPXOR      Y5, Y1, Y1
9279	VPSHUFB    rol8<>+0(SB), Y1, Y1
9280	VPADDD     Y1, Y13, Y13
9281	VPXOR      Y13, Y9, Y9
9282	VPSLLD     $0x07, Y9, Y3
9283	VPSRLD     $0x19, Y9, Y9
9284	VPXOR      Y3, Y9, Y9
9285	VPADDD     Y10, Y6, Y6
9286	VPXOR      Y6, Y2, Y2
9287	VPSHUFB    rol16<>+0(SB), Y2, Y2
9288	VPADDD     Y2, Y8, Y8
9289	VPXOR      Y8, Y10, Y10
9290	VPSLLD     $0x0c, Y10, Y3
9291	VPSRLD     $0x14, Y10, Y10
9292	VPXOR      Y3, Y10, Y10
9293	VPADDD     Y10, Y6, Y6
9294	VPXOR      Y6, Y2, Y2
9295	VPSHUFB    rol8<>+0(SB), Y2, Y2
9296	VPADDD     Y2, Y8, Y8
9297	VPXOR      Y8, Y10, Y10
9298	VPSLLD     $0x07, Y10, Y3
9299	VPSRLD     $0x19, Y10, Y10
9300	VPXOR      Y3, Y10, Y10
9301	ADDQ       16(DI), R10
9302	ADCQ       24(DI), R11
9303	ADCQ       $0x01, R12
9304	MOVQ       (BP), AX
9305	MOVQ       AX, R15
9306	MULQ       R10
9307	MOVQ       AX, R13
9308	MOVQ       DX, R14
9309	MOVQ       (BP), AX
9310	MULQ       R11
9311	IMULQ      R12, R15
9312	ADDQ       AX, R14
9313	ADCQ       DX, R15
9314	MOVQ       8(BP), AX
9315	MOVQ       AX, R8
9316	MULQ       R10
9317	ADDQ       AX, R14
9318	ADCQ       $0x00, DX
9319	MOVQ       DX, R10
9320	MOVQ       8(BP), AX
9321	MULQ       R11
9322	ADDQ       AX, R15
9323	ADCQ       $0x00, DX
9324	IMULQ      R12, R8
9325	ADDQ       R10, R15
9326	ADCQ       DX, R8
9327	MOVQ       R13, R10
9328	MOVQ       R14, R11
9329	MOVQ       R15, R12
9330	ANDQ       $0x03, R12
9331	MOVQ       R15, R13
9332	ANDQ       $-4, R13
9333	MOVQ       R8, R14
9334	SHRQ       $0x02, R8, R15
9335	SHRQ       $0x02, R8
9336	ADDQ       R13, R10
9337	ADCQ       R14, R11
9338	ADCQ       $0x00, R12
9339	ADDQ       R15, R10
9340	ADCQ       R8, R11
9341	ADCQ       $0x00, R12
9342	LEAQ       32(DI), DI
9343	VPALIGNR   $0x0c, Y14, Y14, Y14
9344	VPALIGNR   $0x0c, Y9, Y9, Y9
9345	VPALIGNR   $0x0c, Y10, Y10, Y10
9346	VPALIGNR   $0x08, Y12, Y12, Y12
9347	VPALIGNR   $0x08, Y13, Y13, Y13
9348	VPALIGNR   $0x08, Y8, Y8, Y8
9349	VPALIGNR   $0x04, Y4, Y4, Y4
9350	VPALIGNR   $0x04, Y1, Y1, Y1
9351	VPALIGNR   $0x04, Y2, Y2, Y2
9352	DECQ       CX
9353	JG         sealAVX2Tail384LoopA
9354	DECQ       R9
9355	JGE        sealAVX2Tail384LoopB
9356	VPADDD     chacha20Constants<>+0(SB), Y0, Y0
9357	VPADDD     chacha20Constants<>+0(SB), Y5, Y5
9358	VPADDD     chacha20Constants<>+0(SB), Y6, Y6
9359	VPADDD     32(BP), Y14, Y14
9360	VPADDD     32(BP), Y9, Y9
9361	VPADDD     32(BP), Y10, Y10
9362	VPADDD     64(BP), Y12, Y12
9363	VPADDD     64(BP), Y13, Y13
9364	VPADDD     64(BP), Y8, Y8
9365	VPADDD     Y7, Y4, Y4
9366	VPADDD     Y11, Y1, Y1
9367	VPADDD     Y15, Y2, Y2
9368	VPERM2I128 $0x02, Y0, Y14, Y3
9369	VPERM2I128 $0x02, Y12, Y4, Y7
9370	VPERM2I128 $0x13, Y0, Y14, Y11
9371	VPERM2I128 $0x13, Y12, Y4, Y15
9372	VPXOR      (SI), Y3, Y3
9373	VPXOR      32(SI), Y7, Y7
9374	VPXOR      64(SI), Y11, Y11
9375	VPXOR      96(SI), Y15, Y15
9376	VMOVDQU    Y3, (DI)
9377	VMOVDQU    Y7, 32(DI)
9378	VMOVDQU    Y11, 64(DI)
9379	VMOVDQU    Y15, 96(DI)
9380	VPERM2I128 $0x02, Y5, Y9, Y3
9381	VPERM2I128 $0x02, Y13, Y1, Y7
9382	VPERM2I128 $0x13, Y5, Y9, Y11
9383	VPERM2I128 $0x13, Y13, Y1, Y15
9384	VPXOR      128(SI), Y3, Y3
9385	VPXOR      160(SI), Y7, Y7
9386	VPXOR      192(SI), Y11, Y11
9387	VPXOR      224(SI), Y15, Y15
9388	VMOVDQU    Y3, 128(DI)
9389	VMOVDQU    Y7, 160(DI)
9390	VMOVDQU    Y11, 192(DI)
9391	VMOVDQU    Y15, 224(DI)
9392	MOVQ       $0x00000100, CX
9393	LEAQ       256(SI), SI
9394	SUBQ       $0x00000100, BX
9395	VPERM2I128 $0x02, Y6, Y10, Y0
9396	VPERM2I128 $0x02, Y8, Y2, Y14
9397	VPERM2I128 $0x13, Y6, Y10, Y12
9398	VPERM2I128 $0x13, Y8, Y2, Y4
9399	JMP        sealAVX2SealHash
9400
9401sealAVX2Tail512:
9402	VMOVDQA chacha20Constants<>+0(SB), Y0
9403	VMOVDQA Y0, Y5
9404	VMOVDQA Y0, Y6
9405	VMOVDQA Y0, Y7
9406	VMOVDQA 32(BP), Y14
9407	VMOVDQA Y14, Y9
9408	VMOVDQA Y14, Y10
9409	VMOVDQA Y14, Y11
9410	VMOVDQA 64(BP), Y12
9411	VMOVDQA Y12, Y13
9412	VMOVDQA Y12, Y8
9413	VMOVDQA Y12, Y15
9414	VMOVDQA 192(BP), Y4
9415	VPADDD  avx2IncMask<>+0(SB), Y4, Y4
9416	VPADDD  avx2IncMask<>+0(SB), Y4, Y1
9417	VPADDD  avx2IncMask<>+0(SB), Y1, Y2
9418	VPADDD  avx2IncMask<>+0(SB), Y2, Y3
9419	VMOVDQA Y4, 96(BP)
9420	VMOVDQA Y1, 128(BP)
9421	VMOVDQA Y2, 160(BP)
9422	VMOVDQA Y3, 192(BP)
9423
9424sealAVX2Tail512LoopA:
9425	ADDQ  (DI), R10
9426	ADCQ  8(DI), R11
9427	ADCQ  $0x01, R12
9428	MOVQ  (BP), AX
9429	MOVQ  AX, R15
9430	MULQ  R10
9431	MOVQ  AX, R13
9432	MOVQ  DX, R14
9433	MOVQ  (BP), AX
9434	MULQ  R11
9435	IMULQ R12, R15
9436	ADDQ  AX, R14
9437	ADCQ  DX, R15
9438	MOVQ  8(BP), AX
9439	MOVQ  AX, R8
9440	MULQ  R10
9441	ADDQ  AX, R14
9442	ADCQ  $0x00, DX
9443	MOVQ  DX, R10
9444	MOVQ  8(BP), AX
9445	MULQ  R11
9446	ADDQ  AX, R15
9447	ADCQ  $0x00, DX
9448	IMULQ R12, R8
9449	ADDQ  R10, R15
9450	ADCQ  DX, R8
9451	MOVQ  R13, R10
9452	MOVQ  R14, R11
9453	MOVQ  R15, R12
9454	ANDQ  $0x03, R12
9455	MOVQ  R15, R13
9456	ANDQ  $-4, R13
9457	MOVQ  R8, R14
9458	SHRQ  $0x02, R8, R15
9459	SHRQ  $0x02, R8
9460	ADDQ  R13, R10
9461	ADCQ  R14, R11
9462	ADCQ  $0x00, R12
9463	ADDQ  R15, R10
9464	ADCQ  R8, R11
9465	ADCQ  $0x00, R12
9466	LEAQ  16(DI), DI
9467
9468sealAVX2Tail512LoopB:
9469	VPADDD     Y14, Y0, Y0
9470	VPADDD     Y9, Y5, Y5
9471	VPADDD     Y10, Y6, Y6
9472	VPADDD     Y11, Y7, Y7
9473	VPXOR      Y0, Y4, Y4
9474	VPXOR      Y5, Y1, Y1
9475	VPXOR      Y6, Y2, Y2
9476	VPXOR      Y7, Y3, Y3
9477	VPSHUFB    rol16<>+0(SB), Y4, Y4
9478	VPSHUFB    rol16<>+0(SB), Y1, Y1
9479	VPSHUFB    rol16<>+0(SB), Y2, Y2
9480	VPSHUFB    rol16<>+0(SB), Y3, Y3
9481	VPADDD     Y4, Y12, Y12
9482	VPADDD     Y1, Y13, Y13
9483	VPADDD     Y2, Y8, Y8
9484	VPADDD     Y3, Y15, Y15
9485	VPXOR      Y12, Y14, Y14
9486	VPXOR      Y13, Y9, Y9
9487	VPXOR      Y8, Y10, Y10
9488	VPXOR      Y15, Y11, Y11
9489	VMOVDQA    Y15, 224(BP)
9490	VPSLLD     $0x0c, Y14, Y15
9491	VPSRLD     $0x14, Y14, Y14
9492	VPXOR      Y15, Y14, Y14
9493	VPSLLD     $0x0c, Y9, Y15
9494	VPSRLD     $0x14, Y9, Y9
9495	VPXOR      Y15, Y9, Y9
9496	VPSLLD     $0x0c, Y10, Y15
9497	VPSRLD     $0x14, Y10, Y10
9498	VPXOR      Y15, Y10, Y10
9499	VPSLLD     $0x0c, Y11, Y15
9500	VPSRLD     $0x14, Y11, Y11
9501	VPXOR      Y15, Y11, Y11
9502	VMOVDQA    224(BP), Y15
9503	ADDQ       (DI), R10
9504	ADCQ       8(DI), R11
9505	ADCQ       $0x01, R12
9506	MOVQ       (BP), DX
9507	MOVQ       DX, R15
9508	MULXQ      R10, R13, R14
9509	IMULQ      R12, R15
9510	MULXQ      R11, AX, DX
9511	ADDQ       AX, R14
9512	ADCQ       DX, R15
9513	MOVQ       8(BP), DX
9514	MULXQ      R10, R10, AX
9515	ADDQ       R10, R14
9516	MULXQ      R11, R11, R8
9517	ADCQ       R11, R15
9518	ADCQ       $0x00, R8
9519	IMULQ      R12, DX
9520	ADDQ       AX, R15
9521	ADCQ       DX, R8
9522	MOVQ       R13, R10
9523	MOVQ       R14, R11
9524	MOVQ       R15, R12
9525	ANDQ       $0x03, R12
9526	MOVQ       R15, R13
9527	ANDQ       $-4, R13
9528	MOVQ       R8, R14
9529	SHRQ       $0x02, R8, R15
9530	SHRQ       $0x02, R8
9531	ADDQ       R13, R10
9532	ADCQ       R14, R11
9533	ADCQ       $0x00, R12
9534	ADDQ       R15, R10
9535	ADCQ       R8, R11
9536	ADCQ       $0x00, R12
9537	VPADDD     Y14, Y0, Y0
9538	VPADDD     Y9, Y5, Y5
9539	VPADDD     Y10, Y6, Y6
9540	VPADDD     Y11, Y7, Y7
9541	VPXOR      Y0, Y4, Y4
9542	VPXOR      Y5, Y1, Y1
9543	VPXOR      Y6, Y2, Y2
9544	VPXOR      Y7, Y3, Y3
9545	VPSHUFB    rol8<>+0(SB), Y4, Y4
9546	VPSHUFB    rol8<>+0(SB), Y1, Y1
9547	VPSHUFB    rol8<>+0(SB), Y2, Y2
9548	VPSHUFB    rol8<>+0(SB), Y3, Y3
9549	VPADDD     Y4, Y12, Y12
9550	VPADDD     Y1, Y13, Y13
9551	VPADDD     Y2, Y8, Y8
9552	VPADDD     Y3, Y15, Y15
9553	VPXOR      Y12, Y14, Y14
9554	VPXOR      Y13, Y9, Y9
9555	VPXOR      Y8, Y10, Y10
9556	VPXOR      Y15, Y11, Y11
9557	VMOVDQA    Y15, 224(BP)
9558	VPSLLD     $0x07, Y14, Y15
9559	VPSRLD     $0x19, Y14, Y14
9560	VPXOR      Y15, Y14, Y14
9561	VPSLLD     $0x07, Y9, Y15
9562	VPSRLD     $0x19, Y9, Y9
9563	VPXOR      Y15, Y9, Y9
9564	VPSLLD     $0x07, Y10, Y15
9565	VPSRLD     $0x19, Y10, Y10
9566	VPXOR      Y15, Y10, Y10
9567	VPSLLD     $0x07, Y11, Y15
9568	VPSRLD     $0x19, Y11, Y11
9569	VPXOR      Y15, Y11, Y11
9570	VMOVDQA    224(BP), Y15
9571	VPALIGNR   $0x04, Y14, Y14, Y14
9572	VPALIGNR   $0x04, Y9, Y9, Y9
9573	VPALIGNR   $0x04, Y10, Y10, Y10
9574	VPALIGNR   $0x04, Y11, Y11, Y11
9575	VPALIGNR   $0x08, Y12, Y12, Y12
9576	VPALIGNR   $0x08, Y13, Y13, Y13
9577	VPALIGNR   $0x08, Y8, Y8, Y8
9578	VPALIGNR   $0x08, Y15, Y15, Y15
9579	VPALIGNR   $0x0c, Y4, Y4, Y4
9580	VPALIGNR   $0x0c, Y1, Y1, Y1
9581	VPALIGNR   $0x0c, Y2, Y2, Y2
9582	VPALIGNR   $0x0c, Y3, Y3, Y3
9583	VPADDD     Y14, Y0, Y0
9584	VPADDD     Y9, Y5, Y5
9585	VPADDD     Y10, Y6, Y6
9586	VPADDD     Y11, Y7, Y7
9587	VPXOR      Y0, Y4, Y4
9588	VPXOR      Y5, Y1, Y1
9589	VPXOR      Y6, Y2, Y2
9590	VPXOR      Y7, Y3, Y3
9591	VPSHUFB    rol16<>+0(SB), Y4, Y4
9592	VPSHUFB    rol16<>+0(SB), Y1, Y1
9593	VPSHUFB    rol16<>+0(SB), Y2, Y2
9594	VPSHUFB    rol16<>+0(SB), Y3, Y3
9595	VPADDD     Y4, Y12, Y12
9596	VPADDD     Y1, Y13, Y13
9597	VPADDD     Y2, Y8, Y8
9598	VPADDD     Y3, Y15, Y15
9599	VPXOR      Y12, Y14, Y14
9600	VPXOR      Y13, Y9, Y9
9601	VPXOR      Y8, Y10, Y10
9602	VPXOR      Y15, Y11, Y11
9603	ADDQ       16(DI), R10
9604	ADCQ       24(DI), R11
9605	ADCQ       $0x01, R12
9606	MOVQ       (BP), DX
9607	MOVQ       DX, R15
9608	MULXQ      R10, R13, R14
9609	IMULQ      R12, R15
9610	MULXQ      R11, AX, DX
9611	ADDQ       AX, R14
9612	ADCQ       DX, R15
9613	MOVQ       8(BP), DX
9614	MULXQ      R10, R10, AX
9615	ADDQ       R10, R14
9616	MULXQ      R11, R11, R8
9617	ADCQ       R11, R15
9618	ADCQ       $0x00, R8
9619	IMULQ      R12, DX
9620	ADDQ       AX, R15
9621	ADCQ       DX, R8
9622	MOVQ       R13, R10
9623	MOVQ       R14, R11
9624	MOVQ       R15, R12
9625	ANDQ       $0x03, R12
9626	MOVQ       R15, R13
9627	ANDQ       $-4, R13
9628	MOVQ       R8, R14
9629	SHRQ       $0x02, R8, R15
9630	SHRQ       $0x02, R8
9631	ADDQ       R13, R10
9632	ADCQ       R14, R11
9633	ADCQ       $0x00, R12
9634	ADDQ       R15, R10
9635	ADCQ       R8, R11
9636	ADCQ       $0x00, R12
9637	LEAQ       32(DI), DI
9638	VMOVDQA    Y15, 224(BP)
9639	VPSLLD     $0x0c, Y14, Y15
9640	VPSRLD     $0x14, Y14, Y14
9641	VPXOR      Y15, Y14, Y14
9642	VPSLLD     $0x0c, Y9, Y15
9643	VPSRLD     $0x14, Y9, Y9
9644	VPXOR      Y15, Y9, Y9
9645	VPSLLD     $0x0c, Y10, Y15
9646	VPSRLD     $0x14, Y10, Y10
9647	VPXOR      Y15, Y10, Y10
9648	VPSLLD     $0x0c, Y11, Y15
9649	VPSRLD     $0x14, Y11, Y11
9650	VPXOR      Y15, Y11, Y11
9651	VMOVDQA    224(BP), Y15
9652	VPADDD     Y14, Y0, Y0
9653	VPADDD     Y9, Y5, Y5
9654	VPADDD     Y10, Y6, Y6
9655	VPADDD     Y11, Y7, Y7
9656	VPXOR      Y0, Y4, Y4
9657	VPXOR      Y5, Y1, Y1
9658	VPXOR      Y6, Y2, Y2
9659	VPXOR      Y7, Y3, Y3
9660	VPSHUFB    rol8<>+0(SB), Y4, Y4
9661	VPSHUFB    rol8<>+0(SB), Y1, Y1
9662	VPSHUFB    rol8<>+0(SB), Y2, Y2
9663	VPSHUFB    rol8<>+0(SB), Y3, Y3
9664	VPADDD     Y4, Y12, Y12
9665	VPADDD     Y1, Y13, Y13
9666	VPADDD     Y2, Y8, Y8
9667	VPADDD     Y3, Y15, Y15
9668	VPXOR      Y12, Y14, Y14
9669	VPXOR      Y13, Y9, Y9
9670	VPXOR      Y8, Y10, Y10
9671	VPXOR      Y15, Y11, Y11
9672	VMOVDQA    Y15, 224(BP)
9673	VPSLLD     $0x07, Y14, Y15
9674	VPSRLD     $0x19, Y14, Y14
9675	VPXOR      Y15, Y14, Y14
9676	VPSLLD     $0x07, Y9, Y15
9677	VPSRLD     $0x19, Y9, Y9
9678	VPXOR      Y15, Y9, Y9
9679	VPSLLD     $0x07, Y10, Y15
9680	VPSRLD     $0x19, Y10, Y10
9681	VPXOR      Y15, Y10, Y10
9682	VPSLLD     $0x07, Y11, Y15
9683	VPSRLD     $0x19, Y11, Y11
9684	VPXOR      Y15, Y11, Y11
9685	VMOVDQA    224(BP), Y15
9686	VPALIGNR   $0x0c, Y14, Y14, Y14
9687	VPALIGNR   $0x0c, Y9, Y9, Y9
9688	VPALIGNR   $0x0c, Y10, Y10, Y10
9689	VPALIGNR   $0x0c, Y11, Y11, Y11
9690	VPALIGNR   $0x08, Y12, Y12, Y12
9691	VPALIGNR   $0x08, Y13, Y13, Y13
9692	VPALIGNR   $0x08, Y8, Y8, Y8
9693	VPALIGNR   $0x08, Y15, Y15, Y15
9694	VPALIGNR   $0x04, Y4, Y4, Y4
9695	VPALIGNR   $0x04, Y1, Y1, Y1
9696	VPALIGNR   $0x04, Y2, Y2, Y2
9697	VPALIGNR   $0x04, Y3, Y3, Y3
9698	DECQ       CX
9699	JG         sealAVX2Tail512LoopA
9700	DECQ       R9
9701	JGE        sealAVX2Tail512LoopB
9702	VPADDD     chacha20Constants<>+0(SB), Y0, Y0
9703	VPADDD     chacha20Constants<>+0(SB), Y5, Y5
9704	VPADDD     chacha20Constants<>+0(SB), Y6, Y6
9705	VPADDD     chacha20Constants<>+0(SB), Y7, Y7
9706	VPADDD     32(BP), Y14, Y14
9707	VPADDD     32(BP), Y9, Y9
9708	VPADDD     32(BP), Y10, Y10
9709	VPADDD     32(BP), Y11, Y11
9710	VPADDD     64(BP), Y12, Y12
9711	VPADDD     64(BP), Y13, Y13
9712	VPADDD     64(BP), Y8, Y8
9713	VPADDD     64(BP), Y15, Y15
9714	VPADDD     96(BP), Y4, Y4
9715	VPADDD     128(BP), Y1, Y1
9716	VPADDD     160(BP), Y2, Y2
9717	VPADDD     192(BP), Y3, Y3
9718	VMOVDQA    Y15, 224(BP)
9719	VPERM2I128 $0x02, Y0, Y14, Y15
9720	VPXOR      (SI), Y15, Y15
9721	VMOVDQU    Y15, (DI)
9722	VPERM2I128 $0x02, Y12, Y4, Y15
9723	VPXOR      32(SI), Y15, Y15
9724	VMOVDQU    Y15, 32(DI)
9725	VPERM2I128 $0x13, Y0, Y14, Y15
9726	VPXOR      64(SI), Y15, Y15
9727	VMOVDQU    Y15, 64(DI)
9728	VPERM2I128 $0x13, Y12, Y4, Y15
9729	VPXOR      96(SI), Y15, Y15
9730	VMOVDQU    Y15, 96(DI)
9731	VPERM2I128 $0x02, Y5, Y9, Y0
9732	VPERM2I128 $0x02, Y13, Y1, Y14
9733	VPERM2I128 $0x13, Y5, Y9, Y12
9734	VPERM2I128 $0x13, Y13, Y1, Y4
9735	VPXOR      128(SI), Y0, Y0
9736	VPXOR      160(SI), Y14, Y14
9737	VPXOR      192(SI), Y12, Y12
9738	VPXOR      224(SI), Y4, Y4
9739	VMOVDQU    Y0, 128(DI)
9740	VMOVDQU    Y14, 160(DI)
9741	VMOVDQU    Y12, 192(DI)
9742	VMOVDQU    Y4, 224(DI)
9743	VPERM2I128 $0x02, Y6, Y10, Y0
9744	VPERM2I128 $0x02, Y8, Y2, Y14
9745	VPERM2I128 $0x13, Y6, Y10, Y12
9746	VPERM2I128 $0x13, Y8, Y2, Y4
9747	VPXOR      256(SI), Y0, Y0
9748	VPXOR      288(SI), Y14, Y14
9749	VPXOR      320(SI), Y12, Y12
9750	VPXOR      352(SI), Y4, Y4
9751	VMOVDQU    Y0, 256(DI)
9752	VMOVDQU    Y14, 288(DI)
9753	VMOVDQU    Y12, 320(DI)
9754	VMOVDQU    Y4, 352(DI)
9755	MOVQ       $0x00000180, CX
9756	LEAQ       384(SI), SI
9757	SUBQ       $0x00000180, BX
9758	VPERM2I128 $0x02, Y7, Y11, Y0
9759	VPERM2I128 $0x02, 224(BP), Y3, Y14
9760	VPERM2I128 $0x13, Y7, Y11, Y12
9761	VPERM2I128 $0x13, 224(BP), Y3, Y4
9762	JMP        sealAVX2SealHash