chacha_ppc64x.s

  1// Copyright 2019 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5// Based on CRYPTOGAMS code with the following comment:
  6// # ====================================================================
  7// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  8// # project. The module is, however, dual licensed under OpenSSL and
  9// # CRYPTOGAMS licenses depending on where you obtain it. For further
 10// # details see http://www.openssl.org/~appro/cryptogams/.
 11// # ====================================================================
 12
 13// Code for the perl script that generates the ppc64 assembler
 14// can be found in the cryptogams repository at the link below. It is based on
 15// the original from openssl.
 16
 17// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
 18
 19// The differences in this and the original implementation are
 20// due to the calling conventions and initialization of constants.
 21
 22//go:build gc && !purego && (ppc64 || ppc64le)
 23
 24#include "textflag.h"
 25
 26#define OUT  R3
 27#define INP  R4
 28#define LEN  R5
 29#define KEY  R6
 30#define CNT  R7
 31#define TMP  R15
 32
 33#define CONSTBASE  R16
 34#define BLOCKS R17
 35
 36// for VPERMXOR
 37#define MASK  R18
 38
 39DATA consts<>+0x00(SB)/4, $0x61707865
 40DATA consts<>+0x04(SB)/4, $0x3320646e
 41DATA consts<>+0x08(SB)/4, $0x79622d32
 42DATA consts<>+0x0c(SB)/4, $0x6b206574
 43DATA consts<>+0x10(SB)/4, $0x00000001
 44DATA consts<>+0x14(SB)/4, $0x00000000
 45DATA consts<>+0x18(SB)/4, $0x00000000
 46DATA consts<>+0x1c(SB)/4, $0x00000000
 47DATA consts<>+0x20(SB)/4, $0x00000004
 48DATA consts<>+0x24(SB)/4, $0x00000000
 49DATA consts<>+0x28(SB)/4, $0x00000000
 50DATA consts<>+0x2c(SB)/4, $0x00000000
 51DATA consts<>+0x30(SB)/4, $0x0e0f0c0d
 52DATA consts<>+0x34(SB)/4, $0x0a0b0809
 53DATA consts<>+0x38(SB)/4, $0x06070405
 54DATA consts<>+0x3c(SB)/4, $0x02030001
 55DATA consts<>+0x40(SB)/4, $0x0d0e0f0c
 56DATA consts<>+0x44(SB)/4, $0x090a0b08
 57DATA consts<>+0x48(SB)/4, $0x05060704
 58DATA consts<>+0x4c(SB)/4, $0x01020300
 59DATA consts<>+0x50(SB)/4, $0x61707865
 60DATA consts<>+0x54(SB)/4, $0x61707865
 61DATA consts<>+0x58(SB)/4, $0x61707865
 62DATA consts<>+0x5c(SB)/4, $0x61707865
 63DATA consts<>+0x60(SB)/4, $0x3320646e
 64DATA consts<>+0x64(SB)/4, $0x3320646e
 65DATA consts<>+0x68(SB)/4, $0x3320646e
 66DATA consts<>+0x6c(SB)/4, $0x3320646e
 67DATA consts<>+0x70(SB)/4, $0x79622d32
 68DATA consts<>+0x74(SB)/4, $0x79622d32
 69DATA consts<>+0x78(SB)/4, $0x79622d32
 70DATA consts<>+0x7c(SB)/4, $0x79622d32
 71DATA consts<>+0x80(SB)/4, $0x6b206574
 72DATA consts<>+0x84(SB)/4, $0x6b206574
 73DATA consts<>+0x88(SB)/4, $0x6b206574
 74DATA consts<>+0x8c(SB)/4, $0x6b206574
 75DATA consts<>+0x90(SB)/4, $0x00000000
 76DATA consts<>+0x94(SB)/4, $0x00000001
 77DATA consts<>+0x98(SB)/4, $0x00000002
 78DATA consts<>+0x9c(SB)/4, $0x00000003
 79DATA consts<>+0xa0(SB)/4, $0x11223300
 80DATA consts<>+0xa4(SB)/4, $0x55667744
 81DATA consts<>+0xa8(SB)/4, $0x99aabb88
 82DATA consts<>+0xac(SB)/4, $0xddeeffcc
 83DATA consts<>+0xb0(SB)/4, $0x22330011
 84DATA consts<>+0xb4(SB)/4, $0x66774455
 85DATA consts<>+0xb8(SB)/4, $0xaabb8899
 86DATA consts<>+0xbc(SB)/4, $0xeeffccdd
 87GLOBL consts<>(SB), RODATA, $0xc0
 88
 89#ifdef GOARCH_ppc64
 90#define BE_XXBRW_INIT() \
 91		LVSL (R0)(R0), V24 \
 92		VSPLTISB $3, V25   \
 93		VXOR V24, V25, V24 \
 94
 95#define BE_XXBRW(vr) VPERM vr, vr, V24, vr
 96#else
 97#define BE_XXBRW_INIT()
 98#define BE_XXBRW(vr)
 99#endif
100
101//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
102TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
103	MOVD out+0(FP), OUT
104	MOVD inp+8(FP), INP
105	MOVD len+16(FP), LEN
106	MOVD key+24(FP), KEY
107	MOVD counter+32(FP), CNT
108
109	// Addressing for constants
110	MOVD $consts<>+0x00(SB), CONSTBASE
111	MOVD $16, R8
112	MOVD $32, R9
113	MOVD $48, R10
114	MOVD $64, R11
115	SRD $6, LEN, BLOCKS
116	// for VPERMXOR
117	MOVD $consts<>+0xa0(SB), MASK
118	MOVD $16, R20
119	// V16
120	LXVW4X (CONSTBASE)(R0), VS48
121	ADD $80,CONSTBASE
122
123	// Load key into V17,V18
124	LXVW4X (KEY)(R0), VS49
125	LXVW4X (KEY)(R8), VS50
126
127	// Load CNT, NONCE into V19
128	LXVW4X (CNT)(R0), VS51
129
130	// Clear V27
131	VXOR V27, V27, V27
132
133	BE_XXBRW_INIT()
134
135	// V28
136	LXVW4X (CONSTBASE)(R11), VS60
137
138	// Load mask constants for VPERMXOR
139	LXVW4X (MASK)(R0), V20
140	LXVW4X (MASK)(R20), V21
141
142	// splat slot from V19 -> V26
143	VSPLTW $0, V19, V26
144
145	VSLDOI $4, V19, V27, V19
146	VSLDOI $12, V27, V19, V19
147
148	VADDUWM V26, V28, V26
149
150	MOVD $10, R14
151	MOVD R14, CTR
152	PCALIGN $16
153loop_outer_vsx:
154	// V0, V1, V2, V3
155	LXVW4X (R0)(CONSTBASE), VS32
156	LXVW4X (R8)(CONSTBASE), VS33
157	LXVW4X (R9)(CONSTBASE), VS34
158	LXVW4X (R10)(CONSTBASE), VS35
159
160	// splat values from V17, V18 into V4-V11
161	VSPLTW $0, V17, V4
162	VSPLTW $1, V17, V5
163	VSPLTW $2, V17, V6
164	VSPLTW $3, V17, V7
165	VSPLTW $0, V18, V8
166	VSPLTW $1, V18, V9
167	VSPLTW $2, V18, V10
168	VSPLTW $3, V18, V11
169
170	// VOR
171	VOR V26, V26, V12
172
173	// splat values from V19 -> V13, V14, V15
174	VSPLTW $1, V19, V13
175	VSPLTW $2, V19, V14
176	VSPLTW $3, V19, V15
177
178	// splat   const values
179	VSPLTISW $-16, V27
180	VSPLTISW $12, V28
181	VSPLTISW $8, V29
182	VSPLTISW $7, V30
183	PCALIGN $16
184loop_vsx:
185	VADDUWM V0, V4, V0
186	VADDUWM V1, V5, V1
187	VADDUWM V2, V6, V2
188	VADDUWM V3, V7, V3
189
190	VPERMXOR V12, V0, V21, V12
191	VPERMXOR V13, V1, V21, V13
192	VPERMXOR V14, V2, V21, V14
193	VPERMXOR V15, V3, V21, V15
194
195	VADDUWM V8, V12, V8
196	VADDUWM V9, V13, V9
197	VADDUWM V10, V14, V10
198	VADDUWM V11, V15, V11
199
200	VXOR V4, V8, V4
201	VXOR V5, V9, V5
202	VXOR V6, V10, V6
203	VXOR V7, V11, V7
204
205	VRLW V4, V28, V4
206	VRLW V5, V28, V5
207	VRLW V6, V28, V6
208	VRLW V7, V28, V7
209
210	VADDUWM V0, V4, V0
211	VADDUWM V1, V5, V1
212	VADDUWM V2, V6, V2
213	VADDUWM V3, V7, V3
214
215	VPERMXOR V12, V0, V20, V12
216	VPERMXOR V13, V1, V20, V13
217	VPERMXOR V14, V2, V20, V14
218	VPERMXOR V15, V3, V20, V15
219
220	VADDUWM V8, V12, V8
221	VADDUWM V9, V13, V9
222	VADDUWM V10, V14, V10
223	VADDUWM V11, V15, V11
224
225	VXOR V4, V8, V4
226	VXOR V5, V9, V5
227	VXOR V6, V10, V6
228	VXOR V7, V11, V7
229
230	VRLW V4, V30, V4
231	VRLW V5, V30, V5
232	VRLW V6, V30, V6
233	VRLW V7, V30, V7
234
235	VADDUWM V0, V5, V0
236	VADDUWM V1, V6, V1
237	VADDUWM V2, V7, V2
238	VADDUWM V3, V4, V3
239
240	VPERMXOR V15, V0, V21, V15
241	VPERMXOR V12, V1, V21, V12
242	VPERMXOR V13, V2, V21, V13
243	VPERMXOR V14, V3, V21, V14
244
245	VADDUWM V10, V15, V10
246	VADDUWM V11, V12, V11
247	VADDUWM V8, V13, V8
248	VADDUWM V9, V14, V9
249
250	VXOR V5, V10, V5
251	VXOR V6, V11, V6
252	VXOR V7, V8, V7
253	VXOR V4, V9, V4
254
255	VRLW V5, V28, V5
256	VRLW V6, V28, V6
257	VRLW V7, V28, V7
258	VRLW V4, V28, V4
259
260	VADDUWM V0, V5, V0
261	VADDUWM V1, V6, V1
262	VADDUWM V2, V7, V2
263	VADDUWM V3, V4, V3
264
265	VPERMXOR V15, V0, V20, V15
266	VPERMXOR V12, V1, V20, V12
267	VPERMXOR V13, V2, V20, V13
268	VPERMXOR V14, V3, V20, V14
269
270	VADDUWM V10, V15, V10
271	VADDUWM V11, V12, V11
272	VADDUWM V8, V13, V8
273	VADDUWM V9, V14, V9
274
275	VXOR V5, V10, V5
276	VXOR V6, V11, V6
277	VXOR V7, V8, V7
278	VXOR V4, V9, V4
279
280	VRLW V5, V30, V5
281	VRLW V6, V30, V6
282	VRLW V7, V30, V7
283	VRLW V4, V30, V4
284	BDNZ   loop_vsx
285
286	VADDUWM V12, V26, V12
287
288	VMRGEW V0, V1, V27
289	VMRGEW V2, V3, V28
290
291	VMRGOW V0, V1, V0
292	VMRGOW V2, V3, V2
293
294	VMRGEW V4, V5, V29
295	VMRGEW V6, V7, V30
296
297	XXPERMDI VS32, VS34, $0, VS33
298	XXPERMDI VS32, VS34, $3, VS35
299	XXPERMDI VS59, VS60, $0, VS32
300	XXPERMDI VS59, VS60, $3, VS34
301
302	VMRGOW V4, V5, V4
303	VMRGOW V6, V7, V6
304
305	VMRGEW V8, V9, V27
306	VMRGEW V10, V11, V28
307
308	XXPERMDI VS36, VS38, $0, VS37
309	XXPERMDI VS36, VS38, $3, VS39
310	XXPERMDI VS61, VS62, $0, VS36
311	XXPERMDI VS61, VS62, $3, VS38
312
313	VMRGOW V8, V9, V8
314	VMRGOW V10, V11, V10
315
316	VMRGEW V12, V13, V29
317	VMRGEW V14, V15, V30
318
319	XXPERMDI VS40, VS42, $0, VS41
320	XXPERMDI VS40, VS42, $3, VS43
321	XXPERMDI VS59, VS60, $0, VS40
322	XXPERMDI VS59, VS60, $3, VS42
323
324	VMRGOW V12, V13, V12
325	VMRGOW V14, V15, V14
326
327	VSPLTISW $4, V27
328	VADDUWM V26, V27, V26
329
330	XXPERMDI VS44, VS46, $0, VS45
331	XXPERMDI VS44, VS46, $3, VS47
332	XXPERMDI VS61, VS62, $0, VS44
333	XXPERMDI VS61, VS62, $3, VS46
334
335	VADDUWM V0, V16, V0
336	VADDUWM V4, V17, V4
337	VADDUWM V8, V18, V8
338	VADDUWM V12, V19, V12
339
340	BE_XXBRW(V0)
341	BE_XXBRW(V4)
342	BE_XXBRW(V8)
343	BE_XXBRW(V12)
344
345	CMPU LEN, $64
346	BLT tail_vsx
347
348	// Bottom of loop
349	LXVW4X (INP)(R0), VS59
350	LXVW4X (INP)(R8), VS60
351	LXVW4X (INP)(R9), VS61
352	LXVW4X (INP)(R10), VS62
353
354	VXOR V27, V0, V27
355	VXOR V28, V4, V28
356	VXOR V29, V8, V29
357	VXOR V30, V12, V30
358
359	STXVW4X VS59, (OUT)(R0)
360	STXVW4X VS60, (OUT)(R8)
361	ADD     $64, INP
362	STXVW4X VS61, (OUT)(R9)
363	ADD     $-64, LEN
364	STXVW4X VS62, (OUT)(R10)
365	ADD     $64, OUT
366	BEQ     done_vsx
367
368	VADDUWM V1, V16, V0
369	VADDUWM V5, V17, V4
370	VADDUWM V9, V18, V8
371	VADDUWM V13, V19, V12
372
373	BE_XXBRW(V0)
374	BE_XXBRW(V4)
375	BE_XXBRW(V8)
376	BE_XXBRW(V12)
377
378	CMPU  LEN, $64
379	BLT   tail_vsx
380
381	LXVW4X (INP)(R0), VS59
382	LXVW4X (INP)(R8), VS60
383	LXVW4X (INP)(R9), VS61
384	LXVW4X (INP)(R10), VS62
385
386	VXOR V27, V0, V27
387	VXOR V28, V4, V28
388	VXOR V29, V8, V29
389	VXOR V30, V12, V30
390
391	STXVW4X VS59, (OUT)(R0)
392	STXVW4X VS60, (OUT)(R8)
393	ADD     $64, INP
394	STXVW4X VS61, (OUT)(R9)
395	ADD     $-64, LEN
396	STXVW4X VS62, (OUT)(V10)
397	ADD     $64, OUT
398	BEQ     done_vsx
399
400	VADDUWM V2, V16, V0
401	VADDUWM V6, V17, V4
402	VADDUWM V10, V18, V8
403	VADDUWM V14, V19, V12
404
405	BE_XXBRW(V0)
406	BE_XXBRW(V4)
407	BE_XXBRW(V8)
408	BE_XXBRW(V12)
409
410	CMPU LEN, $64
411	BLT  tail_vsx
412
413	LXVW4X (INP)(R0), VS59
414	LXVW4X (INP)(R8), VS60
415	LXVW4X (INP)(R9), VS61
416	LXVW4X (INP)(R10), VS62
417
418	VXOR V27, V0, V27
419	VXOR V28, V4, V28
420	VXOR V29, V8, V29
421	VXOR V30, V12, V30
422
423	STXVW4X VS59, (OUT)(R0)
424	STXVW4X VS60, (OUT)(R8)
425	ADD     $64, INP
426	STXVW4X VS61, (OUT)(R9)
427	ADD     $-64, LEN
428	STXVW4X VS62, (OUT)(R10)
429	ADD     $64, OUT
430	BEQ     done_vsx
431
432	VADDUWM V3, V16, V0
433	VADDUWM V7, V17, V4
434	VADDUWM V11, V18, V8
435	VADDUWM V15, V19, V12
436
437	BE_XXBRW(V0)
438	BE_XXBRW(V4)
439	BE_XXBRW(V8)
440	BE_XXBRW(V12)
441
442	CMPU  LEN, $64
443	BLT   tail_vsx
444
445	LXVW4X (INP)(R0), VS59
446	LXVW4X (INP)(R8), VS60
447	LXVW4X (INP)(R9), VS61
448	LXVW4X (INP)(R10), VS62
449
450	VXOR V27, V0, V27
451	VXOR V28, V4, V28
452	VXOR V29, V8, V29
453	VXOR V30, V12, V30
454
455	STXVW4X VS59, (OUT)(R0)
456	STXVW4X VS60, (OUT)(R8)
457	ADD     $64, INP
458	STXVW4X VS61, (OUT)(R9)
459	ADD     $-64, LEN
460	STXVW4X VS62, (OUT)(R10)
461	ADD     $64, OUT
462
463	MOVD $10, R14
464	MOVD R14, CTR
465	BNE  loop_outer_vsx
466
467done_vsx:
468	// Increment counter by number of 64 byte blocks
469	MOVWZ (CNT), R14
470	ADD  BLOCKS, R14
471	MOVWZ R14, (CNT)
472	RET
473
474tail_vsx:
475	ADD  $32, R1, R11
476	MOVD LEN, CTR
477
478	// Save values on stack to copy from
479	STXVW4X VS32, (R11)(R0)
480	STXVW4X VS36, (R11)(R8)
481	STXVW4X VS40, (R11)(R9)
482	STXVW4X VS44, (R11)(R10)
483	ADD $-1, R11, R12
484	ADD $-1, INP
485	ADD $-1, OUT
486	PCALIGN $16
487looptail_vsx:
488	// Copying the result to OUT
489	// in bytes.
490	MOVBZU 1(R12), KEY
491	MOVBZU 1(INP), TMP
492	XOR    KEY, TMP, KEY
493	MOVBU  KEY, 1(OUT)
494	BDNZ   looptail_vsx
495
496	// Clear the stack values
497	STXVW4X VS48, (R11)(R0)
498	STXVW4X VS48, (R11)(R8)
499	STXVW4X VS48, (R11)(R9)
500	STXVW4X VS48, (R11)(R10)
501	BR      done_vsx