This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH 3/3] aarch64: Save and restore SVE registers in ld.so


From: Richard Henderson <richard.henderson@linaro.org>

Add SVE versions of _dl_runtime_resolve and _dl_runtime_profile.
This honors the extended vector calling conventionn described in
ARM_100986_0000_00_en (SVEpcs 00bet1).

	* sysdeps/aarch64/dl-trampoline.S (_dl_runtime_resolve_sve): New.
	(_dl_runtime_profile_sve): New.
	* sysdeps/aarch64/dl-machine.h (elf_machine_runtime_set): Use the
	new routines if HWCAP_SVE is set.
---
 sysdeps/aarch64/dl-machine.h    |  13 +-
 sysdeps/aarch64/dl-trampoline.S | 343 ++++++++++++++++++++++++++++++++
 2 files changed, 353 insertions(+), 3 deletions(-)

diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
index 4935aa7c54..ea7c5c71d5 100644
--- a/sysdeps/aarch64/dl-machine.h
+++ b/sysdeps/aarch64/dl-machine.h
@@ -69,6 +69,9 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
       ElfW(Addr) *got;
       extern void _dl_runtime_resolve (ElfW(Word));
       extern void _dl_runtime_profile (ElfW(Word));
+      extern void _dl_runtime_resolve_sve (ElfW(Word));
+      extern void _dl_runtime_profile_sve (ElfW(Word));
+      unsigned has_sve = GLRO(dl_hwcap) & HWCAP_SVE;
 
       got = (ElfW(Addr) *) D_PTR (l, l_info[DT_PLTGOT]);
       if (got[1])
@@ -83,9 +86,11 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	 to intercept the calls to collect information.  In this case we
 	 don't store the address in the GOT so that all future calls also
 	 end in this function.  */
-      if ( profile)
+      if (profile)
 	{
-	   got[2] = (ElfW(Addr)) &_dl_runtime_profile;
+	  got[2] = (has_sve
+		    ? (ElfW(Addr)) &_dl_runtime_profile_sve
+		    : (ElfW(Addr)) &_dl_runtime_profile);
 
 	  if (GLRO(dl_profile) != NULL
 	      && _dl_name_match_p (GLRO(dl_profile), l))
@@ -98,7 +103,9 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	  /* This function will get called to fix up the GOT entry
 	     indicated by the offset on the stack, and then jump to
 	     the resolved address.  */
-	  got[2] = (ElfW(Addr)) &_dl_runtime_resolve;
+	  got[2] = (has_sve
+		    ? (ElfW(Addr)) &_dl_runtime_resolve_sve
+		    : (ElfW(Addr)) &_dl_runtime_resolve);
 	}
     }
 
diff --git a/sysdeps/aarch64/dl-trampoline.S b/sysdeps/aarch64/dl-trampoline.S
index 67a7c1b207..e23e5f1aad 100644
--- a/sysdeps/aarch64/dl-trampoline.S
+++ b/sysdeps/aarch64/dl-trampoline.S
@@ -280,3 +280,346 @@ _dl_runtime_profile:
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile
 #endif
+
+/*
+ * For functions conforming to the procedure call standard as
+ * amended for SVE support (ARM_100986_0000_00_en (SVEpcs 00bet1)),
+ * we must save the entire contents of Z0-Z7 as well as P0-P3.
+ */
+        .arch   armv8-a+sve
+
+	.globl _dl_runtime_resolve_sve
+	.type _dl_runtime_resolve_sve, #function
+	.align 2
+_dl_runtime_resolve_sve:
+	/* AArch64 we get called with:
+	   ip0		&PLTGOT[2]
+	   ip1		temp(dl resolver entry point)
+	   [sp, #8]	lr
+	   [sp, #0]	&PLTGOT[n]
+	 */
+	cfi_startproc
+	cfi_adjust_cfa_offset(16)	/* Incorporate PLT */
+	cfi_rel_offset (lr, 8)
+
+	/* Save arguments.  */
+	stp	x29, x8, [sp, #-80]!
+	cfi_adjust_cfa_offset (80)
+	cfi_rel_offset (x29, 0)
+	mov	x29, sp
+	cfi_def_cfa_register (x29)
+
+	stp	x6, x7, [sp,  #16]
+	stp	x4, x5, [sp,  #32]
+	stp	x2, x3, [sp,  #48]
+	stp	x0, x1, [sp,  #64]
+
+	/* Allocate space for, and store, Z[0-7].  */
+	addvl	sp, sp, #-8
+	str	z0, [sp, #0, mul vl]
+	str	z1, [sp, #1, mul vl]
+	str	z2, [sp, #2, mul vl]
+	str	z3, [sp, #3, mul vl]
+	str	z4, [sp, #4, mul vl]
+	str	z5, [sp, #5, mul vl]
+	str	z6, [sp, #6, mul vl]
+	str	z7, [sp, #7, mul vl]
+
+	/* Allocate space for, and store, P[0-3].  */
+	addpl	sp, sp, #-4
+	str	p0, [sp, #0, mul vl]
+	str	p1, [sp, #1, mul vl]
+	str	p2, [sp, #2, mul vl]
+	str	p3, [sp, #3, mul vl]
+
+	/* Get pointer to linker struct.  */
+	ldr	PTR_REG (0), [ip0, #-PTR_SIZE]
+
+	/* Prepare to call _dl_fixup().  */
+	ldr	x1, [x29, 80]		/* Recover &PLTGOT[n] */
+
+	sub     x1, x1, ip0
+	add     x1, x1, x1, lsl #1
+	lsl     x1, x1, #3
+	sub     x1, x1, #(RELA_SIZE<<3)
+	lsr     x1, x1, #3
+
+	/* Call fixup routine.  */
+	bl	_dl_fixup
+
+	/* Save the return.  */
+	mov	ip0, x0
+
+	/* Get arguments and return address back.  */
+	ldr	p0, [sp, #0, mul vl]
+	ldr	p1, [sp, #1, mul vl]
+	ldr	p2, [sp, #2, mul vl]
+	ldr	p3, [sp, #3, mul vl]
+	addpl	sp, sp, #4
+
+	ldr	z0, [sp, #0, mul vl]
+	ldr	z1, [sp, #1, mul vl]
+	ldr	z2, [sp, #2, mul vl]
+	ldr	z3, [sp, #3, mul vl]
+	ldr	z4, [sp, #4, mul vl]
+	ldr	z5, [sp, #5, mul vl]
+	ldr	z6, [sp, #6, mul vl]
+	ldr	z7, [sp, #7, mul vl]
+	addvl	sp, sp, #8
+
+	ldr	lr, [sp, #88]
+	ldp	x0, x1, [sp, #64]
+	ldp	x2, x3, [sp, #48]
+	ldp	x4, x5, [sp, #32]
+	ldp	x6, x7, [sp, #16]
+	ldp	x29, x8, [sp], #96
+	cfi_def_cfa (sp, 0)
+	cfi_restore (lr)
+	cfi_restore (x29)
+
+	/* Jump to the newly found address.  */
+	br	ip0
+
+	cfi_endproc
+	.size _dl_runtime_resolve_sve, .-_dl_runtime_resolve_sve
+
+#ifndef PROF
+	.globl _dl_runtime_profile_sve
+	.type _dl_runtime_profile_sve, #function
+	.align 2
+_dl_runtime_profile_sve:
+	/* AArch64 we get called with:
+	   ip0		&PLTGOT[2]
+	   ip1		temp(dl resolver entry point)
+	   [sp, #8]	lr
+	   [sp, #0]	&PLTGOT[n]
+
+	   Stack frame layout:
+	         [x29,  #...] lr
+	         [x29,  #...] &PLTGOT[n]
+	         [x29,   #96] La_aarch64_regs
+	         [x29,   #48] La_aarch64_retval
+	         [x29,   #40] frame size return from pltenter
+	         [x29,   #32] dl_profile_call saved x1
+	         [x29,   #24] dl_profile_call saved x0
+	         [x29,   #16] t1
+	         [x29,    #0] x29, lr       <- x29
+           [x29, #-1, mul vl] full p[0-3]
+	   [x29, #-2, mul vl] full z[0-8]   <- sp
+
+	   ??? Extending the profiling hook for full SVE register export
+	   is tricky given the variable register size.  Perhaps the new
+	   La_aarch64_regs should contain pointers to Z0 and P0, and
+	   the current VL, and one infers the addresses from there.
+
+	   This one new form could be used for all, with AdvSIMD
+	   devolving into VL=16 with no predicate registers.
+
+	   In the meantime, this function simply saves the contents of
+	   the SVE registers, but only exposes the AdvSIMD portion to
+	   the profile hooks.
+	 */
+
+	cfi_startproc
+	cfi_adjust_cfa_offset(16)	/* Incorporate PLT */
+	cfi_rel_offset (lr, 8)
+
+	stp	x29, x8, [SP, #-SF_SIZE]!
+	cfi_adjust_cfa_offset (SF_SIZE)
+	cfi_rel_offset (x29, 0)
+	mov	x29, sp
+	cfi_def_cfa_register (x29)
+
+	/* Save La_aarch64_regs.  */
+	stp	x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0]
+	stp	x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1]
+	stp	x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2]
+	stp	x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3]
+	stp	d0, d1, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*0]
+	stp	d2, d3, [X29, #OFFSET_RG+ DL_OFFSET_RG_D0 + 16*1]
+	stp	d4, d5, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*2]
+	stp	d6, d7, [X29, #OFFSET_RG + DL_OFFSET_RG_D0 + 16*3]
+
+	/* Re-save the full contents of the vector arguments.
+
+	   Note that PL = VL/8, so we can save all 4 predicates
+	   in (less than) the space of one vector; this minimizes
+	   the number of stack adjustments required, and gives a
+	   predictable place for each register.
+
+	   Despite the unfortunate assembler mnemomics, the vector
+	   stores do not overlap the preceeding prediate stores.  */
+	addvl	sp, sp, #-9
+
+	str	p0, [x29, #-1, mul vl]
+	str	p1, [x29, #-2, mul vl]
+	str	p2, [x29, #-3, mul vl]
+	str	p3, [x29, #-4, mul vl]
+
+	str	z0, [x29, #-2, mul vl]
+	str	z1, [x29, #-3, mul vl]
+	str	z2, [x29, #-4, mul vl]
+	str	z3, [x29, #-5, mul vl]
+	str	z4, [x29, #-6, mul vl]
+	str	z5, [x29, #-7, mul vl]
+	str	z6, [x29, #-8, mul vl]
+	str	z7, [x29, #-9, mul vl]
+
+	add     x0, x29, #SF_SIZE + 16
+	ldr	x1, [x29, #OFFSET_LR]
+	stp	x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_SP]
+
+	/* Get pointer to linker struct.  */
+	ldr	PTR_REG (0), [ip0, #-PTR_SIZE]
+
+	/* Prepare to call _dl_profile_fixup().  */
+	ldr	x1, [x29, OFFSET_PLTGOTN]	/* Recover &PLTGOT[n] */
+
+	sub     x1, x1, ip0
+	add     x1, x1, x1, lsl #1
+	lsl     x1, x1, #3
+	sub     x1, x1, #(RELA_SIZE<<3)
+	lsr     x1, x1, #3
+
+	stp	x0, x1, [x29, #OFFSET_SAVED_CALL_X0]
+
+	/* Set up extra args for _dl_profile_fixup */
+	ldr	x2, [x29, #OFFSET_LR]		/* load saved LR */
+	add	x3, x29, #OFFSET_RG		/* address of La_aarch64_reg */
+	add	x4, x29, #OFFSET_FS		/* address of framesize */
+	bl	_dl_profile_fixup
+
+	ldr	ip0l, [x29, #OFFSET_FS]		/* framesize == 0 */
+	cmp	ip0l, #0
+	bge	1f
+	cfi_remember_state
+
+	/* Save the return.  */
+	mov	ip0, x0
+
+	/* Get arguments and return address back.  */
+	ldr	p0, [x29, #-1, mul vl]
+	ldr	p1, [x29, #-2, mul vl]
+	ldr	p2, [x29, #-3, mul vl]
+	ldr	p3, [x29, #-4, mul vl]
+
+	ldr	z0, [x29, #-2, mul vl]
+	ldr	z1, [x29, #-3, mul vl]
+	ldr	z2, [x29, #-4, mul vl]
+	ldr	z3, [x29, #-5, mul vl]
+	ldr	z4, [x29, #-6, mul vl]
+	ldr	z5, [x29, #-7, mul vl]
+	ldr	z6, [x29, #-8, mul vl]
+	ldr	z7, [x29, #-9, mul vl]
+
+	ldr	lr, [x29, #OFFSET_LR]
+	ldp	x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0]
+	ldp	x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1]
+	ldp	x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2]
+	ldp	x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3]
+
+	mov	sp, x29
+	ldp	x29, x8, [sp], SF_SIZE + 16
+	cfi_def_cfa (sp, 0)
+	cfi_restore(x29)
+	cfi_restore(lr)
+
+	/* Jump to the newly found address.  */
+	br	ip0
+
+	cfi_restore_state
+	/* The new frame size is in ip0, extended for pointer size.  */
+1:	sub	x1, sp, ip0
+	and	sp, x1, #0xfffffffffffffff0
+
+	str	x0, [x29, #OFFSET_T1]
+
+	mov	x0, sp
+	add	x1, x29, #SF_SIZE + 16
+	mov	x2, ip0
+	bl	memcpy
+
+	ldr	ip0, [x29, #OFFSET_T1]
+
+	/* Reload the full arguments.  */
+	ldp	x0, x1, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*0]
+	ldp	x2, x3, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*1]
+	ldp	x4, x5, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*2]
+	ldp	x6, x7, [x29, #OFFSET_RG + DL_OFFSET_RG_X0 + 16*3]
+	ldr	x8, [x29, 8]
+
+	ldr	p0, [x29, #-1, mul vl]
+	ldr	p1, [x29, #-2, mul vl]
+	ldr	p2, [x29, #-3, mul vl]
+	ldr	p3, [x29, #-4, mul vl]
+
+	ldr	z0, [x29, #-2, mul vl]
+	ldr	z1, [x29, #-3, mul vl]
+	ldr	z2, [x29, #-4, mul vl]
+	ldr	z3, [x29, #-5, mul vl]
+	ldr	z4, [x29, #-6, mul vl]
+	ldr	z5, [x29, #-7, mul vl]
+	ldr	z6, [x29, #-8, mul vl]
+	ldr	z7, [x29, #-9, mul vl]
+
+	/* Call the function.  */
+	blr	ip0
+
+	/* Store La_aarch64_retval, as if for the non-vector ABI.  */
+	stp	x0, x1, [x29, #OFFSET_RV + DL_OFFSET_RV_X0]
+	stp	d0, d1, [x29, #OFFSET_RV + DL_OFFSET_RV_D0 + 16*0]
+	stp	d2, d3, [x29, #OFFSET_RV + DL_OFFSET_RV_D0 + 16*1]
+
+	/* Store the full contents of the vector return.  */
+	str	p0, [x29, #-1, mul vl]
+	str	p1, [x29, #-2, mul vl]
+	str	p2, [x29, #-3, mul vl]
+	str	p3, [x29, #-4, mul vl]
+
+	str	z0, [x29, #-2, mul vl]
+	str	z1, [x29, #-3, mul vl]
+	str	z2, [x29, #-4, mul vl]
+	str	z3, [x29, #-5, mul vl]
+	str	z4, [x29, #-6, mul vl]
+	str	z5, [x29, #-7, mul vl]
+	str	z6, [x29, #-8, mul vl]
+	str	z7, [x29, #-9, mul vl]
+
+	/* Setup call to pltexit  */
+	ldp	x0, x1, [x29, #OFFSET_SAVED_CALL_X0]
+	add	x2, x29, #OFFSET_RG
+	add	x3, x29, #OFFSET_RV
+	bl	_dl_call_pltexit
+
+	/* Reload the full return value.  */
+	ldp	x0, x1, [x29, #OFFSET_RV + DL_OFFSET_RV_X0]
+
+	ldr	p0, [x29, #-1, mul vl]
+	ldr	p1, [x29, #-2, mul vl]
+	ldr	p2, [x29, #-3, mul vl]
+	ldr	p3, [x29, #-4, mul vl]
+
+	ldr	z0, [x29, #-2, mul vl]
+	ldr	z1, [x29, #-3, mul vl]
+	ldr	z2, [x29, #-4, mul vl]
+	ldr	z3, [x29, #-5, mul vl]
+	ldr	z4, [x29, #-6, mul vl]
+	ldr	z5, [x29, #-7, mul vl]
+	ldr	z6, [x29, #-8, mul vl]
+	ldr	z7, [x29, #-9, mul vl]
+
+	/* LR from within La_aarch64_reg */
+	ldr	lr, [x29, #OFFSET_RG + DL_OFFSET_RG_LR]
+	mov	sp, x29
+	cfi_def_cfa_register (sp)
+	ldr	x29, [x29, #0]
+	add	sp, sp, SF_SIZE + 16
+	cfi_adjust_cfa_offset (- SF_SIZE - 16)
+	cfi_restore(x29)
+	cfi_restore(lr)
+
+	br	lr
+
+	cfi_endproc
+	.size _dl_runtime_profile_sve, .-_dl_runtime_profile_sve
+#endif
-- 
2.17.1


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]