This is the mail archive of the gdb-patches@sourceware.org mailing list for the GDB project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH 3/8] Add support for fast tracepoints


From: Pierre Langlois <pierre.langlois@arm.com>

This patch adds support for fast tracepoints for aarch64-linux.  With this
implementation, a tracepoint can only be placed in a +/- 128MB range of
the jump pad.  This is due to the unconditional branch instruction
being limited to a (26 bit << 2) offset from the current PC.

Three target operations are implemented:

- target_install_fast_tracepoint_jump_pad

Building the jump pad the biggest change of this patch.  We need to add
functions to emit all instructions needed to save and restore the
current state when the tracepoint is hit.  As well as implementing a
lock and creating a collecting_t object identifying the current thread.

Steps performed by the jump pad:

  * Save the current state on the stack.
  * Push a collecting_t object on the stack.  We read the special
  tpidr_el0 system register to get the thread ID.
  * Spin-lock on the shared memory location of all tracing threads.  We
  write the address of our collecting_t object there once we have the
  lock.
  * Call gdb_collect.
  * Release the lock.
  * Restore the state.

  * Execute the replaced instruction which will have been relocated.
  * Jump back to the program.

- target_get_thread_area

As implemented in ps_get_thread_area, target_get_thread_area uses ptrace
to fetch the NT_ARM_TLS register.  At the architecture level, NT_ARM_TLS
represents the tpidr_el0 system register.

So this ptrace call (if lwpid is the current thread):
~~~
ptrace (PTRACE_GETREGSET, lwpid, NT_ARM_TLS, &iovec);
~~~

Is equivalent to the following instruction:
~~~
msr x0, tpidr_el0
~~~

This instruction is used when creating the collecting_t object that
GDBserver can read to know if a given thread is currently tracing.

So target_get_thread_area must get the same thread IDs as what the jump
pad writes into its collecting_t object.

- target_get_min_fast_tracepoint_insn_len

This just returns 4.

gdb/gdbserver/ChangeLog:

	* Makefile.in (linux-aarch64-ipa.o, aarch64-ipa.o): New rules.
	* configure.srv (aarch64*-*-linux*): Add linux-aarch64-ipa.o and
	aarch64-ipa.o.
	* linux-aarch64-ipa.c: New file.
	* linux-aarch64-low.c: Include arch/aarch64-insn.h, inttypes.h
	and endian.h.
	(aarch64_get_thread_area): New target method.
	(extract_signed_bitfield): New helper function.
	(aarch64_decode_ldr_literal): New function.
	(enum aarch64_opcodes): New enum.
	(struct aarch64_register): New struct.
	(struct aarch64_operand): New struct.
	(x0): New static global.
	(x1): Likewise.
	(x2): Likewise.
	(x3): Likewise.
	(x4): Likewise.
	(w2): Likewise.
	(ip0): Likewise.
	(sp): Likewise.
	(xzr): Likewise.
	(aarch64_register): New helper function.
	(register_operand): Likewise.
	(immediate_operand): Likewise.
	(struct aarch64_memory_operand): New struct.
	(offset_memory_operand): New helper function.
	(preindex_memory_operand): Likewise.
	(enum aarch64_system_control_registers): New enum.
	(ENCODE): New macro.
	(emit_insn): New helper function.
	(emit_b): New function.
	(emit_bcond): Likewise.
	(emit_cb): Likewise.
	(emit_tb): Likewise.
	(emit_blr): Likewise.
	(emit_stp): Likewise.
	(emit_ldp_q_offset): Likewise.
	(emit_stp_q_offset): Likewise.
	(emit_load_store): Likewise.
	(emit_ldr): Likewise.
	(emit_ldrsw): Likewise.
	(emit_str): Likewise.
	(emit_ldaxr): Likewise.
	(emit_stxr): Likewise.
	(emit_stlr): Likewise.
	(emit_data_processing_reg): Likewise.
	(emit_data_processing): Likewise.
	(emit_add): Likewise.
	(emit_sub): Likewise.
	(emit_mov): Likewise.
	(emit_movk): Likewise.
	(emit_mov_addr): Likewise.
	(emit_mrs): Likewise.
	(emit_msr): Likewise.
	(emit_sevl): Likewise.
	(emit_wfe): Likewise.
	(append_insns): Likewise.
	(can_encode_int32_in): New helper function.
	(aarch64_relocate_instruction): New function.
	(aarch64_install_fast_tracepoint_jump_pad): Likewise.
	(aarch64_get_min_fast_tracepoint_insn_len): Likewise.
	(struct linux_target_ops): Install aarch64_get_thread_area,
	aarch64_install_fast_tracepoint_jump_pad and
	aarch64_get_min_fast_tracepoint_insn_len.
---
 gdb/gdbserver/Makefile.in         |    6 +
 gdb/gdbserver/configure.srv       |    1 +
 gdb/gdbserver/linux-aarch64-ipa.c |  151 ++++
 gdb/gdbserver/linux-aarch64-low.c | 1466 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 1621 insertions(+), 3 deletions(-)
 create mode 100644 gdb/gdbserver/linux-aarch64-ipa.c

diff --git a/gdb/gdbserver/Makefile.in b/gdb/gdbserver/Makefile.in
index d096663..cd146f4 100644
--- a/gdb/gdbserver/Makefile.in
+++ b/gdb/gdbserver/Makefile.in
@@ -499,6 +499,12 @@ linux-amd64-ipa.o: linux-amd64-ipa.c
 amd64-linux-ipa.o: amd64-linux.c
 	$(IPAGENT_COMPILE) $<
 	$(POSTCOMPILE)
+linux-aarch64-ipa.o: linux-aarch64-ipa.c
+	$(IPAGENT_COMPILE) $<
+	$(POSTCOMPILE)
+aarch64-ipa.o: aarch64.c
+	$(IPAGENT_COMPILE) $<
+	$(POSTCOMPILE)
 tdesc-ipa.o: tdesc.c
 	$(IPAGENT_COMPILE) $<
 	$(POSTCOMPILE)
diff --git a/gdb/gdbserver/configure.srv b/gdb/gdbserver/configure.srv
index a62df83..f187c9d 100644
--- a/gdb/gdbserver/configure.srv
+++ b/gdb/gdbserver/configure.srv
@@ -62,6 +62,7 @@ case "${target}" in
 			srv_xmlfiles="${srv_xmlfiles} arm-with-neon.xml"
 			srv_linux_regsets=yes
 			srv_linux_thread_db=yes
+			ipa_obj="linux-aarch64-ipa.o aarch64-ipa.o"
 			;;
   arm*-*-linux*)	srv_regobj="reg-arm.o arm-with-iwmmxt.o"
 			srv_regobj="${srv_regobj} arm-with-vfpv2.o"
diff --git a/gdb/gdbserver/linux-aarch64-ipa.c b/gdb/gdbserver/linux-aarch64-ipa.c
new file mode 100644
index 0000000..1aafc5f
--- /dev/null
+++ b/gdb/gdbserver/linux-aarch64-ipa.c
@@ -0,0 +1,151 @@
+/* GNU/Linux/AArch64 specific low level interface, for the in-process
+   agent library for GDB.
+
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+   This file is part of GDB.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include "server.h"
+#include "tracepoint.h"
+
+/* Defined in auto-generated file aarch64.c.  */
+void init_registers_aarch64 (void);
+extern const struct target_desc *tdesc_aarch64;
+
+/* Each register saved by the jump pad is in a 16 byte cell.  */
+#define FT_CR_SIZE 16
+
+#define FT_CR_FPCR	0
+#define FT_CR_FPSR	1
+#define FT_CR_CPSR	2
+#define FT_CR_PC	3
+#define FT_CR_SP	4
+#define FT_CR_X0	5
+#define FT_CR_GPR(n)	(FT_CR_X0 + (n))
+#define FT_CR_FPR(n)	(FT_CR_GPR (31) + (n))
+
+/* Mapping between registers collected by the jump pad and GDB's register
+   array layout used by regcache.
+
+   See linux-aarch64-low.c (aarch64_install_fast_tracepoint_jump_pad) for
+   more details.  */
+
+static const int aarch64_ft_collect_regmap[] = {
+  FT_CR_GPR (0),
+  FT_CR_GPR (1),
+  FT_CR_GPR (2),
+  FT_CR_GPR (3),
+  FT_CR_GPR (4),
+  FT_CR_GPR (5),
+  FT_CR_GPR (6),
+  FT_CR_GPR (7),
+  FT_CR_GPR (8),
+  FT_CR_GPR (9),
+  FT_CR_GPR (10),
+  FT_CR_GPR (11),
+  FT_CR_GPR (12),
+  FT_CR_GPR (13),
+  FT_CR_GPR (14),
+  FT_CR_GPR (15),
+  FT_CR_GPR (16),
+  FT_CR_GPR (17),
+  FT_CR_GPR (18),
+  FT_CR_GPR (19),
+  FT_CR_GPR (20),
+  FT_CR_GPR (21),
+  FT_CR_GPR (22),
+  FT_CR_GPR (23),
+  FT_CR_GPR (24),
+  FT_CR_GPR (25),
+  FT_CR_GPR (26),
+  FT_CR_GPR (27),
+  FT_CR_GPR (28),
+  /* FP */
+  FT_CR_GPR (29),
+  /* LR */
+  FT_CR_GPR (30),
+  FT_CR_SP,
+  FT_CR_PC,
+  FT_CR_CPSR,
+  FT_CR_FPR (0),
+  FT_CR_FPR (1),
+  FT_CR_FPR (2),
+  FT_CR_FPR (3),
+  FT_CR_FPR (4),
+  FT_CR_FPR (5),
+  FT_CR_FPR (6),
+  FT_CR_FPR (7),
+  FT_CR_FPR (8),
+  FT_CR_FPR (9),
+  FT_CR_FPR (10),
+  FT_CR_FPR (11),
+  FT_CR_FPR (12),
+  FT_CR_FPR (13),
+  FT_CR_FPR (14),
+  FT_CR_FPR (15),
+  FT_CR_FPR (16),
+  FT_CR_FPR (17),
+  FT_CR_FPR (18),
+  FT_CR_FPR (19),
+  FT_CR_FPR (20),
+  FT_CR_FPR (21),
+  FT_CR_FPR (22),
+  FT_CR_FPR (23),
+  FT_CR_FPR (24),
+  FT_CR_FPR (25),
+  FT_CR_FPR (26),
+  FT_CR_FPR (27),
+  FT_CR_FPR (28),
+  FT_CR_FPR (29),
+  FT_CR_FPR (30),
+  FT_CR_FPR (31),
+  FT_CR_FPSR,
+  FT_CR_FPCR
+};
+
+#define AARCH64_NUM_FT_COLLECT_GREGS \
+  (sizeof (aarch64_ft_collect_regmap) / sizeof(aarch64_ft_collect_regmap[0]))
+
+/* Fill in REGCACHE with registers saved by the jump pad in BUF.  */
+
+void
+supply_fast_tracepoint_registers (struct regcache *regcache,
+				  const unsigned char *buf)
+{
+  int i;
+
+  for (i = 0; i < AARCH64_NUM_FT_COLLECT_GREGS; i++)
+    supply_register (regcache, i,
+		     ((char *) buf)
+		     + (aarch64_ft_collect_regmap[i] * FT_CR_SIZE));
+}
+
+IP_AGENT_EXPORT_FUNC ULONGEST
+gdb_agent_get_raw_reg (const unsigned char *raw_regs, int regnum)
+{
+  if (regnum >= AARCH64_NUM_FT_COLLECT_GREGS)
+    return 0;
+
+  return *(ULONGEST *) (raw_regs
+			+ aarch64_ft_collect_regmap[regnum] * FT_CR_SIZE);
+}
+
+void
+initialize_low_tracepoint (void)
+{
+  init_registers_aarch64 ();
+  ipa_tdesc = tdesc_aarch64;
+}
diff --git a/gdb/gdbserver/linux-aarch64-low.c b/gdb/gdbserver/linux-aarch64-low.c
index 73b248c..e2d738b 100644
--- a/gdb/gdbserver/linux-aarch64-low.c
+++ b/gdb/gdbserver/linux-aarch64-low.c
@@ -23,6 +23,7 @@
 #include "linux-low.h"
 #include "nat/aarch64-linux.h"
 #include "nat/aarch64-linux-hw-point.h"
+#include "arch/aarch64-insn.h"
 #include "linux-aarch32-low.h"
 #include "elf/common.h"
 
@@ -31,6 +32,8 @@
 #include "nat/gdb_ptrace.h"
 #include <asm/ptrace.h>
 #include <sys/uio.h>
+#include <inttypes.h>
+#include <endian.h>
 
 #include "gdb_proc_service.h"
 
@@ -573,6 +576,1463 @@ aarch64_supports_tracepoints (void)
     }
 }
 
+/* Implementation of linux_target_ops method "get_thread_area".  */
+
+static int
+aarch64_get_thread_area (int lwpid, CORE_ADDR *addrp)
+{
+  struct iovec iovec;
+  uint64_t reg;
+
+  iovec.iov_base = &reg;
+  iovec.iov_len = sizeof (reg);
+
+  if (ptrace (PTRACE_GETREGSET, lwpid, NT_ARM_TLS, &iovec) != 0)
+    return -1;
+
+  *addrp = reg;
+
+  return 0;
+}
+
+/* Extract a signed value from a bit field within an instruction
+   encoding.
+
+   INSN is the instruction opcode.
+
+   WIDTH specifies the width of the bit field to extract (in bits).
+
+   OFFSET specifies the least significant bit of the field where bits
+   are numbered zero counting from least to most significant.  */
+
+static int32_t
+extract_signed_bitfield (uint32_t insn, unsigned width, unsigned offset)
+{
+  unsigned shift_l = sizeof (int32_t) * 8 - (offset + width);
+  unsigned shift_r = sizeof (int32_t) * 8 - width;
+
+  return ((int32_t) insn << shift_l) >> shift_r;
+}
+
+/* Decode an opcode if it represents an LDR or LDRSW instruction taking a
+   literal offset from the current PC.
+
+   ADDR specifies the address of the opcode.
+   INSN specifies the opcode to test.
+   IS_W is set if the instruction is LDRSW.
+   IS64 receives size field from the decoded instruction.
+   RT receives the 'rt' field from the decoded instruction.
+   OFFSET receives the 'imm' field from the decoded instruction.
+
+   Return 1 if the opcodes matches and is decoded, otherwise 0.  */
+
+int
+aarch64_decode_ldr_literal (CORE_ADDR addr, uint32_t insn, int *is_w,
+			    int *is64, unsigned *rt, int32_t *offset)
+{
+  /* LDR    0T01 1000 iiii iiii iiii iiii iiir rrrr */
+  /* LDRSW  1001 1000 iiii iiii iiii iiii iiir rrrr */
+  if ((insn & 0x3f000000) == 0x18000000)
+    {
+      *is_w = (insn >> 31) & 0x1;
+
+      if (*is_w)
+	{
+	  /* LDRSW always takes a 64-bit destination registers.  */
+	  *is64 = 1;
+	}
+      else
+	*is64 = (insn >> 30) & 0x1;
+
+      *rt = (insn >> 0) & 0x1f;
+      *offset = extract_signed_bitfield (insn, 19, 5) << 2;
+
+      if (aarch64_debug)
+	debug_printf ("decode: %s 0x%x %s %s%u, #?\n",
+		      core_addr_to_string_nz (addr), insn,
+		      *is_w ? "ldrsw" : "ldr",
+		      *is64 ? "x" : "w", *rt);
+
+      return 1;
+    }
+
+  return 0;
+}
+
+/* List of opcodes that we need for building the jump pad and relocating
+   an instruction.  */
+
+enum aarch64_opcodes
+{
+  /* B              0001 01ii iiii iiii iiii iiii iiii iiii */
+  /* BL             1001 01ii iiii iiii iiii iiii iiii iiii */
+  /* B.COND         0101 0100 iiii iiii iiii iiii iii0 cccc */
+  /* CBZ            s011 0100 iiii iiii iiii iiii iiir rrrr */
+  /* CBNZ           s011 0101 iiii iiii iiii iiii iiir rrrr */
+  /* TBZ            b011 0110 bbbb biii iiii iiii iiir rrrr */
+  /* TBNZ           b011 0111 bbbb biii iiii iiii iiir rrrr */
+  B               = 0x14000000,
+  BL              = 0x80000000 | B,
+  BCOND           = 0x40000000 | B,
+  CBZ             = 0x20000000 | B,
+  CBNZ            = 0x21000000 | B,
+  TBZ             = 0x36000000 | B,
+  TBNZ            = 0x37000000 | B,
+  /* BLR            1101 0110 0011 1111 0000 00rr rrr0 0000 */
+  BLR             = 0xd63f0000,
+  /* STP            s010 100o o0ii iiii irrr rrrr rrrr rrrr */
+  /* LDP            s010 100o o1ii iiii irrr rrrr rrrr rrrr */
+  /* STP (SIMD&VFP) ss10 110o o0ii iiii irrr rrrr rrrr rrrr */
+  /* LDP (SIMD&VFP) ss10 110o o1ii iiii irrr rrrr rrrr rrrr */
+  STP             = 0x28000000,
+  LDP             = 0x28400000,
+  STP_SIMD_VFP    = 0x04000000 | STP,
+  LDP_SIMD_VFP    = 0x04000000 | LDP,
+  /* STR            ss11 100o 00xi iiii iiii xxrr rrrr rrrr */
+  /* LDR            ss11 100o 01xi iiii iiii xxrr rrrr rrrr */
+  /* LDRSW          1011 100o 10xi iiii iiii xxrr rrrr rrrr */
+  STR             = 0x38000000,
+  LDR             = 0x00400000 | STR,
+  LDRSW           = 0x80800000 | STR,
+  /* LDAXR          ss00 1000 0101 1111 1111 11rr rrrr rrrr */
+  LDAXR           = 0x085ffc00,
+  /* STXR           ss00 1000 000r rrrr 0111 11rr rrrr rrrr */
+  STXR            = 0x08007c00,
+  /* STLR           ss00 1000 1001 1111 1111 11rr rrrr rrrr */
+  STLR            = 0x089ffc00,
+  /* MOV            s101 0010 1xxi iiii iiii iiii iiir rrrr */
+  /* MOVK           s111 0010 1xxi iiii iiii iiii iiir rrrr */
+  MOV             = 0x52800000,
+  MOVK            = 0x20000000 | MOV,
+  /* ADD            s00o ooo1 xxxx xxxx xxxx xxxx xxxx xxxx */
+  /* SUB            s10o ooo1 xxxx xxxx xxxx xxxx xxxx xxxx */
+  /* SUBS           s11o ooo1 xxxx xxxx xxxx xxxx xxxx xxxx */
+  ADD             = 0x01000000,
+  SUB             = 0x40000000 | ADD,
+  /* MSR (register) 1101 0101 0001 oooo oooo oooo ooor rrrr */
+  /* MRS            1101 0101 0011 oooo oooo oooo ooor rrrr */
+  MSR             = 0xd5100000,
+  MRS             = 0x00200000 | MSR,
+  /* HINT           1101 0101 0000 0011 0010 oooo ooo1 1111 */
+  HINT            = 0xd503201f,
+  SEVL            = (5 << 5) | HINT,
+  WFE             = (2 << 5) | HINT,
+};
+
+/* Representation of a general purpose register of the form xN or wN.
+
+   This type is used by emitting functions that take registers as operands.  */
+
+struct aarch64_register
+{
+  unsigned num;
+  int is64;
+};
+
+/* Representation of an operand.  At this time, it only supports register
+   and immediate types.  */
+
+struct aarch64_operand
+{
+  /* Type of the operand.  */
+  enum
+    {
+      OPERAND_IMMEDIATE,
+      OPERAND_REGISTER,
+    } type;
+  /* Value of the operand according to the type.  */
+  union
+    {
+      uint32_t imm;
+      struct aarch64_register reg;
+    };
+};
+
+/* List of registers that we are currently using, we can add more here as
+   we need to use them.  */
+
+/* General purpose scratch registers (64 bit).  */
+static const struct aarch64_register x0 = { 0, 1 };
+static const struct aarch64_register x1 = { 1, 1 };
+static const struct aarch64_register x2 = { 2, 1 };
+static const struct aarch64_register x3 = { 3, 1 };
+static const struct aarch64_register x4 = { 4, 1 };
+
+/* General purpose scratch registers (32 bit).  */
+static const struct aarch64_register w2 = { 2, 0 };
+
+/* Intra-procedure scratch registers.  */
+static const struct aarch64_register ip0 = { 16, 1 };
+
+/* Special purpose registers.  */
+static const struct aarch64_register sp = { 31, 1 };
+static const struct aarch64_register xzr = { 31, 1 };
+
+/* Dynamically allocate a new register.  If we know the register
+   statically, we should make it a global as above instead of using this
+   helper function.  */
+
+static struct aarch64_register
+aarch64_register (unsigned num, int is64)
+{
+  return (struct aarch64_register) { num, is64 };
+}
+
+/* Helper function to create a register operand, for instructions with
+   different types of operands.
+
+   For example:
+   p += emit_mov (p, x0, register_operand (x1));  */
+
+static struct aarch64_operand
+register_operand (struct aarch64_register reg)
+{
+  struct aarch64_operand operand;
+
+  operand.type = OPERAND_REGISTER;
+  operand.reg = reg;
+
+  return operand;
+}
+
+/* Helper function to create an immediate operand, for instructions with
+   different types of operands.
+
+   For example:
+   p += emit_mov (p, x0, immediate_operand (12));  */
+
+static struct aarch64_operand
+immediate_operand (uint32_t imm)
+{
+  struct aarch64_operand operand;
+
+  operand.type = OPERAND_IMMEDIATE;
+  operand.imm = imm;
+
+  return operand;
+}
+
+/* Representation of a memory operand, used for load and store
+   instructions.
+
+   The types correspond to the following variants:
+
+   MEMORY_OPERAND_OFFSET:   LDR rt, [rn, #offset]
+   MEMORY_OPERAND_PREINDEX: LDR rt, [rn, #index]!  */
+
+struct aarch64_memory_operand
+{
+  /* Type of the operand.  */
+  enum
+    {
+      MEMORY_OPERAND_OFFSET,
+      MEMORY_OPERAND_PREINDEX,
+    } type;
+  /* Index from the base register.  */
+  int32_t index;
+};
+
+/* Helper function to create an offset memory operand.
+
+   For example:
+   p += emit_ldr (p, x0, sp, offset_memory_operand (16));  */
+
+static struct aarch64_memory_operand
+offset_memory_operand (int32_t offset)
+{
+  return (struct aarch64_memory_operand) { MEMORY_OPERAND_OFFSET, offset };
+}
+
+/* Helper function to create a pre-index memory operand.
+
+   For example:
+   p += emit_ldr (p, x0, sp, preindex_memory_operand (16));  */
+
+static struct aarch64_memory_operand
+preindex_memory_operand (int32_t index)
+{
+  return (struct aarch64_memory_operand) { MEMORY_OPERAND_PREINDEX, index };
+}
+
+/* System control registers.  These special registers can be written and
+   read with the MRS and MSR instructions.
+
+   - NZCV: Condition flags.  GDB refers to this register under the CPSR
+	   name.
+   - FPSR: Floating-point status register.
+   - FPCR: Floating-point control registers.
+   - TPIDR_EL0: Software thread ID register.  */
+
+enum aarch64_system_control_registers
+{
+  /*          op0           op1           crn          crm          op2  */
+  NZCV =      (0x1 << 14) | (0x3 << 11) | (0x4 << 7) | (0x2 << 3) | 0x0,
+  FPSR =      (0x1 << 14) | (0x3 << 11) | (0x4 << 7) | (0x4 << 3) | 0x1,
+  FPCR =      (0x1 << 14) | (0x3 << 11) | (0x4 << 7) | (0x4 << 3) | 0x0,
+  TPIDR_EL0 = (0x1 << 14) | (0x3 << 11) | (0xd << 7) | (0x0 << 3) | 0x2
+};
+
+/* Helper macro to mask and shift a value into a bitfield.  */
+
+#define ENCODE(val, size, offset) \
+  ((uint32_t) ((val & ((1ULL << size) - 1)) << offset))
+
+/* Write a 32-bit unsigned integer INSN info *BUF.  Return the number of
+   instructions written (aka. 1).  */
+
+static int
+emit_insn (uint32_t *buf, uint32_t insn)
+{
+  *buf = insn;
+  return 1;
+}
+
+/* Write a B or BL instruction into *BUF.
+
+     B  #offset
+     BL #offset
+
+   IS_BL specifies if the link register should be updated.
+   OFFSET is the immediate offset from the current PC.  It is
+   byte-addressed but should be 4 bytes aligned.  It has a limited range of
+   +/- 128MB (26 bits << 2).  */
+
+static int
+emit_b (uint32_t *buf, int is_bl, int32_t offset)
+{
+  uint32_t imm26 = ENCODE (offset >> 2, 26, 0);
+
+  if (is_bl)
+    return emit_insn (buf, BL | imm26);
+  else
+    return emit_insn (buf, B | imm26);
+}
+
+/* Write a BCOND instruction into *BUF.
+
+     B.COND #offset
+
+   COND specifies the condition field.
+   OFFSET is the immediate offset from the current PC.  It is
+   byte-addressed but should be 4 bytes aligned.  It has a limited range of
+   +/- 1MB (19 bits << 2).  */
+
+static int
+emit_bcond (uint32_t *buf, unsigned cond, int32_t offset)
+{
+  return emit_insn (buf, BCOND | ENCODE (offset >> 2, 19, 5)
+		    | ENCODE (cond, 4, 0));
+}
+
+/* Write a CBZ or CBNZ instruction into *BUF.
+
+     CBZ  rt, #offset
+     CBNZ rt, #offset
+
+   IS_CBNZ distinguishes between CBZ and CBNZ instructions.
+   RN is the register to test.
+   OFFSET is the immediate offset from the current PC.  It is
+   byte-addressed but should be 4 bytes aligned.  It has a limited range of
+   +/- 1MB (19 bits << 2).  */
+
+static int
+emit_cb (uint32_t *buf, int is_cbnz, struct aarch64_register rt,
+	 int32_t offset)
+{
+  uint32_t imm19 = ENCODE (offset >> 2, 19, 5);
+  uint32_t sf = ENCODE (rt.is64, 1, 31);
+
+  if (is_cbnz)
+    return emit_insn (buf, CBNZ | sf | imm19 | ENCODE (rt.num, 5, 0));
+  else
+    return emit_insn (buf, CBZ | sf | imm19 | ENCODE (rt.num, 5, 0));
+}
+
+/* Write a TBZ or TBNZ instruction into *BUF.
+
+     TBZ  rt, #bit, #offset
+     TBNZ rt, #bit, #offset
+
+   IS_TBNZ distinguishes between TBZ and TBNZ instructions.
+   RT is the register to test.
+   BIT is the index of the bit to test in register RT.
+   OFFSET is the immediate offset from the current PC.  It is
+   byte-addressed but should be 4 bytes aligned.  It has a limited range of
+   +/- 32KB (14 bits << 2).  */
+
+static int
+emit_tb (uint32_t *buf, int is_tbnz, unsigned bit,
+	 struct aarch64_register rt, int32_t offset)
+{
+  uint32_t imm14 = ENCODE (offset >> 2, 14, 5);
+  uint32_t b40 = ENCODE (bit, 5, 19);
+  uint32_t b5 = ENCODE (bit >> 5, 1, 31);
+
+  if (is_tbnz)
+    return emit_insn (buf, TBNZ | b5 | b40 | imm14 | ENCODE (rt.num, 5, 0));
+  else
+    return emit_insn (buf, TBZ | b5 | b40 | imm14 | ENCODE (rt.num, 5, 0));
+}
+
+/* Write a BLR instruction into *BUF.
+
+     BLR rn
+
+   RN is the register to branch to.  */
+
+static int
+emit_blr (uint32_t *buf, struct aarch64_register rn)
+{
+  return emit_insn (buf, BLR | ENCODE (rn.num, 5, 5));
+}
+
+/* Write a STP instruction into *BUF.
+
+     STP rt, rt2, [rn, #offset]
+     STP rt, rt2, [rn, #index]!
+
+   RT and RT2 are the registers to store.
+   RN is the base address register.
+   OFFSET is the immediate to add to the base address.  It is limited to a
+   -512 .. 504 range (7 bits << 3).  */
+
+static int
+emit_stp (uint32_t *buf, struct aarch64_register rt,
+	  struct aarch64_register rt2, struct aarch64_register rn,
+	  struct aarch64_memory_operand operand)
+{
+  uint32_t opc;
+  uint32_t pre_index;
+  uint32_t write_back;
+
+  if (rt.is64)
+    opc = ENCODE (2, 2, 30);
+  else
+    opc = ENCODE (0, 2, 30);
+
+  switch (operand.type)
+    {
+    case MEMORY_OPERAND_OFFSET:
+      {
+	pre_index = ENCODE (1, 1, 24);
+	write_back = ENCODE (0, 1, 23);
+	break;
+      }
+    case MEMORY_OPERAND_PREINDEX:
+      {
+	pre_index = ENCODE (1, 1, 24);
+	write_back = ENCODE (1, 1, 23);
+	break;
+      }
+    default:
+      return 0;
+    }
+
+  return emit_insn (buf, STP | opc | pre_index | write_back
+		    | ENCODE (operand.index >> 3, 7, 15) | ENCODE (rt2.num, 5, 10)
+		    | ENCODE (rn.num, 5, 5) | ENCODE (rt.num, 5, 0));
+}
+
+/* Write a LDP (SIMD&VFP) instruction using Q registers into *BUF.
+
+     LDP qt, qt2, [rn, #offset]
+
+   RT and RT2 are the Q registers to store.
+   RN is the base address register.
+   OFFSET is the immediate to add to the base address.  It is limited to
+   -1024 .. 1008 range (7 bits << 4).  */
+
+static int
+emit_ldp_q_offset (uint32_t *buf, unsigned rt, unsigned rt2,
+		   struct aarch64_register rn, int32_t offset)
+{
+  uint32_t opc = ENCODE (2, 2, 30);
+  uint32_t pre_index = ENCODE (1, 1, 24);
+
+  return emit_insn (buf, LDP_SIMD_VFP | opc | pre_index
+		    | ENCODE (offset >> 4, 7, 15) | ENCODE (rt2, 5, 10)
+		    | ENCODE (rn.num, 5, 5) | ENCODE (rt, 5, 0));
+}
+
+/* Write a STP (SIMD&VFP) instruction using Q registers into *BUF.
+
+     STP qt, qt2, [rn, #offset]
+
+   RT and RT2 are the Q registers to store.
+   RN is the base address register.
+   OFFSET is the immediate to add to the base address.  It is limited to
+   -1024 .. 1008 range (7 bits << 4).  */
+
+static int
+emit_stp_q_offset (uint32_t *buf, unsigned rt, unsigned rt2,
+		   struct aarch64_register rn, int32_t offset)
+{
+  uint32_t opc = ENCODE (2, 2, 30);
+  uint32_t pre_index = ENCODE (1, 1, 24);
+
+  return emit_insn (buf, STP_SIMD_VFP | opc | pre_index
+		    | ENCODE (offset >> 4, 7, 15) | ENCODE (rt2, 5, 10)
+		    | ENCODE (rn.num, 5, 5) | ENCODE (rt, 5, 0));
+}
+
+/* Helper function emitting a load or store instruction.  */
+
+static int
+emit_load_store (uint32_t *buf, uint32_t size, enum aarch64_opcodes opcode,
+		 struct aarch64_register rt, struct aarch64_register rn,
+		 struct aarch64_memory_operand operand)
+{
+  uint32_t op;
+
+  switch (operand.type)
+    {
+    case MEMORY_OPERAND_OFFSET:
+      {
+	op = ENCODE (1, 1, 24);
+
+	return emit_insn (buf, opcode | ENCODE (size, 2, 30) | op
+			  | ENCODE (operand.index >> 3, 12, 10)
+			  | ENCODE (rn.num, 5, 5) | ENCODE (rt.num, 5, 0));
+      }
+    case MEMORY_OPERAND_PREINDEX:
+      {
+	uint32_t pre_index = ENCODE (3, 2, 10);
+
+	op = ENCODE (0, 1, 24);
+
+	return emit_insn (buf, opcode | ENCODE (size, 2, 30) | op
+			  | pre_index | ENCODE (operand.index, 9, 12)
+			  | ENCODE (rn.num, 5, 5) | ENCODE (rt.num, 5, 0));
+      }
+    default:
+      return 0;
+    }
+}
+
+/* Write a LDR instruction into *BUF.
+
+     LDR rt, [rn, #offset]
+     LDR rt, [rn, #index]!
+
+   RT is the register to store.
+   RN is the base address register.
+   OFFSET is the immediate to add to the base address.  It is limited to
+   0 .. 32760 range (12 bits << 3).  */
+
+static int
+emit_ldr (uint32_t *buf, struct aarch64_register rt,
+	  struct aarch64_register rn, struct aarch64_memory_operand operand)
+{
+  return emit_load_store (buf, rt.is64 ? 3 : 2, LDR, rt, rn, operand);
+}
+
+/* Write a LDRSW instruction into *BUF.  The register size is 64-bit.
+
+     LDRSW xt, [rn, #offset]
+     LDRSW xt, [rn, #index]!
+
+   RT is the register to store.
+   RN is the base address register.
+   OFFSET is the immediate to add to the base address.  It is limited to
+   0 .. 16380 range (12 bits << 2).  */
+
+static int
+emit_ldrsw (uint32_t *buf, struct aarch64_register rt,
+		   struct aarch64_register rn,
+		   struct aarch64_memory_operand operand)
+{
+  return emit_load_store (buf, 3, LDRSW, rt, rn, operand);
+}
+
+/* Write a STR instruction into *BUF.
+
+     STR rt, [rn, #offset]
+     STR rt, [rn, #index]!
+
+   RT is the register to store.
+   RN is the base address register.
+   OFFSET is the immediate to add to the base address.  It is limited to
+   0 .. 32760 range (12 bits << 3).  */
+
+static int
+emit_str (uint32_t *buf, struct aarch64_register rt,
+	  struct aarch64_register rn,
+	  struct aarch64_memory_operand operand)
+{
+  return emit_load_store (buf, rt.is64 ? 3 : 2, STR, rt, rn, operand);
+}
+
+/* Helper function emitting an exclusive load or store instruction.  */
+
+static int
+emit_load_store_exclusive (uint32_t *buf, uint32_t size,
+			   enum aarch64_opcodes opcode,
+			   struct aarch64_register rs,
+			   struct aarch64_register rt,
+			   struct aarch64_register rt2,
+			   struct aarch64_register rn)
+{
+  return emit_insn (buf, opcode | ENCODE (size, 2, 30)
+		    | ENCODE (rs.num, 5, 16) | ENCODE (rt2.num, 5, 10)
+		    | ENCODE (rn.num, 5, 5) | ENCODE (rt.num, 5, 0));
+}
+
+/* Write a LAXR instruction into *BUF.
+
+     LDAXR rt, [xn]
+
+   RT is the destination register.
+   RN is the base address register.  */
+
+static int
+emit_ldaxr (uint32_t *buf, struct aarch64_register rt,
+	    struct aarch64_register rn)
+{
+  return emit_load_store_exclusive (buf, rt.is64 ? 3 : 2, LDAXR, xzr, rt,
+				    xzr, rn);
+}
+
+/* Write a STXR instruction into *BUF.
+
+     STXR ws, rt, [xn]
+
+   RS is the result register, it indicates if the store succeeded or not.
+   RT is the destination register.
+   RN is the base address register.  */
+
+static int
+emit_stxr (uint32_t *buf, struct aarch64_register rs,
+	   struct aarch64_register rt, struct aarch64_register rn)
+{
+  return emit_load_store_exclusive (buf, rt.is64 ? 3 : 2, STXR, rs, rt,
+				    xzr, rn);
+}
+
+/* Write a STLR instruction into *BUF.
+
+     STLR rt, [xn]
+
+   RT is the register to store.
+   RN is the base address register.  */
+
+static int
+emit_stlr (uint32_t *buf, struct aarch64_register rt,
+	   struct aarch64_register rn)
+{
+  return emit_load_store_exclusive (buf, rt.is64 ? 3 : 2, STLR, xzr, rt,
+				    xzr, rn);
+}
+
+/* Helper function for data processing instructions with register sources.  */
+
+static int
+emit_data_processing_reg (uint32_t *buf, enum aarch64_opcodes opcode,
+			  struct aarch64_register rd,
+			  struct aarch64_register rn,
+			  struct aarch64_register rm)
+{
+  uint32_t size = ENCODE (rd.is64, 1, 31);
+
+  return emit_insn (buf, opcode | size | ENCODE (rm.num, 5, 16)
+		    | ENCODE (rn.num, 5, 5) | ENCODE (rd.num, 5, 0));
+}
+
+/* Helper function for data processing instructions taking either a register
+   or an immediate.  */
+
+static int
+emit_data_processing (uint32_t *buf, enum aarch64_opcodes opcode,
+		      struct aarch64_register rd,
+		      struct aarch64_register rn,
+		      struct aarch64_operand operand)
+{
+  uint32_t size = ENCODE (rd.is64, 1, 31);
+  /* The opcode is different for register and immediate source operands.  */
+  uint32_t operand_opcode;
+
+  if (operand.type == OPERAND_IMMEDIATE)
+    {
+      /* xxx1 000x xxxx xxxx xxxx xxxx xxxx xxxx */
+      operand_opcode = ENCODE (8, 4, 25);
+
+      return emit_insn (buf, opcode | operand_opcode | size
+			| ENCODE (operand.imm, 12, 10)
+			| ENCODE (rn.num, 5, 5) | ENCODE (rd.num, 5, 0));
+    }
+  else
+    {
+      /* xxx0 101x xxxx xxxx xxxx xxxx xxxx xxxx */
+      operand_opcode = ENCODE (5, 4, 25);
+
+      return emit_data_processing_reg (buf, opcode | operand_opcode, rd,
+				       rn, operand.reg);
+    }
+}
+
+/* Write an ADD instruction into *BUF.
+
+     ADD rd, rn, #imm
+     ADD rd, rn, rm
+
+   This function handles both an immediate and register add.
+
+   RD is the destination register.
+   RN is the input register.
+   OPERAND is the source operand, either of type OPERAND_IMMEDIATE or
+   OPERAND_REGISTER.  */
+
+static int
+emit_add (uint32_t *buf, struct aarch64_register rd,
+	  struct aarch64_register rn, struct aarch64_operand operand)
+{
+  return emit_data_processing (buf, ADD, rd, rn, operand);
+}
+
+/* Write a SUB instruction into *BUF.
+
+     SUB rd, rn, #imm
+     SUB rd, rn, rm
+
+   This function handles both an immediate and register sub.
+
+   RD is the destination register.
+   RN is the input register.
+   IMM is the immediate to substract to RN.  */
+
+static int
+emit_sub (uint32_t *buf, struct aarch64_register rd,
+	  struct aarch64_register rn, struct aarch64_operand operand)
+{
+  return emit_data_processing (buf, SUB, rd, rn, operand);
+}
+
+/* Write a MOV instruction into *BUF.
+
+     MOV rd, #imm
+     MOV rd, rm
+
+   This function handles both a wide immediate move and a register move,
+   with the condition that the source register is not xzr.  xzr and the
+   stack pointer share the same encoding and this function only supports
+   the stack pointer.
+
+   RD is the destination register.
+   OPERAND is the source operand, either of type OPERAND_IMMEDIATE or
+   OPERAND_REGISTER.  */
+
+static int
+emit_mov (uint32_t *buf, struct aarch64_register rd,
+	  struct aarch64_operand operand)
+{
+  if (operand.type == OPERAND_IMMEDIATE)
+    {
+      uint32_t size = ENCODE (rd.is64, 1, 31);
+      /* Do not shift the immediate.  */
+      uint32_t shift = ENCODE (0, 2, 21);
+
+      return emit_insn (buf, MOV | size | shift
+			| ENCODE (operand.imm, 16, 5)
+			| ENCODE (rd.num, 5, 0));
+    }
+  else
+    return emit_add (buf, rd, operand.reg, immediate_operand (0));
+}
+
+/* Write a MOVK instruction into *BUF.
+
+     MOVK rd, #imm, lsl #shift
+
+   RD is the destination register.
+   IMM is the immediate.
+   SHIFT is the logical shift left to apply to IMM.   */
+
+static int
+emit_movk (uint32_t *buf, struct aarch64_register rd, uint32_t imm, unsigned shift)
+{
+  uint32_t size = ENCODE (rd.is64, 1, 31);
+
+  return emit_insn (buf, MOVK | size | ENCODE (shift, 2, 21) |
+		    ENCODE (imm, 16, 5) | ENCODE (rd.num, 5, 0));
+}
+
+/* Write instructions into *BUF in order to move ADDR into a register.
+   ADDR can be a 64-bit value.
+
+   This function will emit a series of MOV and MOVK instructions, such as:
+
+     MOV  xd, #(addr)
+     MOVK xd, #(addr >> 16), lsl #16
+     MOVK xd, #(addr >> 32), lsl #32
+     MOVK xd, #(addr >> 48), lsl #48  */
+
+static int
+emit_mov_addr (uint32_t *buf, struct aarch64_register rd, CORE_ADDR addr)
+{
+  uint32_t *p = buf;
+
+  /* The MOV (wide immediate) instruction clears to top bits of the
+     register.  */
+  p += emit_mov (p, rd, immediate_operand (addr & 0xffff));
+
+  if ((addr >> 16) != 0)
+    p += emit_movk (p, rd, (addr >> 16) & 0xffff, 1);
+  else
+    return p - buf;
+
+  if ((addr >> 32) != 0)
+    p += emit_movk (p, rd, (addr >> 32) & 0xffff, 2);
+  else
+    return p - buf;
+
+  if ((addr >> 48) != 0)
+    p += emit_movk (p, rd, (addr >> 48) & 0xffff, 3);
+
+  return p - buf;
+}
+
+/* Write a MRS instruction into *BUF.  The register size is 64-bit.
+
+     MRS xt, system_reg
+
+   RT is the destination register.
+   SYSTEM_REG is special purpose register to read.  */
+
+static int
+emit_mrs (uint32_t *buf, struct aarch64_register rt,
+	  enum aarch64_system_control_registers system_reg)
+{
+  return emit_insn (buf, MRS | ENCODE (system_reg, 15, 5)
+		    | ENCODE (rt.num, 5, 0));
+}
+
+/* Write a MSR instruction into *BUF.  The register size is 64-bit.
+
+     MSR system_reg, xt
+
+   SYSTEM_REG is special purpose register to write.
+   RT is the input register.  */
+
+static int
+emit_msr (uint32_t *buf, enum aarch64_system_control_registers system_reg,
+	  struct aarch64_register rt)
+{
+  return emit_insn (buf, MSR | ENCODE (system_reg, 15, 5)
+		    | ENCODE (rt.num, 5, 0));
+}
+
+/* Write a SEVL instruction into *BUF.
+
+   This is a hint instruction telling the hardware to trigger an event.  */
+
+static int
+emit_sevl (uint32_t *buf)
+{
+  return emit_insn (buf, SEVL);
+}
+
+/* Write a WFE instruction into *BUF.
+
+   This is a hint instruction telling the hardware to wait for an event.  */
+
+static int
+emit_wfe (uint32_t *buf)
+{
+  return emit_insn (buf, WFE);
+}
+
+/* Write LEN instructions from BUF into the inferior memory at *TO.
+
+   Note instructions are always little endian on AArch64, unlike data.  */
+
+static void
+append_insns (CORE_ADDR *to, size_t len, const uint32_t *buf)
+{
+  size_t byte_len = len * sizeof (uint32_t);
+#if (__BYTE_ORDER == __BIG_ENDIAN)
+  uint32_t *le_buf = xmalloc (byte_len);
+  size_t i;
+
+  for (i = 0; i < len; i++)
+    le_buf[i] = htole32 (buf[i]);
+
+  write_inferior_memory (*to, (const unsigned char *) le_buf, byte_len);
+
+  xfree (le_buf);
+#else
+  write_inferior_memory (*to, (const unsigned char *) buf, byte_len);
+#endif
+
+  *to += byte_len;
+}
+
+/* Helper function.  Return 1 if VAL can be encoded in BITS bits.  */
+
+static int
+can_encode_int32 (int32_t val, unsigned bits)
+{
+  /* This must be an arithemic shift.  */
+  int32_t rest = val >> bits;
+
+  return rest == 0 || rest == -1;
+}
+
+/* Relocate an instruction from OLDLOC to *TO.  This function will also
+   increment TO by the number of bytes the new instruction(s) take(s).
+
+   PC relative instructions need to be handled specifically:
+
+   - B/BL
+   - B.COND
+   - CBZ/CBNZ
+   - TBZ/TBNZ
+   - ADR/ADRP
+   - LDR/LDRSW (literal)  */
+
+static void
+aarch64_relocate_instruction (CORE_ADDR *to, CORE_ADDR oldloc)
+{
+  uint32_t buf[32];
+  uint32_t *p = buf;
+  uint32_t insn;
+
+  int is_bl;
+  int is64;
+  int is_sw;
+  int is_cbnz;
+  int is_tbnz;
+  int is_adrp;
+  unsigned rn;
+  unsigned rt;
+  unsigned rd;
+  unsigned cond;
+  unsigned bit;
+  int32_t offset;
+
+  target_read_uint32 (oldloc, &insn);
+
+  if (aarch64_decode_b (oldloc, insn, &is_bl, &offset))
+    {
+      offset = (oldloc - *to + offset);
+
+      if (can_encode_int32 (offset, 28))
+	p += emit_b (p, is_bl, offset);
+      else
+	return;
+    }
+  else if (aarch64_decode_bcond (oldloc, insn, &cond, &offset))
+    {
+      offset = (oldloc - *to + offset);
+
+      if (can_encode_int32 (offset, 21))
+	p += emit_bcond (p, cond, offset);
+      else if (can_encode_int32 (offset, 28))
+	{
+	  /* The offset is out of range for a conditional branch
+	     instruction but not for a unconditional branch.  We can use
+	     the following instructions instead:
+
+	       B.COND TAKEN    ; If cond is true, then jump to TAKEN.
+	       B NOT_TAKEN     ; Else jump over TAKEN and continue.
+	     TAKEN:
+	       B #(offset - 8)
+	     NOT_TAKEN:
+
+	     */
+
+	  p += emit_bcond (p, cond, 8);
+	  p += emit_b (p, 0, 8);
+	  p += emit_b (p, 0, offset - 8);
+	}
+      else
+	return;
+    }
+  else if (aarch64_decode_cb (oldloc, insn, &is64, &is_cbnz, &rn, &offset))
+    {
+      offset = (oldloc - *to + offset);
+
+      if (can_encode_int32 (offset, 21))
+	p += emit_cb (p, is_cbnz, aarch64_register (rn, is64), offset);
+      else if (can_encode_int32 (offset, 28))
+	{
+	  /* The offset is out of range for a compare and branch
+	     instruction but not for a unconditional branch.  We can use
+	     the following instructions instead:
+
+	       CBZ xn, TAKEN   ; xn == 0, then jump to TAKEN.
+	       B NOT_TAKEN     ; Else jump over TAKEN and continue.
+	     TAKEN:
+	       B #(offset - 8)
+	     NOT_TAKEN:
+
+	     */
+	  p += emit_cb (p, is_cbnz, aarch64_register (rn, is64), 8);
+	  p += emit_b (p, 0, 8);
+	  p += emit_b (p, 0, offset - 8);
+	}
+      else
+	return;
+    }
+  else if (aarch64_decode_tb (oldloc, insn, &is_tbnz, &bit, &rt, &offset))
+    {
+      offset = (oldloc - *to + offset);
+
+      if (can_encode_int32 (offset, 16))
+	p += emit_tb (p, is_tbnz, bit, aarch64_register (rt, 1), offset);
+      else if (can_encode_int32 (offset, 28))
+	{
+	  /* The offset is out of range for a test bit and branch
+	     instruction but not for a unconditional branch.  We can use
+	     the following instructions instead:
+
+	       TBZ xn, #bit, TAKEN ; xn[bit] == 0, then jump to TAKEN.
+	       B NOT_TAKEN         ; Else jump over TAKEN and continue.
+	     TAKEN:
+	       B #(offset - 8)
+	     NOT_TAKEN:
+
+	     */
+	  p += emit_tb (p, is_tbnz, bit, aarch64_register (rt, 1), 8);
+	  p += emit_b (p, 0, 8);
+	  p += emit_b (p, 0, offset - 8);
+	}
+      else
+	return;
+    }
+  else if (aarch64_decode_adr (oldloc, insn, &is_adrp, &rd, &offset))
+    {
+
+      /* We know exactly the address the ADR{P,} instruction will compute.
+	 We can just write it to the destination register.  */
+      CORE_ADDR address = oldloc + offset;
+
+      if (is_adrp)
+	{
+	  /* Clear the lower 12 bits of the offset to get the 4K page.  */
+	  p += emit_mov_addr (p, aarch64_register (rd, 1),
+			      address & ~0xfff);
+	}
+      else
+	p += emit_mov_addr (p, aarch64_register (rd, 1), address);
+    }
+  else if (aarch64_decode_ldr_literal (oldloc, insn, &is_sw, &is64, &rt,
+				       &offset))
+    {
+      /* We know exactly what address to load from, and what register we
+	 can use:
+
+	   MOV xd, #(oldloc + offset)
+	   MOVK xd, #((oldloc + offset) >> 16), lsl #16
+	   ...
+
+	   LDR xd, [xd] ; or LDRSW xd, [xd]
+
+	 */
+      CORE_ADDR address = oldloc + offset;
+
+      p += emit_mov_addr (p, aarch64_register (rt, 1), address);
+
+      if (is_sw)
+	p += emit_ldrsw (p, aarch64_register (rt, 1),
+			 aarch64_register (rt, 1),
+			 offset_memory_operand (0));
+      else
+	p += emit_ldr (p, aarch64_register (rt, is64),
+		       aarch64_register (rt, 1),
+		       offset_memory_operand (0));
+    }
+  else
+    {
+      /* The instruction is not PC relative.  Just re-emit it at the new
+	 location.  */
+      p += emit_insn (p, insn);
+    }
+
+  append_insns (to, p - buf, buf);
+}
+
+/* Implementation of linux_target_ops method
+   "install_fast_tracepoint_jump_pad".  */
+
+static int
+aarch64_install_fast_tracepoint_jump_pad (CORE_ADDR tpoint,
+					  CORE_ADDR tpaddr,
+					  CORE_ADDR collector,
+					  CORE_ADDR lockaddr,
+					  ULONGEST orig_size,
+					  CORE_ADDR *jump_entry,
+					  CORE_ADDR *trampoline,
+					  ULONGEST *trampoline_size,
+					  unsigned char *jjump_pad_insn,
+					  ULONGEST *jjump_pad_insn_size,
+					  CORE_ADDR *adjusted_insn_addr,
+					  CORE_ADDR *adjusted_insn_addr_end,
+					  char *err)
+{
+  uint32_t buf[256];
+  uint32_t *p = buf;
+  int32_t offset;
+  int i;
+  CORE_ADDR buildaddr = *jump_entry;
+
+  /* We need to save the current state on the stack both to restore it
+     later and to collect register values when the tracepoint is hit.
+
+     The saved registers are pushed in a layout that needs to be in sync
+     with aarch64_ft_collect_regmap (see linux-aarch64-ipa.c).  Later on
+     the supply_fast_tracepoint_registers function will fill in the
+     register cache from a pointer to saved registers on the stack we build
+     here.
+
+     For simplicity, we set the size of each cell on the stack to 16 bytes.
+     This way one cell can hold any register type, from system registers
+     to the 128 bit SIMD&FP registers.  Furthermore, the stack pointer
+     has to be 16 bytes aligned anyway.
+
+     Note that the CPSR register does not exist on AArch64.  Instead we
+     can access system bits describing the process state with the
+     MRS/MSR instructions, namely the condition flags.  We save them as
+     if they are part of a CPSR register because that's how GDB
+     interprets these system bits.  At the moment, only the condition
+     flags are saved in CPSR (NZCV).
+
+     Stack layout, each cell is 16 bytes (descending):
+
+     High *-------- SIMD&FP registers from 31 down to 0. --------*
+	  | q31                                                  |
+	  .                                                      .
+	  .                                                      . 32 cells
+	  .                                                      .
+	  | q0                                                   |
+	  *---- General purpose registers from 30 down to 0. ----*
+	  | x30                                                  |
+	  .                                                      .
+	  .                                                      . 31 cells
+	  .                                                      .
+	  | x0                                                   |
+	  *------------- Special purpose registers. -------------*
+	  | SP                                                   |
+	  | PC                                                   |
+	  | CPSR (NZCV)                                          | 5 cells
+	  | FPSR                                                 |
+	  | FPCR                                                 | <- SP + 16
+	  *------------- collecting_t object --------------------*
+	  | TPIDR_EL0               | struct tracepoint *        |
+     Low  *------------------------------------------------------*
+
+     After this stack is set up, we issue a call to the collector, passing
+     it the saved registers at (SP + 16).  */
+
+  /* Push SIMD&FP registers on the stack:
+
+       SUB sp, sp, #(32 * 16)
+
+       STP q30, q31, [sp, #(30 * 16)]
+       ...
+       STP q0, q1, [sp]
+
+     */
+  p += emit_sub (p, sp, sp, immediate_operand (32 * 16));
+  for (i = 30; i >= 0; i -= 2)
+    p += emit_stp_q_offset (p, i, i + 1, sp, i * 16);
+
+  /* Push general puspose registers on the stack.  Note that we do not need
+     to push x31 as it represents the xzr register and not the stack
+     pointer in a STR instruction.
+
+       SUB sp, sp, #(31 * 16)
+
+       STR x30, [sp, #(30 * 16)]
+       ...
+       STR x0, [sp]
+
+     */
+  p += emit_sub (p, sp, sp, immediate_operand (31 * 16));
+  for (i = 30; i >= 0; i -= 1)
+    p += emit_str (p, aarch64_register (i, 1), sp,
+		   offset_memory_operand (i * 16));
+
+  /* Make space for 5 more cells.
+
+       SUB sp, sp, #(5 * 16)
+
+     */
+  p += emit_sub (p, sp, sp, immediate_operand (5 * 16));
+
+
+  /* Save SP:
+
+       ADD x4, sp, #((32 + 31 + 5) * 16)
+       STR x4, [sp, #(4 * 16)]
+
+     */
+  p += emit_add (p, x4, sp, immediate_operand ((32 + 31 + 5) * 16));
+  p += emit_str (p, x4, sp, offset_memory_operand (4 * 16));
+
+  /* Save PC (tracepoint address):
+
+       MOV  x3, #(tpaddr)
+       ...
+
+       STR x3, [sp, #(3 * 16)]
+
+     */
+
+  p += emit_mov_addr (p, x3, tpaddr);
+  p += emit_str (p, x3, sp, offset_memory_operand (3 * 16));
+
+  /* Save CPSR (NZCV), FPSR and FPCR:
+
+       MRS x2, nzcv
+       MRS x1, fpsr
+       MRS x0, fpcr
+
+       STR x2, [sp, #(2 * 16)]
+       STR x1, [sp, #(1 * 16)]
+       STR x0, [sp, #(0 * 16)]
+
+     */
+  p += emit_mrs (p, x2, NZCV);
+  p += emit_mrs (p, x1, FPSR);
+  p += emit_mrs (p, x0, FPCR);
+  p += emit_str (p, x2, sp, offset_memory_operand (2 * 16));
+  p += emit_str (p, x1, sp, offset_memory_operand (1 * 16));
+  p += emit_str (p, x0, sp, offset_memory_operand (0 * 16));
+
+  /* Push the collecting_t object.  It consist of the address of the
+     tracepoint and an ID for the current thread.  We get the latter by
+     reading the tpidr_el0 system register.  It corresponds to the
+     NT_ARM_TLS register accessible with ptrace.
+
+       MOV x0, #(tpoint)
+       ...
+
+       MRS x1, tpidr_el0
+
+       STP x0, x1, [sp, #-16]!
+
+     */
+
+  p += emit_mov_addr (p, x0, tpoint);
+  p += emit_mrs (p, x1, TPIDR_EL0);
+  p += emit_stp (p, x0, x1, sp, preindex_memory_operand (-16));
+
+  /* Spin-lock:
+
+     The shared memory for the lock is at lockaddr.  It will hold zero
+     if no-one is holding the lock, otherwise it contains the address of
+     the collecting_t object on the stack of the thread which acquired it.
+
+     At this stage, the stack pointer points to this thread's collecting_t
+     object.
+
+     We use the following registers:
+     - x0: Address of the lock.
+     - x1: Pointer to collecting_t object.
+     - x2: Scratch register.
+
+       MOV x0, #(lockaddr)
+       ...
+       MOV x1, sp
+
+       ; Trigger an event local to this core.  So the following WFE
+       ; instruction is ignored.
+       SEVL
+     again:
+       ; Wait for an event.  The event is triggered by either the SEVL
+       ; or STLR instructions (store release).
+       WFE
+
+       ; Atomically read at lockaddr.  This marks the memory location as
+       ; exclusive.  This instruction also has memory constraints which
+       ; make sure all previous data reads and writes are done before
+       ; executing it.
+       LDAXR x2, [x0]
+
+       ; Try again if another thread holds the lock.
+       CBNZ x2, again
+
+       ; We can lock it!  Write the address of the collecting_t object.
+       ; This instruction will fail if the memory location is not marked
+       ; as exclusive anymore.  If it succeeds, it will remove the
+       ; exclusive mark on the memory location.  This way, if another
+       ; thread executes this instruction before us, we will fail and try
+       ; all over again.
+       STXR w2, x1, [x0]
+       CBNZ w2, again
+
+     */
+
+  p += emit_mov_addr (p, x0, lockaddr);
+  p += emit_mov (p, x1, register_operand (sp));
+
+  p += emit_sevl (p);
+  p += emit_wfe (p);
+  p += emit_ldaxr (p, x2, x0);
+  p += emit_cb (p, 1, w2, -2 * 4);
+  p += emit_stxr (p, w2, x1, x0);
+  p += emit_cb (p, 1, x2, -4 * 4);
+
+  /* Call collector (struct tracepoint *, unsigned char *):
+
+       MOV x0, #(tpoint)
+       ...
+
+       ; Saved registers start after the collecting_t object.
+       ADD x1, sp, #16
+
+       ; We use an intra-procedure-call scratch register.
+       MOV ip0, #(collector)
+       ...
+
+       ; And call back to C!
+       BLR ip0
+
+     */
+
+  p += emit_mov_addr (p, x0, tpoint);
+  p += emit_add (p, x1, sp, immediate_operand (16));
+
+  p += emit_mov_addr (p, ip0, collector);
+  p += emit_blr (p, ip0);
+
+  /* Release the lock.
+
+       MOV x0, #(lockaddr)
+       ...
+
+       ; This instruction is a normal store with memory ordering
+       ; constraints.  Thanks to this we do not have to put a data
+       ; barrier instruction to make sure all data read and writes are done
+       ; before this instruction is executed.  Furthermore, this instrucion
+       ; will trigger an event, letting other threads know they can grab
+       ; the lock.
+       STLR xzr, [x0]
+
+     */
+  p += emit_mov_addr (p, x0, lockaddr);
+  p += emit_stlr (p, xzr, x0);
+
+  /* Free collecting_t object:
+
+       ADD sp, sp, #16
+
+     */
+  p += emit_add (p, sp, sp, immediate_operand (16));
+
+  /* Restore CPSR (NZCV), FPSR and FPCR.  And free all special purpose
+     registers from the stack.
+
+       LDR x2, [sp, #(2 * 16)]
+       LDR x1, [sp, #(1 * 16)]
+       LDR x0, [sp, #(0 * 16)]
+
+       MSR NZCV, x2
+       MSR FPSR, x1
+       MSR FPCR, x0
+
+       ADD sp, sp #(5 * 16)
+
+     */
+  p += emit_ldr (p, x2, sp, offset_memory_operand (2 * 16));
+  p += emit_ldr (p, x1, sp, offset_memory_operand (1 * 16));
+  p += emit_ldr (p, x0, sp, offset_memory_operand (0 * 16));
+  p += emit_msr (p, NZCV, x2);
+  p += emit_msr (p, FPSR, x1);
+  p += emit_msr (p, FPCR, x0);
+
+  p += emit_add (p, sp, sp, immediate_operand (5 * 16));
+
+  /* Pop general purpose registers:
+
+       LDR x0, [sp]
+       ...
+       LDR x30, [sp, #(30 * 16)]
+
+       ADD sp, sp, #(31 * 16)
+
+     */
+  for (i = 0; i <= 30; i += 1)
+    p += emit_ldr (p, aarch64_register (i, 1), sp,
+		   offset_memory_operand (i * 16));
+  p += emit_add (p, sp, sp, immediate_operand (31 * 16));
+
+  /* Pop SIMD&FP registers:
+
+       LDP q0, q1, [sp]
+       ...
+       LDP q30, q31, [sp, #(30 * 16)]
+
+       ADD sp, sp, #(32 * 16)
+
+     */
+  for (i = 0; i <= 30; i += 2)
+    p += emit_ldp_q_offset (p, i, i + 1, sp, i * 16);
+  p += emit_add (p, sp, sp, immediate_operand (32 * 16));
+
+  /* Write the code into the inferior memory.  */
+  append_insns (&buildaddr, p - buf, buf);
+
+  /* Now emit the relocated instruction.  */
+  *adjusted_insn_addr = buildaddr;
+  aarch64_relocate_instruction (&buildaddr, tpaddr);
+  *adjusted_insn_addr_end = buildaddr;
+
+  /* We may not have been able to relocate the instruction.  */
+  if (*adjusted_insn_addr == *adjusted_insn_addr_end)
+    {
+      sprintf (err,
+	       "E.Could not relocate instruction from %s to %s.",
+	       core_addr_to_string_nz (tpaddr),
+	       core_addr_to_string_nz (buildaddr));
+      return 1;
+    }
+
+  /* Go back to the start of the buffer.  */
+  p = buf;
+
+  /* Emit a branch back from the jump pad.  */
+  offset = (tpaddr + orig_size - buildaddr);
+  if (!can_encode_int32 (offset, 28))
+    {
+      sprintf (err,
+	       "E.Jump back from jump pad too far from tracepoint "
+	       "(offset 0x%" PRIx32 " cannot be encoded in 28 bits).",
+	       offset);
+      return 1;
+    }
+
+  p += emit_b (p, 0, offset);
+  append_insns (&buildaddr, p - buf, buf);
+
+  /* Give the caller a branch instruction into the jump pad.  */
+  offset = (*jump_entry - tpaddr);
+  if (!can_encode_int32 (offset, 28))
+    {
+      sprintf (err,
+	       "E.Jump pad too far from tracepoint "
+	       "(offset 0x%" PRIx32 " cannot be encoded in 28 bits).",
+	       offset);
+      return 1;
+    }
+
+  emit_b ((uint32_t *) jjump_pad_insn, 0, offset);
+  *jjump_pad_insn_size = 4;
+
+  /* Return the end address of our pad.  */
+  *jump_entry = buildaddr;
+
+  return 0;
+}
+
+/* Implementation of linux_target_ops method
+   "get_min_fast_tracepoint_insn_len".  */
+
+static int
+aarch64_get_min_fast_tracepoint_insn_len (void)
+{
+  return 4;
+}
+
 /* Implementation of linux_target_ops method "supports_range_stepping".  */
 
 static int
@@ -609,10 +2069,10 @@ struct linux_target_ops the_low_target =
   aarch64_linux_prepare_to_resume,
   NULL, /* process_qsupported */
   aarch64_supports_tracepoints,
-  NULL, /* get_thread_area */
-  NULL, /* install_fast_tracepoint_jump_pad */
+  aarch64_get_thread_area,
+  aarch64_install_fast_tracepoint_jump_pad,
   NULL, /* emit_ops */
-  NULL, /* get_min_fast_tracepoint_insn_len */
+  aarch64_get_min_fast_tracepoint_insn_len,
   aarch64_supports_range_stepping,
 };
 
-- 
1.9.1


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]