This is the mail archive of the gdb@sourceware.org mailing list for the GDB project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Note on choosing string hash functions


On 11/22/2017 02:10 AM, Pedro Alves wrote:
> On 11/17/2017 01:42 PM, Pedro Alves wrote:
> 
> Then, I played with making Ada/gnat and both Latin-1 and UTF-8 sources
> files (the latter with "pragma Wide_Character_Encoding (UTF8)"), and
> what I discovered was that Ada's encoding/mangling guarantees that only
> ASCII characters end up in mangled names.  From gcc/ada/namet.ads:
> 
> ~~~
> --    Identifiers        Stored with upper case letters folded to lower case.
> --                       Upper half (16#80# bit set) and wide characters are
> --                       stored in an encoded form (Uhh for upper half char,
> --                       Whhhh for wide characters, WWhhhhhhhh as provided by
> --                       the routine Append_Encoded, where hh are hex
> --                       digits for the character code using lower case a-f).
> --                       Normally the use of U or W in other internal names is
> --                       avoided, but these letters may be used in internal
> --                       names (without this special meaning), if they appear
> --                       as the last character of the name, or they are
> --                       followed by an upper case letter (other than the WW
> --                       sequence), or an underscore.
> ~~~
> 
> Funny enough, GDB doesn't grok this Uhh/WWhhhhhhhh encoding today.
> (I wrote a quick patch to teach GDB about it, to help convince myself,
> though as is, it only works when gdb's charset/locale is UTF-8.)

For the record, here's what that patch looks like.

>From 710bde831ed78641e175046e0711a35d5061d7ee Mon Sep 17 00:00:00 2001
From: Pedro Alves <palves@redhat.com>
Date: Tue, 21 Nov 2017 20:05:42 +0000
Subject: [PATCH] Ada: Support Uhh encoding, UTF-8

An attempt at checking whether TOLOWER for minsyms makes a difference
over tolower...

It doesn't, Ada's encoding encodes "upper half char"s using Uff, so
non-ASCII characters don't appear in the mangled names...

The Ada lexer change is necessary so that it's possible to input UTF-8
in expressions.

This assumes the host encoding is UTF-8 as is...  I wonder... maybe
GDB should always use UTF-8 internally, and translate host-encoding ->
UTF-8 at the readline -> GDB boundary.

Yes, the test passes.  :-)
---
 gdb/ada-lang.c                     | 30 +++++++++++++++++++++
 gdb/ada-lex.l                      |  2 +-
 gdb/common/rsp-low.c               |  2 +-
 gdb/common/rsp-low.h               |  4 +++
 gdb/testsuite/gdb.ada/utf8.exp     | 53 ++++++++++++++++++++++++++++++++++++++
 gdb/testsuite/gdb.ada/utf8/foo.adb | 25 ++++++++++++++++++
 gdb/testsuite/gdb.ada/utf8/pck.adb | 26 +++++++++++++++++++
 gdb/testsuite/gdb.ada/utf8/pck.ads | 22 ++++++++++++++++
 8 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 gdb/testsuite/gdb.ada/utf8.exp
 create mode 100644 gdb/testsuite/gdb.ada/utf8/foo.adb
 create mode 100644 gdb/testsuite/gdb.ada/utf8/pck.adb
 create mode 100644 gdb/testsuite/gdb.ada/utf8/pck.ads

diff --git a/gdb/ada-lang.c b/gdb/ada-lang.c
index 33c4e8e..d0fb06d 100644
--- a/gdb/ada-lang.c
+++ b/gdb/ada-lang.c
@@ -63,6 +63,7 @@
 #include "common/function-view.h"
 #include "common/byte-vector.h"
 #include <algorithm>
+#include "common/rsp-low.h"
 
 /* Define whether or not the C operator '/' truncates towards zero for
    differently signed operands (truncation direction is undefined in C).
@@ -1007,6 +1008,19 @@ ada_encode_1 (const char *decoded, bool throw_errors)
           encoding_buffer[k] = encoding_buffer[k + 1] = '_';
           k += 2;
         }
+      else if (((unsigned char) *p & 0xe0) == 0xc0)
+	{
+	  /* "Uhh" Ada encoding -> UTF-8 character.  */
+
+	  unsigned char c1 = p[0];
+	  unsigned char c2 = p[1];
+	  unsigned char c = (c1 << 6) | (c2 & (0xff >> 2));
+	  p += 1;
+
+	  encoding_buffer[k] = 'U';
+	  pack_hex_byte (&encoding_buffer[k + 1], c);
+	  k += 3;
+	}
       else if (*p == '"')
         {
           const struct ada_opname_map *mapping;
@@ -1355,6 +1369,8 @@ ada_decode (const char *encoded)
             i++;
         }
 
+      std::pair<int, int> nibbles;
+
       if (encoded[i] == 'X' && i != 0 && isalnum (encoded[i - 1]))
         {
           /* This is a X[bn]* sequence not separated from the previous
@@ -1378,6 +1394,20 @@ ada_decode (const char *encoded)
           i += 2;
           j += 1;
         }
+      else if (len0 - i > 3
+	       && encoded[i] == 'U'
+	       && ishex (encoded[i + 1], &nibbles.first)
+	       && ishex (encoded[i + 2], &nibbles.second))
+	{
+	  /* Convert Ada upper half char encoding to UTF-8 character
+	     (2 bytes code point).  */
+	  unsigned char c = nibbles.first << 4 | nibbles.second;
+
+	  decoded[j] = 0xc0 | c >> 6;
+	  decoded[j + 1] = 0x80 | (c & 0x03f);
+	  i += 3;
+	  j += 2;
+	}
       else
         {
           /* It's a character part of the decoded name, so just copy it
diff --git a/gdb/ada-lex.l b/gdb/ada-lex.l
index 63137bd..41b0582 100644
--- a/gdb/ada-lex.l
+++ b/gdb/ada-lex.l
@@ -29,7 +29,7 @@ NUM10	({DIG}({DIG}|_)*)
 HEXDIG	[0-9a-f]
 NUM16	({HEXDIG}({HEXDIG}|_)*)
 OCTDIG	[0-7]
-LETTER	[a-z_]
+LETTER	[a-z_\x80-\xff]
 ID	({LETTER}({LETTER}|{DIG})*|"<"{LETTER}({LETTER}|{DIG})*">")
 WHITE	[ \t\n]
 TICK	("'"{WHITE}*)
diff --git a/gdb/common/rsp-low.c b/gdb/common/rsp-low.c
index 85987f7..3209693 100644
--- a/gdb/common/rsp-low.c
+++ b/gdb/common/rsp-low.c
@@ -50,7 +50,7 @@ tohex (int nib)
 
 static const char hexchars[] = "0123456789abcdef";
 
-static int
+int
 ishex (int ch, int *val)
 {
   if ((ch >= 'a') && (ch <= 'f'))
diff --git a/gdb/common/rsp-low.h b/gdb/common/rsp-low.h
index 99dc93f..947ee20 100644
--- a/gdb/common/rsp-low.h
+++ b/gdb/common/rsp-low.h
@@ -20,6 +20,10 @@
 #ifndef COMMON_RSP_LOW_H
 #define COMMON_RSP_LOW_H
 
+/* FIXME: comment.  */
+
+extern int ishex (int ch, int *val);
+
 /* Convert hex digit A to a number, or throw an exception.  */
 
 extern int fromhex (int a);
diff --git a/gdb/testsuite/gdb.ada/utf8.exp b/gdb/testsuite/gdb.ada/utf8.exp
new file mode 100644
index 0000000..4e5fc01
--- /dev/null
+++ b/gdb/testsuite/gdb.ada/utf8.exp
@@ -0,0 +1,53 @@
+# -*-mode: tcl; coding: utf-8;-*-
+#
+# Copyright 2017 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Test GDB's support for symbols with UTF-8 multi-byte symbol names.
+
+# Actually, we're only testing "Uff" (Latin1 page) encoded names,
+# i.e., upper half char characters.  Wider characters have a different
+# Ada encoding which we don't support yet.
+
+load_lib "ada.exp"
+
+# Enable basic use of UTF-8.  This is restored automatically for every
+# testcase.
+setenv LC_ALL C.UTF-8
+
+standard_ada_testfile foo
+
+if {[gdb_compile_ada "${srcfile}" "${binfile}" executable {debug}] != "" } {
+  return -1
+}
+
+clean_restart ${testfile}
+
+if ![runto_main] then {
+  perror "Couldn't run ${testfile}"
+  return
+}
+
+# Check printing an expression involving an UTF8 symbol name.
+gdb_test "print &pck.funcáx" \
+         " = \\(access function \\(a1: integer\\) return integer\\) $hex <pck.funcáx>"
+
+# Check setting a breakpoint in a function with an UTF8 symbol name.
+gdb_test "b pck.funcáx" "Breakpoint $decimal .*"
+
+# Test running to the breakpoint, confirm GDB prints the function name
+# correctly.
+gdb_test "continue" "Breakpoint $decimal, pck.funcáx \\(i=1\\).*"
+
diff --git a/gdb/testsuite/gdb.ada/utf8/foo.adb b/gdb/testsuite/gdb.ada/utf8/foo.adb
new file mode 100644
index 0000000..f49ab49
--- /dev/null
+++ b/gdb/testsuite/gdb.ada/utf8/foo.adb
@@ -0,0 +1,25 @@
+-- -*-mode: Ada; coding: utf-8;-*-
+
+--  Copyright 2017 Free Software Foundation, Inc.
+--
+--  This program is free software; you can redistribute it and/or modify
+--  it under the terms of the GNU General Public License as published by
+--  the Free Software Foundation; either version 3 of the License, or
+--  (at your option) any later version.
+--
+--  This program is distributed in the hope that it will be useful,
+--  but WITHOUT ANY WARRANTY; without even the implied warranty of
+--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+--  GNU General Public License for more details.
+--
+--  You should have received a copy of the GNU General Public License
+--  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+pragma Wide_Character_Encoding (UTF8);
+
+with Pck; use Pck;
+procedure Foo is
+   I : Integer := 1;
+begin
+   FuncÁx (I);
+end Foo;
diff --git a/gdb/testsuite/gdb.ada/utf8/pck.adb b/gdb/testsuite/gdb.ada/utf8/pck.adb
new file mode 100644
index 0000000..a4a4962
--- /dev/null
+++ b/gdb/testsuite/gdb.ada/utf8/pck.adb
@@ -0,0 +1,26 @@
+-- -*-mode: Ada; coding: utf-8;-*-
+
+--  Copyright 2017 Free Software Foundation, Inc.
+--
+--  This program is free software; you can redistribute it and/or modify
+--  it under the terms of the GNU General Public License as published by
+--  the Free Software Foundation; either version 3 of the License, or
+--  (at your option) any later version.
+--
+--  This program is distributed in the hope that it will be useful,
+--  but WITHOUT ANY WARRANTY; without even the implied warranty of
+--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+--  GNU General Public License for more details.
+--
+--  You should have received a copy of the GNU General Public License
+--  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+pragma Wide_Character_Encoding (UTF8);
+
+package body Pck is
+   procedure FuncÁx (I: in out Integer) is
+   begin
+      I := I + 1;
+   end FuncÁx;
+
+end Pck;
diff --git a/gdb/testsuite/gdb.ada/utf8/pck.ads b/gdb/testsuite/gdb.ada/utf8/pck.ads
new file mode 100644
index 0000000..3978ba4
--- /dev/null
+++ b/gdb/testsuite/gdb.ada/utf8/pck.ads
@@ -0,0 +1,22 @@
+-- -*-mode: Ada; coding: utf-8;-*-
+
+--  Copyright 2017 Free Software Foundation, Inc.
+--
+--  This program is free software; you can redistribute it and/or modify
+--  it under the terms of the GNU General Public License as published by
+--  the Free Software Foundation; either version 3 of the License, or
+--  (at your option) any later version.
+--
+--  This program is distributed in the hope that it will be useful,
+--  but WITHOUT ANY WARRANTY; without even the implied warranty of
+--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+--  GNU General Public License for more details.
+--
+--  You should have received a copy of the GNU General Public License
+--  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+pragma Wide_Character_Encoding (UTF8);
+
+package Pck is
+   procedure FuncÁx (I: in out Integer);
+end Pck;
-- 
2.5.5



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]