root/tools/perf/util/demangle-rust.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. rust_is_mangled
  2. is_prefixed_hash
  3. looks_like_rust
  4. rust_demangle_sym
  5. unescape

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <string.h>
   3 #include "debug.h"
   4 
   5 #include "demangle-rust.h"
   6 
   7 /*
   8  * Mangled Rust symbols look like this:
   9  *
  10  *     _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
  11  *
  12  * The original symbol is:
  13  *
  14  *     <std::sys::fd::FileDesc as core::ops::Drop>::drop
  15  *
  16  * The last component of the path is a 64-bit hash in lowercase hex, prefixed
  17  * with "h". Rust does not have a global namespace between crates, an illusion
  18  * which Rust maintains by using the hash to distinguish things that would
  19  * otherwise have the same symbol.
  20  *
  21  * Any path component not starting with a XID_Start character is prefixed with
  22  * "_".
  23  *
  24  * The following escape sequences are used:
  25  *
  26  *     ","  =>  $C$
  27  *     "@"  =>  $SP$
  28  *     "*"  =>  $BP$
  29  *     "&"  =>  $RF$
  30  *     "<"  =>  $LT$
  31  *     ">"  =>  $GT$
  32  *     "("  =>  $LP$
  33  *     ")"  =>  $RP$
  34  *     " "  =>  $u20$
  35  *     "'"  =>  $u27$
  36  *     "["  =>  $u5b$
  37  *     "]"  =>  $u5d$
  38  *     "~"  =>  $u7e$
  39  *
  40  * A double ".." means "::" and a single "." means "-".
  41  *
  42  * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
  43  */
  44 
  45 static const char *hash_prefix = "::h";
  46 static const size_t hash_prefix_len = 3;
  47 static const size_t hash_len = 16;
  48 
  49 static bool is_prefixed_hash(const char *start);
  50 static bool looks_like_rust(const char *sym, size_t len);
  51 static bool unescape(const char **in, char **out, const char *seq, char value);
  52 
  53 /*
  54  * INPUT:
  55  *     sym: symbol that has been through BFD-demangling
  56  *
  57  * This function looks for the following indicators:
  58  *
  59  *  1. The hash must consist of "h" followed by 16 lowercase hex digits.
  60  *
  61  *  2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
  62  *     hex digits. This is true of 99.9998% of hashes so once in your life you
  63  *     may see a false negative. The point is to notice path components that
  64  *     could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
  65  *     this case a false positive (non-Rust symbol has an important path
  66  *     component removed because it looks like a Rust hash) is worse than a
  67  *     false negative (the rare Rust symbol is not demangled) so this sets the
  68  *     balance in favor of false negatives.
  69  *
  70  *  3. There must be no characters other than a-zA-Z0-9 and _.:$
  71  *
  72  *  4. There must be no unrecognized $-sign sequences.
  73  *
  74  *  5. There must be no sequence of three or more dots in a row ("...").
  75  */
  76 bool
  77 rust_is_mangled(const char *sym)
  78 {
  79         size_t len, len_without_hash;
  80 
  81         if (!sym)
  82                 return false;
  83 
  84         len = strlen(sym);
  85         if (len <= hash_prefix_len + hash_len)
  86                 /* Not long enough to contain "::h" + hash + something else */
  87                 return false;
  88 
  89         len_without_hash = len - (hash_prefix_len + hash_len);
  90         if (!is_prefixed_hash(sym + len_without_hash))
  91                 return false;
  92 
  93         return looks_like_rust(sym, len_without_hash);
  94 }
  95 
  96 /*
  97  * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
  98  * digits must comprise between 5 and 15 (inclusive) distinct digits.
  99  */
 100 static bool is_prefixed_hash(const char *str)
 101 {
 102         const char *end;
 103         bool seen[16];
 104         size_t i;
 105         int count;
 106 
 107         if (strncmp(str, hash_prefix, hash_prefix_len))
 108                 return false;
 109         str += hash_prefix_len;
 110 
 111         memset(seen, false, sizeof(seen));
 112         for (end = str + hash_len; str < end; str++)
 113                 if (*str >= '0' && *str <= '9')
 114                         seen[*str - '0'] = true;
 115                 else if (*str >= 'a' && *str <= 'f')
 116                         seen[*str - 'a' + 10] = true;
 117                 else
 118                         return false;
 119 
 120         /* Count how many distinct digits seen */
 121         count = 0;
 122         for (i = 0; i < 16; i++)
 123                 if (seen[i])
 124                         count++;
 125 
 126         return count >= 5 && count <= 15;
 127 }
 128 
 129 static bool looks_like_rust(const char *str, size_t len)
 130 {
 131         const char *end = str + len;
 132 
 133         while (str < end)
 134                 switch (*str) {
 135                 case '$':
 136                         if (!strncmp(str, "$C$", 3))
 137                                 str += 3;
 138                         else if (!strncmp(str, "$SP$", 4)
 139                                         || !strncmp(str, "$BP$", 4)
 140                                         || !strncmp(str, "$RF$", 4)
 141                                         || !strncmp(str, "$LT$", 4)
 142                                         || !strncmp(str, "$GT$", 4)
 143                                         || !strncmp(str, "$LP$", 4)
 144                                         || !strncmp(str, "$RP$", 4))
 145                                 str += 4;
 146                         else if (!strncmp(str, "$u20$", 5)
 147                                         || !strncmp(str, "$u27$", 5)
 148                                         || !strncmp(str, "$u5b$", 5)
 149                                         || !strncmp(str, "$u5d$", 5)
 150                                         || !strncmp(str, "$u7e$", 5))
 151                                 str += 5;
 152                         else
 153                                 return false;
 154                         break;
 155                 case '.':
 156                         /* Do not allow three or more consecutive dots */
 157                         if (!strncmp(str, "...", 3))
 158                                 return false;
 159                         /* Fall through */
 160                 case 'a' ... 'z':
 161                 case 'A' ... 'Z':
 162                 case '0' ... '9':
 163                 case '_':
 164                 case ':':
 165                         str++;
 166                         break;
 167                 default:
 168                         return false;
 169                 }
 170 
 171         return true;
 172 }
 173 
 174 /*
 175  * INPUT:
 176  *     sym: symbol for which rust_is_mangled(sym) returns true
 177  *
 178  * The input is demangled in-place because the mangled name is always longer
 179  * than the demangled one.
 180  */
 181 void
 182 rust_demangle_sym(char *sym)
 183 {
 184         const char *in;
 185         char *out;
 186         const char *end;
 187 
 188         if (!sym)
 189                 return;
 190 
 191         in = sym;
 192         out = sym;
 193         end = sym + strlen(sym) - (hash_prefix_len + hash_len);
 194 
 195         while (in < end)
 196                 switch (*in) {
 197                 case '$':
 198                         if (!(unescape(&in, &out, "$C$", ',')
 199                                         || unescape(&in, &out, "$SP$", '@')
 200                                         || unescape(&in, &out, "$BP$", '*')
 201                                         || unescape(&in, &out, "$RF$", '&')
 202                                         || unescape(&in, &out, "$LT$", '<')
 203                                         || unescape(&in, &out, "$GT$", '>')
 204                                         || unescape(&in, &out, "$LP$", '(')
 205                                         || unescape(&in, &out, "$RP$", ')')
 206                                         || unescape(&in, &out, "$u20$", ' ')
 207                                         || unescape(&in, &out, "$u27$", '\'')
 208                                         || unescape(&in, &out, "$u5b$", '[')
 209                                         || unescape(&in, &out, "$u5d$", ']')
 210                                         || unescape(&in, &out, "$u7e$", '~'))) {
 211                                 pr_err("demangle-rust: unexpected escape sequence");
 212                                 goto done;
 213                         }
 214                         break;
 215                 case '_':
 216                         /*
 217                          * If this is the start of a path component and the next
 218                          * character is an escape sequence, ignore the
 219                          * underscore. The mangler inserts an underscore to make
 220                          * sure the path component begins with a XID_Start
 221                          * character.
 222                          */
 223                         if ((in == sym || in[-1] == ':') && in[1] == '$')
 224                                 in++;
 225                         else
 226                                 *out++ = *in++;
 227                         break;
 228                 case '.':
 229                         if (in[1] == '.') {
 230                                 /* ".." becomes "::" */
 231                                 *out++ = ':';
 232                                 *out++ = ':';
 233                                 in += 2;
 234                         } else {
 235                                 /* "." becomes "-" */
 236                                 *out++ = '-';
 237                                 in++;
 238                         }
 239                         break;
 240                 case 'a' ... 'z':
 241                 case 'A' ... 'Z':
 242                 case '0' ... '9':
 243                 case ':':
 244                         *out++ = *in++;
 245                         break;
 246                 default:
 247                         pr_err("demangle-rust: unexpected character '%c' in symbol\n",
 248                                 *in);
 249                         goto done;
 250                 }
 251 
 252 done:
 253         *out = '\0';
 254 }
 255 
 256 static bool unescape(const char **in, char **out, const char *seq, char value)
 257 {
 258         size_t len = strlen(seq);
 259 
 260         if (strncmp(*in, seq, len))
 261                 return false;
 262 
 263         **out = value;
 264 
 265         *in += len;
 266         *out += 1;
 267 
 268         return true;
 269 }

/* [<][>][^][v][top][bottom][index][help] */