root/tools/perf/builtin-record.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. switch_output_signal
  2. switch_output_size
  3. switch_output_time
  4. record__write
  5. record__aio_write
  6. record__aio_complete
  7. record__aio_sync
  8. record__aio_pushfn
  9. record__aio_push
  10. record__aio_get_pos
  11. record__aio_set_pos
  12. record__aio_mmap_read_sync
  13. record__aio_parse
  14. record__aio_push
  15. record__aio_get_pos
  16. record__aio_set_pos
  17. record__aio_mmap_read_sync
  18. record__aio_enabled
  19. record__mmap_flush_parse
  20. record__parse_comp_level
  21. record__comp_enabled
  22. process_synthesized_event
  23. record__pushfn
  24. sig_handler
  25. sigsegv_handler
  26. record__sig_exit
  27. record__process_auxtrace
  28. record__auxtrace_mmap_read
  29. record__auxtrace_mmap_read_snapshot
  30. record__auxtrace_read_snapshot_all
  31. record__read_auxtrace_snapshot
  32. record__auxtrace_snapshot_exit
  33. record__auxtrace_init
  34. record__auxtrace_mmap_read
  35. record__read_auxtrace_snapshot
  36. auxtrace_record__snapshot_start
  37. record__auxtrace_snapshot_exit
  38. record__auxtrace_init
  39. record__mmap_evlist
  40. record__mmap
  41. record__open
  42. process_sample_event
  43. process_buildids
  44. perf_event__synthesize_guest_os
  45. record__adjust_affinity
  46. process_comp_header
  47. zstd_compress
  48. record__mmap_read_evlist
  49. record__mmap_read_all
  50. record__init_features
  51. record__finish_output
  52. record__synthesize_workload
  53. record__switch_output
  54. workload_exec_failed_signal
  55. perf_evlist__pick_pc
  56. record__pick_pc
  57. record__synthesize
  58. __cmd_record
  59. callchain_debug
  60. record_opts__parse_callchain
  61. record_parse_callchain_opt
  62. record_callchain_opt
  63. perf_record_config
  64. get_clockid_res
  65. parse_clockid
  66. record__parse_affinity
  67. record__parse_mmap_pages
  68. switch_output_size_warn
  69. switch_output_setup
  70. cmd_record
  71. snapshot_sig_handler
  72. alarm_sig_handler

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * builtin-record.c
   4  *
   5  * Builtin record command: Record the profile of a workload
   6  * (or a CPU, or a PID) into the perf.data output file - for
   7  * later analysis via perf report.
   8  */
   9 #include "builtin.h"
  10 
  11 #include "util/build-id.h"
  12 #include <subcmd/parse-options.h>
  13 #include "util/parse-events.h"
  14 #include "util/config.h"
  15 
  16 #include "util/callchain.h"
  17 #include "util/cgroup.h"
  18 #include "util/header.h"
  19 #include "util/event.h"
  20 #include "util/evlist.h"
  21 #include "util/evsel.h"
  22 #include "util/debug.h"
  23 #include "util/mmap.h"
  24 #include "util/target.h"
  25 #include "util/session.h"
  26 #include "util/tool.h"
  27 #include "util/symbol.h"
  28 #include "util/record.h"
  29 #include "util/cpumap.h"
  30 #include "util/thread_map.h"
  31 #include "util/data.h"
  32 #include "util/perf_regs.h"
  33 #include "util/auxtrace.h"
  34 #include "util/tsc.h"
  35 #include "util/parse-branch-options.h"
  36 #include "util/parse-regs-options.h"
  37 #include "util/llvm-utils.h"
  38 #include "util/bpf-loader.h"
  39 #include "util/trigger.h"
  40 #include "util/perf-hooks.h"
  41 #include "util/cpu-set-sched.h"
  42 #include "util/synthetic-events.h"
  43 #include "util/time-utils.h"
  44 #include "util/units.h"
  45 #include "util/bpf-event.h"
  46 #include "asm/bug.h"
  47 #include "perf.h"
  48 
  49 #include <errno.h>
  50 #include <inttypes.h>
  51 #include <locale.h>
  52 #include <poll.h>
  53 #include <unistd.h>
  54 #include <sched.h>
  55 #include <signal.h>
  56 #include <sys/mman.h>
  57 #include <sys/wait.h>
  58 #include <linux/err.h>
  59 #include <linux/string.h>
  60 #include <linux/time64.h>
  61 #include <linux/zalloc.h>
  62 
  63 struct switch_output {
  64         bool             enabled;
  65         bool             signal;
  66         unsigned long    size;
  67         unsigned long    time;
  68         const char      *str;
  69         bool             set;
  70         char             **filenames;
  71         int              num_files;
  72         int              cur_file;
  73 };
  74 
  75 struct record {
  76         struct perf_tool        tool;
  77         struct record_opts      opts;
  78         u64                     bytes_written;
  79         struct perf_data        data;
  80         struct auxtrace_record  *itr;
  81         struct evlist   *evlist;
  82         struct perf_session     *session;
  83         int                     realtime_prio;
  84         bool                    no_buildid;
  85         bool                    no_buildid_set;
  86         bool                    no_buildid_cache;
  87         bool                    no_buildid_cache_set;
  88         bool                    buildid_all;
  89         bool                    timestamp_filename;
  90         bool                    timestamp_boundary;
  91         struct switch_output    switch_output;
  92         unsigned long long      samples;
  93         cpu_set_t               affinity_mask;
  94 };
  95 
  96 static volatile int auxtrace_record__snapshot_started;
  97 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
  98 static DEFINE_TRIGGER(switch_output_trigger);
  99 
 100 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
 101         "SYS", "NODE", "CPU"
 102 };
 103 
 104 static bool switch_output_signal(struct record *rec)
 105 {
 106         return rec->switch_output.signal &&
 107                trigger_is_ready(&switch_output_trigger);
 108 }
 109 
 110 static bool switch_output_size(struct record *rec)
 111 {
 112         return rec->switch_output.size &&
 113                trigger_is_ready(&switch_output_trigger) &&
 114                (rec->bytes_written >= rec->switch_output.size);
 115 }
 116 
 117 static bool switch_output_time(struct record *rec)
 118 {
 119         return rec->switch_output.time &&
 120                trigger_is_ready(&switch_output_trigger);
 121 }
 122 
 123 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
 124                          void *bf, size_t size)
 125 {
 126         struct perf_data_file *file = &rec->session->data->file;
 127 
 128         if (perf_data_file__write(file, bf, size) < 0) {
 129                 pr_err("failed to write perf data, error: %m\n");
 130                 return -1;
 131         }
 132 
 133         rec->bytes_written += size;
 134 
 135         if (switch_output_size(rec))
 136                 trigger_hit(&switch_output_trigger);
 137 
 138         return 0;
 139 }
 140 
 141 static int record__aio_enabled(struct record *rec);
 142 static int record__comp_enabled(struct record *rec);
 143 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
 144                             void *src, size_t src_size);
 145 
 146 #ifdef HAVE_AIO_SUPPORT
 147 static int record__aio_write(struct aiocb *cblock, int trace_fd,
 148                 void *buf, size_t size, off_t off)
 149 {
 150         int rc;
 151 
 152         cblock->aio_fildes = trace_fd;
 153         cblock->aio_buf    = buf;
 154         cblock->aio_nbytes = size;
 155         cblock->aio_offset = off;
 156         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
 157 
 158         do {
 159                 rc = aio_write(cblock);
 160                 if (rc == 0) {
 161                         break;
 162                 } else if (errno != EAGAIN) {
 163                         cblock->aio_fildes = -1;
 164                         pr_err("failed to queue perf data, error: %m\n");
 165                         break;
 166                 }
 167         } while (1);
 168 
 169         return rc;
 170 }
 171 
 172 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
 173 {
 174         void *rem_buf;
 175         off_t rem_off;
 176         size_t rem_size;
 177         int rc, aio_errno;
 178         ssize_t aio_ret, written;
 179 
 180         aio_errno = aio_error(cblock);
 181         if (aio_errno == EINPROGRESS)
 182                 return 0;
 183 
 184         written = aio_ret = aio_return(cblock);
 185         if (aio_ret < 0) {
 186                 if (aio_errno != EINTR)
 187                         pr_err("failed to write perf data, error: %m\n");
 188                 written = 0;
 189         }
 190 
 191         rem_size = cblock->aio_nbytes - written;
 192 
 193         if (rem_size == 0) {
 194                 cblock->aio_fildes = -1;
 195                 /*
 196                  * md->refcount is incremented in record__aio_pushfn() for
 197                  * every aio write request started in record__aio_push() so
 198                  * decrement it because the request is now complete.
 199                  */
 200                 perf_mmap__put(md);
 201                 rc = 1;
 202         } else {
 203                 /*
 204                  * aio write request may require restart with the
 205                  * reminder if the kernel didn't write whole
 206                  * chunk at once.
 207                  */
 208                 rem_off = cblock->aio_offset + written;
 209                 rem_buf = (void *)(cblock->aio_buf + written);
 210                 record__aio_write(cblock, cblock->aio_fildes,
 211                                 rem_buf, rem_size, rem_off);
 212                 rc = 0;
 213         }
 214 
 215         return rc;
 216 }
 217 
 218 static int record__aio_sync(struct mmap *md, bool sync_all)
 219 {
 220         struct aiocb **aiocb = md->aio.aiocb;
 221         struct aiocb *cblocks = md->aio.cblocks;
 222         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
 223         int i, do_suspend;
 224 
 225         do {
 226                 do_suspend = 0;
 227                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
 228                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
 229                                 if (sync_all)
 230                                         aiocb[i] = NULL;
 231                                 else
 232                                         return i;
 233                         } else {
 234                                 /*
 235                                  * Started aio write is not complete yet
 236                                  * so it has to be waited before the
 237                                  * next allocation.
 238                                  */
 239                                 aiocb[i] = &cblocks[i];
 240                                 do_suspend = 1;
 241                         }
 242                 }
 243                 if (!do_suspend)
 244                         return -1;
 245 
 246                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
 247                         if (!(errno == EAGAIN || errno == EINTR))
 248                                 pr_err("failed to sync perf data, error: %m\n");
 249                 }
 250         } while (1);
 251 }
 252 
 253 struct record_aio {
 254         struct record   *rec;
 255         void            *data;
 256         size_t          size;
 257 };
 258 
 259 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
 260 {
 261         struct record_aio *aio = to;
 262 
 263         /*
 264          * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
 265          * to release space in the kernel buffer as fast as possible, calling
 266          * perf_mmap__consume() from perf_mmap__push() function.
 267          *
 268          * That lets the kernel to proceed with storing more profiling data into
 269          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
 270          *
 271          * Coping can be done in two steps in case the chunk of profiling data
 272          * crosses the upper bound of the kernel buffer. In this case we first move
 273          * part of data from map->start till the upper bound and then the reminder
 274          * from the beginning of the kernel buffer till the end of the data chunk.
 275          */
 276 
 277         if (record__comp_enabled(aio->rec)) {
 278                 size = zstd_compress(aio->rec->session, aio->data + aio->size,
 279                                      perf_mmap__mmap_len(map) - aio->size,
 280                                      buf, size);
 281         } else {
 282                 memcpy(aio->data + aio->size, buf, size);
 283         }
 284 
 285         if (!aio->size) {
 286                 /*
 287                  * Increment map->refcount to guard map->aio.data[] buffer
 288                  * from premature deallocation because map object can be
 289                  * released earlier than aio write request started on
 290                  * map->aio.data[] buffer is complete.
 291                  *
 292                  * perf_mmap__put() is done at record__aio_complete()
 293                  * after started aio request completion or at record__aio_push()
 294                  * if the request failed to start.
 295                  */
 296                 perf_mmap__get(map);
 297         }
 298 
 299         aio->size += size;
 300 
 301         return size;
 302 }
 303 
 304 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
 305 {
 306         int ret, idx;
 307         int trace_fd = rec->session->data->file.fd;
 308         struct record_aio aio = { .rec = rec, .size = 0 };
 309 
 310         /*
 311          * Call record__aio_sync() to wait till map->aio.data[] buffer
 312          * becomes available after previous aio write operation.
 313          */
 314 
 315         idx = record__aio_sync(map, false);
 316         aio.data = map->aio.data[idx];
 317         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
 318         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
 319                 return ret;
 320 
 321         rec->samples++;
 322         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
 323         if (!ret) {
 324                 *off += aio.size;
 325                 rec->bytes_written += aio.size;
 326                 if (switch_output_size(rec))
 327                         trigger_hit(&switch_output_trigger);
 328         } else {
 329                 /*
 330                  * Decrement map->refcount incremented in record__aio_pushfn()
 331                  * back if record__aio_write() operation failed to start, otherwise
 332                  * map->refcount is decremented in record__aio_complete() after
 333                  * aio write operation finishes successfully.
 334                  */
 335                 perf_mmap__put(map);
 336         }
 337 
 338         return ret;
 339 }
 340 
 341 static off_t record__aio_get_pos(int trace_fd)
 342 {
 343         return lseek(trace_fd, 0, SEEK_CUR);
 344 }
 345 
 346 static void record__aio_set_pos(int trace_fd, off_t pos)
 347 {
 348         lseek(trace_fd, pos, SEEK_SET);
 349 }
 350 
 351 static void record__aio_mmap_read_sync(struct record *rec)
 352 {
 353         int i;
 354         struct evlist *evlist = rec->evlist;
 355         struct mmap *maps = evlist->mmap;
 356 
 357         if (!record__aio_enabled(rec))
 358                 return;
 359 
 360         for (i = 0; i < evlist->core.nr_mmaps; i++) {
 361                 struct mmap *map = &maps[i];
 362 
 363                 if (map->core.base)
 364                         record__aio_sync(map, true);
 365         }
 366 }
 367 
 368 static int nr_cblocks_default = 1;
 369 static int nr_cblocks_max = 4;
 370 
 371 static int record__aio_parse(const struct option *opt,
 372                              const char *str,
 373                              int unset)
 374 {
 375         struct record_opts *opts = (struct record_opts *)opt->value;
 376 
 377         if (unset) {
 378                 opts->nr_cblocks = 0;
 379         } else {
 380                 if (str)
 381                         opts->nr_cblocks = strtol(str, NULL, 0);
 382                 if (!opts->nr_cblocks)
 383                         opts->nr_cblocks = nr_cblocks_default;
 384         }
 385 
 386         return 0;
 387 }
 388 #else /* HAVE_AIO_SUPPORT */
 389 static int nr_cblocks_max = 0;
 390 
 391 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
 392                             off_t *off __maybe_unused)
 393 {
 394         return -1;
 395 }
 396 
 397 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
 398 {
 399         return -1;
 400 }
 401 
 402 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
 403 {
 404 }
 405 
 406 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
 407 {
 408 }
 409 #endif
 410 
 411 static int record__aio_enabled(struct record *rec)
 412 {
 413         return rec->opts.nr_cblocks > 0;
 414 }
 415 
 416 #define MMAP_FLUSH_DEFAULT 1
 417 static int record__mmap_flush_parse(const struct option *opt,
 418                                     const char *str,
 419                                     int unset)
 420 {
 421         int flush_max;
 422         struct record_opts *opts = (struct record_opts *)opt->value;
 423         static struct parse_tag tags[] = {
 424                         { .tag  = 'B', .mult = 1       },
 425                         { .tag  = 'K', .mult = 1 << 10 },
 426                         { .tag  = 'M', .mult = 1 << 20 },
 427                         { .tag  = 'G', .mult = 1 << 30 },
 428                         { .tag  = 0 },
 429         };
 430 
 431         if (unset)
 432                 return 0;
 433 
 434         if (str) {
 435                 opts->mmap_flush = parse_tag_value(str, tags);
 436                 if (opts->mmap_flush == (int)-1)
 437                         opts->mmap_flush = strtol(str, NULL, 0);
 438         }
 439 
 440         if (!opts->mmap_flush)
 441                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
 442 
 443         flush_max = evlist__mmap_size(opts->mmap_pages);
 444         flush_max /= 4;
 445         if (opts->mmap_flush > flush_max)
 446                 opts->mmap_flush = flush_max;
 447 
 448         return 0;
 449 }
 450 
 451 #ifdef HAVE_ZSTD_SUPPORT
 452 static unsigned int comp_level_default = 1;
 453 
 454 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
 455 {
 456         struct record_opts *opts = opt->value;
 457 
 458         if (unset) {
 459                 opts->comp_level = 0;
 460         } else {
 461                 if (str)
 462                         opts->comp_level = strtol(str, NULL, 0);
 463                 if (!opts->comp_level)
 464                         opts->comp_level = comp_level_default;
 465         }
 466 
 467         return 0;
 468 }
 469 #endif
 470 static unsigned int comp_level_max = 22;
 471 
 472 static int record__comp_enabled(struct record *rec)
 473 {
 474         return rec->opts.comp_level > 0;
 475 }
 476 
 477 static int process_synthesized_event(struct perf_tool *tool,
 478                                      union perf_event *event,
 479                                      struct perf_sample *sample __maybe_unused,
 480                                      struct machine *machine __maybe_unused)
 481 {
 482         struct record *rec = container_of(tool, struct record, tool);
 483         return record__write(rec, NULL, event, event->header.size);
 484 }
 485 
 486 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
 487 {
 488         struct record *rec = to;
 489 
 490         if (record__comp_enabled(rec)) {
 491                 size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
 492                 bf   = map->data;
 493         }
 494 
 495         rec->samples++;
 496         return record__write(rec, map, bf, size);
 497 }
 498 
 499 static volatile int done;
 500 static volatile int signr = -1;
 501 static volatile int child_finished;
 502 
 503 static void sig_handler(int sig)
 504 {
 505         if (sig == SIGCHLD)
 506                 child_finished = 1;
 507         else
 508                 signr = sig;
 509 
 510         done = 1;
 511 }
 512 
 513 static void sigsegv_handler(int sig)
 514 {
 515         perf_hooks__recover();
 516         sighandler_dump_stack(sig);
 517 }
 518 
 519 static void record__sig_exit(void)
 520 {
 521         if (signr == -1)
 522                 return;
 523 
 524         signal(signr, SIG_DFL);
 525         raise(signr);
 526 }
 527 
 528 #ifdef HAVE_AUXTRACE_SUPPORT
 529 
 530 static int record__process_auxtrace(struct perf_tool *tool,
 531                                     struct mmap *map,
 532                                     union perf_event *event, void *data1,
 533                                     size_t len1, void *data2, size_t len2)
 534 {
 535         struct record *rec = container_of(tool, struct record, tool);
 536         struct perf_data *data = &rec->data;
 537         size_t padding;
 538         u8 pad[8] = {0};
 539 
 540         if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
 541                 off_t file_offset;
 542                 int fd = perf_data__fd(data);
 543                 int err;
 544 
 545                 file_offset = lseek(fd, 0, SEEK_CUR);
 546                 if (file_offset == -1)
 547                         return -1;
 548                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
 549                                                      event, file_offset);
 550                 if (err)
 551                         return err;
 552         }
 553 
 554         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
 555         padding = (len1 + len2) & 7;
 556         if (padding)
 557                 padding = 8 - padding;
 558 
 559         record__write(rec, map, event, event->header.size);
 560         record__write(rec, map, data1, len1);
 561         if (len2)
 562                 record__write(rec, map, data2, len2);
 563         record__write(rec, map, &pad, padding);
 564 
 565         return 0;
 566 }
 567 
 568 static int record__auxtrace_mmap_read(struct record *rec,
 569                                       struct mmap *map)
 570 {
 571         int ret;
 572 
 573         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
 574                                   record__process_auxtrace);
 575         if (ret < 0)
 576                 return ret;
 577 
 578         if (ret)
 579                 rec->samples++;
 580 
 581         return 0;
 582 }
 583 
 584 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
 585                                                struct mmap *map)
 586 {
 587         int ret;
 588 
 589         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
 590                                            record__process_auxtrace,
 591                                            rec->opts.auxtrace_snapshot_size);
 592         if (ret < 0)
 593                 return ret;
 594 
 595         if (ret)
 596                 rec->samples++;
 597 
 598         return 0;
 599 }
 600 
 601 static int record__auxtrace_read_snapshot_all(struct record *rec)
 602 {
 603         int i;
 604         int rc = 0;
 605 
 606         for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
 607                 struct mmap *map = &rec->evlist->mmap[i];
 608 
 609                 if (!map->auxtrace_mmap.base)
 610                         continue;
 611 
 612                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
 613                         rc = -1;
 614                         goto out;
 615                 }
 616         }
 617 out:
 618         return rc;
 619 }
 620 
 621 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
 622 {
 623         pr_debug("Recording AUX area tracing snapshot\n");
 624         if (record__auxtrace_read_snapshot_all(rec) < 0) {
 625                 trigger_error(&auxtrace_snapshot_trigger);
 626         } else {
 627                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
 628                         trigger_error(&auxtrace_snapshot_trigger);
 629                 else
 630                         trigger_ready(&auxtrace_snapshot_trigger);
 631         }
 632 }
 633 
 634 static int record__auxtrace_snapshot_exit(struct record *rec)
 635 {
 636         if (trigger_is_error(&auxtrace_snapshot_trigger))
 637                 return 0;
 638 
 639         if (!auxtrace_record__snapshot_started &&
 640             auxtrace_record__snapshot_start(rec->itr))
 641                 return -1;
 642 
 643         record__read_auxtrace_snapshot(rec, true);
 644         if (trigger_is_error(&auxtrace_snapshot_trigger))
 645                 return -1;
 646 
 647         return 0;
 648 }
 649 
 650 static int record__auxtrace_init(struct record *rec)
 651 {
 652         int err;
 653 
 654         if (!rec->itr) {
 655                 rec->itr = auxtrace_record__init(rec->evlist, &err);
 656                 if (err)
 657                         return err;
 658         }
 659 
 660         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
 661                                               rec->opts.auxtrace_snapshot_opts);
 662         if (err)
 663                 return err;
 664 
 665         return auxtrace_parse_filters(rec->evlist);
 666 }
 667 
 668 #else
 669 
 670 static inline
 671 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
 672                                struct mmap *map __maybe_unused)
 673 {
 674         return 0;
 675 }
 676 
 677 static inline
 678 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
 679                                     bool on_exit __maybe_unused)
 680 {
 681 }
 682 
 683 static inline
 684 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
 685 {
 686         return 0;
 687 }
 688 
 689 static inline
 690 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
 691 {
 692         return 0;
 693 }
 694 
 695 static int record__auxtrace_init(struct record *rec __maybe_unused)
 696 {
 697         return 0;
 698 }
 699 
 700 #endif
 701 
 702 static int record__mmap_evlist(struct record *rec,
 703                                struct evlist *evlist)
 704 {
 705         struct record_opts *opts = &rec->opts;
 706         char msg[512];
 707 
 708         if (opts->affinity != PERF_AFFINITY_SYS)
 709                 cpu__setup_cpunode_map();
 710 
 711         if (evlist__mmap_ex(evlist, opts->mmap_pages,
 712                                  opts->auxtrace_mmap_pages,
 713                                  opts->auxtrace_snapshot_mode,
 714                                  opts->nr_cblocks, opts->affinity,
 715                                  opts->mmap_flush, opts->comp_level) < 0) {
 716                 if (errno == EPERM) {
 717                         pr_err("Permission error mapping pages.\n"
 718                                "Consider increasing "
 719                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
 720                                "or try again with a smaller value of -m/--mmap_pages.\n"
 721                                "(current value: %u,%u)\n",
 722                                opts->mmap_pages, opts->auxtrace_mmap_pages);
 723                         return -errno;
 724                 } else {
 725                         pr_err("failed to mmap with %d (%s)\n", errno,
 726                                 str_error_r(errno, msg, sizeof(msg)));
 727                         if (errno)
 728                                 return -errno;
 729                         else
 730                                 return -EINVAL;
 731                 }
 732         }
 733         return 0;
 734 }
 735 
 736 static int record__mmap(struct record *rec)
 737 {
 738         return record__mmap_evlist(rec, rec->evlist);
 739 }
 740 
 741 static int record__open(struct record *rec)
 742 {
 743         char msg[BUFSIZ];
 744         struct evsel *pos;
 745         struct evlist *evlist = rec->evlist;
 746         struct perf_session *session = rec->session;
 747         struct record_opts *opts = &rec->opts;
 748         int rc = 0;
 749 
 750         /*
 751          * For initial_delay we need to add a dummy event so that we can track
 752          * PERF_RECORD_MMAP while we wait for the initial delay to enable the
 753          * real events, the ones asked by the user.
 754          */
 755         if (opts->initial_delay) {
 756                 if (perf_evlist__add_dummy(evlist))
 757                         return -ENOMEM;
 758 
 759                 pos = evlist__first(evlist);
 760                 pos->tracking = 0;
 761                 pos = evlist__last(evlist);
 762                 pos->tracking = 1;
 763                 pos->core.attr.enable_on_exec = 1;
 764         }
 765 
 766         perf_evlist__config(evlist, opts, &callchain_param);
 767 
 768         evlist__for_each_entry(evlist, pos) {
 769 try_again:
 770                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
 771                         if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
 772                                 if (verbose > 0)
 773                                         ui__warning("%s\n", msg);
 774                                 goto try_again;
 775                         }
 776                         if ((errno == EINVAL || errno == EBADF) &&
 777                             pos->leader != pos &&
 778                             pos->weak_group) {
 779                                 pos = perf_evlist__reset_weak_group(evlist, pos);
 780                                 goto try_again;
 781                         }
 782                         rc = -errno;
 783                         perf_evsel__open_strerror(pos, &opts->target,
 784                                                   errno, msg, sizeof(msg));
 785                         ui__error("%s\n", msg);
 786                         goto out;
 787                 }
 788 
 789                 pos->supported = true;
 790         }
 791 
 792         if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
 793                 pr_warning(
 794 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
 795 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
 796 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
 797 "file is not found in the buildid cache or in the vmlinux path.\n\n"
 798 "Samples in kernel modules won't be resolved at all.\n\n"
 799 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
 800 "even with a suitable vmlinux or kallsyms file.\n\n");
 801         }
 802 
 803         if (perf_evlist__apply_filters(evlist, &pos)) {
 804                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
 805                         pos->filter, perf_evsel__name(pos), errno,
 806                         str_error_r(errno, msg, sizeof(msg)));
 807                 rc = -1;
 808                 goto out;
 809         }
 810 
 811         rc = record__mmap(rec);
 812         if (rc)
 813                 goto out;
 814 
 815         session->evlist = evlist;
 816         perf_session__set_id_hdr_size(session);
 817 out:
 818         return rc;
 819 }
 820 
 821 static int process_sample_event(struct perf_tool *tool,
 822                                 union perf_event *event,
 823                                 struct perf_sample *sample,
 824                                 struct evsel *evsel,
 825                                 struct machine *machine)
 826 {
 827         struct record *rec = container_of(tool, struct record, tool);
 828 
 829         if (rec->evlist->first_sample_time == 0)
 830                 rec->evlist->first_sample_time = sample->time;
 831 
 832         rec->evlist->last_sample_time = sample->time;
 833 
 834         if (rec->buildid_all)
 835                 return 0;
 836 
 837         rec->samples++;
 838         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
 839 }
 840 
 841 static int process_buildids(struct record *rec)
 842 {
 843         struct perf_session *session = rec->session;
 844 
 845         if (perf_data__size(&rec->data) == 0)
 846                 return 0;
 847 
 848         /*
 849          * During this process, it'll load kernel map and replace the
 850          * dso->long_name to a real pathname it found.  In this case
 851          * we prefer the vmlinux path like
 852          *   /lib/modules/3.16.4/build/vmlinux
 853          *
 854          * rather than build-id path (in debug directory).
 855          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
 856          */
 857         symbol_conf.ignore_vmlinux_buildid = true;
 858 
 859         /*
 860          * If --buildid-all is given, it marks all DSO regardless of hits,
 861          * so no need to process samples. But if timestamp_boundary is enabled,
 862          * it still needs to walk on all samples to get the timestamps of
 863          * first/last samples.
 864          */
 865         if (rec->buildid_all && !rec->timestamp_boundary)
 866                 rec->tool.sample = NULL;
 867 
 868         return perf_session__process_events(session);
 869 }
 870 
 871 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
 872 {
 873         int err;
 874         struct perf_tool *tool = data;
 875         /*
 876          *As for guest kernel when processing subcommand record&report,
 877          *we arrange module mmap prior to guest kernel mmap and trigger
 878          *a preload dso because default guest module symbols are loaded
 879          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
 880          *method is used to avoid symbol missing when the first addr is
 881          *in module instead of in guest kernel.
 882          */
 883         err = perf_event__synthesize_modules(tool, process_synthesized_event,
 884                                              machine);
 885         if (err < 0)
 886                 pr_err("Couldn't record guest kernel [%d]'s reference"
 887                        " relocation symbol.\n", machine->pid);
 888 
 889         /*
 890          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
 891          * have no _text sometimes.
 892          */
 893         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
 894                                                  machine);
 895         if (err < 0)
 896                 pr_err("Couldn't record guest kernel [%d]'s reference"
 897                        " relocation symbol.\n", machine->pid);
 898 }
 899 
 900 static struct perf_event_header finished_round_event = {
 901         .size = sizeof(struct perf_event_header),
 902         .type = PERF_RECORD_FINISHED_ROUND,
 903 };
 904 
 905 static void record__adjust_affinity(struct record *rec, struct mmap *map)
 906 {
 907         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
 908             !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
 909                 CPU_ZERO(&rec->affinity_mask);
 910                 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
 911                 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
 912         }
 913 }
 914 
 915 static size_t process_comp_header(void *record, size_t increment)
 916 {
 917         struct perf_record_compressed *event = record;
 918         size_t size = sizeof(*event);
 919 
 920         if (increment) {
 921                 event->header.size += increment;
 922                 return increment;
 923         }
 924 
 925         event->header.type = PERF_RECORD_COMPRESSED;
 926         event->header.size = size;
 927 
 928         return size;
 929 }
 930 
 931 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
 932                             void *src, size_t src_size)
 933 {
 934         size_t compressed;
 935         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
 936 
 937         compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
 938                                                      max_record_size, process_comp_header);
 939 
 940         session->bytes_transferred += src_size;
 941         session->bytes_compressed  += compressed;
 942 
 943         return compressed;
 944 }
 945 
 946 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
 947                                     bool overwrite, bool synch)
 948 {
 949         u64 bytes_written = rec->bytes_written;
 950         int i;
 951         int rc = 0;
 952         struct mmap *maps;
 953         int trace_fd = rec->data.file.fd;
 954         off_t off = 0;
 955 
 956         if (!evlist)
 957                 return 0;
 958 
 959         maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
 960         if (!maps)
 961                 return 0;
 962 
 963         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
 964                 return 0;
 965 
 966         if (record__aio_enabled(rec))
 967                 off = record__aio_get_pos(trace_fd);
 968 
 969         for (i = 0; i < evlist->core.nr_mmaps; i++) {
 970                 u64 flush = 0;
 971                 struct mmap *map = &maps[i];
 972 
 973                 if (map->core.base) {
 974                         record__adjust_affinity(rec, map);
 975                         if (synch) {
 976                                 flush = map->core.flush;
 977                                 map->core.flush = 1;
 978                         }
 979                         if (!record__aio_enabled(rec)) {
 980                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
 981                                         if (synch)
 982                                                 map->core.flush = flush;
 983                                         rc = -1;
 984                                         goto out;
 985                                 }
 986                         } else {
 987                                 if (record__aio_push(rec, map, &off) < 0) {
 988                                         record__aio_set_pos(trace_fd, off);
 989                                         if (synch)
 990                                                 map->core.flush = flush;
 991                                         rc = -1;
 992                                         goto out;
 993                                 }
 994                         }
 995                         if (synch)
 996                                 map->core.flush = flush;
 997                 }
 998 
 999                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1000                     record__auxtrace_mmap_read(rec, map) != 0) {
1001                         rc = -1;
1002                         goto out;
1003                 }
1004         }
1005 
1006         if (record__aio_enabled(rec))
1007                 record__aio_set_pos(trace_fd, off);
1008 
1009         /*
1010          * Mark the round finished in case we wrote
1011          * at least one event.
1012          */
1013         if (bytes_written != rec->bytes_written)
1014                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1015 
1016         if (overwrite)
1017                 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1018 out:
1019         return rc;
1020 }
1021 
1022 static int record__mmap_read_all(struct record *rec, bool synch)
1023 {
1024         int err;
1025 
1026         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1027         if (err)
1028                 return err;
1029 
1030         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1031 }
1032 
1033 static void record__init_features(struct record *rec)
1034 {
1035         struct perf_session *session = rec->session;
1036         int feat;
1037 
1038         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1039                 perf_header__set_feat(&session->header, feat);
1040 
1041         if (rec->no_buildid)
1042                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1043 
1044         if (!have_tracepoints(&rec->evlist->core.entries))
1045                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1046 
1047         if (!rec->opts.branch_stack)
1048                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1049 
1050         if (!rec->opts.full_auxtrace)
1051                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1052 
1053         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1054                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1055 
1056         perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1057         if (!record__comp_enabled(rec))
1058                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1059 
1060         perf_header__clear_feat(&session->header, HEADER_STAT);
1061 }
1062 
1063 static void
1064 record__finish_output(struct record *rec)
1065 {
1066         struct perf_data *data = &rec->data;
1067         int fd = perf_data__fd(data);
1068 
1069         if (data->is_pipe)
1070                 return;
1071 
1072         rec->session->header.data_size += rec->bytes_written;
1073         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1074 
1075         if (!rec->no_buildid) {
1076                 process_buildids(rec);
1077 
1078                 if (rec->buildid_all)
1079                         dsos__hit_all(rec->session);
1080         }
1081         perf_session__write_header(rec->session, rec->evlist, fd, true);
1082 
1083         return;
1084 }
1085 
1086 static int record__synthesize_workload(struct record *rec, bool tail)
1087 {
1088         int err;
1089         struct perf_thread_map *thread_map;
1090 
1091         if (rec->opts.tail_synthesize != tail)
1092                 return 0;
1093 
1094         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1095         if (thread_map == NULL)
1096                 return -1;
1097 
1098         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1099                                                  process_synthesized_event,
1100                                                  &rec->session->machines.host,
1101                                                  rec->opts.sample_address);
1102         perf_thread_map__put(thread_map);
1103         return err;
1104 }
1105 
1106 static int record__synthesize(struct record *rec, bool tail);
1107 
1108 static int
1109 record__switch_output(struct record *rec, bool at_exit)
1110 {
1111         struct perf_data *data = &rec->data;
1112         int fd, err;
1113         char *new_filename;
1114 
1115         /* Same Size:      "2015122520103046"*/
1116         char timestamp[] = "InvalidTimestamp";
1117 
1118         record__aio_mmap_read_sync(rec);
1119 
1120         record__synthesize(rec, true);
1121         if (target__none(&rec->opts.target))
1122                 record__synthesize_workload(rec, true);
1123 
1124         rec->samples = 0;
1125         record__finish_output(rec);
1126         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1127         if (err) {
1128                 pr_err("Failed to get current timestamp\n");
1129                 return -EINVAL;
1130         }
1131 
1132         fd = perf_data__switch(data, timestamp,
1133                                     rec->session->header.data_offset,
1134                                     at_exit, &new_filename);
1135         if (fd >= 0 && !at_exit) {
1136                 rec->bytes_written = 0;
1137                 rec->session->header.data_size = 0;
1138         }
1139 
1140         if (!quiet)
1141                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1142                         data->path, timestamp);
1143 
1144         if (rec->switch_output.num_files) {
1145                 int n = rec->switch_output.cur_file + 1;
1146 
1147                 if (n >= rec->switch_output.num_files)
1148                         n = 0;
1149                 rec->switch_output.cur_file = n;
1150                 if (rec->switch_output.filenames[n]) {
1151                         remove(rec->switch_output.filenames[n]);
1152                         zfree(&rec->switch_output.filenames[n]);
1153                 }
1154                 rec->switch_output.filenames[n] = new_filename;
1155         } else {
1156                 free(new_filename);
1157         }
1158 
1159         /* Output tracking events */
1160         if (!at_exit) {
1161                 record__synthesize(rec, false);
1162 
1163                 /*
1164                  * In 'perf record --switch-output' without -a,
1165                  * record__synthesize() in record__switch_output() won't
1166                  * generate tracking events because there's no thread_map
1167                  * in evlist. Which causes newly created perf.data doesn't
1168                  * contain map and comm information.
1169                  * Create a fake thread_map and directly call
1170                  * perf_event__synthesize_thread_map() for those events.
1171                  */
1172                 if (target__none(&rec->opts.target))
1173                         record__synthesize_workload(rec, false);
1174         }
1175         return fd;
1176 }
1177 
1178 static volatile int workload_exec_errno;
1179 
1180 /*
1181  * perf_evlist__prepare_workload will send a SIGUSR1
1182  * if the fork fails, since we asked by setting its
1183  * want_signal to true.
1184  */
1185 static void workload_exec_failed_signal(int signo __maybe_unused,
1186                                         siginfo_t *info,
1187                                         void *ucontext __maybe_unused)
1188 {
1189         workload_exec_errno = info->si_value.sival_int;
1190         done = 1;
1191         child_finished = 1;
1192 }
1193 
1194 static void snapshot_sig_handler(int sig);
1195 static void alarm_sig_handler(int sig);
1196 
1197 static const struct perf_event_mmap_page *
1198 perf_evlist__pick_pc(struct evlist *evlist)
1199 {
1200         if (evlist) {
1201                 if (evlist->mmap && evlist->mmap[0].core.base)
1202                         return evlist->mmap[0].core.base;
1203                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1204                         return evlist->overwrite_mmap[0].core.base;
1205         }
1206         return NULL;
1207 }
1208 
1209 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1210 {
1211         const struct perf_event_mmap_page *pc;
1212 
1213         pc = perf_evlist__pick_pc(rec->evlist);
1214         if (pc)
1215                 return pc;
1216         return NULL;
1217 }
1218 
1219 static int record__synthesize(struct record *rec, bool tail)
1220 {
1221         struct perf_session *session = rec->session;
1222         struct machine *machine = &session->machines.host;
1223         struct perf_data *data = &rec->data;
1224         struct record_opts *opts = &rec->opts;
1225         struct perf_tool *tool = &rec->tool;
1226         int fd = perf_data__fd(data);
1227         int err = 0;
1228 
1229         if (rec->opts.tail_synthesize != tail)
1230                 return 0;
1231 
1232         if (data->is_pipe) {
1233                 /*
1234                  * We need to synthesize events first, because some
1235                  * features works on top of them (on report side).
1236                  */
1237                 err = perf_event__synthesize_attrs(tool, rec->evlist,
1238                                                    process_synthesized_event);
1239                 if (err < 0) {
1240                         pr_err("Couldn't synthesize attrs.\n");
1241                         goto out;
1242                 }
1243 
1244                 err = perf_event__synthesize_features(tool, session, rec->evlist,
1245                                                       process_synthesized_event);
1246                 if (err < 0) {
1247                         pr_err("Couldn't synthesize features.\n");
1248                         return err;
1249                 }
1250 
1251                 if (have_tracepoints(&rec->evlist->core.entries)) {
1252                         /*
1253                          * FIXME err <= 0 here actually means that
1254                          * there were no tracepoints so its not really
1255                          * an error, just that we don't need to
1256                          * synthesize anything.  We really have to
1257                          * return this more properly and also
1258                          * propagate errors that now are calling die()
1259                          */
1260                         err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1261                                                                   process_synthesized_event);
1262                         if (err <= 0) {
1263                                 pr_err("Couldn't record tracing data.\n");
1264                                 goto out;
1265                         }
1266                         rec->bytes_written += err;
1267                 }
1268         }
1269 
1270         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1271                                           process_synthesized_event, machine);
1272         if (err)
1273                 goto out;
1274 
1275         if (rec->opts.full_auxtrace) {
1276                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1277                                         session, process_synthesized_event);
1278                 if (err)
1279                         goto out;
1280         }
1281 
1282         if (!perf_evlist__exclude_kernel(rec->evlist)) {
1283                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1284                                                          machine);
1285                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1286                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1287                                    "Check /proc/kallsyms permission or run as root.\n");
1288 
1289                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1290                                                      machine);
1291                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1292                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1293                                    "Check /proc/modules permission or run as root.\n");
1294         }
1295 
1296         if (perf_guest) {
1297                 machines__process_guests(&session->machines,
1298                                          perf_event__synthesize_guest_os, tool);
1299         }
1300 
1301         err = perf_event__synthesize_extra_attr(&rec->tool,
1302                                                 rec->evlist,
1303                                                 process_synthesized_event,
1304                                                 data->is_pipe);
1305         if (err)
1306                 goto out;
1307 
1308         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1309                                                  process_synthesized_event,
1310                                                 NULL);
1311         if (err < 0) {
1312                 pr_err("Couldn't synthesize thread map.\n");
1313                 return err;
1314         }
1315 
1316         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1317                                              process_synthesized_event, NULL);
1318         if (err < 0) {
1319                 pr_err("Couldn't synthesize cpu map.\n");
1320                 return err;
1321         }
1322 
1323         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1324                                                 machine, opts);
1325         if (err < 0)
1326                 pr_warning("Couldn't synthesize bpf events.\n");
1327 
1328         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1329                                             process_synthesized_event, opts->sample_address,
1330                                             1);
1331 out:
1332         return err;
1333 }
1334 
1335 static int __cmd_record(struct record *rec, int argc, const char **argv)
1336 {
1337         int err;
1338         int status = 0;
1339         unsigned long waking = 0;
1340         const bool forks = argc > 0;
1341         struct perf_tool *tool = &rec->tool;
1342         struct record_opts *opts = &rec->opts;
1343         struct perf_data *data = &rec->data;
1344         struct perf_session *session;
1345         bool disabled = false, draining = false;
1346         struct evlist *sb_evlist = NULL;
1347         int fd;
1348         float ratio = 0;
1349 
1350         atexit(record__sig_exit);
1351         signal(SIGCHLD, sig_handler);
1352         signal(SIGINT, sig_handler);
1353         signal(SIGTERM, sig_handler);
1354         signal(SIGSEGV, sigsegv_handler);
1355 
1356         if (rec->opts.record_namespaces)
1357                 tool->namespace_events = true;
1358 
1359         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1360                 signal(SIGUSR2, snapshot_sig_handler);
1361                 if (rec->opts.auxtrace_snapshot_mode)
1362                         trigger_on(&auxtrace_snapshot_trigger);
1363                 if (rec->switch_output.enabled)
1364                         trigger_on(&switch_output_trigger);
1365         } else {
1366                 signal(SIGUSR2, SIG_IGN);
1367         }
1368 
1369         session = perf_session__new(data, false, tool);
1370         if (IS_ERR(session)) {
1371                 pr_err("Perf session creation failed.\n");
1372                 return PTR_ERR(session);
1373         }
1374 
1375         fd = perf_data__fd(data);
1376         rec->session = session;
1377 
1378         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1379                 pr_err("Compression initialization failed.\n");
1380                 return -1;
1381         }
1382 
1383         session->header.env.comp_type  = PERF_COMP_ZSTD;
1384         session->header.env.comp_level = rec->opts.comp_level;
1385 
1386         record__init_features(rec);
1387 
1388         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1389                 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1390 
1391         if (forks) {
1392                 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1393                                                     argv, data->is_pipe,
1394                                                     workload_exec_failed_signal);
1395                 if (err < 0) {
1396                         pr_err("Couldn't run the workload!\n");
1397                         status = err;
1398                         goto out_delete_session;
1399                 }
1400         }
1401 
1402         /*
1403          * If we have just single event and are sending data
1404          * through pipe, we need to force the ids allocation,
1405          * because we synthesize event name through the pipe
1406          * and need the id for that.
1407          */
1408         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1409                 rec->opts.sample_id = true;
1410 
1411         if (record__open(rec) != 0) {
1412                 err = -1;
1413                 goto out_child;
1414         }
1415         session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1416 
1417         err = bpf__apply_obj_config();
1418         if (err) {
1419                 char errbuf[BUFSIZ];
1420 
1421                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1422                 pr_err("ERROR: Apply config to BPF failed: %s\n",
1423                          errbuf);
1424                 goto out_child;
1425         }
1426 
1427         /*
1428          * Normally perf_session__new would do this, but it doesn't have the
1429          * evlist.
1430          */
1431         if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1432                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1433                 rec->tool.ordered_events = false;
1434         }
1435 
1436         if (!rec->evlist->nr_groups)
1437                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1438 
1439         if (data->is_pipe) {
1440                 err = perf_header__write_pipe(fd);
1441                 if (err < 0)
1442                         goto out_child;
1443         } else {
1444                 err = perf_session__write_header(session, rec->evlist, fd, false);
1445                 if (err < 0)
1446                         goto out_child;
1447         }
1448 
1449         if (!rec->no_buildid
1450             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1451                 pr_err("Couldn't generate buildids. "
1452                        "Use --no-buildid to profile anyway.\n");
1453                 err = -1;
1454                 goto out_child;
1455         }
1456 
1457         if (!opts->no_bpf_event)
1458                 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1459 
1460         if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1461                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1462                 opts->no_bpf_event = true;
1463         }
1464 
1465         err = record__synthesize(rec, false);
1466         if (err < 0)
1467                 goto out_child;
1468 
1469         if (rec->realtime_prio) {
1470                 struct sched_param param;
1471 
1472                 param.sched_priority = rec->realtime_prio;
1473                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1474                         pr_err("Could not set realtime priority.\n");
1475                         err = -1;
1476                         goto out_child;
1477                 }
1478         }
1479 
1480         /*
1481          * When perf is starting the traced process, all the events
1482          * (apart from group members) have enable_on_exec=1 set,
1483          * so don't spoil it by prematurely enabling them.
1484          */
1485         if (!target__none(&opts->target) && !opts->initial_delay)
1486                 evlist__enable(rec->evlist);
1487 
1488         /*
1489          * Let the child rip
1490          */
1491         if (forks) {
1492                 struct machine *machine = &session->machines.host;
1493                 union perf_event *event;
1494                 pid_t tgid;
1495 
1496                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1497                 if (event == NULL) {
1498                         err = -ENOMEM;
1499                         goto out_child;
1500                 }
1501 
1502                 /*
1503                  * Some H/W events are generated before COMM event
1504                  * which is emitted during exec(), so perf script
1505                  * cannot see a correct process name for those events.
1506                  * Synthesize COMM event to prevent it.
1507                  */
1508                 tgid = perf_event__synthesize_comm(tool, event,
1509                                                    rec->evlist->workload.pid,
1510                                                    process_synthesized_event,
1511                                                    machine);
1512                 free(event);
1513 
1514                 if (tgid == -1)
1515                         goto out_child;
1516 
1517                 event = malloc(sizeof(event->namespaces) +
1518                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1519                                machine->id_hdr_size);
1520                 if (event == NULL) {
1521                         err = -ENOMEM;
1522                         goto out_child;
1523                 }
1524 
1525                 /*
1526                  * Synthesize NAMESPACES event for the command specified.
1527                  */
1528                 perf_event__synthesize_namespaces(tool, event,
1529                                                   rec->evlist->workload.pid,
1530                                                   tgid, process_synthesized_event,
1531                                                   machine);
1532                 free(event);
1533 
1534                 perf_evlist__start_workload(rec->evlist);
1535         }
1536 
1537         if (opts->initial_delay) {
1538                 usleep(opts->initial_delay * USEC_PER_MSEC);
1539                 evlist__enable(rec->evlist);
1540         }
1541 
1542         trigger_ready(&auxtrace_snapshot_trigger);
1543         trigger_ready(&switch_output_trigger);
1544         perf_hooks__invoke_record_start();
1545         for (;;) {
1546                 unsigned long long hits = rec->samples;
1547 
1548                 /*
1549                  * rec->evlist->bkw_mmap_state is possible to be
1550                  * BKW_MMAP_EMPTY here: when done == true and
1551                  * hits != rec->samples in previous round.
1552                  *
1553                  * perf_evlist__toggle_bkw_mmap ensure we never
1554                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1555                  */
1556                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1557                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1558 
1559                 if (record__mmap_read_all(rec, false) < 0) {
1560                         trigger_error(&auxtrace_snapshot_trigger);
1561                         trigger_error(&switch_output_trigger);
1562                         err = -1;
1563                         goto out_child;
1564                 }
1565 
1566                 if (auxtrace_record__snapshot_started) {
1567                         auxtrace_record__snapshot_started = 0;
1568                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
1569                                 record__read_auxtrace_snapshot(rec, false);
1570                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1571                                 pr_err("AUX area tracing snapshot failed\n");
1572                                 err = -1;
1573                                 goto out_child;
1574                         }
1575                 }
1576 
1577                 if (trigger_is_hit(&switch_output_trigger)) {
1578                         /*
1579                          * If switch_output_trigger is hit, the data in
1580                          * overwritable ring buffer should have been collected,
1581                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1582                          *
1583                          * If SIGUSR2 raise after or during record__mmap_read_all(),
1584                          * record__mmap_read_all() didn't collect data from
1585                          * overwritable ring buffer. Read again.
1586                          */
1587                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1588                                 continue;
1589                         trigger_ready(&switch_output_trigger);
1590 
1591                         /*
1592                          * Reenable events in overwrite ring buffer after
1593                          * record__mmap_read_all(): we should have collected
1594                          * data from it.
1595                          */
1596                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1597 
1598                         if (!quiet)
1599                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1600                                         waking);
1601                         waking = 0;
1602                         fd = record__switch_output(rec, false);
1603                         if (fd < 0) {
1604                                 pr_err("Failed to switch to new file\n");
1605                                 trigger_error(&switch_output_trigger);
1606                                 err = fd;
1607                                 goto out_child;
1608                         }
1609 
1610                         /* re-arm the alarm */
1611                         if (rec->switch_output.time)
1612                                 alarm(rec->switch_output.time);
1613                 }
1614 
1615                 if (hits == rec->samples) {
1616                         if (done || draining)
1617                                 break;
1618                         err = evlist__poll(rec->evlist, -1);
1619                         /*
1620                          * Propagate error, only if there's any. Ignore positive
1621                          * number of returned events and interrupt error.
1622                          */
1623                         if (err > 0 || (err < 0 && errno == EINTR))
1624                                 err = 0;
1625                         waking++;
1626 
1627                         if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1628                                 draining = true;
1629                 }
1630 
1631                 /*
1632                  * When perf is starting the traced process, at the end events
1633                  * die with the process and we wait for that. Thus no need to
1634                  * disable events in this case.
1635                  */
1636                 if (done && !disabled && !target__none(&opts->target)) {
1637                         trigger_off(&auxtrace_snapshot_trigger);
1638                         evlist__disable(rec->evlist);
1639                         disabled = true;
1640                 }
1641         }
1642 
1643         trigger_off(&auxtrace_snapshot_trigger);
1644         trigger_off(&switch_output_trigger);
1645 
1646         if (opts->auxtrace_snapshot_on_exit)
1647                 record__auxtrace_snapshot_exit(rec);
1648 
1649         if (forks && workload_exec_errno) {
1650                 char msg[STRERR_BUFSIZE];
1651                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1652                 pr_err("Workload failed: %s\n", emsg);
1653                 err = -1;
1654                 goto out_child;
1655         }
1656 
1657         if (!quiet)
1658                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1659 
1660         if (target__none(&rec->opts.target))
1661                 record__synthesize_workload(rec, true);
1662 
1663 out_child:
1664         record__mmap_read_all(rec, true);
1665         record__aio_mmap_read_sync(rec);
1666 
1667         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1668                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1669                 session->header.env.comp_ratio = ratio + 0.5;
1670         }
1671 
1672         if (forks) {
1673                 int exit_status;
1674 
1675                 if (!child_finished)
1676                         kill(rec->evlist->workload.pid, SIGTERM);
1677 
1678                 wait(&exit_status);
1679 
1680                 if (err < 0)
1681                         status = err;
1682                 else if (WIFEXITED(exit_status))
1683                         status = WEXITSTATUS(exit_status);
1684                 else if (WIFSIGNALED(exit_status))
1685                         signr = WTERMSIG(exit_status);
1686         } else
1687                 status = err;
1688 
1689         record__synthesize(rec, true);
1690         /* this will be recalculated during process_buildids() */
1691         rec->samples = 0;
1692 
1693         if (!err) {
1694                 if (!rec->timestamp_filename) {
1695                         record__finish_output(rec);
1696                 } else {
1697                         fd = record__switch_output(rec, true);
1698                         if (fd < 0) {
1699                                 status = fd;
1700                                 goto out_delete_session;
1701                         }
1702                 }
1703         }
1704 
1705         perf_hooks__invoke_record_end();
1706 
1707         if (!err && !quiet) {
1708                 char samples[128];
1709                 const char *postfix = rec->timestamp_filename ?
1710                                         ".<timestamp>" : "";
1711 
1712                 if (rec->samples && !rec->opts.full_auxtrace)
1713                         scnprintf(samples, sizeof(samples),
1714                                   " (%" PRIu64 " samples)", rec->samples);
1715                 else
1716                         samples[0] = '\0';
1717 
1718                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1719                         perf_data__size(data) / 1024.0 / 1024.0,
1720                         data->path, postfix, samples);
1721                 if (ratio) {
1722                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1723                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
1724                                         ratio);
1725                 }
1726                 fprintf(stderr, " ]\n");
1727         }
1728 
1729 out_delete_session:
1730         zstd_fini(&session->zstd_data);
1731         perf_session__delete(session);
1732 
1733         if (!opts->no_bpf_event)
1734                 perf_evlist__stop_sb_thread(sb_evlist);
1735         return status;
1736 }
1737 
1738 static void callchain_debug(struct callchain_param *callchain)
1739 {
1740         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1741 
1742         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1743 
1744         if (callchain->record_mode == CALLCHAIN_DWARF)
1745                 pr_debug("callchain: stack dump size %d\n",
1746                          callchain->dump_size);
1747 }
1748 
1749 int record_opts__parse_callchain(struct record_opts *record,
1750                                  struct callchain_param *callchain,
1751                                  const char *arg, bool unset)
1752 {
1753         int ret;
1754         callchain->enabled = !unset;
1755 
1756         /* --no-call-graph */
1757         if (unset) {
1758                 callchain->record_mode = CALLCHAIN_NONE;
1759                 pr_debug("callchain: disabled\n");
1760                 return 0;
1761         }
1762 
1763         ret = parse_callchain_record_opt(arg, callchain);
1764         if (!ret) {
1765                 /* Enable data address sampling for DWARF unwind. */
1766                 if (callchain->record_mode == CALLCHAIN_DWARF)
1767                         record->sample_address = true;
1768                 callchain_debug(callchain);
1769         }
1770 
1771         return ret;
1772 }
1773 
1774 int record_parse_callchain_opt(const struct option *opt,
1775                                const char *arg,
1776                                int unset)
1777 {
1778         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1779 }
1780 
1781 int record_callchain_opt(const struct option *opt,
1782                          const char *arg __maybe_unused,
1783                          int unset __maybe_unused)
1784 {
1785         struct callchain_param *callchain = opt->value;
1786 
1787         callchain->enabled = true;
1788 
1789         if (callchain->record_mode == CALLCHAIN_NONE)
1790                 callchain->record_mode = CALLCHAIN_FP;
1791 
1792         callchain_debug(callchain);
1793         return 0;
1794 }
1795 
1796 static int perf_record_config(const char *var, const char *value, void *cb)
1797 {
1798         struct record *rec = cb;
1799 
1800         if (!strcmp(var, "record.build-id")) {
1801                 if (!strcmp(value, "cache"))
1802                         rec->no_buildid_cache = false;
1803                 else if (!strcmp(value, "no-cache"))
1804                         rec->no_buildid_cache = true;
1805                 else if (!strcmp(value, "skip"))
1806                         rec->no_buildid = true;
1807                 else
1808                         return -1;
1809                 return 0;
1810         }
1811         if (!strcmp(var, "record.call-graph")) {
1812                 var = "call-graph.record-mode";
1813                 return perf_default_config(var, value, cb);
1814         }
1815 #ifdef HAVE_AIO_SUPPORT
1816         if (!strcmp(var, "record.aio")) {
1817                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1818                 if (!rec->opts.nr_cblocks)
1819                         rec->opts.nr_cblocks = nr_cblocks_default;
1820         }
1821 #endif
1822 
1823         return 0;
1824 }
1825 
1826 struct clockid_map {
1827         const char *name;
1828         int clockid;
1829 };
1830 
1831 #define CLOCKID_MAP(n, c)       \
1832         { .name = n, .clockid = (c), }
1833 
1834 #define CLOCKID_END     { .name = NULL, }
1835 
1836 
1837 /*
1838  * Add the missing ones, we need to build on many distros...
1839  */
1840 #ifndef CLOCK_MONOTONIC_RAW
1841 #define CLOCK_MONOTONIC_RAW 4
1842 #endif
1843 #ifndef CLOCK_BOOTTIME
1844 #define CLOCK_BOOTTIME 7
1845 #endif
1846 #ifndef CLOCK_TAI
1847 #define CLOCK_TAI 11
1848 #endif
1849 
1850 static const struct clockid_map clockids[] = {
1851         /* available for all events, NMI safe */
1852         CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1853         CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1854 
1855         /* available for some events */
1856         CLOCKID_MAP("realtime", CLOCK_REALTIME),
1857         CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1858         CLOCKID_MAP("tai", CLOCK_TAI),
1859 
1860         /* available for the lazy */
1861         CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1862         CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1863         CLOCKID_MAP("real", CLOCK_REALTIME),
1864         CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1865 
1866         CLOCKID_END,
1867 };
1868 
1869 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1870 {
1871         struct timespec res;
1872 
1873         *res_ns = 0;
1874         if (!clock_getres(clk_id, &res))
1875                 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1876         else
1877                 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1878 
1879         return 0;
1880 }
1881 
1882 static int parse_clockid(const struct option *opt, const char *str, int unset)
1883 {
1884         struct record_opts *opts = (struct record_opts *)opt->value;
1885         const struct clockid_map *cm;
1886         const char *ostr = str;
1887 
1888         if (unset) {
1889                 opts->use_clockid = 0;
1890                 return 0;
1891         }
1892 
1893         /* no arg passed */
1894         if (!str)
1895                 return 0;
1896 
1897         /* no setting it twice */
1898         if (opts->use_clockid)
1899                 return -1;
1900 
1901         opts->use_clockid = true;
1902 
1903         /* if its a number, we're done */
1904         if (sscanf(str, "%d", &opts->clockid) == 1)
1905                 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1906 
1907         /* allow a "CLOCK_" prefix to the name */
1908         if (!strncasecmp(str, "CLOCK_", 6))
1909                 str += 6;
1910 
1911         for (cm = clockids; cm->name; cm++) {
1912                 if (!strcasecmp(str, cm->name)) {
1913                         opts->clockid = cm->clockid;
1914                         return get_clockid_res(opts->clockid,
1915                                                &opts->clockid_res_ns);
1916                 }
1917         }
1918 
1919         opts->use_clockid = false;
1920         ui__warning("unknown clockid %s, check man page\n", ostr);
1921         return -1;
1922 }
1923 
1924 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1925 {
1926         struct record_opts *opts = (struct record_opts *)opt->value;
1927 
1928         if (unset || !str)
1929                 return 0;
1930 
1931         if (!strcasecmp(str, "node"))
1932                 opts->affinity = PERF_AFFINITY_NODE;
1933         else if (!strcasecmp(str, "cpu"))
1934                 opts->affinity = PERF_AFFINITY_CPU;
1935 
1936         return 0;
1937 }
1938 
1939 static int record__parse_mmap_pages(const struct option *opt,
1940                                     const char *str,
1941                                     int unset __maybe_unused)
1942 {
1943         struct record_opts *opts = opt->value;
1944         char *s, *p;
1945         unsigned int mmap_pages;
1946         int ret;
1947 
1948         if (!str)
1949                 return -EINVAL;
1950 
1951         s = strdup(str);
1952         if (!s)
1953                 return -ENOMEM;
1954 
1955         p = strchr(s, ',');
1956         if (p)
1957                 *p = '\0';
1958 
1959         if (*s) {
1960                 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1961                 if (ret)
1962                         goto out_free;
1963                 opts->mmap_pages = mmap_pages;
1964         }
1965 
1966         if (!p) {
1967                 ret = 0;
1968                 goto out_free;
1969         }
1970 
1971         ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1972         if (ret)
1973                 goto out_free;
1974 
1975         opts->auxtrace_mmap_pages = mmap_pages;
1976 
1977 out_free:
1978         free(s);
1979         return ret;
1980 }
1981 
1982 static void switch_output_size_warn(struct record *rec)
1983 {
1984         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
1985         struct switch_output *s = &rec->switch_output;
1986 
1987         wakeup_size /= 2;
1988 
1989         if (s->size < wakeup_size) {
1990                 char buf[100];
1991 
1992                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1993                 pr_warning("WARNING: switch-output data size lower than "
1994                            "wakeup kernel buffer size (%s) "
1995                            "expect bigger perf.data sizes\n", buf);
1996         }
1997 }
1998 
1999 static int switch_output_setup(struct record *rec)
2000 {
2001         struct switch_output *s = &rec->switch_output;
2002         static struct parse_tag tags_size[] = {
2003                 { .tag  = 'B', .mult = 1       },
2004                 { .tag  = 'K', .mult = 1 << 10 },
2005                 { .tag  = 'M', .mult = 1 << 20 },
2006                 { .tag  = 'G', .mult = 1 << 30 },
2007                 { .tag  = 0 },
2008         };
2009         static struct parse_tag tags_time[] = {
2010                 { .tag  = 's', .mult = 1        },
2011                 { .tag  = 'm', .mult = 60       },
2012                 { .tag  = 'h', .mult = 60*60    },
2013                 { .tag  = 'd', .mult = 60*60*24 },
2014                 { .tag  = 0 },
2015         };
2016         unsigned long val;
2017 
2018         if (!s->set)
2019                 return 0;
2020 
2021         if (!strcmp(s->str, "signal")) {
2022                 s->signal = true;
2023                 pr_debug("switch-output with SIGUSR2 signal\n");
2024                 goto enabled;
2025         }
2026 
2027         val = parse_tag_value(s->str, tags_size);
2028         if (val != (unsigned long) -1) {
2029                 s->size = val;
2030                 pr_debug("switch-output with %s size threshold\n", s->str);
2031                 goto enabled;
2032         }
2033 
2034         val = parse_tag_value(s->str, tags_time);
2035         if (val != (unsigned long) -1) {
2036                 s->time = val;
2037                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2038                          s->str, s->time);
2039                 goto enabled;
2040         }
2041 
2042         return -1;
2043 
2044 enabled:
2045         rec->timestamp_filename = true;
2046         s->enabled              = true;
2047 
2048         if (s->size && !rec->opts.no_buffering)
2049                 switch_output_size_warn(rec);
2050 
2051         return 0;
2052 }
2053 
2054 static const char * const __record_usage[] = {
2055         "perf record [<options>] [<command>]",
2056         "perf record [<options>] -- <command> [<options>]",
2057         NULL
2058 };
2059 const char * const *record_usage = __record_usage;
2060 
2061 /*
2062  * XXX Ideally would be local to cmd_record() and passed to a record__new
2063  * because we need to have access to it in record__exit, that is called
2064  * after cmd_record() exits, but since record_options need to be accessible to
2065  * builtin-script, leave it here.
2066  *
2067  * At least we don't ouch it in all the other functions here directly.
2068  *
2069  * Just say no to tons of global variables, sigh.
2070  */
2071 static struct record record = {
2072         .opts = {
2073                 .sample_time         = true,
2074                 .mmap_pages          = UINT_MAX,
2075                 .user_freq           = UINT_MAX,
2076                 .user_interval       = ULLONG_MAX,
2077                 .freq                = 4000,
2078                 .target              = {
2079                         .uses_mmap   = true,
2080                         .default_per_cpu = true,
2081                 },
2082                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
2083         },
2084         .tool = {
2085                 .sample         = process_sample_event,
2086                 .fork           = perf_event__process_fork,
2087                 .exit           = perf_event__process_exit,
2088                 .comm           = perf_event__process_comm,
2089                 .namespaces     = perf_event__process_namespaces,
2090                 .mmap           = perf_event__process_mmap,
2091                 .mmap2          = perf_event__process_mmap2,
2092                 .ordered_events = true,
2093         },
2094 };
2095 
2096 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2097         "\n\t\t\t\tDefault: fp";
2098 
2099 static bool dry_run;
2100 
2101 /*
2102  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2103  * with it and switch to use the library functions in perf_evlist that came
2104  * from builtin-record.c, i.e. use record_opts,
2105  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2106  * using pipes, etc.
2107  */
2108 static struct option __record_options[] = {
2109         OPT_CALLBACK('e', "event", &record.evlist, "event",
2110                      "event selector. use 'perf list' to list available events",
2111                      parse_events_option),
2112         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2113                      "event filter", parse_filter),
2114         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2115                            NULL, "don't record events from perf itself",
2116                            exclude_perf),
2117         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2118                     "record events on existing process id"),
2119         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2120                     "record events on existing thread id"),
2121         OPT_INTEGER('r', "realtime", &record.realtime_prio,
2122                     "collect data with this RT SCHED_FIFO priority"),
2123         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2124                     "collect data without buffering"),
2125         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2126                     "collect raw sample records from all opened counters"),
2127         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2128                             "system-wide collection from all CPUs"),
2129         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2130                     "list of cpus to monitor"),
2131         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2132         OPT_STRING('o', "output", &record.data.path, "file",
2133                     "output file name"),
2134         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2135                         &record.opts.no_inherit_set,
2136                         "child tasks do not inherit counters"),
2137         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2138                     "synthesize non-sample events at the end of output"),
2139         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2140         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2141         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2142                     "Fail if the specified frequency can't be used"),
2143         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2144                      "profile at this frequency",
2145                       record__parse_freq),
2146         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2147                      "number of mmap data pages and AUX area tracing mmap pages",
2148                      record__parse_mmap_pages),
2149         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2150                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2151                      record__mmap_flush_parse),
2152         OPT_BOOLEAN(0, "group", &record.opts.group,
2153                     "put the counters into a counter group"),
2154         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2155                            NULL, "enables call-graph recording" ,
2156                            &record_callchain_opt),
2157         OPT_CALLBACK(0, "call-graph", &record.opts,
2158                      "record_mode[,record_size]", record_callchain_help,
2159                      &record_parse_callchain_opt),
2160         OPT_INCR('v', "verbose", &verbose,
2161                     "be more verbose (show counter open errors, etc)"),
2162         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2163         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2164                     "per thread counts"),
2165         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2166         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2167                     "Record the sample physical addresses"),
2168         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2169         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2170                         &record.opts.sample_time_set,
2171                         "Record the sample timestamps"),
2172         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2173                         "Record the sample period"),
2174         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2175                     "don't sample"),
2176         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2177                         &record.no_buildid_cache_set,
2178                         "do not update the buildid cache"),
2179         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2180                         &record.no_buildid_set,
2181                         "do not collect buildids in perf.data"),
2182         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2183                      "monitor event in cgroup name only",
2184                      parse_cgroups),
2185         OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2186                   "ms to wait before starting measurement after program start"),
2187         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2188                    "user to profile"),
2189 
2190         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2191                      "branch any", "sample any taken branches",
2192                      parse_branch_stack),
2193 
2194         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2195                      "branch filter mask", "branch stack filter modes",
2196                      parse_branch_stack),
2197         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2198                     "sample by weight (on special events only)"),
2199         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2200                     "sample transaction flags (special events only)"),
2201         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2202                     "use per-thread mmaps"),
2203         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2204                     "sample selected machine registers on interrupt,"
2205                     " use '-I?' to list register names", parse_intr_regs),
2206         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2207                     "sample selected machine registers on interrupt,"
2208                     " use '--user-regs=?' to list register names", parse_user_regs),
2209         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2210                     "Record running/enabled time of read (:S) events"),
2211         OPT_CALLBACK('k', "clockid", &record.opts,
2212         "clockid", "clockid to use for events, see clock_gettime()",
2213         parse_clockid),
2214         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2215                           "opts", "AUX area tracing Snapshot Mode", ""),
2216         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2217                         "per thread proc mmap processing timeout in ms"),
2218         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2219                     "Record namespaces events"),
2220         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2221                     "Record context switch events"),
2222         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2223                          "Configure all used events to run in kernel space.",
2224                          PARSE_OPT_EXCLUSIVE),
2225         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2226                          "Configure all used events to run in user space.",
2227                          PARSE_OPT_EXCLUSIVE),
2228         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2229                     "collect kernel callchains"),
2230         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2231                     "collect user callchains"),
2232         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2233                    "clang binary to use for compiling BPF scriptlets"),
2234         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2235                    "options passed to clang when compiling BPF scriptlets"),
2236         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2237                    "file", "vmlinux pathname"),
2238         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2239                     "Record build-id of all DSOs regardless of hits"),
2240         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2241                     "append timestamp to output filename"),
2242         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2243                     "Record timestamp boundary (time of first/last samples)"),
2244         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2245                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2246                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2247                           "signal"),
2248         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2249                    "Limit number of switch output generated files"),
2250         OPT_BOOLEAN(0, "dry-run", &dry_run,
2251                     "Parse options then exit"),
2252 #ifdef HAVE_AIO_SUPPORT
2253         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2254                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2255                      record__aio_parse),
2256 #endif
2257         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2258                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2259                      record__parse_affinity),
2260 #ifdef HAVE_ZSTD_SUPPORT
2261         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2262                             "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2263                             record__parse_comp_level),
2264 #endif
2265         OPT_END()
2266 };
2267 
2268 struct option *record_options = __record_options;
2269 
2270 int cmd_record(int argc, const char **argv)
2271 {
2272         int err;
2273         struct record *rec = &record;
2274         char errbuf[BUFSIZ];
2275 
2276         setlocale(LC_ALL, "");
2277 
2278 #ifndef HAVE_LIBBPF_SUPPORT
2279 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2280         set_nobuild('\0', "clang-path", true);
2281         set_nobuild('\0', "clang-opt", true);
2282 # undef set_nobuild
2283 #endif
2284 
2285 #ifndef HAVE_BPF_PROLOGUE
2286 # if !defined (HAVE_DWARF_SUPPORT)
2287 #  define REASON  "NO_DWARF=1"
2288 # elif !defined (HAVE_LIBBPF_SUPPORT)
2289 #  define REASON  "NO_LIBBPF=1"
2290 # else
2291 #  define REASON  "this architecture doesn't support BPF prologue"
2292 # endif
2293 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2294         set_nobuild('\0', "vmlinux", true);
2295 # undef set_nobuild
2296 # undef REASON
2297 #endif
2298 
2299         CPU_ZERO(&rec->affinity_mask);
2300         rec->opts.affinity = PERF_AFFINITY_SYS;
2301 
2302         rec->evlist = evlist__new();
2303         if (rec->evlist == NULL)
2304                 return -ENOMEM;
2305 
2306         err = perf_config(perf_record_config, rec);
2307         if (err)
2308                 return err;
2309 
2310         argc = parse_options(argc, argv, record_options, record_usage,
2311                             PARSE_OPT_STOP_AT_NON_OPTION);
2312         if (quiet)
2313                 perf_quiet_option();
2314 
2315         /* Make system wide (-a) the default target. */
2316         if (!argc && target__none(&rec->opts.target))
2317                 rec->opts.target.system_wide = true;
2318 
2319         if (nr_cgroups && !rec->opts.target.system_wide) {
2320                 usage_with_options_msg(record_usage, record_options,
2321                         "cgroup monitoring only available in system-wide mode");
2322 
2323         }
2324 
2325         if (rec->opts.comp_level != 0) {
2326                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2327                 rec->no_buildid = true;
2328         }
2329 
2330         if (rec->opts.record_switch_events &&
2331             !perf_can_record_switch_events()) {
2332                 ui__error("kernel does not support recording context switch events\n");
2333                 parse_options_usage(record_usage, record_options, "switch-events", 0);
2334                 return -EINVAL;
2335         }
2336 
2337         if (switch_output_setup(rec)) {
2338                 parse_options_usage(record_usage, record_options, "switch-output", 0);
2339                 return -EINVAL;
2340         }
2341 
2342         if (rec->switch_output.time) {
2343                 signal(SIGALRM, alarm_sig_handler);
2344                 alarm(rec->switch_output.time);
2345         }
2346 
2347         if (rec->switch_output.num_files) {
2348                 rec->switch_output.filenames = calloc(sizeof(char *),
2349                                                       rec->switch_output.num_files);
2350                 if (!rec->switch_output.filenames)
2351                         return -EINVAL;
2352         }
2353 
2354         /*
2355          * Allow aliases to facilitate the lookup of symbols for address
2356          * filters. Refer to auxtrace_parse_filters().
2357          */
2358         symbol_conf.allow_aliases = true;
2359 
2360         symbol__init(NULL);
2361 
2362         err = record__auxtrace_init(rec);
2363         if (err)
2364                 goto out;
2365 
2366         if (dry_run)
2367                 goto out;
2368 
2369         err = bpf__setup_stdout(rec->evlist);
2370         if (err) {
2371                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2372                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2373                          errbuf);
2374                 goto out;
2375         }
2376 
2377         err = -ENOMEM;
2378 
2379         if (rec->no_buildid_cache || rec->no_buildid) {
2380                 disable_buildid_cache();
2381         } else if (rec->switch_output.enabled) {
2382                 /*
2383                  * In 'perf record --switch-output', disable buildid
2384                  * generation by default to reduce data file switching
2385                  * overhead. Still generate buildid if they are required
2386                  * explicitly using
2387                  *
2388                  *  perf record --switch-output --no-no-buildid \
2389                  *              --no-no-buildid-cache
2390                  *
2391                  * Following code equals to:
2392                  *
2393                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
2394                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2395                  *         disable_buildid_cache();
2396                  */
2397                 bool disable = true;
2398 
2399                 if (rec->no_buildid_set && !rec->no_buildid)
2400                         disable = false;
2401                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2402                         disable = false;
2403                 if (disable) {
2404                         rec->no_buildid = true;
2405                         rec->no_buildid_cache = true;
2406                         disable_buildid_cache();
2407                 }
2408         }
2409 
2410         if (record.opts.overwrite)
2411                 record.opts.tail_synthesize = true;
2412 
2413         if (rec->evlist->core.nr_entries == 0 &&
2414             __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2415                 pr_err("Not enough memory for event selector list\n");
2416                 goto out;
2417         }
2418 
2419         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2420                 rec->opts.no_inherit = true;
2421 
2422         err = target__validate(&rec->opts.target);
2423         if (err) {
2424                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2425                 ui__warning("%s\n", errbuf);
2426         }
2427 
2428         err = target__parse_uid(&rec->opts.target);
2429         if (err) {
2430                 int saved_errno = errno;
2431 
2432                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2433                 ui__error("%s", errbuf);
2434 
2435                 err = -saved_errno;
2436                 goto out;
2437         }
2438 
2439         /* Enable ignoring missing threads when -u/-p option is defined. */
2440         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2441 
2442         err = -ENOMEM;
2443         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2444                 usage_with_options(record_usage, record_options);
2445 
2446         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2447         if (err)
2448                 goto out;
2449 
2450         /*
2451          * We take all buildids when the file contains
2452          * AUX area tracing data because we do not decode the
2453          * trace because it would take too long.
2454          */
2455         if (rec->opts.full_auxtrace)
2456                 rec->buildid_all = true;
2457 
2458         if (record_opts__config(&rec->opts)) {
2459                 err = -EINVAL;
2460                 goto out;
2461         }
2462 
2463         if (rec->opts.nr_cblocks > nr_cblocks_max)
2464                 rec->opts.nr_cblocks = nr_cblocks_max;
2465         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2466 
2467         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2468         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2469 
2470         if (rec->opts.comp_level > comp_level_max)
2471                 rec->opts.comp_level = comp_level_max;
2472         pr_debug("comp level: %d\n", rec->opts.comp_level);
2473 
2474         err = __cmd_record(&record, argc, argv);
2475 out:
2476         evlist__delete(rec->evlist);
2477         symbol__exit();
2478         auxtrace_record__free(rec->itr);
2479         return err;
2480 }
2481 
2482 static void snapshot_sig_handler(int sig __maybe_unused)
2483 {
2484         struct record *rec = &record;
2485 
2486         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2487                 trigger_hit(&auxtrace_snapshot_trigger);
2488                 auxtrace_record__snapshot_started = 1;
2489                 if (auxtrace_record__snapshot_start(record.itr))
2490                         trigger_error(&auxtrace_snapshot_trigger);
2491         }
2492 
2493         if (switch_output_signal(rec))
2494                 trigger_hit(&switch_output_trigger);
2495 }
2496 
2497 static void alarm_sig_handler(int sig __maybe_unused)
2498 {
2499         struct record *rec = &record;
2500 
2501         if (switch_output_time(rec))
2502                 trigger_hit(&switch_output_trigger);
2503 }

/* [<][>][^][v][top][bottom][index][help] */