1/** 2 * @file cpu_buffer.c 3 * 4 * @remark Copyright 2002-2009 OProfile authors 5 * @remark Read the file COPYING 6 * 7 * @author John Levon <levon@movementarian.org> 8 * @author Barry Kasindorf <barry.kasindorf@amd.com> 9 * @author Robert Richter <robert.richter@amd.com> 10 * 11 * Each CPU has a local buffer that stores PC value/event 12 * pairs. We also log context switches when we notice them. 13 * Eventually each CPU's buffer is processed into the global 14 * event buffer by sync_buffer(). 15 * 16 * We use a local buffer for two reasons: an NMI or similar 17 * interrupt cannot synchronise, and high sampling rates 18 * would lead to catastrophic global synchronisation if 19 * a global buffer was used. 20 */ 21 22#include <linux/sched.h> 23#include <linux/oprofile.h> 24#include <linux/errno.h> 25 26#include "event_buffer.h" 27#include "cpu_buffer.h" 28#include "buffer_sync.h" 29#include "oprof.h" 30 31#define OP_BUFFER_FLAGS 0 32 33static struct ring_buffer *op_ring_buffer; 34DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer); 35 36static void wq_sync_buffer(struct work_struct *work); 37 38#define DEFAULT_TIMER_EXPIRE (HZ / 10) 39static int work_enabled; 40 41unsigned long oprofile_get_cpu_buffer_size(void) 42{ 43 return oprofile_cpu_buffer_size; 44} 45 46void oprofile_cpu_buffer_inc_smpl_lost(void) 47{ 48 struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer); 49 50 cpu_buf->sample_lost_overflow++; 51} 52 53void free_cpu_buffers(void) 54{ 55 if (op_ring_buffer) 56 ring_buffer_free(op_ring_buffer); 57 op_ring_buffer = NULL; 58} 59 60#define RB_EVENT_HDR_SIZE 4 61 62int alloc_cpu_buffers(void) 63{ 64 int i; 65 66 unsigned long buffer_size = oprofile_cpu_buffer_size; 67 unsigned long byte_size = buffer_size * (sizeof(struct op_sample) + 68 RB_EVENT_HDR_SIZE); 69 70 op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS); 71 if (!op_ring_buffer) 72 goto fail; 73 74 for_each_possible_cpu(i) { 75 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); 76 77 b->last_task = NULL; 78 b->last_is_kernel = -1; 79 b->tracing = 0; 80 b->buffer_size = buffer_size; 81 b->sample_received = 0; 82 b->sample_lost_overflow = 0; 83 b->backtrace_aborted = 0; 84 b->sample_invalid_eip = 0; 85 b->cpu = i; 86 INIT_DELAYED_WORK(&b->work, wq_sync_buffer); 87 } 88 return 0; 89 90fail: 91 free_cpu_buffers(); 92 return -ENOMEM; 93} 94 95void start_cpu_work(void) 96{ 97 int i; 98 99 work_enabled = 1; 100 101 for_each_online_cpu(i) { 102 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); 103 104 /* 105 * Spread the work by 1 jiffy per cpu so they dont all 106 * fire at once. 107 */ 108 schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i); 109 } 110} 111 112void end_cpu_work(void) 113{ 114 work_enabled = 0; 115} 116 117void flush_cpu_work(void) 118{ 119 int i; 120 121 for_each_online_cpu(i) { 122 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i); 123 124 /* these works are per-cpu, no need for flush_sync */ 125 flush_delayed_work(&b->work); 126 } 127} 128 129/* 130 * This function prepares the cpu buffer to write a sample. 131 * 132 * Struct op_entry is used during operations on the ring buffer while 133 * struct op_sample contains the data that is stored in the ring 134 * buffer. Struct entry can be uninitialized. The function reserves a 135 * data array that is specified by size. Use 136 * op_cpu_buffer_write_commit() after preparing the sample. In case of 137 * errors a null pointer is returned, otherwise the pointer to the 138 * sample. 139 * 140 */ 141struct op_sample 142*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size) 143{ 144 entry->event = ring_buffer_lock_reserve 145 (op_ring_buffer, sizeof(struct op_sample) + 146 size * sizeof(entry->sample->data[0])); 147 if (!entry->event) 148 return NULL; 149 entry->sample = ring_buffer_event_data(entry->event); 150 entry->size = size; 151 entry->data = entry->sample->data; 152 153 return entry->sample; 154} 155 156int op_cpu_buffer_write_commit(struct op_entry *entry) 157{ 158 return ring_buffer_unlock_commit(op_ring_buffer, entry->event); 159} 160 161struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu) 162{ 163 struct ring_buffer_event *e; 164 e = ring_buffer_consume(op_ring_buffer, cpu, NULL, NULL); 165 if (!e) 166 return NULL; 167 168 entry->event = e; 169 entry->sample = ring_buffer_event_data(e); 170 entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample)) 171 / sizeof(entry->sample->data[0]); 172 entry->data = entry->sample->data; 173 return entry->sample; 174} 175 176unsigned long op_cpu_buffer_entries(int cpu) 177{ 178 return ring_buffer_entries_cpu(op_ring_buffer, cpu); 179} 180 181static int 182op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace, 183 int is_kernel, struct task_struct *task) 184{ 185 struct op_entry entry; 186 struct op_sample *sample; 187 unsigned long flags; 188 int size; 189 190 flags = 0; 191 192 if (backtrace) 193 flags |= TRACE_BEGIN; 194 195 /* notice a switch from user->kernel or vice versa */ 196 is_kernel = !!is_kernel; 197 if (cpu_buf->last_is_kernel != is_kernel) { 198 cpu_buf->last_is_kernel = is_kernel; 199 flags |= KERNEL_CTX_SWITCH; 200 if (is_kernel) 201 flags |= IS_KERNEL; 202 } 203 204 /* notice a task switch */ 205 if (cpu_buf->last_task != task) { 206 cpu_buf->last_task = task; 207 flags |= USER_CTX_SWITCH; 208 } 209 210 if (!flags) 211 /* nothing to do */ 212 return 0; 213 214 if (flags & USER_CTX_SWITCH) 215 size = 1; 216 else 217 size = 0; 218 219 sample = op_cpu_buffer_write_reserve(&entry, size); 220 if (!sample) 221 return -ENOMEM; 222 223 sample->eip = ESCAPE_CODE; 224 sample->event = flags; 225 226 if (size) 227 op_cpu_buffer_add_data(&entry, (unsigned long)task); 228 229 op_cpu_buffer_write_commit(&entry); 230 231 return 0; 232} 233 234static inline int 235op_add_sample(struct oprofile_cpu_buffer *cpu_buf, 236 unsigned long pc, unsigned long event) 237{ 238 struct op_entry entry; 239 struct op_sample *sample; 240 241 sample = op_cpu_buffer_write_reserve(&entry, 0); 242 if (!sample) 243 return -ENOMEM; 244 245 sample->eip = pc; 246 sample->event = event; 247 248 return op_cpu_buffer_write_commit(&entry); 249} 250 251/* 252 * This must be safe from any context. 253 * 254 * is_kernel is needed because on some architectures you cannot 255 * tell if you are in kernel or user space simply by looking at 256 * pc. We tag this in the buffer by generating kernel enter/exit 257 * events whenever is_kernel changes 258 */ 259static int 260log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, 261 unsigned long backtrace, int is_kernel, unsigned long event, 262 struct task_struct *task) 263{ 264 struct task_struct *tsk = task ? task : current; 265 cpu_buf->sample_received++; 266 267 if (pc == ESCAPE_CODE) { 268 cpu_buf->sample_invalid_eip++; 269 return 0; 270 } 271 272 if (op_add_code(cpu_buf, backtrace, is_kernel, tsk)) 273 goto fail; 274 275 if (op_add_sample(cpu_buf, pc, event)) 276 goto fail; 277 278 return 1; 279 280fail: 281 cpu_buf->sample_lost_overflow++; 282 return 0; 283} 284 285static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf) 286{ 287 cpu_buf->tracing = 1; 288} 289 290static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf) 291{ 292 cpu_buf->tracing = 0; 293} 294 295static inline void 296__oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, 297 unsigned long event, int is_kernel, 298 struct task_struct *task) 299{ 300 struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer); 301 unsigned long backtrace = oprofile_backtrace_depth; 302 303 /* 304 * if log_sample() fail we can't backtrace since we lost the 305 * source of this event 306 */ 307 if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event, task)) 308 /* failed */ 309 return; 310 311 if (!backtrace) 312 return; 313 314 oprofile_begin_trace(cpu_buf); 315 oprofile_ops.backtrace(regs, backtrace); 316 oprofile_end_trace(cpu_buf); 317} 318 319void oprofile_add_ext_hw_sample(unsigned long pc, struct pt_regs * const regs, 320 unsigned long event, int is_kernel, 321 struct task_struct *task) 322{ 323 __oprofile_add_ext_sample(pc, regs, event, is_kernel, task); 324} 325 326void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, 327 unsigned long event, int is_kernel) 328{ 329 __oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL); 330} 331 332void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) 333{ 334 int is_kernel; 335 unsigned long pc; 336 337 if (likely(regs)) { 338 is_kernel = !user_mode(regs); 339 pc = profile_pc(regs); 340 } else { 341 is_kernel = 0; /* This value will not be used */ 342 pc = ESCAPE_CODE; /* as this causes an early return. */ 343 } 344 345 __oprofile_add_ext_sample(pc, regs, event, is_kernel, NULL); 346} 347 348/* 349 * Add samples with data to the ring buffer. 350 * 351 * Use oprofile_add_data(&entry, val) to add data and 352 * oprofile_write_commit(&entry) to commit the sample. 353 */ 354void 355oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs, 356 unsigned long pc, int code, int size) 357{ 358 struct op_sample *sample; 359 int is_kernel = !user_mode(regs); 360 struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer); 361 362 cpu_buf->sample_received++; 363 364 /* no backtraces for samples with data */ 365 if (op_add_code(cpu_buf, 0, is_kernel, current)) 366 goto fail; 367 368 sample = op_cpu_buffer_write_reserve(entry, size + 2); 369 if (!sample) 370 goto fail; 371 sample->eip = ESCAPE_CODE; 372 sample->event = 0; /* no flags */ 373 374 op_cpu_buffer_add_data(entry, code); 375 op_cpu_buffer_add_data(entry, pc); 376 377 return; 378 379fail: 380 entry->event = NULL; 381 cpu_buf->sample_lost_overflow++; 382} 383 384int oprofile_add_data(struct op_entry *entry, unsigned long val) 385{ 386 if (!entry->event) 387 return 0; 388 return op_cpu_buffer_add_data(entry, val); 389} 390 391int oprofile_add_data64(struct op_entry *entry, u64 val) 392{ 393 if (!entry->event) 394 return 0; 395 if (op_cpu_buffer_get_size(entry) < 2) 396 /* 397 * the function returns 0 to indicate a too small 398 * buffer, even if there is some space left 399 */ 400 return 0; 401 if (!op_cpu_buffer_add_data(entry, (u32)val)) 402 return 0; 403 return op_cpu_buffer_add_data(entry, (u32)(val >> 32)); 404} 405 406int oprofile_write_commit(struct op_entry *entry) 407{ 408 if (!entry->event) 409 return -EINVAL; 410 return op_cpu_buffer_write_commit(entry); 411} 412 413void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event) 414{ 415 struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer); 416 log_sample(cpu_buf, pc, 0, is_kernel, event, NULL); 417} 418 419void oprofile_add_trace(unsigned long pc) 420{ 421 struct oprofile_cpu_buffer *cpu_buf = this_cpu_ptr(&op_cpu_buffer); 422 423 if (!cpu_buf->tracing) 424 return; 425 426 /* 427 * broken frame can give an eip with the same value as an 428 * escape code, abort the trace if we get it 429 */ 430 if (pc == ESCAPE_CODE) 431 goto fail; 432 433 if (op_add_sample(cpu_buf, pc, 0)) 434 goto fail; 435 436 return; 437fail: 438 cpu_buf->tracing = 0; 439 cpu_buf->backtrace_aborted++; 440 return; 441} 442 443/* 444 * This serves to avoid cpu buffer overflow, and makes sure 445 * the task mortuary progresses 446 * 447 * By using schedule_delayed_work_on and then schedule_delayed_work 448 * we guarantee this will stay on the correct cpu 449 */ 450static void wq_sync_buffer(struct work_struct *work) 451{ 452 struct oprofile_cpu_buffer *b = 453 container_of(work, struct oprofile_cpu_buffer, work.work); 454 if (b->cpu != smp_processor_id() && !cpu_online(b->cpu)) { 455 cancel_delayed_work(&b->work); 456 return; 457 } 458 sync_buffer(b->cpu); 459 460 /* don't re-add the work if we're shutting down */ 461 if (work_enabled) 462 schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE); 463} 464