1/*
2 * Cell Broadband Engine OProfile Support
3 *
4 * (C) Copyright IBM Corporation 2006
5 *
6 * Author: Maynard Johnson <maynardj@us.ibm.com>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14/* The purpose of this file is to handle SPU event task switching
15 * and to record SPU context information into the OProfile
16 * event buffer.
17 *
18 * Additionally, the spu_sync_buffer function is provided as a helper
19 * for recoding actual SPU program counter samples to the event buffer.
20 */
21#include <linux/dcookies.h>
22#include <linux/kref.h>
23#include <linux/mm.h>
24#include <linux/fs.h>
25#include <linux/file.h>
26#include <linux/module.h>
27#include <linux/notifier.h>
28#include <linux/numa.h>
29#include <linux/oprofile.h>
30#include <linux/slab.h>
31#include <linux/spinlock.h>
32#include "pr_util.h"
33
34#define RELEASE_ALL 9999
35
36static DEFINE_SPINLOCK(buffer_lock);
37static DEFINE_SPINLOCK(cache_lock);
38static int num_spu_nodes;
39int spu_prof_num_nodes;
40
41struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE];
42struct delayed_work spu_work;
43static unsigned max_spu_buff;
44
45static void spu_buff_add(unsigned long int value, int spu)
46{
47	/* spu buff is a circular buffer.  Add entries to the
48	 * head.  Head is the index to store the next value.
49	 * The buffer is full when there is one available entry
50	 * in the queue, i.e. head and tail can't be equal.
51	 * That way we can tell the difference between the
52	 * buffer being full versus empty.
53	 *
54	 *  ASSUPTION: the buffer_lock is held when this function
55	 *             is called to lock the buffer, head and tail.
56	 */
57	int full = 1;
58
59	if (spu_buff[spu].head >= spu_buff[spu].tail) {
60		if ((spu_buff[spu].head - spu_buff[spu].tail)
61		    <  (max_spu_buff - 1))
62			full = 0;
63
64	} else if (spu_buff[spu].tail > spu_buff[spu].head) {
65		if ((spu_buff[spu].tail - spu_buff[spu].head)
66		    > 1)
67			full = 0;
68	}
69
70	if (!full) {
71		spu_buff[spu].buff[spu_buff[spu].head] = value;
72		spu_buff[spu].head++;
73
74		if (spu_buff[spu].head >= max_spu_buff)
75			spu_buff[spu].head = 0;
76	} else {
77		/* From the user's perspective make the SPU buffer
78		 * size management/overflow look like we are using
79		 * per cpu buffers.  The user uses the same
80		 * per cpu parameter to adjust the SPU buffer size.
81		 * Increment the sample_lost_overflow to inform
82		 * the user the buffer size needs to be increased.
83		 */
84		oprofile_cpu_buffer_inc_smpl_lost();
85	}
86}
87
88/* This function copies the per SPU buffers to the
89 * OProfile kernel buffer.
90 */
91void sync_spu_buff(void)
92{
93	int spu;
94	unsigned long flags;
95	int curr_head;
96
97	for (spu = 0; spu < num_spu_nodes; spu++) {
98		/* In case there was an issue and the buffer didn't
99		 * get created skip it.
100		 */
101		if (spu_buff[spu].buff == NULL)
102			continue;
103
104		/* Hold the lock to make sure the head/tail
105		 * doesn't change while spu_buff_add() is
106		 * deciding if the buffer is full or not.
107		 * Being a little paranoid.
108		 */
109		spin_lock_irqsave(&buffer_lock, flags);
110		curr_head = spu_buff[spu].head;
111		spin_unlock_irqrestore(&buffer_lock, flags);
112
113		/* Transfer the current contents to the kernel buffer.
114		 * data can still be added to the head of the buffer.
115		 */
116		oprofile_put_buff(spu_buff[spu].buff,
117				  spu_buff[spu].tail,
118				  curr_head, max_spu_buff);
119
120		spin_lock_irqsave(&buffer_lock, flags);
121		spu_buff[spu].tail = curr_head;
122		spin_unlock_irqrestore(&buffer_lock, flags);
123	}
124
125}
126
127static void wq_sync_spu_buff(struct work_struct *work)
128{
129	/* move data from spu buffers to kernel buffer */
130	sync_spu_buff();
131
132	/* only reschedule if profiling is not done */
133	if (spu_prof_running)
134		schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
135}
136
137/* Container for caching information about an active SPU task. */
138struct cached_info {
139	struct vma_to_fileoffset_map *map;
140	struct spu *the_spu;	/* needed to access pointer to local_store */
141	struct kref cache_ref;
142};
143
144static struct cached_info *spu_info[MAX_NUMNODES * 8];
145
146static void destroy_cached_info(struct kref *kref)
147{
148	struct cached_info *info;
149
150	info = container_of(kref, struct cached_info, cache_ref);
151	vma_map_free(info->map);
152	kfree(info);
153	module_put(THIS_MODULE);
154}
155
156/* Return the cached_info for the passed SPU number.
157 * ATTENTION:  Callers are responsible for obtaining the
158 *	       cache_lock if needed prior to invoking this function.
159 */
160static struct cached_info *get_cached_info(struct spu *the_spu, int spu_num)
161{
162	struct kref *ref;
163	struct cached_info *ret_info;
164
165	if (spu_num >= num_spu_nodes) {
166		printk(KERN_ERR "SPU_PROF: "
167		       "%s, line %d: Invalid index %d into spu info cache\n",
168		       __func__, __LINE__, spu_num);
169		ret_info = NULL;
170		goto out;
171	}
172	if (!spu_info[spu_num] && the_spu) {
173		ref = spu_get_profile_private_kref(the_spu->ctx);
174		if (ref) {
175			spu_info[spu_num] = container_of(ref, struct cached_info, cache_ref);
176			kref_get(&spu_info[spu_num]->cache_ref);
177		}
178	}
179
180	ret_info = spu_info[spu_num];
181 out:
182	return ret_info;
183}
184
185
186/* Looks for cached info for the passed spu.  If not found, the
187 * cached info is created for the passed spu.
188 * Returns 0 for success; otherwise, -1 for error.
189 */
190static int
191prepare_cached_spu_info(struct spu *spu, unsigned long objectId)
192{
193	unsigned long flags;
194	struct vma_to_fileoffset_map *new_map;
195	int retval = 0;
196	struct cached_info *info;
197
198	/* We won't bother getting cache_lock here since
199	 * don't do anything with the cached_info that's returned.
200	 */
201	info = get_cached_info(spu, spu->number);
202
203	if (info) {
204		pr_debug("Found cached SPU info.\n");
205		goto out;
206	}
207
208	/* Create cached_info and set spu_info[spu->number] to point to it.
209	 * spu->number is a system-wide value, not a per-node value.
210	 */
211	info = kzalloc(sizeof(struct cached_info), GFP_KERNEL);
212	if (!info) {
213		printk(KERN_ERR "SPU_PROF: "
214		       "%s, line %d: create vma_map failed\n",
215		       __func__, __LINE__);
216		retval = -ENOMEM;
217		goto err_alloc;
218	}
219	new_map = create_vma_map(spu, objectId);
220	if (!new_map) {
221		printk(KERN_ERR "SPU_PROF: "
222		       "%s, line %d: create vma_map failed\n",
223		       __func__, __LINE__);
224		retval = -ENOMEM;
225		goto err_alloc;
226	}
227
228	pr_debug("Created vma_map\n");
229	info->map = new_map;
230	info->the_spu = spu;
231	kref_init(&info->cache_ref);
232	spin_lock_irqsave(&cache_lock, flags);
233	spu_info[spu->number] = info;
234	/* Increment count before passing off ref to SPUFS. */
235	kref_get(&info->cache_ref);
236
237	/* We increment the module refcount here since SPUFS is
238	 * responsible for the final destruction of the cached_info,
239	 * and it must be able to access the destroy_cached_info()
240	 * function defined in the OProfile module.  We decrement
241	 * the module refcount in destroy_cached_info.
242	 */
243	try_module_get(THIS_MODULE);
244	spu_set_profile_private_kref(spu->ctx, &info->cache_ref,
245				destroy_cached_info);
246	spin_unlock_irqrestore(&cache_lock, flags);
247	goto out;
248
249err_alloc:
250	kfree(info);
251out:
252	return retval;
253}
254
255/*
256 * NOTE:  The caller is responsible for locking the
257 *	  cache_lock prior to calling this function.
258 */
259static int release_cached_info(int spu_index)
260{
261	int index, end;
262
263	if (spu_index == RELEASE_ALL) {
264		end = num_spu_nodes;
265		index = 0;
266	} else {
267		if (spu_index >= num_spu_nodes) {
268			printk(KERN_ERR "SPU_PROF: "
269				"%s, line %d: "
270				"Invalid index %d into spu info cache\n",
271				__func__, __LINE__, spu_index);
272			goto out;
273		}
274		end = spu_index + 1;
275		index = spu_index;
276	}
277	for (; index < end; index++) {
278		if (spu_info[index]) {
279			kref_put(&spu_info[index]->cache_ref,
280				 destroy_cached_info);
281			spu_info[index] = NULL;
282		}
283	}
284
285out:
286	return 0;
287}
288
289/* The source code for fast_get_dcookie was "borrowed"
290 * from drivers/oprofile/buffer_sync.c.
291 */
292
293/* Optimisation. We can manage without taking the dcookie sem
294 * because we cannot reach this code without at least one
295 * dcookie user still being registered (namely, the reader
296 * of the event buffer).
297 */
298static inline unsigned long fast_get_dcookie(struct path *path)
299{
300	unsigned long cookie;
301
302	if (path->dentry->d_flags & DCACHE_COOKIE)
303		return (unsigned long)path->dentry;
304	get_dcookie(path, &cookie);
305	return cookie;
306}
307
308/* Look up the dcookie for the task's mm->exe_file,
309 * which corresponds loosely to "application name". Also, determine
310 * the offset for the SPU ELF object.  If computed offset is
311 * non-zero, it implies an embedded SPU object; otherwise, it's a
312 * separate SPU binary, in which case we retrieve it's dcookie.
313 * For the embedded case, we must determine if SPU ELF is embedded
314 * in the executable application or another file (i.e., shared lib).
315 * If embedded in a shared lib, we must get the dcookie and return
316 * that to the caller.
317 */
318static unsigned long
319get_exec_dcookie_and_offset(struct spu *spu, unsigned int *offsetp,
320			    unsigned long *spu_bin_dcookie,
321			    unsigned long spu_ref)
322{
323	unsigned long app_cookie = 0;
324	unsigned int my_offset = 0;
325	struct vm_area_struct *vma;
326	struct file *exe_file;
327	struct mm_struct *mm = spu->mm;
328
329	if (!mm)
330		goto out;
331
332	exe_file = get_mm_exe_file(mm);
333	if (exe_file) {
334		app_cookie = fast_get_dcookie(&exe_file->f_path);
335		pr_debug("got dcookie for %pD\n", exe_file);
336		fput(exe_file);
337	}
338
339	down_read(&mm->mmap_sem);
340	for (vma = mm->mmap; vma; vma = vma->vm_next) {
341		if (vma->vm_start > spu_ref || vma->vm_end <= spu_ref)
342			continue;
343		my_offset = spu_ref - vma->vm_start;
344		if (!vma->vm_file)
345			goto fail_no_image_cookie;
346
347		pr_debug("Found spu ELF at %X(object-id:%lx) for file %pD\n",
348			 my_offset, spu_ref, vma->vm_file);
349		*offsetp = my_offset;
350		break;
351	}
352
353	*spu_bin_dcookie = fast_get_dcookie(&vma->vm_file->f_path);
354	pr_debug("got dcookie for %pD\n", vma->vm_file);
355
356	up_read(&mm->mmap_sem);
357
358out:
359	return app_cookie;
360
361fail_no_image_cookie:
362	up_read(&mm->mmap_sem);
363
364	printk(KERN_ERR "SPU_PROF: "
365		"%s, line %d: Cannot find dcookie for SPU binary\n",
366		__func__, __LINE__);
367	goto out;
368}
369
370
371
372/* This function finds or creates cached context information for the
373 * passed SPU and records SPU context information into the OProfile
374 * event buffer.
375 */
376static int process_context_switch(struct spu *spu, unsigned long objectId)
377{
378	unsigned long flags;
379	int retval;
380	unsigned int offset = 0;
381	unsigned long spu_cookie = 0, app_dcookie;
382
383	retval = prepare_cached_spu_info(spu, objectId);
384	if (retval)
385		goto out;
386
387	/* Get dcookie first because a mutex_lock is taken in that
388	 * code path, so interrupts must not be disabled.
389	 */
390	app_dcookie = get_exec_dcookie_and_offset(spu, &offset, &spu_cookie, objectId);
391	if (!app_dcookie || !spu_cookie) {
392		retval  = -ENOENT;
393		goto out;
394	}
395
396	/* Record context info in event buffer */
397	spin_lock_irqsave(&buffer_lock, flags);
398	spu_buff_add(ESCAPE_CODE, spu->number);
399	spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number);
400	spu_buff_add(spu->number, spu->number);
401	spu_buff_add(spu->pid, spu->number);
402	spu_buff_add(spu->tgid, spu->number);
403	spu_buff_add(app_dcookie, spu->number);
404	spu_buff_add(spu_cookie, spu->number);
405	spu_buff_add(offset, spu->number);
406
407	/* Set flag to indicate SPU PC data can now be written out.  If
408	 * the SPU program counter data is seen before an SPU context
409	 * record is seen, the postprocessing will fail.
410	 */
411	spu_buff[spu->number].ctx_sw_seen = 1;
412
413	spin_unlock_irqrestore(&buffer_lock, flags);
414	smp_wmb();	/* insure spu event buffer updates are written */
415			/* don't want entries intermingled... */
416out:
417	return retval;
418}
419
420/*
421 * This function is invoked on either a bind_context or unbind_context.
422 * If called for an unbind_context, the val arg is 0; otherwise,
423 * it is the object-id value for the spu context.
424 * The data arg is of type 'struct spu *'.
425 */
426static int spu_active_notify(struct notifier_block *self, unsigned long val,
427				void *data)
428{
429	int retval;
430	unsigned long flags;
431	struct spu *the_spu = data;
432
433	pr_debug("SPU event notification arrived\n");
434	if (!val) {
435		spin_lock_irqsave(&cache_lock, flags);
436		retval = release_cached_info(the_spu->number);
437		spin_unlock_irqrestore(&cache_lock, flags);
438	} else {
439		retval = process_context_switch(the_spu, val);
440	}
441	return retval;
442}
443
444static struct notifier_block spu_active = {
445	.notifier_call = spu_active_notify,
446};
447
448static int number_of_online_nodes(void)
449{
450        u32 cpu; u32 tmp;
451        int nodes = 0;
452        for_each_online_cpu(cpu) {
453                tmp = cbe_cpu_to_node(cpu) + 1;
454                if (tmp > nodes)
455                        nodes++;
456        }
457        return nodes;
458}
459
460static int oprofile_spu_buff_create(void)
461{
462	int spu;
463
464	max_spu_buff = oprofile_get_cpu_buffer_size();
465
466	for (spu = 0; spu < num_spu_nodes; spu++) {
467		/* create circular buffers to store the data in.
468		 * use locks to manage accessing the buffers
469		 */
470		spu_buff[spu].head = 0;
471		spu_buff[spu].tail = 0;
472
473		/*
474		 * Create a buffer for each SPU.  Can't reliably
475		 * create a single buffer for all spus due to not
476		 * enough contiguous kernel memory.
477		 */
478
479		spu_buff[spu].buff = kzalloc((max_spu_buff
480					      * sizeof(unsigned long)),
481					     GFP_KERNEL);
482
483		if (!spu_buff[spu].buff) {
484			printk(KERN_ERR "SPU_PROF: "
485			       "%s, line %d:  oprofile_spu_buff_create "
486		       "failed to allocate spu buffer %d.\n",
487			       __func__, __LINE__, spu);
488
489			/* release the spu buffers that have been allocated */
490			while (spu >= 0) {
491				kfree(spu_buff[spu].buff);
492				spu_buff[spu].buff = 0;
493				spu--;
494			}
495			return -ENOMEM;
496		}
497	}
498	return 0;
499}
500
501/* The main purpose of this function is to synchronize
502 * OProfile with SPUFS by registering to be notified of
503 * SPU task switches.
504 *
505 * NOTE: When profiling SPUs, we must ensure that only
506 * spu_sync_start is invoked and not the generic sync_start
507 * in drivers/oprofile/oprof.c.	 A return value of
508 * SKIP_GENERIC_SYNC or SYNC_START_ERROR will
509 * accomplish this.
510 */
511int spu_sync_start(void)
512{
513	int spu;
514	int ret = SKIP_GENERIC_SYNC;
515	int register_ret;
516	unsigned long flags = 0;
517
518	spu_prof_num_nodes = number_of_online_nodes();
519	num_spu_nodes = spu_prof_num_nodes * 8;
520	INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff);
521
522	/* create buffer for storing the SPU data to put in
523	 * the kernel buffer.
524	 */
525	ret = oprofile_spu_buff_create();
526	if (ret)
527		goto out;
528
529	spin_lock_irqsave(&buffer_lock, flags);
530	for (spu = 0; spu < num_spu_nodes; spu++) {
531		spu_buff_add(ESCAPE_CODE, spu);
532		spu_buff_add(SPU_PROFILING_CODE, spu);
533		spu_buff_add(num_spu_nodes, spu);
534	}
535	spin_unlock_irqrestore(&buffer_lock, flags);
536
537	for (spu = 0; spu < num_spu_nodes; spu++) {
538		spu_buff[spu].ctx_sw_seen = 0;
539		spu_buff[spu].last_guard_val = 0;
540	}
541
542	/* Register for SPU events  */
543	register_ret = spu_switch_event_register(&spu_active);
544	if (register_ret) {
545		ret = SYNC_START_ERROR;
546		goto out;
547	}
548
549	pr_debug("spu_sync_start -- running.\n");
550out:
551	return ret;
552}
553
554/* Record SPU program counter samples to the oprofile event buffer. */
555void spu_sync_buffer(int spu_num, unsigned int *samples,
556		     int num_samples)
557{
558	unsigned long long file_offset;
559	unsigned long flags;
560	int i;
561	struct vma_to_fileoffset_map *map;
562	struct spu *the_spu;
563	unsigned long long spu_num_ll = spu_num;
564	unsigned long long spu_num_shifted = spu_num_ll << 32;
565	struct cached_info *c_info;
566
567	/* We need to obtain the cache_lock here because it's
568	 * possible that after getting the cached_info, the SPU job
569	 * corresponding to this cached_info may end, thus resulting
570	 * in the destruction of the cached_info.
571	 */
572	spin_lock_irqsave(&cache_lock, flags);
573	c_info = get_cached_info(NULL, spu_num);
574	if (!c_info) {
575		/* This legitimately happens when the SPU task ends before all
576		 * samples are recorded.
577		 * No big deal -- so we just drop a few samples.
578		 */
579		pr_debug("SPU_PROF: No cached SPU contex "
580			  "for SPU #%d. Dropping samples.\n", spu_num);
581		goto out;
582	}
583
584	map = c_info->map;
585	the_spu = c_info->the_spu;
586	spin_lock(&buffer_lock);
587	for (i = 0; i < num_samples; i++) {
588		unsigned int sample = *(samples+i);
589		int grd_val = 0;
590		file_offset = 0;
591		if (sample == 0)
592			continue;
593		file_offset = vma_map_lookup( map, sample, the_spu, &grd_val);
594
595		/* If overlays are used by this SPU application, the guard
596		 * value is non-zero, indicating which overlay section is in
597		 * use.	 We need to discard samples taken during the time
598		 * period which an overlay occurs (i.e., guard value changes).
599		 */
600		if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) {
601			spu_buff[spu_num].last_guard_val = grd_val;
602			/* Drop the rest of the samples. */
603			break;
604		}
605
606		/* We must ensure that the SPU context switch has been written
607		 * out before samples for the SPU.  Otherwise, the SPU context
608		 * information is not available and the postprocessing of the
609		 * SPU PC will fail with no available anonymous map information.
610		 */
611		if (spu_buff[spu_num].ctx_sw_seen)
612			spu_buff_add((file_offset | spu_num_shifted),
613					 spu_num);
614	}
615	spin_unlock(&buffer_lock);
616out:
617	spin_unlock_irqrestore(&cache_lock, flags);
618}
619
620
621int spu_sync_stop(void)
622{
623	unsigned long flags = 0;
624	int ret;
625	int k;
626
627	ret = spu_switch_event_unregister(&spu_active);
628
629	if (ret)
630		printk(KERN_ERR "SPU_PROF: "
631		       "%s, line %d: spu_switch_event_unregister "	\
632		       "returned %d\n",
633		       __func__, __LINE__, ret);
634
635	/* flush any remaining data in the per SPU buffers */
636	sync_spu_buff();
637
638	spin_lock_irqsave(&cache_lock, flags);
639	ret = release_cached_info(RELEASE_ALL);
640	spin_unlock_irqrestore(&cache_lock, flags);
641
642	/* remove scheduled work queue item rather then waiting
643	 * for every queued entry to execute.  Then flush pending
644	 * system wide buffer to event buffer.
645	 */
646	cancel_delayed_work(&spu_work);
647
648	for (k = 0; k < num_spu_nodes; k++) {
649		spu_buff[k].ctx_sw_seen = 0;
650
651		/*
652		 * spu_sys_buff will be null if there was a problem
653		 * allocating the buffer.  Only delete if it exists.
654		 */
655		kfree(spu_buff[k].buff);
656		spu_buff[k].buff = 0;
657	}
658	pr_debug("spu_sync_stop -- done.\n");
659	return ret;
660}
661
662