1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License.  See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (c) 2004-2009 Silicon Graphics, Inc.  All Rights Reserved.
7 */
8
9/*
10 * Cross Partition Communication (XPC) support - standard version.
11 *
12 *	XPC provides a message passing capability that crosses partition
13 *	boundaries. This module is made up of two parts:
14 *
15 *	    partition	This part detects the presence/absence of other
16 *			partitions. It provides a heartbeat and monitors
17 *			the heartbeats of other partitions.
18 *
19 *	    channel	This part manages the channels and sends/receives
20 *			messages across them to/from other partitions.
21 *
22 *	There are a couple of additional functions residing in XP, which
23 *	provide an interface to XPC for its users.
24 *
25 *
26 *	Caveats:
27 *
28 *	  . Currently on sn2, we have no way to determine which nasid an IRQ
29 *	    came from. Thus, xpc_send_IRQ_sn2() does a remote amo write
30 *	    followed by an IPI. The amo indicates where data is to be pulled
31 *	    from, so after the IPI arrives, the remote partition checks the amo
32 *	    word. The IPI can actually arrive before the amo however, so other
33 *	    code must periodically check for this case. Also, remote amo
34 *	    operations do not reliably time out. Thus we do a remote PIO read
35 *	    solely to know whether the remote partition is down and whether we
36 *	    should stop sending IPIs to it. This remote PIO read operation is
37 *	    set up in a special nofault region so SAL knows to ignore (and
38 *	    cleanup) any errors due to the remote amo write, PIO read, and/or
39 *	    PIO write operations.
40 *
41 *	    If/when new hardware solves this IPI problem, we should abandon
42 *	    the current approach.
43 *
44 */
45
46#include <linux/module.h>
47#include <linux/slab.h>
48#include <linux/sysctl.h>
49#include <linux/device.h>
50#include <linux/delay.h>
51#include <linux/reboot.h>
52#include <linux/kdebug.h>
53#include <linux/kthread.h>
54#include "xpc.h"
55
56#ifdef CONFIG_X86_64
57#include <asm/traps.h>
58#endif
59
60/* define two XPC debug device structures to be used with dev_dbg() et al */
61
62struct device_driver xpc_dbg_name = {
63	.name = "xpc"
64};
65
66struct device xpc_part_dbg_subname = {
67	.init_name = "",	/* set to "part" at xpc_init() time */
68	.driver = &xpc_dbg_name
69};
70
71struct device xpc_chan_dbg_subname = {
72	.init_name = "",	/* set to "chan" at xpc_init() time */
73	.driver = &xpc_dbg_name
74};
75
76struct device *xpc_part = &xpc_part_dbg_subname;
77struct device *xpc_chan = &xpc_chan_dbg_subname;
78
79static int xpc_kdebug_ignore;
80
81/* systune related variables for /proc/sys directories */
82
83static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
84static int xpc_hb_min_interval = 1;
85static int xpc_hb_max_interval = 10;
86
87static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
88static int xpc_hb_check_min_interval = 10;
89static int xpc_hb_check_max_interval = 120;
90
91int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT;
92static int xpc_disengage_min_timelimit;	/* = 0 */
93static int xpc_disengage_max_timelimit = 120;
94
95static struct ctl_table xpc_sys_xpc_hb_dir[] = {
96	{
97	 .procname = "hb_interval",
98	 .data = &xpc_hb_interval,
99	 .maxlen = sizeof(int),
100	 .mode = 0644,
101	 .proc_handler = proc_dointvec_minmax,
102	 .extra1 = &xpc_hb_min_interval,
103	 .extra2 = &xpc_hb_max_interval},
104	{
105	 .procname = "hb_check_interval",
106	 .data = &xpc_hb_check_interval,
107	 .maxlen = sizeof(int),
108	 .mode = 0644,
109	 .proc_handler = proc_dointvec_minmax,
110	 .extra1 = &xpc_hb_check_min_interval,
111	 .extra2 = &xpc_hb_check_max_interval},
112	{}
113};
114static struct ctl_table xpc_sys_xpc_dir[] = {
115	{
116	 .procname = "hb",
117	 .mode = 0555,
118	 .child = xpc_sys_xpc_hb_dir},
119	{
120	 .procname = "disengage_timelimit",
121	 .data = &xpc_disengage_timelimit,
122	 .maxlen = sizeof(int),
123	 .mode = 0644,
124	 .proc_handler = proc_dointvec_minmax,
125	 .extra1 = &xpc_disengage_min_timelimit,
126	 .extra2 = &xpc_disengage_max_timelimit},
127	{}
128};
129static struct ctl_table xpc_sys_dir[] = {
130	{
131	 .procname = "xpc",
132	 .mode = 0555,
133	 .child = xpc_sys_xpc_dir},
134	{}
135};
136static struct ctl_table_header *xpc_sysctl;
137
138/* non-zero if any remote partition disengage was timed out */
139int xpc_disengage_timedout;
140
141/* #of activate IRQs received and not yet processed */
142int xpc_activate_IRQ_rcvd;
143DEFINE_SPINLOCK(xpc_activate_IRQ_rcvd_lock);
144
145/* IRQ handler notifies this wait queue on receipt of an IRQ */
146DECLARE_WAIT_QUEUE_HEAD(xpc_activate_IRQ_wq);
147
148static unsigned long xpc_hb_check_timeout;
149static struct timer_list xpc_hb_timer;
150
151/* notification that the xpc_hb_checker thread has exited */
152static DECLARE_COMPLETION(xpc_hb_checker_exited);
153
154/* notification that the xpc_discovery thread has exited */
155static DECLARE_COMPLETION(xpc_discovery_exited);
156
157static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
158
159static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
160static struct notifier_block xpc_reboot_notifier = {
161	.notifier_call = xpc_system_reboot,
162};
163
164static int xpc_system_die(struct notifier_block *, unsigned long, void *);
165static struct notifier_block xpc_die_notifier = {
166	.notifier_call = xpc_system_die,
167};
168
169struct xpc_arch_operations xpc_arch_ops;
170
171/*
172 * Timer function to enforce the timelimit on the partition disengage.
173 */
174static void
175xpc_timeout_partition_disengage(unsigned long data)
176{
177	struct xpc_partition *part = (struct xpc_partition *)data;
178
179	DBUG_ON(time_is_after_jiffies(part->disengage_timeout));
180
181	(void)xpc_partition_disengaged(part);
182
183	DBUG_ON(part->disengage_timeout != 0);
184	DBUG_ON(xpc_arch_ops.partition_engaged(XPC_PARTID(part)));
185}
186
187/*
188 * Timer to produce the heartbeat.  The timer structures function is
189 * already set when this is initially called.  A tunable is used to
190 * specify when the next timeout should occur.
191 */
192static void
193xpc_hb_beater(unsigned long dummy)
194{
195	xpc_arch_ops.increment_heartbeat();
196
197	if (time_is_before_eq_jiffies(xpc_hb_check_timeout))
198		wake_up_interruptible(&xpc_activate_IRQ_wq);
199
200	xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
201	add_timer(&xpc_hb_timer);
202}
203
204static void
205xpc_start_hb_beater(void)
206{
207	xpc_arch_ops.heartbeat_init();
208	init_timer(&xpc_hb_timer);
209	xpc_hb_timer.function = xpc_hb_beater;
210	xpc_hb_beater(0);
211}
212
213static void
214xpc_stop_hb_beater(void)
215{
216	del_timer_sync(&xpc_hb_timer);
217	xpc_arch_ops.heartbeat_exit();
218}
219
220/*
221 * At periodic intervals, scan through all active partitions and ensure
222 * their heartbeat is still active.  If not, the partition is deactivated.
223 */
224static void
225xpc_check_remote_hb(void)
226{
227	struct xpc_partition *part;
228	short partid;
229	enum xp_retval ret;
230
231	for (partid = 0; partid < xp_max_npartitions; partid++) {
232
233		if (xpc_exiting)
234			break;
235
236		if (partid == xp_partition_id)
237			continue;
238
239		part = &xpc_partitions[partid];
240
241		if (part->act_state == XPC_P_AS_INACTIVE ||
242		    part->act_state == XPC_P_AS_DEACTIVATING) {
243			continue;
244		}
245
246		ret = xpc_arch_ops.get_remote_heartbeat(part);
247		if (ret != xpSuccess)
248			XPC_DEACTIVATE_PARTITION(part, ret);
249	}
250}
251
252/*
253 * This thread is responsible for nearly all of the partition
254 * activation/deactivation.
255 */
256static int
257xpc_hb_checker(void *ignore)
258{
259	int force_IRQ = 0;
260
261	/* this thread was marked active by xpc_hb_init() */
262
263	set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU));
264
265	/* set our heartbeating to other partitions into motion */
266	xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
267	xpc_start_hb_beater();
268
269	while (!xpc_exiting) {
270
271		dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
272			"been received\n",
273			(int)(xpc_hb_check_timeout - jiffies),
274			xpc_activate_IRQ_rcvd);
275
276		/* checking of remote heartbeats is skewed by IRQ handling */
277		if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) {
278			xpc_hb_check_timeout = jiffies +
279			    (xpc_hb_check_interval * HZ);
280
281			dev_dbg(xpc_part, "checking remote heartbeats\n");
282			xpc_check_remote_hb();
283
284			/*
285			 * On sn2 we need to periodically recheck to ensure no
286			 * IRQ/amo pairs have been missed.
287			 */
288			if (is_shub())
289				force_IRQ = 1;
290		}
291
292		/* check for outstanding IRQs */
293		if (xpc_activate_IRQ_rcvd > 0 || force_IRQ != 0) {
294			force_IRQ = 0;
295			dev_dbg(xpc_part, "processing activate IRQs "
296				"received\n");
297			xpc_arch_ops.process_activate_IRQ_rcvd();
298		}
299
300		/* wait for IRQ or timeout */
301		(void)wait_event_interruptible(xpc_activate_IRQ_wq,
302					       (time_is_before_eq_jiffies(
303						xpc_hb_check_timeout) ||
304						xpc_activate_IRQ_rcvd > 0 ||
305						xpc_exiting));
306	}
307
308	xpc_stop_hb_beater();
309
310	dev_dbg(xpc_part, "heartbeat checker is exiting\n");
311
312	/* mark this thread as having exited */
313	complete(&xpc_hb_checker_exited);
314	return 0;
315}
316
317/*
318 * This thread will attempt to discover other partitions to activate
319 * based on info provided by SAL. This new thread is short lived and
320 * will exit once discovery is complete.
321 */
322static int
323xpc_initiate_discovery(void *ignore)
324{
325	xpc_discovery();
326
327	dev_dbg(xpc_part, "discovery thread is exiting\n");
328
329	/* mark this thread as having exited */
330	complete(&xpc_discovery_exited);
331	return 0;
332}
333
334/*
335 * The first kthread assigned to a newly activated partition is the one
336 * created by XPC HB with which it calls xpc_activating(). XPC hangs on to
337 * that kthread until the partition is brought down, at which time that kthread
338 * returns back to XPC HB. (The return of that kthread will signify to XPC HB
339 * that XPC has dismantled all communication infrastructure for the associated
340 * partition.) This kthread becomes the channel manager for that partition.
341 *
342 * Each active partition has a channel manager, who, besides connecting and
343 * disconnecting channels, will ensure that each of the partition's connected
344 * channels has the required number of assigned kthreads to get the work done.
345 */
346static void
347xpc_channel_mgr(struct xpc_partition *part)
348{
349	while (part->act_state != XPC_P_AS_DEACTIVATING ||
350	       atomic_read(&part->nchannels_active) > 0 ||
351	       !xpc_partition_disengaged(part)) {
352
353		xpc_process_sent_chctl_flags(part);
354
355		/*
356		 * Wait until we've been requested to activate kthreads or
357		 * all of the channel's message queues have been torn down or
358		 * a signal is pending.
359		 *
360		 * The channel_mgr_requests is set to 1 after being awakened,
361		 * This is done to prevent the channel mgr from making one pass
362		 * through the loop for each request, since he will
363		 * be servicing all the requests in one pass. The reason it's
364		 * set to 1 instead of 0 is so that other kthreads will know
365		 * that the channel mgr is running and won't bother trying to
366		 * wake him up.
367		 */
368		atomic_dec(&part->channel_mgr_requests);
369		(void)wait_event_interruptible(part->channel_mgr_wq,
370				(atomic_read(&part->channel_mgr_requests) > 0 ||
371				 part->chctl.all_flags != 0 ||
372				 (part->act_state == XPC_P_AS_DEACTIVATING &&
373				 atomic_read(&part->nchannels_active) == 0 &&
374				 xpc_partition_disengaged(part))));
375		atomic_set(&part->channel_mgr_requests, 1);
376	}
377}
378
379/*
380 * Guarantee that the kzalloc'd memory is cacheline aligned.
381 */
382void *
383xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
384{
385	/* see if kzalloc will give us cachline aligned memory by default */
386	*base = kzalloc(size, flags);
387	if (*base == NULL)
388		return NULL;
389
390	if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
391		return *base;
392
393	kfree(*base);
394
395	/* nope, we'll have to do it ourselves */
396	*base = kzalloc(size + L1_CACHE_BYTES, flags);
397	if (*base == NULL)
398		return NULL;
399
400	return (void *)L1_CACHE_ALIGN((u64)*base);
401}
402
403/*
404 * Setup the channel structures necessary to support XPartition Communication
405 * between the specified remote partition and the local one.
406 */
407static enum xp_retval
408xpc_setup_ch_structures(struct xpc_partition *part)
409{
410	enum xp_retval ret;
411	int ch_number;
412	struct xpc_channel *ch;
413	short partid = XPC_PARTID(part);
414
415	/*
416	 * Allocate all of the channel structures as a contiguous chunk of
417	 * memory.
418	 */
419	DBUG_ON(part->channels != NULL);
420	part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_MAX_NCHANNELS,
421				 GFP_KERNEL);
422	if (part->channels == NULL) {
423		dev_err(xpc_chan, "can't get memory for channels\n");
424		return xpNoMemory;
425	}
426
427	/* allocate the remote open and close args */
428
429	part->remote_openclose_args =
430	    xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE,
431					  GFP_KERNEL, &part->
432					  remote_openclose_args_base);
433	if (part->remote_openclose_args == NULL) {
434		dev_err(xpc_chan, "can't get memory for remote connect args\n");
435		ret = xpNoMemory;
436		goto out_1;
437	}
438
439	part->chctl.all_flags = 0;
440	spin_lock_init(&part->chctl_lock);
441
442	atomic_set(&part->channel_mgr_requests, 1);
443	init_waitqueue_head(&part->channel_mgr_wq);
444
445	part->nchannels = XPC_MAX_NCHANNELS;
446
447	atomic_set(&part->nchannels_active, 0);
448	atomic_set(&part->nchannels_engaged, 0);
449
450	for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
451		ch = &part->channels[ch_number];
452
453		ch->partid = partid;
454		ch->number = ch_number;
455		ch->flags = XPC_C_DISCONNECTED;
456
457		atomic_set(&ch->kthreads_assigned, 0);
458		atomic_set(&ch->kthreads_idle, 0);
459		atomic_set(&ch->kthreads_active, 0);
460
461		atomic_set(&ch->references, 0);
462		atomic_set(&ch->n_to_notify, 0);
463
464		spin_lock_init(&ch->lock);
465		init_completion(&ch->wdisconnect_wait);
466
467		atomic_set(&ch->n_on_msg_allocate_wq, 0);
468		init_waitqueue_head(&ch->msg_allocate_wq);
469		init_waitqueue_head(&ch->idle_wq);
470	}
471
472	ret = xpc_arch_ops.setup_ch_structures(part);
473	if (ret != xpSuccess)
474		goto out_2;
475
476	/*
477	 * With the setting of the partition setup_state to XPC_P_SS_SETUP,
478	 * we're declaring that this partition is ready to go.
479	 */
480	part->setup_state = XPC_P_SS_SETUP;
481
482	return xpSuccess;
483
484	/* setup of ch structures failed */
485out_2:
486	kfree(part->remote_openclose_args_base);
487	part->remote_openclose_args = NULL;
488out_1:
489	kfree(part->channels);
490	part->channels = NULL;
491	return ret;
492}
493
494/*
495 * Teardown the channel structures necessary to support XPartition Communication
496 * between the specified remote partition and the local one.
497 */
498static void
499xpc_teardown_ch_structures(struct xpc_partition *part)
500{
501	DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
502	DBUG_ON(atomic_read(&part->nchannels_active) != 0);
503
504	/*
505	 * Make this partition inaccessible to local processes by marking it
506	 * as no longer setup. Then wait before proceeding with the teardown
507	 * until all existing references cease.
508	 */
509	DBUG_ON(part->setup_state != XPC_P_SS_SETUP);
510	part->setup_state = XPC_P_SS_WTEARDOWN;
511
512	wait_event(part->teardown_wq, (atomic_read(&part->references) == 0));
513
514	/* now we can begin tearing down the infrastructure */
515
516	xpc_arch_ops.teardown_ch_structures(part);
517
518	kfree(part->remote_openclose_args_base);
519	part->remote_openclose_args = NULL;
520	kfree(part->channels);
521	part->channels = NULL;
522
523	part->setup_state = XPC_P_SS_TORNDOWN;
524}
525
526/*
527 * When XPC HB determines that a partition has come up, it will create a new
528 * kthread and that kthread will call this function to attempt to set up the
529 * basic infrastructure used for Cross Partition Communication with the newly
530 * upped partition.
531 *
532 * The kthread that was created by XPC HB and which setup the XPC
533 * infrastructure will remain assigned to the partition becoming the channel
534 * manager for that partition until the partition is deactivating, at which
535 * time the kthread will teardown the XPC infrastructure and then exit.
536 */
537static int
538xpc_activating(void *__partid)
539{
540	short partid = (u64)__partid;
541	struct xpc_partition *part = &xpc_partitions[partid];
542	unsigned long irq_flags;
543
544	DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
545
546	spin_lock_irqsave(&part->act_lock, irq_flags);
547
548	if (part->act_state == XPC_P_AS_DEACTIVATING) {
549		part->act_state = XPC_P_AS_INACTIVE;
550		spin_unlock_irqrestore(&part->act_lock, irq_flags);
551		part->remote_rp_pa = 0;
552		return 0;
553	}
554
555	/* indicate the thread is activating */
556	DBUG_ON(part->act_state != XPC_P_AS_ACTIVATION_REQ);
557	part->act_state = XPC_P_AS_ACTIVATING;
558
559	XPC_SET_REASON(part, 0, 0);
560	spin_unlock_irqrestore(&part->act_lock, irq_flags);
561
562	dev_dbg(xpc_part, "activating partition %d\n", partid);
563
564	xpc_arch_ops.allow_hb(partid);
565
566	if (xpc_setup_ch_structures(part) == xpSuccess) {
567		(void)xpc_part_ref(part);	/* this will always succeed */
568
569		if (xpc_arch_ops.make_first_contact(part) == xpSuccess) {
570			xpc_mark_partition_active(part);
571			xpc_channel_mgr(part);
572			/* won't return until partition is deactivating */
573		}
574
575		xpc_part_deref(part);
576		xpc_teardown_ch_structures(part);
577	}
578
579	xpc_arch_ops.disallow_hb(partid);
580	xpc_mark_partition_inactive(part);
581
582	if (part->reason == xpReactivating) {
583		/* interrupting ourselves results in activating partition */
584		xpc_arch_ops.request_partition_reactivation(part);
585	}
586
587	return 0;
588}
589
590void
591xpc_activate_partition(struct xpc_partition *part)
592{
593	short partid = XPC_PARTID(part);
594	unsigned long irq_flags;
595	struct task_struct *kthread;
596
597	spin_lock_irqsave(&part->act_lock, irq_flags);
598
599	DBUG_ON(part->act_state != XPC_P_AS_INACTIVE);
600
601	part->act_state = XPC_P_AS_ACTIVATION_REQ;
602	XPC_SET_REASON(part, xpCloneKThread, __LINE__);
603
604	spin_unlock_irqrestore(&part->act_lock, irq_flags);
605
606	kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d",
607			      partid);
608	if (IS_ERR(kthread)) {
609		spin_lock_irqsave(&part->act_lock, irq_flags);
610		part->act_state = XPC_P_AS_INACTIVE;
611		XPC_SET_REASON(part, xpCloneKThreadFailed, __LINE__);
612		spin_unlock_irqrestore(&part->act_lock, irq_flags);
613	}
614}
615
616void
617xpc_activate_kthreads(struct xpc_channel *ch, int needed)
618{
619	int idle = atomic_read(&ch->kthreads_idle);
620	int assigned = atomic_read(&ch->kthreads_assigned);
621	int wakeup;
622
623	DBUG_ON(needed <= 0);
624
625	if (idle > 0) {
626		wakeup = (needed > idle) ? idle : needed;
627		needed -= wakeup;
628
629		dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
630			"channel=%d\n", wakeup, ch->partid, ch->number);
631
632		/* only wakeup the requested number of kthreads */
633		wake_up_nr(&ch->idle_wq, wakeup);
634	}
635
636	if (needed <= 0)
637		return;
638
639	if (needed + assigned > ch->kthreads_assigned_limit) {
640		needed = ch->kthreads_assigned_limit - assigned;
641		if (needed <= 0)
642			return;
643	}
644
645	dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
646		needed, ch->partid, ch->number);
647
648	xpc_create_kthreads(ch, needed, 0);
649}
650
651/*
652 * This function is where XPC's kthreads wait for messages to deliver.
653 */
654static void
655xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
656{
657	int (*n_of_deliverable_payloads) (struct xpc_channel *) =
658		xpc_arch_ops.n_of_deliverable_payloads;
659
660	do {
661		/* deliver messages to their intended recipients */
662
663		while (n_of_deliverable_payloads(ch) > 0 &&
664		       !(ch->flags & XPC_C_DISCONNECTING)) {
665			xpc_deliver_payload(ch);
666		}
667
668		if (atomic_inc_return(&ch->kthreads_idle) >
669		    ch->kthreads_idle_limit) {
670			/* too many idle kthreads on this channel */
671			atomic_dec(&ch->kthreads_idle);
672			break;
673		}
674
675		dev_dbg(xpc_chan, "idle kthread calling "
676			"wait_event_interruptible_exclusive()\n");
677
678		(void)wait_event_interruptible_exclusive(ch->idle_wq,
679				(n_of_deliverable_payloads(ch) > 0 ||
680				 (ch->flags & XPC_C_DISCONNECTING)));
681
682		atomic_dec(&ch->kthreads_idle);
683
684	} while (!(ch->flags & XPC_C_DISCONNECTING));
685}
686
687static int
688xpc_kthread_start(void *args)
689{
690	short partid = XPC_UNPACK_ARG1(args);
691	u16 ch_number = XPC_UNPACK_ARG2(args);
692	struct xpc_partition *part = &xpc_partitions[partid];
693	struct xpc_channel *ch;
694	int n_needed;
695	unsigned long irq_flags;
696	int (*n_of_deliverable_payloads) (struct xpc_channel *) =
697		xpc_arch_ops.n_of_deliverable_payloads;
698
699	dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
700		partid, ch_number);
701
702	ch = &part->channels[ch_number];
703
704	if (!(ch->flags & XPC_C_DISCONNECTING)) {
705
706		/* let registerer know that connection has been established */
707
708		spin_lock_irqsave(&ch->lock, irq_flags);
709		if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) {
710			ch->flags |= XPC_C_CONNECTEDCALLOUT;
711			spin_unlock_irqrestore(&ch->lock, irq_flags);
712
713			xpc_connected_callout(ch);
714
715			spin_lock_irqsave(&ch->lock, irq_flags);
716			ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE;
717			spin_unlock_irqrestore(&ch->lock, irq_flags);
718
719			/*
720			 * It is possible that while the callout was being
721			 * made that the remote partition sent some messages.
722			 * If that is the case, we may need to activate
723			 * additional kthreads to help deliver them. We only
724			 * need one less than total #of messages to deliver.
725			 */
726			n_needed = n_of_deliverable_payloads(ch) - 1;
727			if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING))
728				xpc_activate_kthreads(ch, n_needed);
729
730		} else {
731			spin_unlock_irqrestore(&ch->lock, irq_flags);
732		}
733
734		xpc_kthread_waitmsgs(part, ch);
735	}
736
737	/* let registerer know that connection is disconnecting */
738
739	spin_lock_irqsave(&ch->lock, irq_flags);
740	if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
741	    !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
742		ch->flags |= XPC_C_DISCONNECTINGCALLOUT;
743		spin_unlock_irqrestore(&ch->lock, irq_flags);
744
745		xpc_disconnect_callout(ch, xpDisconnecting);
746
747		spin_lock_irqsave(&ch->lock, irq_flags);
748		ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE;
749	}
750	spin_unlock_irqrestore(&ch->lock, irq_flags);
751
752	if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
753	    atomic_dec_return(&part->nchannels_engaged) == 0) {
754		xpc_arch_ops.indicate_partition_disengaged(part);
755	}
756
757	xpc_msgqueue_deref(ch);
758
759	dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
760		partid, ch_number);
761
762	xpc_part_deref(part);
763	return 0;
764}
765
766/*
767 * For each partition that XPC has established communications with, there is
768 * a minimum of one kernel thread assigned to perform any operation that
769 * may potentially sleep or block (basically the callouts to the asynchronous
770 * functions registered via xpc_connect()).
771 *
772 * Additional kthreads are created and destroyed by XPC as the workload
773 * demands.
774 *
775 * A kthread is assigned to one of the active channels that exists for a given
776 * partition.
777 */
778void
779xpc_create_kthreads(struct xpc_channel *ch, int needed,
780		    int ignore_disconnecting)
781{
782	unsigned long irq_flags;
783	u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
784	struct xpc_partition *part = &xpc_partitions[ch->partid];
785	struct task_struct *kthread;
786	void (*indicate_partition_disengaged) (struct xpc_partition *) =
787		xpc_arch_ops.indicate_partition_disengaged;
788
789	while (needed-- > 0) {
790
791		/*
792		 * The following is done on behalf of the newly created
793		 * kthread. That kthread is responsible for doing the
794		 * counterpart to the following before it exits.
795		 */
796		if (ignore_disconnecting) {
797			if (!atomic_inc_not_zero(&ch->kthreads_assigned)) {
798				/* kthreads assigned had gone to zero */
799				BUG_ON(!(ch->flags &
800					 XPC_C_DISCONNECTINGCALLOUT_MADE));
801				break;
802			}
803
804		} else if (ch->flags & XPC_C_DISCONNECTING) {
805			break;
806
807		} else if (atomic_inc_return(&ch->kthreads_assigned) == 1 &&
808			   atomic_inc_return(&part->nchannels_engaged) == 1) {
809			xpc_arch_ops.indicate_partition_engaged(part);
810		}
811		(void)xpc_part_ref(part);
812		xpc_msgqueue_ref(ch);
813
814		kthread = kthread_run(xpc_kthread_start, (void *)args,
815				      "xpc%02dc%d", ch->partid, ch->number);
816		if (IS_ERR(kthread)) {
817			/* the fork failed */
818
819			/*
820			 * NOTE: if (ignore_disconnecting &&
821			 * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true,
822			 * then we'll deadlock if all other kthreads assigned
823			 * to this channel are blocked in the channel's
824			 * registerer, because the only thing that will unblock
825			 * them is the xpDisconnecting callout that this
826			 * failed kthread_run() would have made.
827			 */
828
829			if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
830			    atomic_dec_return(&part->nchannels_engaged) == 0) {
831				indicate_partition_disengaged(part);
832			}
833			xpc_msgqueue_deref(ch);
834			xpc_part_deref(part);
835
836			if (atomic_read(&ch->kthreads_assigned) <
837			    ch->kthreads_idle_limit) {
838				/*
839				 * Flag this as an error only if we have an
840				 * insufficient #of kthreads for the channel
841				 * to function.
842				 */
843				spin_lock_irqsave(&ch->lock, irq_flags);
844				XPC_DISCONNECT_CHANNEL(ch, xpLackOfResources,
845						       &irq_flags);
846				spin_unlock_irqrestore(&ch->lock, irq_flags);
847			}
848			break;
849		}
850	}
851}
852
853void
854xpc_disconnect_wait(int ch_number)
855{
856	unsigned long irq_flags;
857	short partid;
858	struct xpc_partition *part;
859	struct xpc_channel *ch;
860	int wakeup_channel_mgr;
861
862	/* now wait for all callouts to the caller's function to cease */
863	for (partid = 0; partid < xp_max_npartitions; partid++) {
864		part = &xpc_partitions[partid];
865
866		if (!xpc_part_ref(part))
867			continue;
868
869		ch = &part->channels[ch_number];
870
871		if (!(ch->flags & XPC_C_WDISCONNECT)) {
872			xpc_part_deref(part);
873			continue;
874		}
875
876		wait_for_completion(&ch->wdisconnect_wait);
877
878		spin_lock_irqsave(&ch->lock, irq_flags);
879		DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
880		wakeup_channel_mgr = 0;
881
882		if (ch->delayed_chctl_flags) {
883			if (part->act_state != XPC_P_AS_DEACTIVATING) {
884				spin_lock(&part->chctl_lock);
885				part->chctl.flags[ch->number] |=
886				    ch->delayed_chctl_flags;
887				spin_unlock(&part->chctl_lock);
888				wakeup_channel_mgr = 1;
889			}
890			ch->delayed_chctl_flags = 0;
891		}
892
893		ch->flags &= ~XPC_C_WDISCONNECT;
894		spin_unlock_irqrestore(&ch->lock, irq_flags);
895
896		if (wakeup_channel_mgr)
897			xpc_wakeup_channel_mgr(part);
898
899		xpc_part_deref(part);
900	}
901}
902
903static int
904xpc_setup_partitions(void)
905{
906	short partid;
907	struct xpc_partition *part;
908
909	xpc_partitions = kzalloc(sizeof(struct xpc_partition) *
910				 xp_max_npartitions, GFP_KERNEL);
911	if (xpc_partitions == NULL) {
912		dev_err(xpc_part, "can't get memory for partition structure\n");
913		return -ENOMEM;
914	}
915
916	/*
917	 * The first few fields of each entry of xpc_partitions[] need to
918	 * be initialized now so that calls to xpc_connect() and
919	 * xpc_disconnect() can be made prior to the activation of any remote
920	 * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
921	 * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
922	 * PARTITION HAS BEEN ACTIVATED.
923	 */
924	for (partid = 0; partid < xp_max_npartitions; partid++) {
925		part = &xpc_partitions[partid];
926
927		DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part));
928
929		part->activate_IRQ_rcvd = 0;
930		spin_lock_init(&part->act_lock);
931		part->act_state = XPC_P_AS_INACTIVE;
932		XPC_SET_REASON(part, 0, 0);
933
934		init_timer(&part->disengage_timer);
935		part->disengage_timer.function =
936		    xpc_timeout_partition_disengage;
937		part->disengage_timer.data = (unsigned long)part;
938
939		part->setup_state = XPC_P_SS_UNSET;
940		init_waitqueue_head(&part->teardown_wq);
941		atomic_set(&part->references, 0);
942	}
943
944	return xpc_arch_ops.setup_partitions();
945}
946
947static void
948xpc_teardown_partitions(void)
949{
950	xpc_arch_ops.teardown_partitions();
951	kfree(xpc_partitions);
952}
953
954static void
955xpc_do_exit(enum xp_retval reason)
956{
957	short partid;
958	int active_part_count, printed_waiting_msg = 0;
959	struct xpc_partition *part;
960	unsigned long printmsg_time, disengage_timeout = 0;
961
962	/* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
963	DBUG_ON(xpc_exiting == 1);
964
965	/*
966	 * Let the heartbeat checker thread and the discovery thread
967	 * (if one is running) know that they should exit. Also wake up
968	 * the heartbeat checker thread in case it's sleeping.
969	 */
970	xpc_exiting = 1;
971	wake_up_interruptible(&xpc_activate_IRQ_wq);
972
973	/* wait for the discovery thread to exit */
974	wait_for_completion(&xpc_discovery_exited);
975
976	/* wait for the heartbeat checker thread to exit */
977	wait_for_completion(&xpc_hb_checker_exited);
978
979	/* sleep for a 1/3 of a second or so */
980	(void)msleep_interruptible(300);
981
982	/* wait for all partitions to become inactive */
983
984	printmsg_time = jiffies + (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
985	xpc_disengage_timedout = 0;
986
987	do {
988		active_part_count = 0;
989
990		for (partid = 0; partid < xp_max_npartitions; partid++) {
991			part = &xpc_partitions[partid];
992
993			if (xpc_partition_disengaged(part) &&
994			    part->act_state == XPC_P_AS_INACTIVE) {
995				continue;
996			}
997
998			active_part_count++;
999
1000			XPC_DEACTIVATE_PARTITION(part, reason);
1001
1002			if (part->disengage_timeout > disengage_timeout)
1003				disengage_timeout = part->disengage_timeout;
1004		}
1005
1006		if (xpc_arch_ops.any_partition_engaged()) {
1007			if (time_is_before_jiffies(printmsg_time)) {
1008				dev_info(xpc_part, "waiting for remote "
1009					 "partitions to deactivate, timeout in "
1010					 "%ld seconds\n", (disengage_timeout -
1011					 jiffies) / HZ);
1012				printmsg_time = jiffies +
1013				    (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
1014				printed_waiting_msg = 1;
1015			}
1016
1017		} else if (active_part_count > 0) {
1018			if (printed_waiting_msg) {
1019				dev_info(xpc_part, "waiting for local partition"
1020					 " to deactivate\n");
1021				printed_waiting_msg = 0;
1022			}
1023
1024		} else {
1025			if (!xpc_disengage_timedout) {
1026				dev_info(xpc_part, "all partitions have "
1027					 "deactivated\n");
1028			}
1029			break;
1030		}
1031
1032		/* sleep for a 1/3 of a second or so */
1033		(void)msleep_interruptible(300);
1034
1035	} while (1);
1036
1037	DBUG_ON(xpc_arch_ops.any_partition_engaged());
1038
1039	xpc_teardown_rsvd_page();
1040
1041	if (reason == xpUnloading) {
1042		(void)unregister_die_notifier(&xpc_die_notifier);
1043		(void)unregister_reboot_notifier(&xpc_reboot_notifier);
1044	}
1045
1046	/* clear the interface to XPC's functions */
1047	xpc_clear_interface();
1048
1049	if (xpc_sysctl)
1050		unregister_sysctl_table(xpc_sysctl);
1051
1052	xpc_teardown_partitions();
1053
1054	if (is_shub())
1055		xpc_exit_sn2();
1056	else if (is_uv())
1057		xpc_exit_uv();
1058}
1059
1060/*
1061 * This function is called when the system is being rebooted.
1062 */
1063static int
1064xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
1065{
1066	enum xp_retval reason;
1067
1068	switch (event) {
1069	case SYS_RESTART:
1070		reason = xpSystemReboot;
1071		break;
1072	case SYS_HALT:
1073		reason = xpSystemHalt;
1074		break;
1075	case SYS_POWER_OFF:
1076		reason = xpSystemPoweroff;
1077		break;
1078	default:
1079		reason = xpSystemGoingDown;
1080	}
1081
1082	xpc_do_exit(reason);
1083	return NOTIFY_DONE;
1084}
1085
1086/* Used to only allow one cpu to complete disconnect */
1087static unsigned int xpc_die_disconnecting;
1088
1089/*
1090 * Notify other partitions to deactivate from us by first disengaging from all
1091 * references to our memory.
1092 */
1093static void
1094xpc_die_deactivate(void)
1095{
1096	struct xpc_partition *part;
1097	short partid;
1098	int any_engaged;
1099	long keep_waiting;
1100	long wait_to_print;
1101
1102	if (cmpxchg(&xpc_die_disconnecting, 0, 1))
1103		return;
1104
1105	/* keep xpc_hb_checker thread from doing anything (just in case) */
1106	xpc_exiting = 1;
1107
1108	xpc_arch_ops.disallow_all_hbs();   /*indicate we're deactivated */
1109
1110	for (partid = 0; partid < xp_max_npartitions; partid++) {
1111		part = &xpc_partitions[partid];
1112
1113		if (xpc_arch_ops.partition_engaged(partid) ||
1114		    part->act_state != XPC_P_AS_INACTIVE) {
1115			xpc_arch_ops.request_partition_deactivation(part);
1116			xpc_arch_ops.indicate_partition_disengaged(part);
1117		}
1118	}
1119
1120	/*
1121	 * Though we requested that all other partitions deactivate from us,
1122	 * we only wait until they've all disengaged or we've reached the
1123	 * defined timelimit.
1124	 *
1125	 * Given that one iteration through the following while-loop takes
1126	 * approximately 200 microseconds, calculate the #of loops to take
1127	 * before bailing and the #of loops before printing a waiting message.
1128	 */
1129	keep_waiting = xpc_disengage_timelimit * 1000 * 5;
1130	wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1000 * 5;
1131
1132	while (1) {
1133		any_engaged = xpc_arch_ops.any_partition_engaged();
1134		if (!any_engaged) {
1135			dev_info(xpc_part, "all partitions have deactivated\n");
1136			break;
1137		}
1138
1139		if (!keep_waiting--) {
1140			for (partid = 0; partid < xp_max_npartitions;
1141			     partid++) {
1142				if (xpc_arch_ops.partition_engaged(partid)) {
1143					dev_info(xpc_part, "deactivate from "
1144						 "remote partition %d timed "
1145						 "out\n", partid);
1146				}
1147			}
1148			break;
1149		}
1150
1151		if (!wait_to_print--) {
1152			dev_info(xpc_part, "waiting for remote partitions to "
1153				 "deactivate, timeout in %ld seconds\n",
1154				 keep_waiting / (1000 * 5));
1155			wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL *
1156			    1000 * 5;
1157		}
1158
1159		udelay(200);
1160	}
1161}
1162
1163/*
1164 * This function is called when the system is being restarted or halted due
1165 * to some sort of system failure. If this is the case we need to notify the
1166 * other partitions to disengage from all references to our memory.
1167 * This function can also be called when our heartbeater could be offlined
1168 * for a time. In this case we need to notify other partitions to not worry
1169 * about the lack of a heartbeat.
1170 */
1171static int
1172xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args)
1173{
1174#ifdef CONFIG_IA64		/* !!! temporary kludge */
1175	switch (event) {
1176	case DIE_MACHINE_RESTART:
1177	case DIE_MACHINE_HALT:
1178		xpc_die_deactivate();
1179		break;
1180
1181	case DIE_KDEBUG_ENTER:
1182		/* Should lack of heartbeat be ignored by other partitions? */
1183		if (!xpc_kdebug_ignore)
1184			break;
1185
1186		/* fall through */
1187	case DIE_MCA_MONARCH_ENTER:
1188	case DIE_INIT_MONARCH_ENTER:
1189		xpc_arch_ops.offline_heartbeat();
1190		break;
1191
1192	case DIE_KDEBUG_LEAVE:
1193		/* Is lack of heartbeat being ignored by other partitions? */
1194		if (!xpc_kdebug_ignore)
1195			break;
1196
1197		/* fall through */
1198	case DIE_MCA_MONARCH_LEAVE:
1199	case DIE_INIT_MONARCH_LEAVE:
1200		xpc_arch_ops.online_heartbeat();
1201		break;
1202	}
1203#else
1204	struct die_args *die_args = _die_args;
1205
1206	switch (event) {
1207	case DIE_TRAP:
1208		if (die_args->trapnr == X86_TRAP_DF)
1209			xpc_die_deactivate();
1210
1211		if (((die_args->trapnr == X86_TRAP_MF) ||
1212		     (die_args->trapnr == X86_TRAP_XF)) &&
1213		    !user_mode(die_args->regs))
1214			xpc_die_deactivate();
1215
1216		break;
1217	case DIE_INT3:
1218	case DIE_DEBUG:
1219		break;
1220	case DIE_OOPS:
1221	case DIE_GPF:
1222	default:
1223		xpc_die_deactivate();
1224	}
1225#endif
1226
1227	return NOTIFY_DONE;
1228}
1229
1230int __init
1231xpc_init(void)
1232{
1233	int ret;
1234	struct task_struct *kthread;
1235
1236	dev_set_name(xpc_part, "part");
1237	dev_set_name(xpc_chan, "chan");
1238
1239	if (is_shub()) {
1240		/*
1241		 * The ia64-sn2 architecture supports at most 64 partitions.
1242		 * And the inability to unregister remote amos restricts us
1243		 * further to only support exactly 64 partitions on this
1244		 * architecture, no less.
1245		 */
1246		if (xp_max_npartitions != 64) {
1247			dev_err(xpc_part, "max #of partitions not set to 64\n");
1248			ret = -EINVAL;
1249		} else {
1250			ret = xpc_init_sn2();
1251		}
1252
1253	} else if (is_uv()) {
1254		ret = xpc_init_uv();
1255
1256	} else {
1257		ret = -ENODEV;
1258	}
1259
1260	if (ret != 0)
1261		return ret;
1262
1263	ret = xpc_setup_partitions();
1264	if (ret != 0) {
1265		dev_err(xpc_part, "can't get memory for partition structure\n");
1266		goto out_1;
1267	}
1268
1269	xpc_sysctl = register_sysctl_table(xpc_sys_dir);
1270
1271	/*
1272	 * Fill the partition reserved page with the information needed by
1273	 * other partitions to discover we are alive and establish initial
1274	 * communications.
1275	 */
1276	ret = xpc_setup_rsvd_page();
1277	if (ret != 0) {
1278		dev_err(xpc_part, "can't setup our reserved page\n");
1279		goto out_2;
1280	}
1281
1282	/* add ourselves to the reboot_notifier_list */
1283	ret = register_reboot_notifier(&xpc_reboot_notifier);
1284	if (ret != 0)
1285		dev_warn(xpc_part, "can't register reboot notifier\n");
1286
1287	/* add ourselves to the die_notifier list */
1288	ret = register_die_notifier(&xpc_die_notifier);
1289	if (ret != 0)
1290		dev_warn(xpc_part, "can't register die notifier\n");
1291
1292	/*
1293	 * The real work-horse behind xpc.  This processes incoming
1294	 * interrupts and monitors remote heartbeats.
1295	 */
1296	kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME);
1297	if (IS_ERR(kthread)) {
1298		dev_err(xpc_part, "failed while forking hb check thread\n");
1299		ret = -EBUSY;
1300		goto out_3;
1301	}
1302
1303	/*
1304	 * Startup a thread that will attempt to discover other partitions to
1305	 * activate based on info provided by SAL. This new thread is short
1306	 * lived and will exit once discovery is complete.
1307	 */
1308	kthread = kthread_run(xpc_initiate_discovery, NULL,
1309			      XPC_DISCOVERY_THREAD_NAME);
1310	if (IS_ERR(kthread)) {
1311		dev_err(xpc_part, "failed while forking discovery thread\n");
1312
1313		/* mark this new thread as a non-starter */
1314		complete(&xpc_discovery_exited);
1315
1316		xpc_do_exit(xpUnloading);
1317		return -EBUSY;
1318	}
1319
1320	/* set the interface to point at XPC's functions */
1321	xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1322			  xpc_initiate_send, xpc_initiate_send_notify,
1323			  xpc_initiate_received, xpc_initiate_partid_to_nasids);
1324
1325	return 0;
1326
1327	/* initialization was not successful */
1328out_3:
1329	xpc_teardown_rsvd_page();
1330
1331	(void)unregister_die_notifier(&xpc_die_notifier);
1332	(void)unregister_reboot_notifier(&xpc_reboot_notifier);
1333out_2:
1334	if (xpc_sysctl)
1335		unregister_sysctl_table(xpc_sysctl);
1336
1337	xpc_teardown_partitions();
1338out_1:
1339	if (is_shub())
1340		xpc_exit_sn2();
1341	else if (is_uv())
1342		xpc_exit_uv();
1343	return ret;
1344}
1345
1346module_init(xpc_init);
1347
1348void __exit
1349xpc_exit(void)
1350{
1351	xpc_do_exit(xpUnloading);
1352}
1353
1354module_exit(xpc_exit);
1355
1356MODULE_AUTHOR("Silicon Graphics, Inc.");
1357MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
1358MODULE_LICENSE("GPL");
1359
1360module_param(xpc_hb_interval, int, 0);
1361MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
1362		 "heartbeat increments.");
1363
1364module_param(xpc_hb_check_interval, int, 0);
1365MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1366		 "heartbeat checks.");
1367
1368module_param(xpc_disengage_timelimit, int, 0);
1369MODULE_PARM_DESC(xpc_disengage_timelimit, "Number of seconds to wait "
1370		 "for disengage to complete.");
1371
1372module_param(xpc_kdebug_ignore, int, 0);
1373MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by "
1374		 "other partitions when dropping into kdebug.");
1375