1 /*
2  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
3  *
4  * Copyright (c) 2011, 2012, Intel Corporation.
5  *
6  *   This file is part of Portals
7  *   http://sourceforge.net/projects/sandiaportals/
8  *
9  *   Portals is free software; you can redistribute it and/or
10  *   modify it under the terms of version 2 of the GNU General Public
11  *   License as published by the Free Software Foundation.
12  *
13  *   Portals is distributed in the hope that it will be useful,
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *   GNU General Public License for more details.
17  *
18  *   You should have received a copy of the GNU General Public License
19  *   along with Portals; if not, write to the Free Software
20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  *
22  */
23 
24 #define DEBUG_SUBSYSTEM S_LNET
25 #include "../../include/linux/lnet/lib-lnet.h"
26 
27 #if  defined(LNET_ROUTER)
28 
29 #define LNET_NRB_TINY_MIN	512	/* min value for each CPT */
30 #define LNET_NRB_TINY		(LNET_NRB_TINY_MIN * 4)
31 #define LNET_NRB_SMALL_MIN	4096	/* min value for each CPT */
32 #define LNET_NRB_SMALL		(LNET_NRB_SMALL_MIN * 4)
33 #define LNET_NRB_LARGE_MIN	256	/* min value for each CPT */
34 #define LNET_NRB_LARGE		(LNET_NRB_LARGE_MIN * 4)
35 
36 static char *forwarding = "";
37 module_param(forwarding, charp, 0444);
38 MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks");
39 
40 static int tiny_router_buffers;
41 module_param(tiny_router_buffers, int, 0444);
42 MODULE_PARM_DESC(tiny_router_buffers, "# of 0 payload messages to buffer in the router");
43 static int small_router_buffers;
44 module_param(small_router_buffers, int, 0444);
45 MODULE_PARM_DESC(small_router_buffers, "# of small (1 page) messages to buffer in the router");
46 static int large_router_buffers;
47 module_param(large_router_buffers, int, 0444);
48 MODULE_PARM_DESC(large_router_buffers, "# of large messages to buffer in the router");
49 static int peer_buffer_credits;
50 module_param(peer_buffer_credits, int, 0444);
51 MODULE_PARM_DESC(peer_buffer_credits, "# router buffer credits per peer");
52 
53 static int auto_down = 1;
54 module_param(auto_down, int, 0444);
55 MODULE_PARM_DESC(auto_down, "Automatically mark peers down on comms error");
56 
57 int
lnet_peer_buffer_credits(lnet_ni_t * ni)58 lnet_peer_buffer_credits(lnet_ni_t *ni)
59 {
60 	/* NI option overrides LNet default */
61 	if (ni->ni_peerrtrcredits > 0)
62 		return ni->ni_peerrtrcredits;
63 	if (peer_buffer_credits > 0)
64 		return peer_buffer_credits;
65 
66 	/* As an approximation, allow this peer the same number of router
67 	 * buffers as it is allowed outstanding sends */
68 	return ni->ni_peertxcredits;
69 }
70 
71 /* forward ref's */
72 static int lnet_router_checker(void *);
73 #else
74 
75 int
lnet_peer_buffer_credits(lnet_ni_t * ni)76 lnet_peer_buffer_credits(lnet_ni_t *ni)
77 {
78 	return 0;
79 }
80 
81 #endif
82 
83 static int check_routers_before_use;
84 module_param(check_routers_before_use, int, 0444);
85 MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use");
86 
87 int avoid_asym_router_failure = 1;
88 module_param(avoid_asym_router_failure, int, 0644);
89 MODULE_PARM_DESC(avoid_asym_router_failure, "Avoid asymmetrical router failures (0 to disable)");
90 
91 static int dead_router_check_interval = 60;
92 module_param(dead_router_check_interval, int, 0644);
93 MODULE_PARM_DESC(dead_router_check_interval, "Seconds between dead router health checks (<= 0 to disable)");
94 
95 static int live_router_check_interval = 60;
96 module_param(live_router_check_interval, int, 0644);
97 MODULE_PARM_DESC(live_router_check_interval, "Seconds between live router health checks (<= 0 to disable)");
98 
99 static int router_ping_timeout = 50;
100 module_param(router_ping_timeout, int, 0644);
101 MODULE_PARM_DESC(router_ping_timeout, "Seconds to wait for the reply to a router health query");
102 
103 int
lnet_peers_start_down(void)104 lnet_peers_start_down(void)
105 {
106 	return check_routers_before_use;
107 }
108 
109 void
lnet_notify_locked(lnet_peer_t * lp,int notifylnd,int alive,unsigned long when)110 lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive,
111 		   unsigned long when)
112 {
113 	if (time_before(when, lp->lp_timestamp)) { /* out of date information */
114 		CDEBUG(D_NET, "Out of date\n");
115 		return;
116 	}
117 
118 	lp->lp_timestamp = when;		/* update timestamp */
119 	lp->lp_ping_deadline = 0;	       /* disable ping timeout */
120 
121 	if (lp->lp_alive_count != 0 &&	  /* got old news */
122 	    (!lp->lp_alive) == (!alive)) {      /* new date for old news */
123 		CDEBUG(D_NET, "Old news\n");
124 		return;
125 	}
126 
127 	/* Flag that notification is outstanding */
128 
129 	lp->lp_alive_count++;
130 	lp->lp_alive = !(!alive);	       /* 1 bit! */
131 	lp->lp_notify = 1;
132 	lp->lp_notifylnd |= notifylnd;
133 	if (lp->lp_alive)
134 		lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
135 
136 	CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
137 }
138 
139 static void
lnet_ni_notify_locked(lnet_ni_t * ni,lnet_peer_t * lp)140 lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp)
141 {
142 	int	alive;
143 	int	notifylnd;
144 
145 	/* Notify only in 1 thread at any time to ensure ordered notification.
146 	 * NB individual events can be missed; the only guarantee is that you
147 	 * always get the most recent news */
148 
149 	if (lp->lp_notifying || ni == NULL)
150 		return;
151 
152 	lp->lp_notifying = 1;
153 
154 	while (lp->lp_notify) {
155 		alive     = lp->lp_alive;
156 		notifylnd = lp->lp_notifylnd;
157 
158 		lp->lp_notifylnd = 0;
159 		lp->lp_notify    = 0;
160 
161 		if (notifylnd && ni->ni_lnd->lnd_notify != NULL) {
162 			lnet_net_unlock(lp->lp_cpt);
163 
164 			/* A new notification could happen now; I'll handle it
165 			 * when control returns to me */
166 
167 			(ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive);
168 
169 			lnet_net_lock(lp->lp_cpt);
170 		}
171 	}
172 
173 	lp->lp_notifying = 0;
174 }
175 
176 
177 static void
lnet_rtr_addref_locked(lnet_peer_t * lp)178 lnet_rtr_addref_locked(lnet_peer_t *lp)
179 {
180 	LASSERT(lp->lp_refcount > 0);
181 	LASSERT(lp->lp_rtr_refcount >= 0);
182 
183 	/* lnet_net_lock must be exclusively locked */
184 	lp->lp_rtr_refcount++;
185 	if (lp->lp_rtr_refcount == 1) {
186 		struct list_head *pos;
187 
188 		/* a simple insertion sort */
189 		list_for_each_prev(pos, &the_lnet.ln_routers) {
190 			lnet_peer_t *rtr = list_entry(pos, lnet_peer_t,
191 							  lp_rtr_list);
192 
193 			if (rtr->lp_nid < lp->lp_nid)
194 				break;
195 		}
196 
197 		list_add(&lp->lp_rtr_list, pos);
198 		/* addref for the_lnet.ln_routers */
199 		lnet_peer_addref_locked(lp);
200 		the_lnet.ln_routers_version++;
201 	}
202 }
203 
204 static void
lnet_rtr_decref_locked(lnet_peer_t * lp)205 lnet_rtr_decref_locked(lnet_peer_t *lp)
206 {
207 	LASSERT(lp->lp_refcount > 0);
208 	LASSERT(lp->lp_rtr_refcount > 0);
209 
210 	/* lnet_net_lock must be exclusively locked */
211 	lp->lp_rtr_refcount--;
212 	if (lp->lp_rtr_refcount == 0) {
213 		LASSERT(list_empty(&lp->lp_routes));
214 
215 		if (lp->lp_rcd != NULL) {
216 			list_add(&lp->lp_rcd->rcd_list,
217 				     &the_lnet.ln_rcd_deathrow);
218 			lp->lp_rcd = NULL;
219 		}
220 
221 		list_del(&lp->lp_rtr_list);
222 		/* decref for the_lnet.ln_routers */
223 		lnet_peer_decref_locked(lp);
224 		the_lnet.ln_routers_version++;
225 	}
226 }
227 
228 lnet_remotenet_t *
lnet_find_net_locked(__u32 net)229 lnet_find_net_locked(__u32 net)
230 {
231 	lnet_remotenet_t	*rnet;
232 	struct list_head		*tmp;
233 	struct list_head		*rn_list;
234 
235 	LASSERT(!the_lnet.ln_shutdown);
236 
237 	rn_list = lnet_net2rnethash(net);
238 	list_for_each(tmp, rn_list) {
239 		rnet = list_entry(tmp, lnet_remotenet_t, lrn_list);
240 
241 		if (rnet->lrn_net == net)
242 			return rnet;
243 	}
244 	return NULL;
245 }
246 
lnet_shuffle_seed(void)247 static void lnet_shuffle_seed(void)
248 {
249 	static int seeded;
250 	int lnd_type, seed[2];
251 	struct timeval tv;
252 	lnet_ni_t *ni;
253 	struct list_head *tmp;
254 
255 	if (seeded)
256 		return;
257 
258 	cfs_get_random_bytes(seed, sizeof(seed));
259 
260 	/* Nodes with small feet have little entropy
261 	 * the NID for this node gives the most entropy in the low bits */
262 	list_for_each(tmp, &the_lnet.ln_nis) {
263 		ni = list_entry(tmp, lnet_ni_t, ni_list);
264 		lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
265 
266 		if (lnd_type != LOLND)
267 			seed[0] ^= (LNET_NIDADDR(ni->ni_nid) | lnd_type);
268 	}
269 
270 	do_gettimeofday(&tv);
271 	cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
272 	seeded = 1;
273 }
274 
275 /* NB expects LNET_LOCK held */
276 static void
lnet_add_route_to_rnet(lnet_remotenet_t * rnet,lnet_route_t * route)277 lnet_add_route_to_rnet(lnet_remotenet_t *rnet, lnet_route_t *route)
278 {
279 	unsigned int      len = 0;
280 	unsigned int      offset = 0;
281 	struct list_head       *e;
282 
283 	lnet_shuffle_seed();
284 
285 	list_for_each(e, &rnet->lrn_routes) {
286 		len++;
287 	}
288 
289 	/* len+1 positions to add a new entry, also prevents division by 0 */
290 	offset = cfs_rand() % (len + 1);
291 	list_for_each(e, &rnet->lrn_routes) {
292 		if (offset == 0)
293 			break;
294 		offset--;
295 	}
296 	list_add(&route->lr_list, e);
297 	list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
298 
299 	the_lnet.ln_remote_nets_version++;
300 	lnet_rtr_addref_locked(route->lr_gateway);
301 }
302 
303 int
lnet_add_route(__u32 net,unsigned int hops,lnet_nid_t gateway,unsigned int priority)304 lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway,
305 	       unsigned int priority)
306 {
307 	struct list_head	  *e;
308 	lnet_remotenet_t    *rnet;
309 	lnet_remotenet_t    *rnet2;
310 	lnet_route_t	*route;
311 	lnet_ni_t	   *ni;
312 	int		  add_route;
313 	int		  rc;
314 
315 	CDEBUG(D_NET, "Add route: net %s hops %u priority %u gw %s\n",
316 	       libcfs_net2str(net), hops, priority, libcfs_nid2str(gateway));
317 
318 	if (gateway == LNET_NID_ANY ||
319 	    LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
320 	    net == LNET_NIDNET(LNET_NID_ANY) ||
321 	    LNET_NETTYP(net) == LOLND ||
322 	    LNET_NIDNET(gateway) == net ||
323 	    hops < 1 || hops > 255)
324 		return -EINVAL;
325 
326 	if (lnet_islocalnet(net))	       /* it's a local network */
327 		return 0;		       /* ignore the route entry */
328 
329 	/* Assume net, route, all new */
330 	LIBCFS_ALLOC(route, sizeof(*route));
331 	LIBCFS_ALLOC(rnet, sizeof(*rnet));
332 	if (route == NULL || rnet == NULL) {
333 		CERROR("Out of memory creating route %s %d %s\n",
334 		       libcfs_net2str(net), hops, libcfs_nid2str(gateway));
335 		if (route != NULL)
336 			LIBCFS_FREE(route, sizeof(*route));
337 		if (rnet != NULL)
338 			LIBCFS_FREE(rnet, sizeof(*rnet));
339 		return -ENOMEM;
340 	}
341 
342 	INIT_LIST_HEAD(&rnet->lrn_routes);
343 	rnet->lrn_net = net;
344 	route->lr_hops = hops;
345 	route->lr_net = net;
346 	route->lr_priority = priority;
347 
348 	lnet_net_lock(LNET_LOCK_EX);
349 
350 	rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX);
351 	if (rc != 0) {
352 		lnet_net_unlock(LNET_LOCK_EX);
353 
354 		LIBCFS_FREE(route, sizeof(*route));
355 		LIBCFS_FREE(rnet, sizeof(*rnet));
356 
357 		if (rc == -EHOSTUNREACH) /* gateway is not on a local net */
358 			return 0;	/* ignore the route entry */
359 		CERROR("Error %d creating route %s %d %s\n", rc,
360 		       libcfs_net2str(net), hops,
361 		       libcfs_nid2str(gateway));
362 
363 		return rc;
364 	}
365 
366 	LASSERT(!the_lnet.ln_shutdown);
367 
368 	rnet2 = lnet_find_net_locked(net);
369 	if (rnet2 == NULL) {
370 		/* new network */
371 		list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
372 		rnet2 = rnet;
373 	}
374 
375 	/* Search for a duplicate route (it's a NOOP if it is) */
376 	add_route = 1;
377 	list_for_each(e, &rnet2->lrn_routes) {
378 		lnet_route_t *route2 = list_entry(e, lnet_route_t, lr_list);
379 
380 		if (route2->lr_gateway == route->lr_gateway) {
381 			add_route = 0;
382 			break;
383 		}
384 
385 		/* our lookups must be true */
386 		LASSERT(route2->lr_gateway->lp_nid != gateway);
387 	}
388 
389 	if (add_route) {
390 		lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */
391 		lnet_add_route_to_rnet(rnet2, route);
392 
393 		ni = route->lr_gateway->lp_ni;
394 		lnet_net_unlock(LNET_LOCK_EX);
395 
396 		/* XXX Assume alive */
397 		if (ni->ni_lnd->lnd_notify != NULL)
398 			(ni->ni_lnd->lnd_notify)(ni, gateway, 1);
399 
400 		lnet_net_lock(LNET_LOCK_EX);
401 	}
402 
403 	/* -1 for notify or !add_route */
404 	lnet_peer_decref_locked(route->lr_gateway);
405 	lnet_net_unlock(LNET_LOCK_EX);
406 
407 	if (!add_route)
408 		LIBCFS_FREE(route, sizeof(*route));
409 
410 	if (rnet != rnet2)
411 		LIBCFS_FREE(rnet, sizeof(*rnet));
412 
413 	return 0;
414 }
415 
416 int
lnet_check_routes(void)417 lnet_check_routes(void)
418 {
419 	lnet_remotenet_t	*rnet;
420 	lnet_route_t		*route;
421 	lnet_route_t		*route2;
422 	struct list_head		*e1;
423 	struct list_head		*e2;
424 	int			cpt;
425 	struct list_head		*rn_list;
426 	int			i;
427 
428 	cpt = lnet_net_lock_current();
429 
430 	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
431 		rn_list = &the_lnet.ln_remote_nets_hash[i];
432 		list_for_each(e1, rn_list) {
433 			rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
434 
435 			route2 = NULL;
436 			list_for_each(e2, &rnet->lrn_routes) {
437 				lnet_nid_t	nid1;
438 				lnet_nid_t	nid2;
439 				int		net;
440 
441 				route = list_entry(e2, lnet_route_t,
442 						       lr_list);
443 
444 				if (route2 == NULL) {
445 					route2 = route;
446 					continue;
447 				}
448 
449 				if (route->lr_gateway->lp_ni ==
450 				    route2->lr_gateway->lp_ni)
451 					continue;
452 
453 				nid1 = route->lr_gateway->lp_nid;
454 				nid2 = route2->lr_gateway->lp_nid;
455 				net = rnet->lrn_net;
456 
457 				lnet_net_unlock(cpt);
458 
459 				CERROR("Routes to %s via %s and %s not supported\n",
460 				       libcfs_net2str(net),
461 				       libcfs_nid2str(nid1),
462 				       libcfs_nid2str(nid2));
463 				return -EINVAL;
464 			}
465 		}
466 	}
467 
468 	lnet_net_unlock(cpt);
469 	return 0;
470 }
471 
472 int
lnet_del_route(__u32 net,lnet_nid_t gw_nid)473 lnet_del_route(__u32 net, lnet_nid_t gw_nid)
474 {
475 	struct lnet_peer	*gateway;
476 	lnet_remotenet_t	*rnet;
477 	lnet_route_t		*route;
478 	struct list_head		*e1;
479 	struct list_head		*e2;
480 	int			rc = -ENOENT;
481 	struct list_head		*rn_list;
482 	int			idx = 0;
483 
484 	CDEBUG(D_NET, "Del route: net %s : gw %s\n",
485 	       libcfs_net2str(net), libcfs_nid2str(gw_nid));
486 
487 	/* NB Caller may specify either all routes via the given gateway
488 	 * or a specific route entry actual NIDs) */
489 
490 	lnet_net_lock(LNET_LOCK_EX);
491 	if (net == LNET_NIDNET(LNET_NID_ANY))
492 		rn_list = &the_lnet.ln_remote_nets_hash[0];
493 	else
494 		rn_list = lnet_net2rnethash(net);
495 
496  again:
497 	list_for_each(e1, rn_list) {
498 		rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
499 
500 		if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
501 			net == rnet->lrn_net))
502 			continue;
503 
504 		list_for_each(e2, &rnet->lrn_routes) {
505 			route = list_entry(e2, lnet_route_t, lr_list);
506 
507 			gateway = route->lr_gateway;
508 			if (!(gw_nid == LNET_NID_ANY ||
509 			      gw_nid == gateway->lp_nid))
510 				continue;
511 
512 			list_del(&route->lr_list);
513 			list_del(&route->lr_gwlist);
514 			the_lnet.ln_remote_nets_version++;
515 
516 			if (list_empty(&rnet->lrn_routes))
517 				list_del(&rnet->lrn_list);
518 			else
519 				rnet = NULL;
520 
521 			lnet_rtr_decref_locked(gateway);
522 			lnet_peer_decref_locked(gateway);
523 
524 			lnet_net_unlock(LNET_LOCK_EX);
525 
526 			LIBCFS_FREE(route, sizeof(*route));
527 
528 			if (rnet != NULL)
529 				LIBCFS_FREE(rnet, sizeof(*rnet));
530 
531 			rc = 0;
532 			lnet_net_lock(LNET_LOCK_EX);
533 			goto again;
534 		}
535 	}
536 
537 	if (net == LNET_NIDNET(LNET_NID_ANY) &&
538 	    ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
539 		rn_list = &the_lnet.ln_remote_nets_hash[idx];
540 		goto again;
541 	}
542 	lnet_net_unlock(LNET_LOCK_EX);
543 
544 	return rc;
545 }
546 
547 void
lnet_destroy_routes(void)548 lnet_destroy_routes(void)
549 {
550 	lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
551 }
552 
553 int
lnet_get_route(int idx,__u32 * net,__u32 * hops,lnet_nid_t * gateway,__u32 * alive,__u32 * priority)554 lnet_get_route(int idx, __u32 *net, __u32 *hops,
555 	       lnet_nid_t *gateway, __u32 *alive, __u32 *priority)
556 {
557 	struct list_head		*e1;
558 	struct list_head		*e2;
559 	lnet_remotenet_t	*rnet;
560 	lnet_route_t		*route;
561 	int			cpt;
562 	int			i;
563 	struct list_head		*rn_list;
564 
565 	cpt = lnet_net_lock_current();
566 
567 	for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
568 		rn_list = &the_lnet.ln_remote_nets_hash[i];
569 		list_for_each(e1, rn_list) {
570 			rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
571 
572 			list_for_each(e2, &rnet->lrn_routes) {
573 				route = list_entry(e2, lnet_route_t,
574 						       lr_list);
575 
576 				if (idx-- == 0) {
577 					*net	  = rnet->lrn_net;
578 					*hops	  = route->lr_hops;
579 					*priority = route->lr_priority;
580 					*gateway  = route->lr_gateway->lp_nid;
581 					*alive	  = route->lr_gateway->lp_alive;
582 					lnet_net_unlock(cpt);
583 					return 0;
584 				}
585 			}
586 		}
587 	}
588 
589 	lnet_net_unlock(cpt);
590 	return -ENOENT;
591 }
592 
593 void
lnet_swap_pinginfo(lnet_ping_info_t * info)594 lnet_swap_pinginfo(lnet_ping_info_t *info)
595 {
596 	int	       i;
597 	lnet_ni_status_t *stat;
598 
599 	__swab32s(&info->pi_magic);
600 	__swab32s(&info->pi_features);
601 	__swab32s(&info->pi_pid);
602 	__swab32s(&info->pi_nnis);
603 	for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
604 		stat = &info->pi_ni[i];
605 		__swab64s(&stat->ns_nid);
606 		__swab32s(&stat->ns_status);
607 	}
608 }
609 
610 /**
611  * parse router-checker pinginfo, record number of down NIs for remote
612  * networks on that router.
613  */
614 static void
lnet_parse_rc_info(lnet_rc_data_t * rcd)615 lnet_parse_rc_info(lnet_rc_data_t *rcd)
616 {
617 	lnet_ping_info_t	*info = rcd->rcd_pinginfo;
618 	struct lnet_peer	*gw   = rcd->rcd_gateway;
619 	lnet_route_t		*rtr;
620 
621 	if (!gw->lp_alive)
622 		return;
623 
624 	if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC))
625 		lnet_swap_pinginfo(info);
626 
627 	/* NB always racing with network! */
628 	if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
629 		CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
630 		       libcfs_nid2str(gw->lp_nid), info->pi_magic);
631 		gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
632 		return;
633 	}
634 
635 	gw->lp_ping_feats = info->pi_features;
636 	if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
637 		CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
638 		       libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
639 		return; /* nothing I can understand */
640 	}
641 
642 	if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
643 		return; /* can't carry NI status info */
644 
645 	list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) {
646 		int	ptl_status = LNET_NI_STATUS_INVALID;
647 		int	down = 0;
648 		int	up = 0;
649 		int	i;
650 
651 		for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
652 			lnet_ni_status_t *stat = &info->pi_ni[i];
653 			lnet_nid_t	 nid = stat->ns_nid;
654 
655 			if (nid == LNET_NID_ANY) {
656 				CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
657 				       libcfs_nid2str(gw->lp_nid));
658 				gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
659 				return;
660 			}
661 
662 			if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
663 				continue;
664 
665 			if (stat->ns_status == LNET_NI_STATUS_DOWN) {
666 				if (LNET_NETTYP(LNET_NIDNET(nid)) != PTLLND)
667 					down++;
668 				else if (ptl_status != LNET_NI_STATUS_UP)
669 					ptl_status = LNET_NI_STATUS_DOWN;
670 				continue;
671 			}
672 
673 			if (stat->ns_status == LNET_NI_STATUS_UP) {
674 				if (LNET_NIDNET(nid) == rtr->lr_net) {
675 					up = 1;
676 					break;
677 				}
678 				/* ptl NIs are considered down only when
679 				 * they're all down */
680 				if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
681 					ptl_status = LNET_NI_STATUS_UP;
682 				continue;
683 			}
684 
685 			CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
686 			       libcfs_nid2str(gw->lp_nid), stat->ns_status);
687 			gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
688 			return;
689 		}
690 
691 		if (up) { /* ignore downed NIs if NI for dest network is up */
692 			rtr->lr_downis = 0;
693 			continue;
694 		}
695 		rtr->lr_downis = down + (ptl_status == LNET_NI_STATUS_DOWN);
696 	}
697 }
698 
699 static void
lnet_router_checker_event(lnet_event_t * event)700 lnet_router_checker_event(lnet_event_t *event)
701 {
702 	lnet_rc_data_t		*rcd = event->md.user_ptr;
703 	struct lnet_peer	*lp;
704 
705 	LASSERT(rcd != NULL);
706 
707 	if (event->unlinked) {
708 		LNetInvalidateHandle(&rcd->rcd_mdh);
709 		return;
710 	}
711 
712 	LASSERT(event->type == LNET_EVENT_SEND ||
713 		event->type == LNET_EVENT_REPLY);
714 
715 	lp = rcd->rcd_gateway;
716 	LASSERT(lp != NULL);
717 
718 	 /* NB: it's called with holding lnet_res_lock, we have a few
719 	  * places need to hold both locks at the same time, please take
720 	  * care of lock ordering */
721 	lnet_net_lock(lp->lp_cpt);
722 	if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) {
723 		/* ignore if no longer a router or rcd is replaced */
724 		goto out;
725 	}
726 
727 	if (event->type == LNET_EVENT_SEND) {
728 		lp->lp_ping_notsent = 0;
729 		if (event->status == 0)
730 			goto out;
731 	}
732 
733 	/* LNET_EVENT_REPLY */
734 	/* A successful REPLY means the router is up.  If _any_ comms
735 	 * to the router fail I assume it's down (this will happen if
736 	 * we ping alive routers to try to detect router death before
737 	 * apps get burned). */
738 
739 	lnet_notify_locked(lp, 1, (event->status == 0), cfs_time_current());
740 	/* The router checker will wake up very shortly and do the
741 	 * actual notification.
742 	 * XXX If 'lp' stops being a router before then, it will still
743 	 * have the notification pending!!! */
744 
745 	if (avoid_asym_router_failure && event->status == 0)
746 		lnet_parse_rc_info(rcd);
747 
748  out:
749 	lnet_net_unlock(lp->lp_cpt);
750 }
751 
752 static void
lnet_wait_known_routerstate(void)753 lnet_wait_known_routerstate(void)
754 {
755 	lnet_peer_t	 *rtr;
756 	struct list_head	  *entry;
757 	int		  all_known;
758 
759 	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
760 
761 	for (;;) {
762 		int	cpt = lnet_net_lock_current();
763 
764 		all_known = 1;
765 		list_for_each(entry, &the_lnet.ln_routers) {
766 			rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
767 
768 			if (rtr->lp_alive_count == 0) {
769 				all_known = 0;
770 				break;
771 			}
772 		}
773 
774 		lnet_net_unlock(cpt);
775 
776 		if (all_known)
777 			return;
778 
779 		set_current_state(TASK_UNINTERRUPTIBLE);
780 		schedule_timeout(cfs_time_seconds(1));
781 	}
782 }
783 
784 void
lnet_router_ni_update_locked(lnet_peer_t * gw,__u32 net)785 lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net)
786 {
787 	lnet_route_t *rte;
788 
789 	if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) {
790 		list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) {
791 			if (rte->lr_net == net) {
792 				rte->lr_downis = 0;
793 				break;
794 			}
795 		}
796 	}
797 }
798 
799 static void
lnet_update_ni_status_locked(void)800 lnet_update_ni_status_locked(void)
801 {
802 	lnet_ni_t	*ni;
803 	long		now;
804 	int		timeout;
805 
806 	LASSERT(the_lnet.ln_routing);
807 
808 	timeout = router_ping_timeout +
809 		  max(live_router_check_interval, dead_router_check_interval);
810 
811 	now = get_seconds();
812 	list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) {
813 		if (ni->ni_lnd->lnd_type == LOLND)
814 			continue;
815 
816 		if (now < ni->ni_last_alive + timeout)
817 			continue;
818 
819 		lnet_ni_lock(ni);
820 		/* re-check with lock */
821 		if (now < ni->ni_last_alive + timeout) {
822 			lnet_ni_unlock(ni);
823 			continue;
824 		}
825 
826 		LASSERT(ni->ni_status != NULL);
827 
828 		if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) {
829 			CDEBUG(D_NET, "NI(%s:%d) status changed to down\n",
830 			       libcfs_nid2str(ni->ni_nid), timeout);
831 			/* NB: so far, this is the only place to set
832 			 * NI status to "down" */
833 			ni->ni_status->ns_status = LNET_NI_STATUS_DOWN;
834 		}
835 		lnet_ni_unlock(ni);
836 	}
837 }
838 
839 static void
lnet_destroy_rc_data(lnet_rc_data_t * rcd)840 lnet_destroy_rc_data(lnet_rc_data_t *rcd)
841 {
842 	LASSERT(list_empty(&rcd->rcd_list));
843 	/* detached from network */
844 	LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh));
845 
846 	if (rcd->rcd_gateway != NULL) {
847 		int cpt = rcd->rcd_gateway->lp_cpt;
848 
849 		lnet_net_lock(cpt);
850 		lnet_peer_decref_locked(rcd->rcd_gateway);
851 		lnet_net_unlock(cpt);
852 	}
853 
854 	if (rcd->rcd_pinginfo != NULL)
855 		LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
856 
857 	LIBCFS_FREE(rcd, sizeof(*rcd));
858 }
859 
860 static lnet_rc_data_t *
lnet_create_rc_data_locked(lnet_peer_t * gateway)861 lnet_create_rc_data_locked(lnet_peer_t *gateway)
862 {
863 	lnet_rc_data_t		*rcd = NULL;
864 	lnet_ping_info_t	*pi;
865 	int			rc;
866 	int			i;
867 
868 	lnet_net_unlock(gateway->lp_cpt);
869 
870 	LIBCFS_ALLOC(rcd, sizeof(*rcd));
871 	if (rcd == NULL)
872 		goto out;
873 
874 	LNetInvalidateHandle(&rcd->rcd_mdh);
875 	INIT_LIST_HEAD(&rcd->rcd_list);
876 
877 	LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
878 	if (pi == NULL)
879 		goto out;
880 
881 	for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
882 		pi->pi_ni[i].ns_nid = LNET_NID_ANY;
883 		pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
884 	}
885 	rcd->rcd_pinginfo = pi;
886 
887 	LASSERT(!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
888 	rc = LNetMDBind((lnet_md_t){.start     = pi,
889 				    .user_ptr  = rcd,
890 				    .length    = LNET_PINGINFO_SIZE,
891 				    .threshold = LNET_MD_THRESH_INF,
892 				    .options   = LNET_MD_TRUNCATE,
893 				    .eq_handle = the_lnet.ln_rc_eqh},
894 			LNET_UNLINK,
895 			&rcd->rcd_mdh);
896 	if (rc < 0) {
897 		CERROR("Can't bind MD: %d\n", rc);
898 		goto out;
899 	}
900 	LASSERT(rc == 0);
901 
902 	lnet_net_lock(gateway->lp_cpt);
903 	/* router table changed or someone has created rcd for this gateway */
904 	if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) {
905 		lnet_net_unlock(gateway->lp_cpt);
906 		goto out;
907 	}
908 
909 	lnet_peer_addref_locked(gateway);
910 	rcd->rcd_gateway = gateway;
911 	gateway->lp_rcd = rcd;
912 	gateway->lp_ping_notsent = 0;
913 
914 	return rcd;
915 
916  out:
917 	if (rcd != NULL) {
918 		if (!LNetHandleIsInvalid(rcd->rcd_mdh)) {
919 			rc = LNetMDUnlink(rcd->rcd_mdh);
920 			LASSERT(rc == 0);
921 		}
922 		lnet_destroy_rc_data(rcd);
923 	}
924 
925 	lnet_net_lock(gateway->lp_cpt);
926 	return gateway->lp_rcd;
927 }
928 
929 static int
lnet_router_check_interval(lnet_peer_t * rtr)930 lnet_router_check_interval(lnet_peer_t *rtr)
931 {
932 	int secs;
933 
934 	secs = rtr->lp_alive ? live_router_check_interval :
935 			       dead_router_check_interval;
936 	if (secs < 0)
937 		secs = 0;
938 
939 	return secs;
940 }
941 
942 static void
lnet_ping_router_locked(lnet_peer_t * rtr)943 lnet_ping_router_locked(lnet_peer_t *rtr)
944 {
945 	lnet_rc_data_t *rcd = NULL;
946 	unsigned long      now = cfs_time_current();
947 	int	     secs;
948 
949 	lnet_peer_addref_locked(rtr);
950 
951 	if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
952 	    cfs_time_after(now, rtr->lp_ping_deadline))
953 		lnet_notify_locked(rtr, 1, 0, now);
954 
955 	/* Run any outstanding notifications */
956 	lnet_ni_notify_locked(rtr->lp_ni, rtr);
957 
958 	if (!lnet_isrouter(rtr) ||
959 	    the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
960 		/* router table changed or router checker is shutting down */
961 		lnet_peer_decref_locked(rtr);
962 		return;
963 	}
964 
965 	rcd = rtr->lp_rcd != NULL ?
966 	      rtr->lp_rcd : lnet_create_rc_data_locked(rtr);
967 
968 	if (rcd == NULL)
969 		return;
970 
971 	secs = lnet_router_check_interval(rtr);
972 
973 	CDEBUG(D_NET,
974 	       "rtr %s %d: deadline %lu ping_notsent %d alive %d alive_count %d lp_ping_timestamp %lu\n",
975 	       libcfs_nid2str(rtr->lp_nid), secs,
976 	       rtr->lp_ping_deadline, rtr->lp_ping_notsent,
977 	       rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
978 
979 	if (secs != 0 && !rtr->lp_ping_notsent &&
980 	    cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
981 					     cfs_time_seconds(secs)))) {
982 		int	       rc;
983 		lnet_process_id_t id;
984 		lnet_handle_md_t  mdh;
985 
986 		id.nid = rtr->lp_nid;
987 		id.pid = LUSTRE_SRV_LNET_PID;
988 		CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
989 
990 		rtr->lp_ping_notsent   = 1;
991 		rtr->lp_ping_timestamp = now;
992 
993 		mdh = rcd->rcd_mdh;
994 
995 		if (rtr->lp_ping_deadline == 0) {
996 			rtr->lp_ping_deadline =
997 				cfs_time_shift(router_ping_timeout);
998 		}
999 
1000 		lnet_net_unlock(rtr->lp_cpt);
1001 
1002 		rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
1003 			     LNET_PROTO_PING_MATCHBITS, 0);
1004 
1005 		lnet_net_lock(rtr->lp_cpt);
1006 		if (rc != 0)
1007 			rtr->lp_ping_notsent = 0; /* no event pending */
1008 	}
1009 
1010 	lnet_peer_decref_locked(rtr);
1011 }
1012 
1013 int
lnet_router_checker_start(void)1014 lnet_router_checker_start(void)
1015 {
1016 	int	  rc;
1017 	int	  eqsz;
1018 
1019 	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
1020 
1021 	if (check_routers_before_use &&
1022 	    dead_router_check_interval <= 0) {
1023 		LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be set if 'check_routers_before_use' is set\n");
1024 		return -EINVAL;
1025 	}
1026 
1027 	if (!the_lnet.ln_routing &&
1028 	    live_router_check_interval <= 0 &&
1029 	    dead_router_check_interval <= 0)
1030 		return 0;
1031 
1032 	sema_init(&the_lnet.ln_rc_signal, 0);
1033 	/* EQ size doesn't matter; the callback is guaranteed to get every
1034 	 * event */
1035 	eqsz = 0;
1036 	rc = LNetEQAlloc(eqsz, lnet_router_checker_event,
1037 			 &the_lnet.ln_rc_eqh);
1038 	if (rc != 0) {
1039 		CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc);
1040 		return -ENOMEM;
1041 	}
1042 
1043 	the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
1044 	rc = PTR_ERR(kthread_run(lnet_router_checker,
1045 				 NULL, "router_checker"));
1046 	if (IS_ERR_VALUE(rc)) {
1047 		CERROR("Can't start router checker thread: %d\n", rc);
1048 		/* block until event callback signals exit */
1049 		down(&the_lnet.ln_rc_signal);
1050 		rc = LNetEQFree(the_lnet.ln_rc_eqh);
1051 		LASSERT(rc == 0);
1052 		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
1053 		return -ENOMEM;
1054 	}
1055 
1056 	if (check_routers_before_use) {
1057 		/* Note that a helpful side-effect of pinging all known routers
1058 		 * at startup is that it makes them drop stale connections they
1059 		 * may have to a previous instance of me. */
1060 		lnet_wait_known_routerstate();
1061 	}
1062 
1063 	return 0;
1064 }
1065 
1066 void
lnet_router_checker_stop(void)1067 lnet_router_checker_stop(void)
1068 {
1069 	int rc;
1070 
1071 	if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
1072 		return;
1073 
1074 	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
1075 	the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING;
1076 
1077 	/* block until event callback signals exit */
1078 	down(&the_lnet.ln_rc_signal);
1079 	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
1080 
1081 	rc = LNetEQFree(the_lnet.ln_rc_eqh);
1082 	LASSERT(rc == 0);
1083 }
1084 
1085 static void
lnet_prune_rc_data(int wait_unlink)1086 lnet_prune_rc_data(int wait_unlink)
1087 {
1088 	lnet_rc_data_t		*rcd;
1089 	lnet_rc_data_t		*tmp;
1090 	lnet_peer_t		*lp;
1091 	struct list_head		head;
1092 	int			i = 2;
1093 
1094 	if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING &&
1095 		   list_empty(&the_lnet.ln_rcd_deathrow) &&
1096 		   list_empty(&the_lnet.ln_rcd_zombie)))
1097 		return;
1098 
1099 	INIT_LIST_HEAD(&head);
1100 
1101 	lnet_net_lock(LNET_LOCK_EX);
1102 
1103 	if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) {
1104 		/* router checker is stopping, prune all */
1105 		list_for_each_entry(lp, &the_lnet.ln_routers,
1106 					lp_rtr_list) {
1107 			if (lp->lp_rcd == NULL)
1108 				continue;
1109 
1110 			LASSERT(list_empty(&lp->lp_rcd->rcd_list));
1111 			list_add(&lp->lp_rcd->rcd_list,
1112 				     &the_lnet.ln_rcd_deathrow);
1113 			lp->lp_rcd = NULL;
1114 		}
1115 	}
1116 
1117 	/* unlink all RCDs on deathrow list */
1118 	list_splice_init(&the_lnet.ln_rcd_deathrow, &head);
1119 
1120 	if (!list_empty(&head)) {
1121 		lnet_net_unlock(LNET_LOCK_EX);
1122 
1123 		list_for_each_entry(rcd, &head, rcd_list)
1124 			LNetMDUnlink(rcd->rcd_mdh);
1125 
1126 		lnet_net_lock(LNET_LOCK_EX);
1127 	}
1128 
1129 	list_splice_init(&head, &the_lnet.ln_rcd_zombie);
1130 
1131 	/* release all zombie RCDs */
1132 	while (!list_empty(&the_lnet.ln_rcd_zombie)) {
1133 		list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
1134 					     rcd_list) {
1135 			if (LNetHandleIsInvalid(rcd->rcd_mdh))
1136 				list_move(&rcd->rcd_list, &head);
1137 		}
1138 
1139 		wait_unlink = wait_unlink &&
1140 			      !list_empty(&the_lnet.ln_rcd_zombie);
1141 
1142 		lnet_net_unlock(LNET_LOCK_EX);
1143 
1144 		while (!list_empty(&head)) {
1145 			rcd = list_entry(head.next,
1146 					     lnet_rc_data_t, rcd_list);
1147 			list_del_init(&rcd->rcd_list);
1148 			lnet_destroy_rc_data(rcd);
1149 		}
1150 
1151 		if (!wait_unlink)
1152 			return;
1153 
1154 		i++;
1155 		CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
1156 		       "Waiting for rc buffers to unlink\n");
1157 		set_current_state(TASK_UNINTERRUPTIBLE);
1158 		schedule_timeout(cfs_time_seconds(1) / 4);
1159 
1160 		lnet_net_lock(LNET_LOCK_EX);
1161 	}
1162 
1163 	lnet_net_unlock(LNET_LOCK_EX);
1164 }
1165 
1166 
1167 #if  defined(LNET_ROUTER)
1168 
1169 static int
lnet_router_checker(void * arg)1170 lnet_router_checker(void *arg)
1171 {
1172 	lnet_peer_t       *rtr;
1173 	struct list_head	*entry;
1174 
1175 	cfs_block_allsigs();
1176 
1177 	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
1178 
1179 	while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
1180 		__u64	version;
1181 		int	cpt;
1182 		int	cpt2;
1183 
1184 		cpt = lnet_net_lock_current();
1185 rescan:
1186 		version = the_lnet.ln_routers_version;
1187 
1188 		list_for_each(entry, &the_lnet.ln_routers) {
1189 			rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
1190 
1191 			cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid);
1192 			if (cpt != cpt2) {
1193 				lnet_net_unlock(cpt);
1194 				cpt = cpt2;
1195 				lnet_net_lock(cpt);
1196 				/* the routers list has changed */
1197 				if (version != the_lnet.ln_routers_version)
1198 					goto rescan;
1199 			}
1200 
1201 			lnet_ping_router_locked(rtr);
1202 
1203 			/* NB dropped lock */
1204 			if (version != the_lnet.ln_routers_version) {
1205 				/* the routers list has changed */
1206 				goto rescan;
1207 			}
1208 		}
1209 
1210 		if (the_lnet.ln_routing)
1211 			lnet_update_ni_status_locked();
1212 
1213 		lnet_net_unlock(cpt);
1214 
1215 		lnet_prune_rc_data(0); /* don't wait for UNLINK */
1216 
1217 		/* Call schedule_timeout() here always adds 1 to load average
1218 		 * because kernel counts # active tasks as nr_running
1219 		 * + nr_uninterruptible. */
1220 		set_current_state(TASK_INTERRUPTIBLE);
1221 		schedule_timeout(cfs_time_seconds(1));
1222 	}
1223 
1224 	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING);
1225 
1226 	lnet_prune_rc_data(1); /* wait for UNLINK */
1227 
1228 	the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
1229 	up(&the_lnet.ln_rc_signal);
1230 	/* The unlink event callback will signal final completion */
1231 	return 0;
1232 }
1233 
1234 static void
lnet_destroy_rtrbuf(lnet_rtrbuf_t * rb,int npages)1235 lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
1236 {
1237 	int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
1238 
1239 	while (--npages >= 0)
1240 		__free_page(rb->rb_kiov[npages].kiov_page);
1241 
1242 	LIBCFS_FREE(rb, sz);
1243 }
1244 
1245 static lnet_rtrbuf_t *
lnet_new_rtrbuf(lnet_rtrbufpool_t * rbp,int cpt)1246 lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt)
1247 {
1248 	int	    npages = rbp->rbp_npages;
1249 	int	    sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
1250 	struct page   *page;
1251 	lnet_rtrbuf_t *rb;
1252 	int	    i;
1253 
1254 	LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz);
1255 	if (rb == NULL)
1256 		return NULL;
1257 
1258 	rb->rb_pool = rbp;
1259 
1260 	for (i = 0; i < npages; i++) {
1261 		page = alloc_pages_node(
1262 				cfs_cpt_spread_node(lnet_cpt_table(), cpt),
1263 				__GFP_ZERO | GFP_IOFS, 0);
1264 		if (page == NULL) {
1265 			while (--i >= 0)
1266 				__free_page(rb->rb_kiov[i].kiov_page);
1267 
1268 			LIBCFS_FREE(rb, sz);
1269 			return NULL;
1270 		}
1271 
1272 		rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE;
1273 		rb->rb_kiov[i].kiov_offset = 0;
1274 		rb->rb_kiov[i].kiov_page = page;
1275 	}
1276 
1277 	return rb;
1278 }
1279 
1280 static void
lnet_rtrpool_free_bufs(lnet_rtrbufpool_t * rbp)1281 lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp)
1282 {
1283 	int		npages = rbp->rbp_npages;
1284 	int		nbuffers = 0;
1285 	lnet_rtrbuf_t	*rb;
1286 
1287 	if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */
1288 		return;
1289 
1290 	LASSERT(list_empty(&rbp->rbp_msgs));
1291 	LASSERT(rbp->rbp_credits == rbp->rbp_nbuffers);
1292 
1293 	while (!list_empty(&rbp->rbp_bufs)) {
1294 		LASSERT(rbp->rbp_credits > 0);
1295 
1296 		rb = list_entry(rbp->rbp_bufs.next,
1297 				    lnet_rtrbuf_t, rb_list);
1298 		list_del(&rb->rb_list);
1299 		lnet_destroy_rtrbuf(rb, npages);
1300 		nbuffers++;
1301 	}
1302 
1303 	LASSERT(rbp->rbp_nbuffers == nbuffers);
1304 	LASSERT(rbp->rbp_credits == nbuffers);
1305 
1306 	rbp->rbp_nbuffers = rbp->rbp_credits = 0;
1307 }
1308 
1309 static int
lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t * rbp,int nbufs,int cpt)1310 lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt)
1311 {
1312 	lnet_rtrbuf_t *rb;
1313 	int	    i;
1314 
1315 	if (rbp->rbp_nbuffers != 0) {
1316 		LASSERT(rbp->rbp_nbuffers == nbufs);
1317 		return 0;
1318 	}
1319 
1320 	for (i = 0; i < nbufs; i++) {
1321 		rb = lnet_new_rtrbuf(rbp, cpt);
1322 
1323 		if (rb == NULL) {
1324 			CERROR("Failed to allocate %d router bufs of %d pages\n",
1325 			       nbufs, rbp->rbp_npages);
1326 			return -ENOMEM;
1327 		}
1328 
1329 		rbp->rbp_nbuffers++;
1330 		rbp->rbp_credits++;
1331 		rbp->rbp_mincredits++;
1332 		list_add(&rb->rb_list, &rbp->rbp_bufs);
1333 
1334 		/* No allocation "under fire" */
1335 		/* Otherwise we'd need code to schedule blocked msgs etc */
1336 		LASSERT(!the_lnet.ln_routing);
1337 	}
1338 
1339 	LASSERT(rbp->rbp_credits == nbufs);
1340 	return 0;
1341 }
1342 
1343 static void
lnet_rtrpool_init(lnet_rtrbufpool_t * rbp,int npages)1344 lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages)
1345 {
1346 	INIT_LIST_HEAD(&rbp->rbp_msgs);
1347 	INIT_LIST_HEAD(&rbp->rbp_bufs);
1348 
1349 	rbp->rbp_npages = npages;
1350 	rbp->rbp_credits = 0;
1351 	rbp->rbp_mincredits = 0;
1352 }
1353 
1354 void
lnet_rtrpools_free(void)1355 lnet_rtrpools_free(void)
1356 {
1357 	lnet_rtrbufpool_t *rtrp;
1358 	int		  i;
1359 
1360 	if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
1361 		return;
1362 
1363 	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
1364 		lnet_rtrpool_free_bufs(&rtrp[0]);
1365 		lnet_rtrpool_free_bufs(&rtrp[1]);
1366 		lnet_rtrpool_free_bufs(&rtrp[2]);
1367 	}
1368 
1369 	cfs_percpt_free(the_lnet.ln_rtrpools);
1370 	the_lnet.ln_rtrpools = NULL;
1371 }
1372 
1373 static int
lnet_nrb_tiny_calculate(int npages)1374 lnet_nrb_tiny_calculate(int npages)
1375 {
1376 	int	nrbs = LNET_NRB_TINY;
1377 
1378 	if (tiny_router_buffers < 0) {
1379 		LCONSOLE_ERROR_MSG(0x10c,
1380 				   "tiny_router_buffers=%d invalid when routing enabled\n",
1381 				   tiny_router_buffers);
1382 		return -1;
1383 	}
1384 
1385 	if (tiny_router_buffers > 0)
1386 		nrbs = tiny_router_buffers;
1387 
1388 	nrbs /= LNET_CPT_NUMBER;
1389 	return max(nrbs, LNET_NRB_TINY_MIN);
1390 }
1391 
1392 static int
lnet_nrb_small_calculate(int npages)1393 lnet_nrb_small_calculate(int npages)
1394 {
1395 	int	nrbs = LNET_NRB_SMALL;
1396 
1397 	if (small_router_buffers < 0) {
1398 		LCONSOLE_ERROR_MSG(0x10c,
1399 				   "small_router_buffers=%d invalid when routing enabled\n",
1400 				   small_router_buffers);
1401 		return -1;
1402 	}
1403 
1404 	if (small_router_buffers > 0)
1405 		nrbs = small_router_buffers;
1406 
1407 	nrbs /= LNET_CPT_NUMBER;
1408 	return max(nrbs, LNET_NRB_SMALL_MIN);
1409 }
1410 
1411 static int
lnet_nrb_large_calculate(int npages)1412 lnet_nrb_large_calculate(int npages)
1413 {
1414 	int	nrbs = LNET_NRB_LARGE;
1415 
1416 	if (large_router_buffers < 0) {
1417 		LCONSOLE_ERROR_MSG(0x10c,
1418 				   "large_router_buffers=%d invalid when routing enabled\n",
1419 				   large_router_buffers);
1420 		return -1;
1421 	}
1422 
1423 	if (large_router_buffers > 0)
1424 		nrbs = large_router_buffers;
1425 
1426 	nrbs /= LNET_CPT_NUMBER;
1427 	return max(nrbs, LNET_NRB_LARGE_MIN);
1428 }
1429 
1430 int
lnet_rtrpools_alloc(int im_a_router)1431 lnet_rtrpools_alloc(int im_a_router)
1432 {
1433 	lnet_rtrbufpool_t *rtrp;
1434 	int	large_pages;
1435 	int	small_pages = 1;
1436 	int	nrb_tiny;
1437 	int	nrb_small;
1438 	int	nrb_large;
1439 	int	rc;
1440 	int	i;
1441 
1442 	large_pages = (LNET_MTU + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1443 
1444 	if (!strcmp(forwarding, "")) {
1445 		/* not set either way */
1446 		if (!im_a_router)
1447 			return 0;
1448 	} else if (!strcmp(forwarding, "disabled")) {
1449 		/* explicitly disabled */
1450 		return 0;
1451 	} else if (!strcmp(forwarding, "enabled")) {
1452 		/* explicitly enabled */
1453 	} else {
1454 		LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either 'enabled' or 'disabled'\n");
1455 		return -EINVAL;
1456 	}
1457 
1458 	nrb_tiny = lnet_nrb_tiny_calculate(0);
1459 	if (nrb_tiny < 0)
1460 		return -EINVAL;
1461 
1462 	nrb_small = lnet_nrb_small_calculate(small_pages);
1463 	if (nrb_small < 0)
1464 		return -EINVAL;
1465 
1466 	nrb_large = lnet_nrb_large_calculate(large_pages);
1467 	if (nrb_large < 0)
1468 		return -EINVAL;
1469 
1470 	the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(),
1471 						LNET_NRBPOOLS *
1472 						sizeof(lnet_rtrbufpool_t));
1473 	if (the_lnet.ln_rtrpools == NULL) {
1474 		LCONSOLE_ERROR_MSG(0x10c,
1475 				   "Failed to initialize router buffe pool\n");
1476 		return -ENOMEM;
1477 	}
1478 
1479 	cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) {
1480 		lnet_rtrpool_init(&rtrp[0], 0);
1481 		rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i);
1482 		if (rc != 0)
1483 			goto failed;
1484 
1485 		lnet_rtrpool_init(&rtrp[1], small_pages);
1486 		rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i);
1487 		if (rc != 0)
1488 			goto failed;
1489 
1490 		lnet_rtrpool_init(&rtrp[2], large_pages);
1491 		rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i);
1492 		if (rc != 0)
1493 			goto failed;
1494 	}
1495 
1496 	lnet_net_lock(LNET_LOCK_EX);
1497 	the_lnet.ln_routing = 1;
1498 	lnet_net_unlock(LNET_LOCK_EX);
1499 
1500 	return 0;
1501 
1502  failed:
1503 	lnet_rtrpools_free();
1504 	return rc;
1505 }
1506 
1507 int
lnet_notify(lnet_ni_t * ni,lnet_nid_t nid,int alive,unsigned long when)1508 lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, unsigned long when)
1509 {
1510 	struct lnet_peer	*lp = NULL;
1511 	unsigned long		now = cfs_time_current();
1512 	int			cpt = lnet_cpt_of_nid(nid);
1513 
1514 	LASSERT(!in_interrupt ());
1515 
1516 	CDEBUG(D_NET, "%s notifying %s: %s\n",
1517 		(ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
1518 		libcfs_nid2str(nid),
1519 		alive ? "up" : "down");
1520 
1521 	if (ni != NULL &&
1522 	    LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
1523 		CWARN("Ignoring notification of %s %s by %s (different net)\n",
1524 			libcfs_nid2str(nid), alive ? "birth" : "death",
1525 			libcfs_nid2str(ni->ni_nid));
1526 		return -EINVAL;
1527 	}
1528 
1529 	/* can't do predictions... */
1530 	if (cfs_time_after(when, now)) {
1531 		CWARN("Ignoring prediction from %s of %s %s %ld seconds in the future\n",
1532 		      (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
1533 		      libcfs_nid2str(nid), alive ? "up" : "down",
1534 		      cfs_duration_sec(cfs_time_sub(when, now)));
1535 		return -EINVAL;
1536 	}
1537 
1538 	if (ni != NULL && !alive &&	     /* LND telling me she's down */
1539 	    !auto_down) {		       /* auto-down disabled */
1540 		CDEBUG(D_NET, "Auto-down disabled\n");
1541 		return 0;
1542 	}
1543 
1544 	lnet_net_lock(cpt);
1545 
1546 	if (the_lnet.ln_shutdown) {
1547 		lnet_net_unlock(cpt);
1548 		return -ESHUTDOWN;
1549 	}
1550 
1551 	lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
1552 	if (lp == NULL) {
1553 		/* nid not found */
1554 		lnet_net_unlock(cpt);
1555 		CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
1556 		return 0;
1557 	}
1558 
1559 	/* We can't fully trust LND on reporting exact peer last_alive
1560 	 * if he notifies us about dead peer. For example ksocklnd can
1561 	 * call us with when == _time_when_the_node_was_booted_ if
1562 	 * no connections were successfully established */
1563 	if (ni != NULL && !alive && when < lp->lp_last_alive)
1564 		when = lp->lp_last_alive;
1565 
1566 	lnet_notify_locked(lp, ni == NULL, alive, when);
1567 
1568 	lnet_ni_notify_locked(ni, lp);
1569 
1570 	lnet_peer_decref_locked(lp);
1571 
1572 	lnet_net_unlock(cpt);
1573 	return 0;
1574 }
1575 EXPORT_SYMBOL(lnet_notify);
1576 
1577 void
lnet_get_tunables(void)1578 lnet_get_tunables(void)
1579 {
1580 }
1581 
1582 #else
1583 
1584 int
lnet_notify(lnet_ni_t * ni,lnet_nid_t nid,int alive,unsigned long when)1585 lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, unsigned long when)
1586 {
1587 	return -EOPNOTSUPP;
1588 }
1589 
1590 void
lnet_router_checker(void)1591 lnet_router_checker(void)
1592 {
1593 	static time_t last;
1594 	static int    running;
1595 
1596 	time_t	    now = get_seconds();
1597 	int	       interval = now - last;
1598 	int	       rc;
1599 	__u64	     version;
1600 	lnet_peer_t      *rtr;
1601 
1602 	/* It's no use to call me again within a sec - all intervals and
1603 	 * timeouts are measured in seconds */
1604 	if (last != 0 && interval < 2)
1605 		return;
1606 
1607 	if (last != 0 &&
1608 	    interval > max(live_router_check_interval,
1609 			   dead_router_check_interval))
1610 		CNETERR("Checker(%d/%d) not called for %d seconds\n",
1611 			live_router_check_interval, dead_router_check_interval,
1612 			interval);
1613 
1614 	LASSERT(LNET_CPT_NUMBER == 1);
1615 
1616 	lnet_net_lock(0);
1617 	LASSERT(!running); /* recursion check */
1618 	running = 1;
1619 	lnet_net_unlock(0);
1620 
1621 	last = now;
1622 
1623 	if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING)
1624 		lnet_prune_rc_data(0); /* unlink all rcd and nowait */
1625 
1626 	/* consume all pending events */
1627 	while (1) {
1628 		int	  i;
1629 		lnet_event_t ev;
1630 
1631 		/* NB ln_rc_eqh must be the 1st in 'eventqs' otherwise the
1632 		 * recursion breaker in LNetEQPoll would fail */
1633 		rc = LNetEQPoll(&the_lnet.ln_rc_eqh, 1, 0, &ev, &i);
1634 		if (rc == 0)   /* no event pending */
1635 			break;
1636 
1637 		/* NB a lost SENT prevents me from pinging a router again */
1638 		if (rc == -EOVERFLOW) {
1639 			CERROR("Dropped an event!!!\n");
1640 			abort();
1641 		}
1642 
1643 		LASSERT(rc == 1);
1644 
1645 		lnet_router_checker_event(&ev);
1646 	}
1647 
1648 	if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) {
1649 		lnet_prune_rc_data(1); /* release rcd */
1650 		the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
1651 		running = 0;
1652 		return;
1653 	}
1654 
1655 	LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
1656 
1657 	lnet_net_lock(0);
1658 
1659 	version = the_lnet.ln_routers_version;
1660 	list_for_each_entry(rtr, &the_lnet.ln_routers, lp_rtr_list) {
1661 		lnet_ping_router_locked(rtr);
1662 		LASSERT(version == the_lnet.ln_routers_version);
1663 	}
1664 
1665 	lnet_net_unlock(0);
1666 
1667 	running = 0; /* lock only needed for the recursion check */
1668 }
1669 
1670 /* NB lnet_peers_start_down depends on me,
1671  * so must be called before any peer creation */
1672 void
lnet_get_tunables(void)1673 lnet_get_tunables(void)
1674 {
1675 	char *s;
1676 
1677 	s = getenv("LNET_ROUTER_PING_TIMEOUT");
1678 	if (s != NULL)
1679 		router_ping_timeout = atoi(s);
1680 
1681 	s = getenv("LNET_LIVE_ROUTER_CHECK_INTERVAL");
1682 	if (s != NULL)
1683 		live_router_check_interval = atoi(s);
1684 
1685 	s = getenv("LNET_DEAD_ROUTER_CHECK_INTERVAL");
1686 	if (s != NULL)
1687 		dead_router_check_interval = atoi(s);
1688 
1689 	/* This replaces old lnd_notify mechanism */
1690 	check_routers_before_use = 1;
1691 	if (dead_router_check_interval <= 0)
1692 		dead_router_check_interval = 30;
1693 }
1694 
1695 void
lnet_rtrpools_free(void)1696 lnet_rtrpools_free(void)
1697 {
1698 }
1699 
1700 int
lnet_rtrpools_alloc(int im_a_arouter)1701 lnet_rtrpools_alloc(int im_a_arouter)
1702 {
1703 	return 0;
1704 }
1705 
1706 #endif
1707