1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; if not, write to the
18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19  * Boston, MA 021110-1307, USA
20  *
21  * GPL HEADER END
22  */
23 /*
24  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2012, Intel Corporation.
26  */
27 /*
28  * This file is part of Lustre, http://www.lustre.org/
29  * Lustre is a trademark of Sun Microsystems, Inc.
30  *
31  * Author: liang@whamcloud.com
32  */
33 
34 #define DEBUG_SUBSYSTEM S_LNET
35 
36 #include <linux/cpu.h>
37 #include <linux/sched.h>
38 #include "../../../include/linux/libcfs/libcfs.h"
39 
40 #ifdef CONFIG_SMP
41 
42 /**
43  * modparam for setting number of partitions
44  *
45  *  0 : estimate best value based on cores or NUMA nodes
46  *  1 : disable multiple partitions
47  * >1 : specify number of partitions
48  */
49 static int	cpu_npartitions;
50 module_param(cpu_npartitions, int, 0444);
51 MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
52 
53 /**
54  * modparam for setting CPU partitions patterns:
55  *
56  * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
57  *      number in bracket is processor ID (core or HT)
58  *
59  * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
60  *       are NUMA node ID, number before bracket is CPU partition ID.
61  *
62  * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
63  */
64 static char	*cpu_pattern = "";
65 module_param(cpu_pattern, charp, 0444);
66 MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
67 
68 struct cfs_cpt_data {
69 	/* serialize hotplug etc */
70 	spinlock_t		cpt_lock;
71 	/* reserved for hotplug */
72 	unsigned long		cpt_version;
73 	/* mutex to protect cpt_cpumask */
74 	struct mutex		cpt_mutex;
75 	/* scratch buffer for set/unset_node */
76 	cpumask_t		*cpt_cpumask;
77 };
78 
79 static struct cfs_cpt_data	cpt_data;
80 
cfs_cpu_core_siblings(int cpu,cpumask_t * mask)81 static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
82 {
83 	/* return cpumask of cores in the same socket */
84 	cpumask_copy(mask, topology_core_cpumask(cpu));
85 }
86 
87 /* return cpumask of HTs in the same core */
cfs_cpu_ht_siblings(int cpu,cpumask_t * mask)88 static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
89 {
90 	cpumask_copy(mask, topology_sibling_cpumask(cpu));
91 }
92 
cfs_node_to_cpumask(int node,cpumask_t * mask)93 static void cfs_node_to_cpumask(int node, cpumask_t *mask)
94 {
95 	cpumask_copy(mask, cpumask_of_node(node));
96 }
97 
98 void
cfs_cpt_table_free(struct cfs_cpt_table * cptab)99 cfs_cpt_table_free(struct cfs_cpt_table *cptab)
100 {
101 	int	i;
102 
103 	if (cptab->ctb_cpu2cpt != NULL) {
104 		LIBCFS_FREE(cptab->ctb_cpu2cpt,
105 			    num_possible_cpus() *
106 			    sizeof(cptab->ctb_cpu2cpt[0]));
107 	}
108 
109 	for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
110 		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
111 
112 		if (part->cpt_nodemask != NULL) {
113 			LIBCFS_FREE(part->cpt_nodemask,
114 				    sizeof(*part->cpt_nodemask));
115 		}
116 
117 		if (part->cpt_cpumask != NULL)
118 			LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
119 	}
120 
121 	if (cptab->ctb_parts != NULL) {
122 		LIBCFS_FREE(cptab->ctb_parts,
123 			    cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
124 	}
125 
126 	if (cptab->ctb_nodemask != NULL)
127 		LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
128 	if (cptab->ctb_cpumask != NULL)
129 		LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
130 
131 	LIBCFS_FREE(cptab, sizeof(*cptab));
132 }
133 EXPORT_SYMBOL(cfs_cpt_table_free);
134 
135 struct cfs_cpt_table *
cfs_cpt_table_alloc(unsigned int ncpt)136 cfs_cpt_table_alloc(unsigned int ncpt)
137 {
138 	struct cfs_cpt_table *cptab;
139 	int	i;
140 
141 	LIBCFS_ALLOC(cptab, sizeof(*cptab));
142 	if (cptab == NULL)
143 		return NULL;
144 
145 	cptab->ctb_nparts = ncpt;
146 
147 	LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
148 	LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
149 
150 	if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
151 		goto failed;
152 
153 	LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
154 		     num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
155 	if (cptab->ctb_cpu2cpt == NULL)
156 		goto failed;
157 
158 	memset(cptab->ctb_cpu2cpt, -1,
159 	       num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
160 
161 	LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
162 	if (cptab->ctb_parts == NULL)
163 		goto failed;
164 
165 	for (i = 0; i < ncpt; i++) {
166 		struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
167 
168 		LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
169 		LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
170 		if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
171 			goto failed;
172 	}
173 
174 	spin_lock(&cpt_data.cpt_lock);
175 	/* Reserved for hotplug */
176 	cptab->ctb_version = cpt_data.cpt_version;
177 	spin_unlock(&cpt_data.cpt_lock);
178 
179 	return cptab;
180 
181  failed:
182 	cfs_cpt_table_free(cptab);
183 	return NULL;
184 }
185 EXPORT_SYMBOL(cfs_cpt_table_alloc);
186 
187 int
cfs_cpt_table_print(struct cfs_cpt_table * cptab,char * buf,int len)188 cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
189 {
190 	char	*tmp = buf;
191 	int	rc = 0;
192 	int	i;
193 	int	j;
194 
195 	for (i = 0; i < cptab->ctb_nparts; i++) {
196 		if (len > 0) {
197 			rc = snprintf(tmp, len, "%d\t: ", i);
198 			len -= rc;
199 		}
200 
201 		if (len <= 0) {
202 			rc = -EFBIG;
203 			goto out;
204 		}
205 
206 		tmp += rc;
207 		for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
208 			rc = snprintf(tmp, len, "%d ", j);
209 			len -= rc;
210 			if (len <= 0) {
211 				rc = -EFBIG;
212 				goto out;
213 			}
214 			tmp += rc;
215 		}
216 
217 		*tmp = '\n';
218 		tmp++;
219 		len--;
220 	}
221 
222  out:
223 	if (rc < 0)
224 		return rc;
225 
226 	return tmp - buf;
227 }
228 EXPORT_SYMBOL(cfs_cpt_table_print);
229 
230 int
cfs_cpt_number(struct cfs_cpt_table * cptab)231 cfs_cpt_number(struct cfs_cpt_table *cptab)
232 {
233 	return cptab->ctb_nparts;
234 }
235 EXPORT_SYMBOL(cfs_cpt_number);
236 
237 int
cfs_cpt_weight(struct cfs_cpt_table * cptab,int cpt)238 cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
239 {
240 	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
241 
242 	return cpt == CFS_CPT_ANY ?
243 	       cpumask_weight(cptab->ctb_cpumask) :
244 	       cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
245 }
246 EXPORT_SYMBOL(cfs_cpt_weight);
247 
248 int
cfs_cpt_online(struct cfs_cpt_table * cptab,int cpt)249 cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
250 {
251 	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
252 
253 	return cpt == CFS_CPT_ANY ?
254 	       cpumask_any_and(cptab->ctb_cpumask,
255 			       cpu_online_mask) < nr_cpu_ids :
256 	       cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
257 			       cpu_online_mask) < nr_cpu_ids;
258 }
259 EXPORT_SYMBOL(cfs_cpt_online);
260 
261 cpumask_t *
cfs_cpt_cpumask(struct cfs_cpt_table * cptab,int cpt)262 cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
263 {
264 	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
265 
266 	return cpt == CFS_CPT_ANY ?
267 	       cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
268 }
269 EXPORT_SYMBOL(cfs_cpt_cpumask);
270 
271 nodemask_t *
cfs_cpt_nodemask(struct cfs_cpt_table * cptab,int cpt)272 cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
273 {
274 	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
275 
276 	return cpt == CFS_CPT_ANY ?
277 	       cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
278 }
279 EXPORT_SYMBOL(cfs_cpt_nodemask);
280 
281 int
cfs_cpt_set_cpu(struct cfs_cpt_table * cptab,int cpt,int cpu)282 cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
283 {
284 	int	node;
285 
286 	LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
287 
288 	if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
289 		CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
290 		return 0;
291 	}
292 
293 	if (cptab->ctb_cpu2cpt[cpu] != -1) {
294 		CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
295 		       cpu, cptab->ctb_cpu2cpt[cpu]);
296 		return 0;
297 	}
298 
299 	cptab->ctb_cpu2cpt[cpu] = cpt;
300 
301 	LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask));
302 	LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
303 
304 	cpumask_set_cpu(cpu, cptab->ctb_cpumask);
305 	cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
306 
307 	node = cpu_to_node(cpu);
308 
309 	/* first CPU of @node in this CPT table */
310 	if (!node_isset(node, *cptab->ctb_nodemask))
311 		node_set(node, *cptab->ctb_nodemask);
312 
313 	/* first CPU of @node in this partition */
314 	if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
315 		node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
316 
317 	return 1;
318 }
319 EXPORT_SYMBOL(cfs_cpt_set_cpu);
320 
321 void
cfs_cpt_unset_cpu(struct cfs_cpt_table * cptab,int cpt,int cpu)322 cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
323 {
324 	int	node;
325 	int	i;
326 
327 	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
328 
329 	if (cpu < 0 || cpu >= nr_cpu_ids) {
330 		CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
331 		return;
332 	}
333 
334 	if (cpt == CFS_CPT_ANY) {
335 		/* caller doesn't know the partition ID */
336 		cpt = cptab->ctb_cpu2cpt[cpu];
337 		if (cpt < 0) { /* not set in this CPT-table */
338 			CDEBUG(D_INFO, "Try to unset cpu %d which is not in CPT-table %p\n",
339 			       cpt, cptab);
340 			return;
341 		}
342 
343 	} else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
344 		CDEBUG(D_INFO,
345 		       "CPU %d is not in cpu-partition %d\n", cpu, cpt);
346 		return;
347 	}
348 
349 	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
350 	LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
351 
352 	cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
353 	cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
354 	cptab->ctb_cpu2cpt[cpu] = -1;
355 
356 	node = cpu_to_node(cpu);
357 
358 	LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
359 	LASSERT(node_isset(node, *cptab->ctb_nodemask));
360 
361 	for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) {
362 		/* this CPT has other CPU belonging to this node? */
363 		if (cpu_to_node(i) == node)
364 			break;
365 	}
366 
367 	if (i >= nr_cpu_ids)
368 		node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
369 
370 	for_each_cpu(i, cptab->ctb_cpumask) {
371 		/* this CPT-table has other CPU belonging to this node? */
372 		if (cpu_to_node(i) == node)
373 			break;
374 	}
375 
376 	if (i >= nr_cpu_ids)
377 		node_clear(node, *cptab->ctb_nodemask);
378 
379 	return;
380 }
381 EXPORT_SYMBOL(cfs_cpt_unset_cpu);
382 
383 int
cfs_cpt_set_cpumask(struct cfs_cpt_table * cptab,int cpt,cpumask_t * mask)384 cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
385 {
386 	int	i;
387 
388 	if (cpumask_weight(mask) == 0 ||
389 	    cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
390 		CDEBUG(D_INFO, "No online CPU is found in the CPU mask for CPU partition %d\n",
391 		       cpt);
392 		return 0;
393 	}
394 
395 	for_each_cpu(i, mask) {
396 		if (!cfs_cpt_set_cpu(cptab, cpt, i))
397 			return 0;
398 	}
399 
400 	return 1;
401 }
402 EXPORT_SYMBOL(cfs_cpt_set_cpumask);
403 
404 void
cfs_cpt_unset_cpumask(struct cfs_cpt_table * cptab,int cpt,cpumask_t * mask)405 cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
406 {
407 	int	i;
408 
409 	for_each_cpu(i, mask)
410 		cfs_cpt_unset_cpu(cptab, cpt, i);
411 }
412 EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
413 
414 int
cfs_cpt_set_node(struct cfs_cpt_table * cptab,int cpt,int node)415 cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
416 {
417 	cpumask_t	*mask;
418 	int		rc;
419 
420 	if (node < 0 || node >= MAX_NUMNODES) {
421 		CDEBUG(D_INFO,
422 		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
423 		return 0;
424 	}
425 
426 	mutex_lock(&cpt_data.cpt_mutex);
427 
428 	mask = cpt_data.cpt_cpumask;
429 	cfs_node_to_cpumask(node, mask);
430 
431 	rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
432 
433 	mutex_unlock(&cpt_data.cpt_mutex);
434 
435 	return rc;
436 }
437 EXPORT_SYMBOL(cfs_cpt_set_node);
438 
439 void
cfs_cpt_unset_node(struct cfs_cpt_table * cptab,int cpt,int node)440 cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
441 {
442 	cpumask_t *mask;
443 
444 	if (node < 0 || node >= MAX_NUMNODES) {
445 		CDEBUG(D_INFO,
446 		       "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
447 		return;
448 	}
449 
450 	mutex_lock(&cpt_data.cpt_mutex);
451 
452 	mask = cpt_data.cpt_cpumask;
453 	cfs_node_to_cpumask(node, mask);
454 
455 	cfs_cpt_unset_cpumask(cptab, cpt, mask);
456 
457 	mutex_unlock(&cpt_data.cpt_mutex);
458 }
459 EXPORT_SYMBOL(cfs_cpt_unset_node);
460 
461 int
cfs_cpt_set_nodemask(struct cfs_cpt_table * cptab,int cpt,nodemask_t * mask)462 cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
463 {
464 	int	i;
465 
466 	for_each_node_mask(i, *mask) {
467 		if (!cfs_cpt_set_node(cptab, cpt, i))
468 			return 0;
469 	}
470 
471 	return 1;
472 }
473 EXPORT_SYMBOL(cfs_cpt_set_nodemask);
474 
475 void
cfs_cpt_unset_nodemask(struct cfs_cpt_table * cptab,int cpt,nodemask_t * mask)476 cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
477 {
478 	int	i;
479 
480 	for_each_node_mask(i, *mask)
481 		cfs_cpt_unset_node(cptab, cpt, i);
482 }
483 EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
484 
485 void
cfs_cpt_clear(struct cfs_cpt_table * cptab,int cpt)486 cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
487 {
488 	int	last;
489 	int	i;
490 
491 	if (cpt == CFS_CPT_ANY) {
492 		last = cptab->ctb_nparts - 1;
493 		cpt = 0;
494 	} else {
495 		last = cpt;
496 	}
497 
498 	for (; cpt <= last; cpt++) {
499 		for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask)
500 			cfs_cpt_unset_cpu(cptab, cpt, i);
501 	}
502 }
503 EXPORT_SYMBOL(cfs_cpt_clear);
504 
505 int
cfs_cpt_spread_node(struct cfs_cpt_table * cptab,int cpt)506 cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
507 {
508 	nodemask_t	*mask;
509 	int		weight;
510 	int		rotor;
511 	int		node;
512 
513 	/* convert CPU partition ID to HW node id */
514 
515 	if (cpt < 0 || cpt >= cptab->ctb_nparts) {
516 		mask = cptab->ctb_nodemask;
517 		rotor = cptab->ctb_spread_rotor++;
518 	} else {
519 		mask = cptab->ctb_parts[cpt].cpt_nodemask;
520 		rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
521 	}
522 
523 	weight = nodes_weight(*mask);
524 	LASSERT(weight > 0);
525 
526 	rotor %= weight;
527 
528 	for_each_node_mask(node, *mask) {
529 		if (rotor-- == 0)
530 			return node;
531 	}
532 
533 	LBUG();
534 	return 0;
535 }
536 EXPORT_SYMBOL(cfs_cpt_spread_node);
537 
538 int
cfs_cpt_current(struct cfs_cpt_table * cptab,int remap)539 cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
540 {
541 	int	cpu = smp_processor_id();
542 	int	cpt = cptab->ctb_cpu2cpt[cpu];
543 
544 	if (cpt < 0) {
545 		if (!remap)
546 			return cpt;
547 
548 		/* don't return negative value for safety of upper layer,
549 		 * instead we shadow the unknown cpu to a valid partition ID */
550 		cpt = cpu % cptab->ctb_nparts;
551 	}
552 
553 	return cpt;
554 }
555 EXPORT_SYMBOL(cfs_cpt_current);
556 
557 int
cfs_cpt_of_cpu(struct cfs_cpt_table * cptab,int cpu)558 cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
559 {
560 	LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
561 
562 	return cptab->ctb_cpu2cpt[cpu];
563 }
564 EXPORT_SYMBOL(cfs_cpt_of_cpu);
565 
566 int
cfs_cpt_bind(struct cfs_cpt_table * cptab,int cpt)567 cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
568 {
569 	cpumask_t	*cpumask;
570 	nodemask_t	*nodemask;
571 	int		rc;
572 	int		i;
573 
574 	LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
575 
576 	if (cpt == CFS_CPT_ANY) {
577 		cpumask = cptab->ctb_cpumask;
578 		nodemask = cptab->ctb_nodemask;
579 	} else {
580 		cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
581 		nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
582 	}
583 
584 	if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) {
585 		CERROR("No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
586 		       cpt);
587 		return -EINVAL;
588 	}
589 
590 	for_each_online_cpu(i) {
591 		if (cpumask_test_cpu(i, cpumask))
592 			continue;
593 
594 		rc = set_cpus_allowed_ptr(current, cpumask);
595 		set_mems_allowed(*nodemask);
596 		if (rc == 0)
597 			schedule(); /* switch to allowed CPU */
598 
599 		return rc;
600 	}
601 
602 	/* don't need to set affinity because all online CPUs are covered */
603 	return 0;
604 }
605 EXPORT_SYMBOL(cfs_cpt_bind);
606 
607 /**
608  * Choose max to \a number CPUs from \a node and set them in \a cpt.
609  * We always prefer to choose CPU in the same core/socket.
610  */
611 static int
cfs_cpt_choose_ncpus(struct cfs_cpt_table * cptab,int cpt,cpumask_t * node,int number)612 cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
613 		     cpumask_t *node, int number)
614 {
615 	cpumask_t	*socket = NULL;
616 	cpumask_t	*core = NULL;
617 	int		rc = 0;
618 	int		cpu;
619 
620 	LASSERT(number > 0);
621 
622 	if (number >= cpumask_weight(node)) {
623 		while (!cpumask_empty(node)) {
624 			cpu = cpumask_first(node);
625 
626 			rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
627 			if (!rc)
628 				return -EINVAL;
629 			cpumask_clear_cpu(cpu, node);
630 		}
631 		return 0;
632 	}
633 
634 	/* allocate scratch buffer */
635 	LIBCFS_ALLOC(socket, cpumask_size());
636 	LIBCFS_ALLOC(core, cpumask_size());
637 	if (socket == NULL || core == NULL) {
638 		rc = -ENOMEM;
639 		goto out;
640 	}
641 
642 	while (!cpumask_empty(node)) {
643 		cpu = cpumask_first(node);
644 
645 		/* get cpumask for cores in the same socket */
646 		cfs_cpu_core_siblings(cpu, socket);
647 		cpumask_and(socket, socket, node);
648 
649 		LASSERT(!cpumask_empty(socket));
650 
651 		while (!cpumask_empty(socket)) {
652 			int     i;
653 
654 			/* get cpumask for hts in the same core */
655 			cfs_cpu_ht_siblings(cpu, core);
656 			cpumask_and(core, core, node);
657 
658 			LASSERT(!cpumask_empty(core));
659 
660 			for_each_cpu(i, core) {
661 				cpumask_clear_cpu(i, socket);
662 				cpumask_clear_cpu(i, node);
663 
664 				rc = cfs_cpt_set_cpu(cptab, cpt, i);
665 				if (!rc) {
666 					rc = -EINVAL;
667 					goto out;
668 				}
669 
670 				if (--number == 0)
671 					goto out;
672 			}
673 			cpu = cpumask_first(socket);
674 		}
675 	}
676 
677  out:
678 	if (socket != NULL)
679 		LIBCFS_FREE(socket, cpumask_size());
680 	if (core != NULL)
681 		LIBCFS_FREE(core, cpumask_size());
682 	return rc;
683 }
684 
685 #define CPT_WEIGHT_MIN  4u
686 
687 static unsigned int
cfs_cpt_num_estimate(void)688 cfs_cpt_num_estimate(void)
689 {
690 	unsigned nnode = num_online_nodes();
691 	unsigned ncpu  = num_online_cpus();
692 	unsigned ncpt;
693 
694 	if (ncpu <= CPT_WEIGHT_MIN) {
695 		ncpt = 1;
696 		goto out;
697 	}
698 
699 	/* generate reasonable number of CPU partitions based on total number
700 	 * of CPUs, Preferred N should be power2 and match this condition:
701 	 * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
702 	for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1)
703 		;
704 
705 	if (ncpt <= nnode) { /* fat numa system */
706 		while (nnode > ncpt)
707 			nnode >>= 1;
708 
709 	} else { /* ncpt > nnode */
710 		while ((nnode << 1) <= ncpt)
711 			nnode <<= 1;
712 	}
713 
714 	ncpt = nnode;
715 
716  out:
717 #if (BITS_PER_LONG == 32)
718 	/* config many CPU partitions on 32-bit system could consume
719 	 * too much memory */
720 	ncpt = min(2U, ncpt);
721 #endif
722 	while (ncpu % ncpt != 0)
723 		ncpt--; /* worst case is 1 */
724 
725 	return ncpt;
726 }
727 
728 static struct cfs_cpt_table *
cfs_cpt_table_create(int ncpt)729 cfs_cpt_table_create(int ncpt)
730 {
731 	struct cfs_cpt_table *cptab = NULL;
732 	cpumask_t	*mask = NULL;
733 	int		cpt = 0;
734 	int		num;
735 	int		rc;
736 	int		i;
737 
738 	rc = cfs_cpt_num_estimate();
739 	if (ncpt <= 0)
740 		ncpt = rc;
741 
742 	if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
743 		CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
744 		      ncpt, rc);
745 	}
746 
747 	if (num_online_cpus() % ncpt != 0) {
748 		CERROR("CPU number %d is not multiple of cpu_npartition %d, please try different cpu_npartitions value or set pattern string by cpu_pattern=STRING\n",
749 		       (int)num_online_cpus(), ncpt);
750 		goto failed;
751 	}
752 
753 	cptab = cfs_cpt_table_alloc(ncpt);
754 	if (cptab == NULL) {
755 		CERROR("Failed to allocate CPU map(%d)\n", ncpt);
756 		goto failed;
757 	}
758 
759 	num = num_online_cpus() / ncpt;
760 	if (num == 0) {
761 		CERROR("CPU changed while setting CPU partition\n");
762 		goto failed;
763 	}
764 
765 	LIBCFS_ALLOC(mask, cpumask_size());
766 	if (mask == NULL) {
767 		CERROR("Failed to allocate scratch cpumask\n");
768 		goto failed;
769 	}
770 
771 	for_each_online_node(i) {
772 		cfs_node_to_cpumask(i, mask);
773 
774 		while (!cpumask_empty(mask)) {
775 			struct cfs_cpu_partition *part;
776 			int    n;
777 
778 			if (cpt >= ncpt)
779 				goto failed;
780 
781 			part = &cptab->ctb_parts[cpt];
782 
783 			n = num - cpumask_weight(part->cpt_cpumask);
784 			LASSERT(n > 0);
785 
786 			rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
787 			if (rc < 0)
788 				goto failed;
789 
790 			LASSERT(num >= cpumask_weight(part->cpt_cpumask));
791 			if (num == cpumask_weight(part->cpt_cpumask))
792 				cpt++;
793 		}
794 	}
795 
796 	if (cpt != ncpt ||
797 	    num != cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
798 		CERROR("Expect %d(%d) CPU partitions but got %d(%d), CPU hotplug/unplug while setting?\n",
799 		       cptab->ctb_nparts, num, cpt,
800 		       cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask));
801 		goto failed;
802 	}
803 
804 	LIBCFS_FREE(mask, cpumask_size());
805 
806 	return cptab;
807 
808  failed:
809 	CERROR("Failed to setup CPU-partition-table with %d CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
810 	       ncpt, num_online_nodes(), num_online_cpus());
811 
812 	if (mask != NULL)
813 		LIBCFS_FREE(mask, cpumask_size());
814 
815 	if (cptab != NULL)
816 		cfs_cpt_table_free(cptab);
817 
818 	return NULL;
819 }
820 
821 static struct cfs_cpt_table *
cfs_cpt_table_create_pattern(char * pattern)822 cfs_cpt_table_create_pattern(char *pattern)
823 {
824 	struct cfs_cpt_table	*cptab;
825 	char			*str	= pattern;
826 	int			node	= 0;
827 	int			high;
828 	int			ncpt;
829 	int			c;
830 
831 	for (ncpt = 0;; ncpt++) { /* quick scan bracket */
832 		str = strchr(str, '[');
833 		if (str == NULL)
834 			break;
835 		str++;
836 	}
837 
838 	str = cfs_trimwhite(pattern);
839 	if (*str == 'n' || *str == 'N') {
840 		pattern = str + 1;
841 		node = 1;
842 	}
843 
844 	if (ncpt == 0 ||
845 	    (node && ncpt > num_online_nodes()) ||
846 	    (!node && ncpt > num_online_cpus())) {
847 		CERROR("Invalid pattern %s, or too many partitions %d\n",
848 		       pattern, ncpt);
849 		return NULL;
850 	}
851 
852 	high = node ? MAX_NUMNODES - 1 : nr_cpu_ids - 1;
853 
854 	cptab = cfs_cpt_table_alloc(ncpt);
855 	if (cptab == NULL) {
856 		CERROR("Failed to allocate cpu partition table\n");
857 		return NULL;
858 	}
859 
860 	for (str = cfs_trimwhite(pattern), c = 0;; c++) {
861 		struct cfs_range_expr	*range;
862 		struct cfs_expr_list	*el;
863 		char			*bracket = strchr(str, '[');
864 		int			cpt;
865 		int			rc;
866 		int			i;
867 		int			n;
868 
869 		if (bracket == NULL) {
870 			if (*str != 0) {
871 				CERROR("Invalid pattern %s\n", str);
872 				goto failed;
873 			} else if (c != ncpt) {
874 				CERROR("expect %d partitions but found %d\n",
875 				       ncpt, c);
876 				goto failed;
877 			}
878 			break;
879 		}
880 
881 		if (sscanf(str, "%d%n", &cpt, &n) < 1) {
882 			CERROR("Invalid cpu pattern %s\n", str);
883 			goto failed;
884 		}
885 
886 		if (cpt < 0 || cpt >= ncpt) {
887 			CERROR("Invalid partition id %d, total partitions %d\n",
888 			       cpt, ncpt);
889 			goto failed;
890 		}
891 
892 		if (cfs_cpt_weight(cptab, cpt) != 0) {
893 			CERROR("Partition %d has already been set.\n", cpt);
894 			goto failed;
895 		}
896 
897 		str = cfs_trimwhite(str + n);
898 		if (str != bracket) {
899 			CERROR("Invalid pattern %s\n", str);
900 			goto failed;
901 		}
902 
903 		bracket = strchr(str, ']');
904 		if (bracket == NULL) {
905 			CERROR("missing right bracket for cpt %d, %s\n",
906 			       cpt, str);
907 			goto failed;
908 		}
909 
910 		if (cfs_expr_list_parse(str, (bracket - str) + 1,
911 					0, high, &el) != 0) {
912 			CERROR("Can't parse number range: %s\n", str);
913 			goto failed;
914 		}
915 
916 		list_for_each_entry(range, &el->el_exprs, re_link) {
917 			for (i = range->re_lo; i <= range->re_hi; i++) {
918 				if ((i - range->re_lo) % range->re_stride != 0)
919 					continue;
920 
921 				rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
922 					    cfs_cpt_set_cpu(cptab, cpt, i);
923 				if (!rc) {
924 					cfs_expr_list_free(el);
925 					goto failed;
926 				}
927 			}
928 		}
929 
930 		cfs_expr_list_free(el);
931 
932 		if (!cfs_cpt_online(cptab, cpt)) {
933 			CERROR("No online CPU is found on partition %d\n", cpt);
934 			goto failed;
935 		}
936 
937 		str = cfs_trimwhite(bracket + 1);
938 	}
939 
940 	return cptab;
941 
942  failed:
943 	cfs_cpt_table_free(cptab);
944 	return NULL;
945 }
946 
947 #ifdef CONFIG_HOTPLUG_CPU
948 static int
cfs_cpu_notify(struct notifier_block * self,unsigned long action,void * hcpu)949 cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
950 {
951 	unsigned int  cpu = (unsigned long)hcpu;
952 	bool	     warn;
953 
954 	switch (action) {
955 	case CPU_DEAD:
956 	case CPU_DEAD_FROZEN:
957 	case CPU_ONLINE:
958 	case CPU_ONLINE_FROZEN:
959 		spin_lock(&cpt_data.cpt_lock);
960 		cpt_data.cpt_version++;
961 		spin_unlock(&cpt_data.cpt_lock);
962 	default:
963 		if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
964 			CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
965 			       cpu, action);
966 			break;
967 		}
968 
969 		mutex_lock(&cpt_data.cpt_mutex);
970 		/* if all HTs in a core are offline, it may break affinity */
971 		cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask);
972 		warn = cpumask_any_and(cpt_data.cpt_cpumask,
973 				       cpu_online_mask) >= nr_cpu_ids;
974 		mutex_unlock(&cpt_data.cpt_mutex);
975 		CDEBUG(warn ? D_WARNING : D_INFO,
976 		       "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u action: %lx]\n",
977 		       cpu, action);
978 	}
979 
980 	return NOTIFY_OK;
981 }
982 
983 static struct notifier_block cfs_cpu_notifier = {
984 	.notifier_call	= cfs_cpu_notify,
985 	.priority	= 0
986 };
987 
988 #endif
989 
990 void
cfs_cpu_fini(void)991 cfs_cpu_fini(void)
992 {
993 	if (cfs_cpt_table != NULL)
994 		cfs_cpt_table_free(cfs_cpt_table);
995 
996 #ifdef CONFIG_HOTPLUG_CPU
997 	unregister_hotcpu_notifier(&cfs_cpu_notifier);
998 #endif
999 	if (cpt_data.cpt_cpumask != NULL)
1000 		LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
1001 }
1002 
1003 int
cfs_cpu_init(void)1004 cfs_cpu_init(void)
1005 {
1006 	LASSERT(cfs_cpt_table == NULL);
1007 
1008 	memset(&cpt_data, 0, sizeof(cpt_data));
1009 
1010 	LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
1011 	if (cpt_data.cpt_cpumask == NULL) {
1012 		CERROR("Failed to allocate scratch buffer\n");
1013 		return -1;
1014 	}
1015 
1016 	spin_lock_init(&cpt_data.cpt_lock);
1017 	mutex_init(&cpt_data.cpt_mutex);
1018 
1019 #ifdef CONFIG_HOTPLUG_CPU
1020 	register_hotcpu_notifier(&cfs_cpu_notifier);
1021 #endif
1022 
1023 	if (*cpu_pattern != 0) {
1024 		cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
1025 		if (cfs_cpt_table == NULL) {
1026 			CERROR("Failed to create cptab from pattern %s\n",
1027 			       cpu_pattern);
1028 			goto failed;
1029 		}
1030 
1031 	} else {
1032 		cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
1033 		if (cfs_cpt_table == NULL) {
1034 			CERROR("Failed to create ptable with npartitions %d\n",
1035 			       cpu_npartitions);
1036 			goto failed;
1037 		}
1038 	}
1039 
1040 	spin_lock(&cpt_data.cpt_lock);
1041 	if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
1042 		spin_unlock(&cpt_data.cpt_lock);
1043 		CERROR("CPU hotplug/unplug during setup\n");
1044 		goto failed;
1045 	}
1046 	spin_unlock(&cpt_data.cpt_lock);
1047 
1048 	LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
1049 		 num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
1050 	return 0;
1051 
1052  failed:
1053 	cfs_cpu_fini();
1054 	return -1;
1055 }
1056 
1057 #endif
1058