1 /*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA
20 *
21 * GPL HEADER END
22 */
23 /*
24 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2012, Intel Corporation.
26 */
27 /*
28 * This file is part of Lustre, http://www.lustre.org/
29 * Lustre is a trademark of Sun Microsystems, Inc.
30 *
31 * Author: liang@whamcloud.com
32 */
33
34 #define DEBUG_SUBSYSTEM S_LNET
35
36 #include <linux/cpu.h>
37 #include <linux/sched.h>
38 #include "../../../include/linux/libcfs/libcfs.h"
39
40 #ifdef CONFIG_SMP
41
42 /**
43 * modparam for setting number of partitions
44 *
45 * 0 : estimate best value based on cores or NUMA nodes
46 * 1 : disable multiple partitions
47 * >1 : specify number of partitions
48 */
49 static int cpu_npartitions;
50 module_param(cpu_npartitions, int, 0444);
51 MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
52
53 /**
54 * modparam for setting CPU partitions patterns:
55 *
56 * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
57 * number in bracket is processor ID (core or HT)
58 *
59 * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
60 * are NUMA node ID, number before bracket is CPU partition ID.
61 *
62 * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
63 */
64 static char *cpu_pattern = "";
65 module_param(cpu_pattern, charp, 0444);
66 MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
67
68 struct cfs_cpt_data {
69 /* serialize hotplug etc */
70 spinlock_t cpt_lock;
71 /* reserved for hotplug */
72 unsigned long cpt_version;
73 /* mutex to protect cpt_cpumask */
74 struct mutex cpt_mutex;
75 /* scratch buffer for set/unset_node */
76 cpumask_t *cpt_cpumask;
77 };
78
79 static struct cfs_cpt_data cpt_data;
80
cfs_cpu_core_siblings(int cpu,cpumask_t * mask)81 static void cfs_cpu_core_siblings(int cpu, cpumask_t *mask)
82 {
83 /* return cpumask of cores in the same socket */
84 cpumask_copy(mask, topology_core_cpumask(cpu));
85 }
86
87 /* return cpumask of HTs in the same core */
cfs_cpu_ht_siblings(int cpu,cpumask_t * mask)88 static void cfs_cpu_ht_siblings(int cpu, cpumask_t *mask)
89 {
90 cpumask_copy(mask, topology_thread_cpumask(cpu));
91 }
92
cfs_node_to_cpumask(int node,cpumask_t * mask)93 static void cfs_node_to_cpumask(int node, cpumask_t *mask)
94 {
95 cpumask_copy(mask, cpumask_of_node(node));
96 }
97
98 void
cfs_cpt_table_free(struct cfs_cpt_table * cptab)99 cfs_cpt_table_free(struct cfs_cpt_table *cptab)
100 {
101 int i;
102
103 if (cptab->ctb_cpu2cpt != NULL) {
104 LIBCFS_FREE(cptab->ctb_cpu2cpt,
105 num_possible_cpus() *
106 sizeof(cptab->ctb_cpu2cpt[0]));
107 }
108
109 for (i = 0; cptab->ctb_parts != NULL && i < cptab->ctb_nparts; i++) {
110 struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
111
112 if (part->cpt_nodemask != NULL) {
113 LIBCFS_FREE(part->cpt_nodemask,
114 sizeof(*part->cpt_nodemask));
115 }
116
117 if (part->cpt_cpumask != NULL)
118 LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
119 }
120
121 if (cptab->ctb_parts != NULL) {
122 LIBCFS_FREE(cptab->ctb_parts,
123 cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
124 }
125
126 if (cptab->ctb_nodemask != NULL)
127 LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
128 if (cptab->ctb_cpumask != NULL)
129 LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
130
131 LIBCFS_FREE(cptab, sizeof(*cptab));
132 }
133 EXPORT_SYMBOL(cfs_cpt_table_free);
134
135 struct cfs_cpt_table *
cfs_cpt_table_alloc(unsigned int ncpt)136 cfs_cpt_table_alloc(unsigned int ncpt)
137 {
138 struct cfs_cpt_table *cptab;
139 int i;
140
141 LIBCFS_ALLOC(cptab, sizeof(*cptab));
142 if (cptab == NULL)
143 return NULL;
144
145 cptab->ctb_nparts = ncpt;
146
147 LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
148 LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
149
150 if (cptab->ctb_cpumask == NULL || cptab->ctb_nodemask == NULL)
151 goto failed;
152
153 LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
154 num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
155 if (cptab->ctb_cpu2cpt == NULL)
156 goto failed;
157
158 memset(cptab->ctb_cpu2cpt, -1,
159 num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
160
161 LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
162 if (cptab->ctb_parts == NULL)
163 goto failed;
164
165 for (i = 0; i < ncpt; i++) {
166 struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
167
168 LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
169 LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
170 if (part->cpt_cpumask == NULL || part->cpt_nodemask == NULL)
171 goto failed;
172 }
173
174 spin_lock(&cpt_data.cpt_lock);
175 /* Reserved for hotplug */
176 cptab->ctb_version = cpt_data.cpt_version;
177 spin_unlock(&cpt_data.cpt_lock);
178
179 return cptab;
180
181 failed:
182 cfs_cpt_table_free(cptab);
183 return NULL;
184 }
185 EXPORT_SYMBOL(cfs_cpt_table_alloc);
186
187 int
cfs_cpt_table_print(struct cfs_cpt_table * cptab,char * buf,int len)188 cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
189 {
190 char *tmp = buf;
191 int rc = 0;
192 int i;
193 int j;
194
195 for (i = 0; i < cptab->ctb_nparts; i++) {
196 if (len > 0) {
197 rc = snprintf(tmp, len, "%d\t: ", i);
198 len -= rc;
199 }
200
201 if (len <= 0) {
202 rc = -EFBIG;
203 goto out;
204 }
205
206 tmp += rc;
207 for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
208 rc = snprintf(tmp, len, "%d ", j);
209 len -= rc;
210 if (len <= 0) {
211 rc = -EFBIG;
212 goto out;
213 }
214 tmp += rc;
215 }
216
217 *tmp = '\n';
218 tmp++;
219 len--;
220 }
221
222 out:
223 if (rc < 0)
224 return rc;
225
226 return tmp - buf;
227 }
228 EXPORT_SYMBOL(cfs_cpt_table_print);
229
230 int
cfs_cpt_number(struct cfs_cpt_table * cptab)231 cfs_cpt_number(struct cfs_cpt_table *cptab)
232 {
233 return cptab->ctb_nparts;
234 }
235 EXPORT_SYMBOL(cfs_cpt_number);
236
237 int
cfs_cpt_weight(struct cfs_cpt_table * cptab,int cpt)238 cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
239 {
240 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
241
242 return cpt == CFS_CPT_ANY ?
243 cpumask_weight(cptab->ctb_cpumask) :
244 cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
245 }
246 EXPORT_SYMBOL(cfs_cpt_weight);
247
248 int
cfs_cpt_online(struct cfs_cpt_table * cptab,int cpt)249 cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
250 {
251 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
252
253 return cpt == CFS_CPT_ANY ?
254 cpumask_any_and(cptab->ctb_cpumask,
255 cpu_online_mask) < nr_cpu_ids :
256 cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
257 cpu_online_mask) < nr_cpu_ids;
258 }
259 EXPORT_SYMBOL(cfs_cpt_online);
260
261 cpumask_t *
cfs_cpt_cpumask(struct cfs_cpt_table * cptab,int cpt)262 cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
263 {
264 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
265
266 return cpt == CFS_CPT_ANY ?
267 cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
268 }
269 EXPORT_SYMBOL(cfs_cpt_cpumask);
270
271 nodemask_t *
cfs_cpt_nodemask(struct cfs_cpt_table * cptab,int cpt)272 cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
273 {
274 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
275
276 return cpt == CFS_CPT_ANY ?
277 cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
278 }
279 EXPORT_SYMBOL(cfs_cpt_nodemask);
280
281 int
cfs_cpt_set_cpu(struct cfs_cpt_table * cptab,int cpt,int cpu)282 cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
283 {
284 int node;
285
286 LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
287
288 if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
289 CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
290 return 0;
291 }
292
293 if (cptab->ctb_cpu2cpt[cpu] != -1) {
294 CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
295 cpu, cptab->ctb_cpu2cpt[cpu]);
296 return 0;
297 }
298
299 cptab->ctb_cpu2cpt[cpu] = cpt;
300
301 LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask));
302 LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
303
304 cpumask_set_cpu(cpu, cptab->ctb_cpumask);
305 cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
306
307 node = cpu_to_node(cpu);
308
309 /* first CPU of @node in this CPT table */
310 if (!node_isset(node, *cptab->ctb_nodemask))
311 node_set(node, *cptab->ctb_nodemask);
312
313 /* first CPU of @node in this partition */
314 if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
315 node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
316
317 return 1;
318 }
319 EXPORT_SYMBOL(cfs_cpt_set_cpu);
320
321 void
cfs_cpt_unset_cpu(struct cfs_cpt_table * cptab,int cpt,int cpu)322 cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
323 {
324 int node;
325 int i;
326
327 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
328
329 if (cpu < 0 || cpu >= nr_cpu_ids) {
330 CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
331 return;
332 }
333
334 if (cpt == CFS_CPT_ANY) {
335 /* caller doesn't know the partition ID */
336 cpt = cptab->ctb_cpu2cpt[cpu];
337 if (cpt < 0) { /* not set in this CPT-table */
338 CDEBUG(D_INFO, "Try to unset cpu %d which is not in CPT-table %p\n",
339 cpt, cptab);
340 return;
341 }
342
343 } else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
344 CDEBUG(D_INFO,
345 "CPU %d is not in cpu-partition %d\n", cpu, cpt);
346 return;
347 }
348
349 LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
350 LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
351
352 cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
353 cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
354 cptab->ctb_cpu2cpt[cpu] = -1;
355
356 node = cpu_to_node(cpu);
357
358 LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
359 LASSERT(node_isset(node, *cptab->ctb_nodemask));
360
361 for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) {
362 /* this CPT has other CPU belonging to this node? */
363 if (cpu_to_node(i) == node)
364 break;
365 }
366
367 if (i >= nr_cpu_ids)
368 node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
369
370 for_each_cpu(i, cptab->ctb_cpumask) {
371 /* this CPT-table has other CPU belonging to this node? */
372 if (cpu_to_node(i) == node)
373 break;
374 }
375
376 if (i >= nr_cpu_ids)
377 node_clear(node, *cptab->ctb_nodemask);
378
379 return;
380 }
381 EXPORT_SYMBOL(cfs_cpt_unset_cpu);
382
383 int
cfs_cpt_set_cpumask(struct cfs_cpt_table * cptab,int cpt,cpumask_t * mask)384 cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
385 {
386 int i;
387
388 if (cpumask_weight(mask) == 0 ||
389 cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
390 CDEBUG(D_INFO, "No online CPU is found in the CPU mask for CPU partition %d\n",
391 cpt);
392 return 0;
393 }
394
395 for_each_cpu(i, mask) {
396 if (!cfs_cpt_set_cpu(cptab, cpt, i))
397 return 0;
398 }
399
400 return 1;
401 }
402 EXPORT_SYMBOL(cfs_cpt_set_cpumask);
403
404 void
cfs_cpt_unset_cpumask(struct cfs_cpt_table * cptab,int cpt,cpumask_t * mask)405 cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
406 {
407 int i;
408
409 for_each_cpu(i, mask)
410 cfs_cpt_unset_cpu(cptab, cpt, i);
411 }
412 EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
413
414 int
cfs_cpt_set_node(struct cfs_cpt_table * cptab,int cpt,int node)415 cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
416 {
417 cpumask_t *mask;
418 int rc;
419
420 if (node < 0 || node >= MAX_NUMNODES) {
421 CDEBUG(D_INFO,
422 "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
423 return 0;
424 }
425
426 mutex_lock(&cpt_data.cpt_mutex);
427
428 mask = cpt_data.cpt_cpumask;
429 cfs_node_to_cpumask(node, mask);
430
431 rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
432
433 mutex_unlock(&cpt_data.cpt_mutex);
434
435 return rc;
436 }
437 EXPORT_SYMBOL(cfs_cpt_set_node);
438
439 void
cfs_cpt_unset_node(struct cfs_cpt_table * cptab,int cpt,int node)440 cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
441 {
442 cpumask_t *mask;
443
444 if (node < 0 || node >= MAX_NUMNODES) {
445 CDEBUG(D_INFO,
446 "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
447 return;
448 }
449
450 mutex_lock(&cpt_data.cpt_mutex);
451
452 mask = cpt_data.cpt_cpumask;
453 cfs_node_to_cpumask(node, mask);
454
455 cfs_cpt_unset_cpumask(cptab, cpt, mask);
456
457 mutex_unlock(&cpt_data.cpt_mutex);
458 }
459 EXPORT_SYMBOL(cfs_cpt_unset_node);
460
461 int
cfs_cpt_set_nodemask(struct cfs_cpt_table * cptab,int cpt,nodemask_t * mask)462 cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
463 {
464 int i;
465
466 for_each_node_mask(i, *mask) {
467 if (!cfs_cpt_set_node(cptab, cpt, i))
468 return 0;
469 }
470
471 return 1;
472 }
473 EXPORT_SYMBOL(cfs_cpt_set_nodemask);
474
475 void
cfs_cpt_unset_nodemask(struct cfs_cpt_table * cptab,int cpt,nodemask_t * mask)476 cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
477 {
478 int i;
479
480 for_each_node_mask(i, *mask)
481 cfs_cpt_unset_node(cptab, cpt, i);
482 }
483 EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
484
485 void
cfs_cpt_clear(struct cfs_cpt_table * cptab,int cpt)486 cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
487 {
488 int last;
489 int i;
490
491 if (cpt == CFS_CPT_ANY) {
492 last = cptab->ctb_nparts - 1;
493 cpt = 0;
494 } else {
495 last = cpt;
496 }
497
498 for (; cpt <= last; cpt++) {
499 for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask)
500 cfs_cpt_unset_cpu(cptab, cpt, i);
501 }
502 }
503 EXPORT_SYMBOL(cfs_cpt_clear);
504
505 int
cfs_cpt_spread_node(struct cfs_cpt_table * cptab,int cpt)506 cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
507 {
508 nodemask_t *mask;
509 int weight;
510 int rotor;
511 int node;
512
513 /* convert CPU partition ID to HW node id */
514
515 if (cpt < 0 || cpt >= cptab->ctb_nparts) {
516 mask = cptab->ctb_nodemask;
517 rotor = cptab->ctb_spread_rotor++;
518 } else {
519 mask = cptab->ctb_parts[cpt].cpt_nodemask;
520 rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
521 }
522
523 weight = nodes_weight(*mask);
524 LASSERT(weight > 0);
525
526 rotor %= weight;
527
528 for_each_node_mask(node, *mask) {
529 if (rotor-- == 0)
530 return node;
531 }
532
533 LBUG();
534 return 0;
535 }
536 EXPORT_SYMBOL(cfs_cpt_spread_node);
537
538 int
cfs_cpt_current(struct cfs_cpt_table * cptab,int remap)539 cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
540 {
541 int cpu = smp_processor_id();
542 int cpt = cptab->ctb_cpu2cpt[cpu];
543
544 if (cpt < 0) {
545 if (!remap)
546 return cpt;
547
548 /* don't return negative value for safety of upper layer,
549 * instead we shadow the unknown cpu to a valid partition ID */
550 cpt = cpu % cptab->ctb_nparts;
551 }
552
553 return cpt;
554 }
555 EXPORT_SYMBOL(cfs_cpt_current);
556
557 int
cfs_cpt_of_cpu(struct cfs_cpt_table * cptab,int cpu)558 cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
559 {
560 LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
561
562 return cptab->ctb_cpu2cpt[cpu];
563 }
564 EXPORT_SYMBOL(cfs_cpt_of_cpu);
565
566 int
cfs_cpt_bind(struct cfs_cpt_table * cptab,int cpt)567 cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
568 {
569 cpumask_t *cpumask;
570 nodemask_t *nodemask;
571 int rc;
572 int i;
573
574 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
575
576 if (cpt == CFS_CPT_ANY) {
577 cpumask = cptab->ctb_cpumask;
578 nodemask = cptab->ctb_nodemask;
579 } else {
580 cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
581 nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
582 }
583
584 if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) {
585 CERROR("No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
586 cpt);
587 return -EINVAL;
588 }
589
590 for_each_online_cpu(i) {
591 if (cpumask_test_cpu(i, cpumask))
592 continue;
593
594 rc = set_cpus_allowed_ptr(current, cpumask);
595 set_mems_allowed(*nodemask);
596 if (rc == 0)
597 schedule(); /* switch to allowed CPU */
598
599 return rc;
600 }
601
602 /* don't need to set affinity because all online CPUs are covered */
603 return 0;
604 }
605 EXPORT_SYMBOL(cfs_cpt_bind);
606
607 /**
608 * Choose max to \a number CPUs from \a node and set them in \a cpt.
609 * We always prefer to choose CPU in the same core/socket.
610 */
611 static int
cfs_cpt_choose_ncpus(struct cfs_cpt_table * cptab,int cpt,cpumask_t * node,int number)612 cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
613 cpumask_t *node, int number)
614 {
615 cpumask_t *socket = NULL;
616 cpumask_t *core = NULL;
617 int rc = 0;
618 int cpu;
619
620 LASSERT(number > 0);
621
622 if (number >= cpumask_weight(node)) {
623 while (!cpumask_empty(node)) {
624 cpu = cpumask_first(node);
625
626 rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
627 if (!rc)
628 return -EINVAL;
629 cpumask_clear_cpu(cpu, node);
630 }
631 return 0;
632 }
633
634 /* allocate scratch buffer */
635 LIBCFS_ALLOC(socket, cpumask_size());
636 LIBCFS_ALLOC(core, cpumask_size());
637 if (socket == NULL || core == NULL) {
638 rc = -ENOMEM;
639 goto out;
640 }
641
642 while (!cpumask_empty(node)) {
643 cpu = cpumask_first(node);
644
645 /* get cpumask for cores in the same socket */
646 cfs_cpu_core_siblings(cpu, socket);
647 cpumask_and(socket, socket, node);
648
649 LASSERT(!cpumask_empty(socket));
650
651 while (!cpumask_empty(socket)) {
652 int i;
653
654 /* get cpumask for hts in the same core */
655 cfs_cpu_ht_siblings(cpu, core);
656 cpumask_and(core, core, node);
657
658 LASSERT(!cpumask_empty(core));
659
660 for_each_cpu(i, core) {
661 cpumask_clear_cpu(i, socket);
662 cpumask_clear_cpu(i, node);
663
664 rc = cfs_cpt_set_cpu(cptab, cpt, i);
665 if (!rc) {
666 rc = -EINVAL;
667 goto out;
668 }
669
670 if (--number == 0)
671 goto out;
672 }
673 cpu = cpumask_first(socket);
674 }
675 }
676
677 out:
678 if (socket != NULL)
679 LIBCFS_FREE(socket, cpumask_size());
680 if (core != NULL)
681 LIBCFS_FREE(core, cpumask_size());
682 return rc;
683 }
684
685 #define CPT_WEIGHT_MIN 4u
686
687 static unsigned int
cfs_cpt_num_estimate(void)688 cfs_cpt_num_estimate(void)
689 {
690 unsigned nnode = num_online_nodes();
691 unsigned ncpu = num_online_cpus();
692 unsigned ncpt;
693
694 if (ncpu <= CPT_WEIGHT_MIN) {
695 ncpt = 1;
696 goto out;
697 }
698
699 /* generate reasonable number of CPU partitions based on total number
700 * of CPUs, Preferred N should be power2 and match this condition:
701 * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 */
702 for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) {}
703
704 if (ncpt <= nnode) { /* fat numa system */
705 while (nnode > ncpt)
706 nnode >>= 1;
707
708 } else { /* ncpt > nnode */
709 while ((nnode << 1) <= ncpt)
710 nnode <<= 1;
711 }
712
713 ncpt = nnode;
714
715 out:
716 #if (BITS_PER_LONG == 32)
717 /* config many CPU partitions on 32-bit system could consume
718 * too much memory */
719 ncpt = min(2U, ncpt);
720 #endif
721 while (ncpu % ncpt != 0)
722 ncpt--; /* worst case is 1 */
723
724 return ncpt;
725 }
726
727 static struct cfs_cpt_table *
cfs_cpt_table_create(int ncpt)728 cfs_cpt_table_create(int ncpt)
729 {
730 struct cfs_cpt_table *cptab = NULL;
731 cpumask_t *mask = NULL;
732 int cpt = 0;
733 int num;
734 int rc;
735 int i;
736
737 rc = cfs_cpt_num_estimate();
738 if (ncpt <= 0)
739 ncpt = rc;
740
741 if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
742 CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
743 ncpt, rc);
744 }
745
746 if (num_online_cpus() % ncpt != 0) {
747 CERROR("CPU number %d is not multiple of cpu_npartition %d, please try different cpu_npartitions value or set pattern string by cpu_pattern=STRING\n",
748 (int)num_online_cpus(), ncpt);
749 goto failed;
750 }
751
752 cptab = cfs_cpt_table_alloc(ncpt);
753 if (cptab == NULL) {
754 CERROR("Failed to allocate CPU map(%d)\n", ncpt);
755 goto failed;
756 }
757
758 num = num_online_cpus() / ncpt;
759 if (num == 0) {
760 CERROR("CPU changed while setting CPU partition\n");
761 goto failed;
762 }
763
764 LIBCFS_ALLOC(mask, cpumask_size());
765 if (mask == NULL) {
766 CERROR("Failed to allocate scratch cpumask\n");
767 goto failed;
768 }
769
770 for_each_online_node(i) {
771 cfs_node_to_cpumask(i, mask);
772
773 while (!cpumask_empty(mask)) {
774 struct cfs_cpu_partition *part;
775 int n;
776
777 if (cpt >= ncpt)
778 goto failed;
779
780 part = &cptab->ctb_parts[cpt];
781
782 n = num - cpumask_weight(part->cpt_cpumask);
783 LASSERT(n > 0);
784
785 rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
786 if (rc < 0)
787 goto failed;
788
789 LASSERT(num >= cpumask_weight(part->cpt_cpumask));
790 if (num == cpumask_weight(part->cpt_cpumask))
791 cpt++;
792 }
793 }
794
795 if (cpt != ncpt ||
796 num != cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
797 CERROR("Expect %d(%d) CPU partitions but got %d(%d), CPU hotplug/unplug while setting?\n",
798 cptab->ctb_nparts, num, cpt,
799 cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask));
800 goto failed;
801 }
802
803 LIBCFS_FREE(mask, cpumask_size());
804
805 return cptab;
806
807 failed:
808 CERROR("Failed to setup CPU-partition-table with %d CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
809 ncpt, num_online_nodes(), num_online_cpus());
810
811 if (mask != NULL)
812 LIBCFS_FREE(mask, cpumask_size());
813
814 if (cptab != NULL)
815 cfs_cpt_table_free(cptab);
816
817 return NULL;
818 }
819
820 static struct cfs_cpt_table *
cfs_cpt_table_create_pattern(char * pattern)821 cfs_cpt_table_create_pattern(char *pattern)
822 {
823 struct cfs_cpt_table *cptab;
824 char *str = pattern;
825 int node = 0;
826 int high;
827 int ncpt;
828 int c;
829
830 for (ncpt = 0;; ncpt++) { /* quick scan bracket */
831 str = strchr(str, '[');
832 if (str == NULL)
833 break;
834 str++;
835 }
836
837 str = cfs_trimwhite(pattern);
838 if (*str == 'n' || *str == 'N') {
839 pattern = str + 1;
840 node = 1;
841 }
842
843 if (ncpt == 0 ||
844 (node && ncpt > num_online_nodes()) ||
845 (!node && ncpt > num_online_cpus())) {
846 CERROR("Invalid pattern %s, or too many partitions %d\n",
847 pattern, ncpt);
848 return NULL;
849 }
850
851 high = node ? MAX_NUMNODES - 1 : nr_cpu_ids - 1;
852
853 cptab = cfs_cpt_table_alloc(ncpt);
854 if (cptab == NULL) {
855 CERROR("Failed to allocate cpu partition table\n");
856 return NULL;
857 }
858
859 for (str = cfs_trimwhite(pattern), c = 0;; c++) {
860 struct cfs_range_expr *range;
861 struct cfs_expr_list *el;
862 char *bracket = strchr(str, '[');
863 int cpt;
864 int rc;
865 int i;
866 int n;
867
868 if (bracket == NULL) {
869 if (*str != 0) {
870 CERROR("Invalid pattern %s\n", str);
871 goto failed;
872 } else if (c != ncpt) {
873 CERROR("expect %d partitions but found %d\n",
874 ncpt, c);
875 goto failed;
876 }
877 break;
878 }
879
880 if (sscanf(str, "%d%n", &cpt, &n) < 1) {
881 CERROR("Invalid cpu pattern %s\n", str);
882 goto failed;
883 }
884
885 if (cpt < 0 || cpt >= ncpt) {
886 CERROR("Invalid partition id %d, total partitions %d\n",
887 cpt, ncpt);
888 goto failed;
889 }
890
891 if (cfs_cpt_weight(cptab, cpt) != 0) {
892 CERROR("Partition %d has already been set.\n", cpt);
893 goto failed;
894 }
895
896 str = cfs_trimwhite(str + n);
897 if (str != bracket) {
898 CERROR("Invalid pattern %s\n", str);
899 goto failed;
900 }
901
902 bracket = strchr(str, ']');
903 if (bracket == NULL) {
904 CERROR("missing right bracket for cpt %d, %s\n",
905 cpt, str);
906 goto failed;
907 }
908
909 if (cfs_expr_list_parse(str, (bracket - str) + 1,
910 0, high, &el) != 0) {
911 CERROR("Can't parse number range: %s\n", str);
912 goto failed;
913 }
914
915 list_for_each_entry(range, &el->el_exprs, re_link) {
916 for (i = range->re_lo; i <= range->re_hi; i++) {
917 if ((i - range->re_lo) % range->re_stride != 0)
918 continue;
919
920 rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
921 cfs_cpt_set_cpu(cptab, cpt, i);
922 if (!rc) {
923 cfs_expr_list_free(el);
924 goto failed;
925 }
926 }
927 }
928
929 cfs_expr_list_free(el);
930
931 if (!cfs_cpt_online(cptab, cpt)) {
932 CERROR("No online CPU is found on partition %d\n", cpt);
933 goto failed;
934 }
935
936 str = cfs_trimwhite(bracket + 1);
937 }
938
939 return cptab;
940
941 failed:
942 cfs_cpt_table_free(cptab);
943 return NULL;
944 }
945
946 #ifdef CONFIG_HOTPLUG_CPU
947 static int
cfs_cpu_notify(struct notifier_block * self,unsigned long action,void * hcpu)948 cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
949 {
950 unsigned int cpu = (unsigned long)hcpu;
951 bool warn;
952
953 switch (action) {
954 case CPU_DEAD:
955 case CPU_DEAD_FROZEN:
956 case CPU_ONLINE:
957 case CPU_ONLINE_FROZEN:
958 spin_lock(&cpt_data.cpt_lock);
959 cpt_data.cpt_version++;
960 spin_unlock(&cpt_data.cpt_lock);
961 default:
962 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
963 CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
964 cpu, action);
965 break;
966 }
967
968 mutex_lock(&cpt_data.cpt_mutex);
969 /* if all HTs in a core are offline, it may break affinity */
970 cfs_cpu_ht_siblings(cpu, cpt_data.cpt_cpumask);
971 warn = cpumask_any_and(cpt_data.cpt_cpumask,
972 cpu_online_mask) >= nr_cpu_ids;
973 mutex_unlock(&cpt_data.cpt_mutex);
974 CDEBUG(warn ? D_WARNING : D_INFO,
975 "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u action: %lx]\n",
976 cpu, action);
977 }
978
979 return NOTIFY_OK;
980 }
981
982 static struct notifier_block cfs_cpu_notifier = {
983 .notifier_call = cfs_cpu_notify,
984 .priority = 0
985 };
986
987 #endif
988
989 void
cfs_cpu_fini(void)990 cfs_cpu_fini(void)
991 {
992 if (cfs_cpt_table != NULL)
993 cfs_cpt_table_free(cfs_cpt_table);
994
995 #ifdef CONFIG_HOTPLUG_CPU
996 unregister_hotcpu_notifier(&cfs_cpu_notifier);
997 #endif
998 if (cpt_data.cpt_cpumask != NULL)
999 LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
1000 }
1001
1002 int
cfs_cpu_init(void)1003 cfs_cpu_init(void)
1004 {
1005 LASSERT(cfs_cpt_table == NULL);
1006
1007 memset(&cpt_data, 0, sizeof(cpt_data));
1008
1009 LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
1010 if (cpt_data.cpt_cpumask == NULL) {
1011 CERROR("Failed to allocate scratch buffer\n");
1012 return -1;
1013 }
1014
1015 spin_lock_init(&cpt_data.cpt_lock);
1016 mutex_init(&cpt_data.cpt_mutex);
1017
1018 #ifdef CONFIG_HOTPLUG_CPU
1019 register_hotcpu_notifier(&cfs_cpu_notifier);
1020 #endif
1021
1022 if (*cpu_pattern != 0) {
1023 cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
1024 if (cfs_cpt_table == NULL) {
1025 CERROR("Failed to create cptab from pattern %s\n",
1026 cpu_pattern);
1027 goto failed;
1028 }
1029
1030 } else {
1031 cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
1032 if (cfs_cpt_table == NULL) {
1033 CERROR("Failed to create ptable with npartitions %d\n",
1034 cpu_npartitions);
1035 goto failed;
1036 }
1037 }
1038
1039 spin_lock(&cpt_data.cpt_lock);
1040 if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
1041 spin_unlock(&cpt_data.cpt_lock);
1042 CERROR("CPU hotplug/unplug during setup\n");
1043 goto failed;
1044 }
1045 spin_unlock(&cpt_data.cpt_lock);
1046
1047 LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
1048 num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
1049 return 0;
1050
1051 failed:
1052 cfs_cpu_fini();
1053 return -1;
1054 }
1055
1056 #endif
1057