This source file includes following definitions.
- cpu_map_alloc
- get_cpu_map_entry
- cpu_map_kthread_stop
- cpu_map_build_skb
- __cpu_map_ring_cleanup
- put_cpu_map_entry
- cpu_map_kthread_run
- __cpu_map_entry_alloc
- __cpu_map_entry_free
- __cpu_map_entry_replace
- cpu_map_delete_elem
- cpu_map_update_elem
- cpu_map_free
- __cpu_map_lookup_elem
- cpu_map_lookup_elem
- cpu_map_get_next_key
- bq_flush_to_queue
- bq_enqueue
- cpu_map_enqueue
- __cpu_map_flush
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 #include <linux/bpf.h>
20 #include <linux/filter.h>
21 #include <linux/ptr_ring.h>
22 #include <net/xdp.h>
23
24 #include <linux/sched.h>
25 #include <linux/workqueue.h>
26 #include <linux/kthread.h>
27 #include <linux/capability.h>
28 #include <trace/events/xdp.h>
29
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32
33
34
35
36
37
38
39
40 #define CPU_MAP_BULK_SIZE 8
41 struct bpf_cpu_map_entry;
42 struct bpf_cpu_map;
43
44 struct xdp_bulk_queue {
45 void *q[CPU_MAP_BULK_SIZE];
46 struct list_head flush_node;
47 struct bpf_cpu_map_entry *obj;
48 unsigned int count;
49 };
50
51
52 struct bpf_cpu_map_entry {
53 u32 cpu;
54 int map_id;
55 u32 qsize;
56
57
58 struct xdp_bulk_queue __percpu *bulkq;
59
60 struct bpf_cpu_map *cmap;
61
62
63 struct ptr_ring *queue;
64 struct task_struct *kthread;
65 struct work_struct kthread_stop_wq;
66
67 atomic_t refcnt;
68 struct rcu_head rcu;
69 };
70
71 struct bpf_cpu_map {
72 struct bpf_map map;
73
74 struct bpf_cpu_map_entry **cpu_map;
75 struct list_head __percpu *flush_list;
76 };
77
78 static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
79
80 static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
81 {
82 struct bpf_cpu_map *cmap;
83 int err = -ENOMEM;
84 int ret, cpu;
85 u64 cost;
86
87 if (!capable(CAP_SYS_ADMIN))
88 return ERR_PTR(-EPERM);
89
90
91 if (attr->max_entries == 0 || attr->key_size != 4 ||
92 attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
93 return ERR_PTR(-EINVAL);
94
95 cmap = kzalloc(sizeof(*cmap), GFP_USER);
96 if (!cmap)
97 return ERR_PTR(-ENOMEM);
98
99 bpf_map_init_from_attr(&cmap->map, attr);
100
101
102 if (cmap->map.max_entries > NR_CPUS) {
103 err = -E2BIG;
104 goto free_cmap;
105 }
106
107
108 cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
109 cost += sizeof(struct list_head) * num_possible_cpus();
110
111
112 ret = bpf_map_charge_init(&cmap->map.memory, cost);
113 if (ret) {
114 err = ret;
115 goto free_cmap;
116 }
117
118 cmap->flush_list = alloc_percpu(struct list_head);
119 if (!cmap->flush_list)
120 goto free_charge;
121
122 for_each_possible_cpu(cpu)
123 INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
124
125
126 cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
127 sizeof(struct bpf_cpu_map_entry *),
128 cmap->map.numa_node);
129 if (!cmap->cpu_map)
130 goto free_percpu;
131
132 return &cmap->map;
133 free_percpu:
134 free_percpu(cmap->flush_list);
135 free_charge:
136 bpf_map_charge_finish(&cmap->map.memory);
137 free_cmap:
138 kfree(cmap);
139 return ERR_PTR(err);
140 }
141
142 static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
143 {
144 atomic_inc(&rcpu->refcnt);
145 }
146
147
148 static void cpu_map_kthread_stop(struct work_struct *work)
149 {
150 struct bpf_cpu_map_entry *rcpu;
151
152 rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
153
154
155
156
157 rcu_barrier();
158
159
160 kthread_stop(rcpu->kthread);
161 }
162
163 static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
164 struct xdp_frame *xdpf,
165 struct sk_buff *skb)
166 {
167 unsigned int hard_start_headroom;
168 unsigned int frame_size;
169 void *pkt_data_start;
170
171
172 hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom;
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191 frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) +
192 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
193
194 pkt_data_start = xdpf->data - hard_start_headroom;
195 skb = build_skb_around(skb, pkt_data_start, frame_size);
196 if (unlikely(!skb))
197 return NULL;
198
199 skb_reserve(skb, hard_start_headroom);
200 __skb_put(skb, xdpf->len);
201 if (xdpf->metasize)
202 skb_metadata_set(skb, xdpf->metasize);
203
204
205 skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
206
207
208
209
210
211
212
213
214 xdp_release_frame(xdpf);
215
216
217 xdp_scrub_frame(xdpf);
218
219 return skb;
220 }
221
222 static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
223 {
224
225
226
227
228
229 struct xdp_frame *xdpf;
230
231 while ((xdpf = ptr_ring_consume(ring)))
232 if (WARN_ON_ONCE(xdpf))
233 xdp_return_frame(xdpf);
234 }
235
236 static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
237 {
238 if (atomic_dec_and_test(&rcpu->refcnt)) {
239
240 __cpu_map_ring_cleanup(rcpu->queue);
241 ptr_ring_cleanup(rcpu->queue, NULL);
242 kfree(rcpu->queue);
243 kfree(rcpu);
244 }
245 }
246
247 #define CPUMAP_BATCH 8
248
249 static int cpu_map_kthread_run(void *data)
250 {
251 struct bpf_cpu_map_entry *rcpu = data;
252
253 set_current_state(TASK_INTERRUPTIBLE);
254
255
256
257
258
259
260 while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
261 unsigned int drops = 0, sched = 0;
262 void *frames[CPUMAP_BATCH];
263 void *skbs[CPUMAP_BATCH];
264 gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
265 int i, n, m;
266
267
268 if (__ptr_ring_empty(rcpu->queue)) {
269 set_current_state(TASK_INTERRUPTIBLE);
270
271 if (__ptr_ring_empty(rcpu->queue)) {
272 schedule();
273 sched = 1;
274 } else {
275 __set_current_state(TASK_RUNNING);
276 }
277 } else {
278 sched = cond_resched();
279 }
280
281
282
283
284
285
286 n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
287
288 for (i = 0; i < n; i++) {
289 void *f = frames[i];
290 struct page *page = virt_to_page(f);
291
292
293
294
295
296 prefetchw(page);
297 }
298
299 m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs);
300 if (unlikely(m == 0)) {
301 for (i = 0; i < n; i++)
302 skbs[i] = NULL;
303 drops = n;
304 }
305
306 local_bh_disable();
307 for (i = 0; i < n; i++) {
308 struct xdp_frame *xdpf = frames[i];
309 struct sk_buff *skb = skbs[i];
310 int ret;
311
312 skb = cpu_map_build_skb(rcpu, xdpf, skb);
313 if (!skb) {
314 xdp_return_frame(xdpf);
315 continue;
316 }
317
318
319 ret = netif_receive_skb_core(skb);
320 if (ret == NET_RX_DROP)
321 drops++;
322 }
323
324 trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);
325
326 local_bh_enable();
327 }
328 __set_current_state(TASK_RUNNING);
329
330 put_cpu_map_entry(rcpu);
331 return 0;
332 }
333
334 static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
335 int map_id)
336 {
337 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
338 struct bpf_cpu_map_entry *rcpu;
339 struct xdp_bulk_queue *bq;
340 int numa, err, i;
341
342
343 numa = cpu_to_node(cpu);
344
345 rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
346 if (!rcpu)
347 return NULL;
348
349
350 rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
351 sizeof(void *), gfp);
352 if (!rcpu->bulkq)
353 goto free_rcu;
354
355 for_each_possible_cpu(i) {
356 bq = per_cpu_ptr(rcpu->bulkq, i);
357 bq->obj = rcpu;
358 }
359
360
361 rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
362 if (!rcpu->queue)
363 goto free_bulkq;
364
365 err = ptr_ring_init(rcpu->queue, qsize, gfp);
366 if (err)
367 goto free_queue;
368
369 rcpu->cpu = cpu;
370 rcpu->map_id = map_id;
371 rcpu->qsize = qsize;
372
373
374 rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
375 "cpumap/%d/map:%d", cpu, map_id);
376 if (IS_ERR(rcpu->kthread))
377 goto free_ptr_ring;
378
379 get_cpu_map_entry(rcpu);
380 get_cpu_map_entry(rcpu);
381
382
383 kthread_bind(rcpu->kthread, cpu);
384 wake_up_process(rcpu->kthread);
385
386 return rcpu;
387
388 free_ptr_ring:
389 ptr_ring_cleanup(rcpu->queue, NULL);
390 free_queue:
391 kfree(rcpu->queue);
392 free_bulkq:
393 free_percpu(rcpu->bulkq);
394 free_rcu:
395 kfree(rcpu);
396 return NULL;
397 }
398
399 static void __cpu_map_entry_free(struct rcu_head *rcu)
400 {
401 struct bpf_cpu_map_entry *rcpu;
402 int cpu;
403
404
405
406
407
408
409 rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
410
411
412 for_each_online_cpu(cpu) {
413 struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
414
415
416 bq_flush_to_queue(bq, false);
417 }
418 free_percpu(rcpu->bulkq);
419
420 put_cpu_map_entry(rcpu);
421 }
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442 static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
443 u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
444 {
445 struct bpf_cpu_map_entry *old_rcpu;
446
447 old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
448 if (old_rcpu) {
449 call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
450 INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
451 schedule_work(&old_rcpu->kthread_stop_wq);
452 }
453 }
454
455 static int cpu_map_delete_elem(struct bpf_map *map, void *key)
456 {
457 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
458 u32 key_cpu = *(u32 *)key;
459
460 if (key_cpu >= map->max_entries)
461 return -EINVAL;
462
463
464 __cpu_map_entry_replace(cmap, key_cpu, NULL);
465 return 0;
466 }
467
468 static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
469 u64 map_flags)
470 {
471 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
472 struct bpf_cpu_map_entry *rcpu;
473
474
475 u32 key_cpu = *(u32 *)key;
476
477 u32 qsize = *(u32 *)value;
478
479 if (unlikely(map_flags > BPF_EXIST))
480 return -EINVAL;
481 if (unlikely(key_cpu >= cmap->map.max_entries))
482 return -E2BIG;
483 if (unlikely(map_flags == BPF_NOEXIST))
484 return -EEXIST;
485 if (unlikely(qsize > 16384))
486 return -EOVERFLOW;
487
488
489 if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu))
490 return -ENODEV;
491
492 if (qsize == 0) {
493 rcpu = NULL;
494 } else {
495
496 rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
497 if (!rcpu)
498 return -ENOMEM;
499 rcpu->cmap = cmap;
500 }
501 rcu_read_lock();
502 __cpu_map_entry_replace(cmap, key_cpu, rcpu);
503 rcu_read_unlock();
504 return 0;
505 }
506
507 static void cpu_map_free(struct bpf_map *map)
508 {
509 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
510 int cpu;
511 u32 i;
512
513
514
515
516
517
518
519
520
521
522 bpf_clear_redirect_map(map);
523 synchronize_rcu();
524
525
526
527
528
529
530 for_each_online_cpu(cpu) {
531 struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
532
533 while (!list_empty(flush_list))
534 cond_resched();
535 }
536
537
538
539
540 for (i = 0; i < cmap->map.max_entries; i++) {
541 struct bpf_cpu_map_entry *rcpu;
542
543 rcpu = READ_ONCE(cmap->cpu_map[i]);
544 if (!rcpu)
545 continue;
546
547
548 __cpu_map_entry_replace(cmap, i, NULL);
549 }
550 free_percpu(cmap->flush_list);
551 bpf_map_area_free(cmap->cpu_map);
552 kfree(cmap);
553 }
554
555 struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
556 {
557 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
558 struct bpf_cpu_map_entry *rcpu;
559
560 if (key >= map->max_entries)
561 return NULL;
562
563 rcpu = READ_ONCE(cmap->cpu_map[key]);
564 return rcpu;
565 }
566
567 static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
568 {
569 struct bpf_cpu_map_entry *rcpu =
570 __cpu_map_lookup_elem(map, *(u32 *)key);
571
572 return rcpu ? &rcpu->qsize : NULL;
573 }
574
575 static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
576 {
577 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
578 u32 index = key ? *(u32 *)key : U32_MAX;
579 u32 *next = next_key;
580
581 if (index >= cmap->map.max_entries) {
582 *next = 0;
583 return 0;
584 }
585
586 if (index == cmap->map.max_entries - 1)
587 return -ENOENT;
588 *next = index + 1;
589 return 0;
590 }
591
592 const struct bpf_map_ops cpu_map_ops = {
593 .map_alloc = cpu_map_alloc,
594 .map_free = cpu_map_free,
595 .map_delete_elem = cpu_map_delete_elem,
596 .map_update_elem = cpu_map_update_elem,
597 .map_lookup_elem = cpu_map_lookup_elem,
598 .map_get_next_key = cpu_map_get_next_key,
599 .map_check_btf = map_check_no_btf,
600 };
601
602 static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
603 {
604 struct bpf_cpu_map_entry *rcpu = bq->obj;
605 unsigned int processed = 0, drops = 0;
606 const int to_cpu = rcpu->cpu;
607 struct ptr_ring *q;
608 int i;
609
610 if (unlikely(!bq->count))
611 return 0;
612
613 q = rcpu->queue;
614 spin_lock(&q->producer_lock);
615
616 for (i = 0; i < bq->count; i++) {
617 struct xdp_frame *xdpf = bq->q[i];
618 int err;
619
620 err = __ptr_ring_produce(q, xdpf);
621 if (err) {
622 drops++;
623 if (likely(in_napi_ctx))
624 xdp_return_frame_rx_napi(xdpf);
625 else
626 xdp_return_frame(xdpf);
627 }
628 processed++;
629 }
630 bq->count = 0;
631 spin_unlock(&q->producer_lock);
632
633 __list_del_clearprev(&bq->flush_node);
634
635
636 trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
637 return 0;
638 }
639
640
641
642
643 static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
644 {
645 struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
646 struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
647
648 if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
649 bq_flush_to_queue(bq, true);
650
651
652
653
654
655
656
657
658
659
660 bq->q[bq->count++] = xdpf;
661
662 if (!bq->flush_node.prev)
663 list_add(&bq->flush_node, flush_list);
664
665 return 0;
666 }
667
668 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
669 struct net_device *dev_rx)
670 {
671 struct xdp_frame *xdpf;
672
673 xdpf = convert_to_xdp_frame(xdp);
674 if (unlikely(!xdpf))
675 return -EOVERFLOW;
676
677
678 xdpf->dev_rx = dev_rx;
679
680 bq_enqueue(rcpu, xdpf);
681 return 0;
682 }
683
684 void __cpu_map_flush(struct bpf_map *map)
685 {
686 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
687 struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
688 struct xdp_bulk_queue *bq, *tmp;
689
690 list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
691 bq_flush_to_queue(bq, true);
692
693
694 wake_up_process(bq->obj->kthread);
695 }
696 }