This source file includes following definitions.
- siw_try_1seg
- siw_qp_prepare_tx
- siw_tx_ctrl
- siw_tcp_sendpages
- siw_0copy_tx
- siw_unmap_pages
- siw_tx_hdt
- siw_update_tcpseg
- siw_prepare_fpdu
- siw_check_sgl_tx
- siw_qp_sq_proc_tx
- siw_fastreg_mr
- siw_qp_sq_proc_local
- siw_qp_sq_process
- siw_sq_resume
- siw_stop_tx_thread
- siw_run_sq
- siw_sq_start
1
2
3
4
5
6 #include <linux/errno.h>
7 #include <linux/types.h>
8 #include <linux/net.h>
9 #include <linux/scatterlist.h>
10 #include <linux/highmem.h>
11 #include <net/tcp.h>
12
13 #include <rdma/iw_cm.h>
14 #include <rdma/ib_verbs.h>
15 #include <rdma/ib_user_verbs.h>
16
17 #include "siw.h"
18 #include "siw_verbs.h"
19 #include "siw_mem.h"
20
21 #define MAX_HDR_INLINE \
22 (((uint32_t)(sizeof(struct siw_rreq_pkt) - \
23 sizeof(struct iwarp_send))) & 0xF8)
24
25 static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx)
26 {
27 struct siw_pbl *pbl = mem->pbl;
28 u64 offset = addr - mem->va;
29 dma_addr_t paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx);
30
31 if (paddr)
32 return virt_to_page(paddr);
33
34 return NULL;
35 }
36
37
38
39
40 static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void *paddr)
41 {
42 struct siw_wqe *wqe = &c_tx->wqe_active;
43 struct siw_sge *sge = &wqe->sqe.sge[0];
44 u32 bytes = sge->length;
45
46 if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1)
47 return MAX_HDR_INLINE + 1;
48
49 if (!bytes)
50 return 0;
51
52 if (tx_flags(wqe) & SIW_WQE_INLINE) {
53 memcpy(paddr, &wqe->sqe.sge[1], bytes);
54 } else {
55 struct siw_mem *mem = wqe->mem[0];
56
57 if (!mem->mem_obj) {
58
59 memcpy(paddr,
60 (const void *)(uintptr_t)sge->laddr, bytes);
61 } else if (c_tx->in_syscall) {
62 if (copy_from_user(paddr, u64_to_user_ptr(sge->laddr),
63 bytes))
64 return -EFAULT;
65 } else {
66 unsigned int off = sge->laddr & ~PAGE_MASK;
67 struct page *p;
68 char *buffer;
69 int pbl_idx = 0;
70
71 if (!mem->is_pbl)
72 p = siw_get_upage(mem->umem, sge->laddr);
73 else
74 p = siw_get_pblpage(mem, sge->laddr, &pbl_idx);
75
76 if (unlikely(!p))
77 return -EFAULT;
78
79 buffer = kmap(p);
80
81 if (likely(PAGE_SIZE - off >= bytes)) {
82 memcpy(paddr, buffer + off, bytes);
83 } else {
84 unsigned long part = bytes - (PAGE_SIZE - off);
85
86 memcpy(paddr, buffer + off, part);
87 kunmap(p);
88
89 if (!mem->is_pbl)
90 p = siw_get_upage(mem->umem,
91 sge->laddr + part);
92 else
93 p = siw_get_pblpage(mem,
94 sge->laddr + part,
95 &pbl_idx);
96 if (unlikely(!p))
97 return -EFAULT;
98
99 buffer = kmap(p);
100 memcpy(paddr + part, buffer, bytes - part);
101 }
102 kunmap(p);
103 }
104 }
105 return (int)bytes;
106 }
107
108 #define PKT_FRAGMENTED 1
109 #define PKT_COMPLETE 0
110
111
112
113
114
115
116
117
118
119 static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx)
120 {
121 struct siw_wqe *wqe = &c_tx->wqe_active;
122 char *crc = NULL;
123 int data = 0;
124
125 switch (tx_type(wqe)) {
126 case SIW_OP_READ:
127 case SIW_OP_READ_LOCAL_INV:
128 memcpy(&c_tx->pkt.ctrl,
129 &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
130 sizeof(struct iwarp_ctrl));
131
132 c_tx->pkt.rreq.rsvd = 0;
133 c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
134 c_tx->pkt.rreq.ddp_msn =
135 htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]);
136 c_tx->pkt.rreq.ddp_mo = 0;
137 c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey);
138 c_tx->pkt.rreq.sink_to =
139 cpu_to_be64(wqe->sqe.sge[0].laddr);
140 c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey);
141 c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr);
142 c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length);
143
144 c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq);
145 crc = (char *)&c_tx->pkt.rreq_pkt.crc;
146 break;
147
148 case SIW_OP_SEND:
149 if (tx_flags(wqe) & SIW_WQE_SOLICITED)
150 memcpy(&c_tx->pkt.ctrl,
151 &iwarp_pktinfo[RDMAP_SEND_SE].ctrl,
152 sizeof(struct iwarp_ctrl));
153 else
154 memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl,
155 sizeof(struct iwarp_ctrl));
156
157 c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
158 c_tx->pkt.send.ddp_msn =
159 htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
160 c_tx->pkt.send.ddp_mo = 0;
161
162 c_tx->pkt.send_inv.inval_stag = 0;
163
164 c_tx->ctrl_len = sizeof(struct iwarp_send);
165
166 crc = (char *)&c_tx->pkt.send_pkt.crc;
167 data = siw_try_1seg(c_tx, crc);
168 break;
169
170 case SIW_OP_SEND_REMOTE_INV:
171 if (tx_flags(wqe) & SIW_WQE_SOLICITED)
172 memcpy(&c_tx->pkt.ctrl,
173 &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl,
174 sizeof(struct iwarp_ctrl));
175 else
176 memcpy(&c_tx->pkt.ctrl,
177 &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl,
178 sizeof(struct iwarp_ctrl));
179
180 c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
181 c_tx->pkt.send.ddp_msn =
182 htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
183 c_tx->pkt.send.ddp_mo = 0;
184
185 c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey);
186
187 c_tx->ctrl_len = sizeof(struct iwarp_send_inv);
188
189 crc = (char *)&c_tx->pkt.send_pkt.crc;
190 data = siw_try_1seg(c_tx, crc);
191 break;
192
193 case SIW_OP_WRITE:
194 memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl,
195 sizeof(struct iwarp_ctrl));
196
197 c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey);
198 c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr);
199 c_tx->ctrl_len = sizeof(struct iwarp_rdma_write);
200
201 crc = (char *)&c_tx->pkt.write_pkt.crc;
202 data = siw_try_1seg(c_tx, crc);
203 break;
204
205 case SIW_OP_READ_RESPONSE:
206 memcpy(&c_tx->pkt.ctrl,
207 &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl,
208 sizeof(struct iwarp_ctrl));
209
210
211 c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey);
212 c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr);
213
214 c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp);
215
216 crc = (char *)&c_tx->pkt.write_pkt.crc;
217 data = siw_try_1seg(c_tx, crc);
218 break;
219
220 default:
221 siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n", tx_type(wqe));
222 return -EOPNOTSUPP;
223 }
224 if (unlikely(data < 0))
225 return data;
226
227 c_tx->ctrl_sent = 0;
228
229 if (data <= MAX_HDR_INLINE) {
230 if (data) {
231 wqe->processed = data;
232
233 c_tx->pkt.ctrl.mpa_len =
234 htons(c_tx->ctrl_len + data - MPA_HDR_SIZE);
235
236
237 data += -(int)data & 0x3;
238
239 crc += data;
240 c_tx->ctrl_len += data;
241
242 if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED))
243 c_tx->pkt.c_untagged.ddp_mo = 0;
244 else
245 c_tx->pkt.c_tagged.ddp_to =
246 cpu_to_be64(wqe->sqe.raddr);
247 }
248
249 *(u32 *)crc = 0;
250
251
252
253 if (c_tx->mpa_crc_hd) {
254 crypto_shash_init(c_tx->mpa_crc_hd);
255 if (crypto_shash_update(c_tx->mpa_crc_hd,
256 (u8 *)&c_tx->pkt,
257 c_tx->ctrl_len))
258 return -EINVAL;
259 crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)crc);
260 }
261 c_tx->ctrl_len += MPA_CRC_SIZE;
262
263 return PKT_COMPLETE;
264 }
265 c_tx->ctrl_len += MPA_CRC_SIZE;
266 c_tx->sge_idx = 0;
267 c_tx->sge_off = 0;
268 c_tx->pbl_idx = 0;
269
270
271
272
273
274
275
276
277
278
279 if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH &&
280 !(tx_flags(wqe) & SIW_WQE_SIGNALLED))
281 c_tx->use_sendpage = 1;
282 else
283 c_tx->use_sendpage = 0;
284
285 return PKT_FRAGMENTED;
286 }
287
288
289
290
291
292
293 static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s,
294 int flags)
295 {
296 struct msghdr msg = { .msg_flags = flags };
297 struct kvec iov = { .iov_base =
298 (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent,
299 .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent };
300
301 int rv = kernel_sendmsg(s, &msg, &iov, 1,
302 c_tx->ctrl_len - c_tx->ctrl_sent);
303
304 if (rv >= 0) {
305 c_tx->ctrl_sent += rv;
306
307 if (c_tx->ctrl_sent == c_tx->ctrl_len)
308 rv = 0;
309 else
310 rv = -EAGAIN;
311 }
312 return rv;
313 }
314
315
316
317
318
319
320
321
322
323
324 static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset,
325 size_t size)
326 {
327 struct sock *sk = s->sk;
328 int i = 0, rv = 0, sent = 0,
329 flags = MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST;
330
331 while (size) {
332 size_t bytes = min_t(size_t, PAGE_SIZE - offset, size);
333
334 if (size + offset <= PAGE_SIZE)
335 flags = MSG_MORE | MSG_DONTWAIT;
336
337 tcp_rate_check_app_limited(sk);
338 try_page_again:
339 lock_sock(sk);
340 rv = do_tcp_sendpages(sk, page[i], offset, bytes, flags);
341 release_sock(sk);
342
343 if (rv > 0) {
344 size -= rv;
345 sent += rv;
346 if (rv != bytes) {
347 offset += rv;
348 bytes -= rv;
349 goto try_page_again;
350 }
351 offset = 0;
352 } else {
353 if (rv == -EAGAIN || rv == 0)
354 break;
355 return rv;
356 }
357 i++;
358 }
359 return sent;
360 }
361
362
363
364
365
366
367
368
369 static int siw_0copy_tx(struct socket *s, struct page **page,
370 struct siw_sge *sge, unsigned int offset,
371 unsigned int size)
372 {
373 int i = 0, sent = 0, rv;
374 int sge_bytes = min(sge->length - offset, size);
375
376 offset = (sge->laddr + offset) & ~PAGE_MASK;
377
378 while (sent != size) {
379 rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes);
380 if (rv >= 0) {
381 sent += rv;
382 if (size == sent || sge_bytes > rv)
383 break;
384
385 i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT;
386 sge++;
387 sge_bytes = min(sge->length, size - sent);
388 offset = sge->laddr & ~PAGE_MASK;
389 } else {
390 sent = rv;
391 break;
392 }
393 }
394 return sent;
395 }
396
397 #define MAX_TRAILER (MPA_CRC_SIZE + 4)
398
399 static void siw_unmap_pages(struct page **pp, unsigned long kmap_mask)
400 {
401 while (kmap_mask) {
402 if (kmap_mask & BIT(0))
403 kunmap(*pp);
404 pp++;
405 kmap_mask >>= 1;
406 }
407 }
408
409
410
411
412
413
414
415
416
417
418 #define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2))
419
420
421
422
423
424 static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s)
425 {
426 struct siw_wqe *wqe = &c_tx->wqe_active;
427 struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx];
428 struct kvec iov[MAX_ARRAY];
429 struct page *page_array[MAX_ARRAY];
430 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
431
432 int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv;
433 unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0,
434 sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx,
435 pbl_idx = c_tx->pbl_idx;
436 unsigned long kmap_mask = 0L;
437
438 if (c_tx->state == SIW_SEND_HDR) {
439 if (c_tx->use_sendpage) {
440 rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE);
441 if (rv)
442 goto done;
443
444 c_tx->state = SIW_SEND_DATA;
445 } else {
446 iov[0].iov_base =
447 (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent;
448 iov[0].iov_len = hdr_len =
449 c_tx->ctrl_len - c_tx->ctrl_sent;
450 seg = 1;
451 }
452 }
453
454 wqe->processed += data_len;
455
456 while (data_len) {
457 unsigned int sge_len = min(sge->length - sge_off, data_len);
458 unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK;
459 struct siw_mem *mem;
460
461 if (!(tx_flags(wqe) & SIW_WQE_INLINE)) {
462 mem = wqe->mem[sge_idx];
463 is_kva = mem->mem_obj == NULL ? 1 : 0;
464 } else {
465 is_kva = 1;
466 }
467 if (is_kva && !c_tx->use_sendpage) {
468
469
470
471
472 iov[seg].iov_base =
473 (void *)(uintptr_t)(sge->laddr + sge_off);
474 iov[seg].iov_len = sge_len;
475
476 if (do_crc)
477 crypto_shash_update(c_tx->mpa_crc_hd,
478 iov[seg].iov_base,
479 sge_len);
480 sge_off += sge_len;
481 data_len -= sge_len;
482 seg++;
483 goto sge_done;
484 }
485
486 while (sge_len) {
487 size_t plen = min((int)PAGE_SIZE - fp_off, sge_len);
488
489 if (!is_kva) {
490 struct page *p;
491
492 if (mem->is_pbl)
493 p = siw_get_pblpage(
494 mem, sge->laddr + sge_off,
495 &pbl_idx);
496 else
497 p = siw_get_upage(mem->umem,
498 sge->laddr + sge_off);
499 if (unlikely(!p)) {
500 siw_unmap_pages(page_array, kmap_mask);
501 wqe->processed -= c_tx->bytes_unsent;
502 rv = -EFAULT;
503 goto done_crc;
504 }
505 page_array[seg] = p;
506
507 if (!c_tx->use_sendpage) {
508 iov[seg].iov_base = kmap(p) + fp_off;
509 iov[seg].iov_len = plen;
510
511
512 kmap_mask |= BIT(seg);
513
514 if (do_crc)
515 crypto_shash_update(
516 c_tx->mpa_crc_hd,
517 iov[seg].iov_base,
518 plen);
519 } else if (do_crc) {
520 crypto_shash_update(c_tx->mpa_crc_hd,
521 kmap(p) + fp_off,
522 plen);
523 kunmap(p);
524 }
525 } else {
526 u64 va = sge->laddr + sge_off;
527
528 page_array[seg] = virt_to_page(va & PAGE_MASK);
529 if (do_crc)
530 crypto_shash_update(
531 c_tx->mpa_crc_hd,
532 (void *)(uintptr_t)va,
533 plen);
534 }
535
536 sge_len -= plen;
537 sge_off += plen;
538 data_len -= plen;
539 fp_off = 0;
540
541 if (++seg > (int)MAX_ARRAY) {
542 siw_dbg_qp(tx_qp(c_tx), "to many fragments\n");
543 siw_unmap_pages(page_array, kmap_mask);
544 wqe->processed -= c_tx->bytes_unsent;
545 rv = -EMSGSIZE;
546 goto done_crc;
547 }
548 }
549 sge_done:
550
551 if (sge_off == sge->length &&
552 (data_len != 0 || wqe->processed < wqe->bytes)) {
553 sge_idx++;
554 sge++;
555 sge_off = 0;
556 }
557 }
558
559 if (likely(c_tx->state != SIW_SEND_TRAILER)) {
560 iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad];
561 iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad);
562 } else {
563 iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent];
564 iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent;
565 }
566
567 if (c_tx->pad) {
568 *(u32 *)c_tx->trailer.pad = 0;
569 if (do_crc)
570 crypto_shash_update(c_tx->mpa_crc_hd,
571 (u8 *)&c_tx->trailer.crc - c_tx->pad,
572 c_tx->pad);
573 }
574 if (!c_tx->mpa_crc_hd)
575 c_tx->trailer.crc = 0;
576 else if (do_crc)
577 crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc);
578
579 data_len = c_tx->bytes_unsent;
580
581 if (c_tx->use_sendpage) {
582 rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx],
583 c_tx->sge_off, data_len);
584 if (rv == data_len) {
585 rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len);
586 if (rv > 0)
587 rv += data_len;
588 else
589 rv = data_len;
590 }
591 } else {
592 rv = kernel_sendmsg(s, &msg, iov, seg + 1,
593 hdr_len + data_len + trl_len);
594 siw_unmap_pages(page_array, kmap_mask);
595 }
596 if (rv < (int)hdr_len) {
597
598 wqe->processed -= data_len;
599 if (rv >= 0) {
600 c_tx->ctrl_sent += rv;
601 rv = -EAGAIN;
602 }
603 goto done_crc;
604 }
605 rv -= hdr_len;
606
607 if (rv >= (int)data_len) {
608
609 if (data_len > 0 && wqe->processed < wqe->bytes) {
610
611 c_tx->sge_idx = sge_idx;
612 c_tx->sge_off = sge_off;
613 c_tx->pbl_idx = pbl_idx;
614 }
615 rv -= data_len;
616
617 if (rv == trl_len)
618 rv = 0;
619 else {
620 c_tx->state = SIW_SEND_TRAILER;
621 c_tx->ctrl_len = MAX_TRAILER;
622 c_tx->ctrl_sent = rv + 4 - c_tx->pad;
623 c_tx->bytes_unsent = 0;
624 rv = -EAGAIN;
625 }
626
627 } else if (data_len > 0) {
628
629 c_tx->state = SIW_SEND_DATA;
630 wqe->processed -= data_len - rv;
631
632 if (rv) {
633
634
635
636
637 unsigned int sge_unsent;
638
639 c_tx->bytes_unsent -= rv;
640 sge = &wqe->sqe.sge[c_tx->sge_idx];
641 sge_unsent = sge->length - c_tx->sge_off;
642
643 while (sge_unsent <= rv) {
644 rv -= sge_unsent;
645 c_tx->sge_idx++;
646 c_tx->sge_off = 0;
647 sge++;
648 sge_unsent = sge->length;
649 }
650 c_tx->sge_off += rv;
651 }
652 rv = -EAGAIN;
653 }
654 done_crc:
655 c_tx->do_crc = 0;
656 done:
657 return rv;
658 }
659
660 static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx,
661 struct socket *s)
662 {
663 struct tcp_sock *tp = tcp_sk(s->sk);
664
665 if (tp->gso_segs) {
666 if (c_tx->gso_seg_limit == 0)
667 c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs;
668 else
669 c_tx->tcp_seglen =
670 tp->mss_cache *
671 min_t(u16, c_tx->gso_seg_limit, tp->gso_segs);
672 } else {
673 c_tx->tcp_seglen = tp->mss_cache;
674 }
675
676 c_tx->tcp_seglen &= 0xfffffff8;
677 }
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693 static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe)
694 {
695 struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
696 int data_len;
697
698 c_tx->ctrl_len =
699 iwarp_pktinfo[__rdmap_get_opcode(&c_tx->pkt.ctrl)].hdr_len;
700 c_tx->ctrl_sent = 0;
701
702
703
704
705 if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED))
706
707 c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed);
708 else
709 c_tx->pkt.c_tagged.ddp_to =
710 cpu_to_be64(wqe->sqe.raddr + wqe->processed);
711
712 data_len = wqe->bytes - wqe->processed;
713 if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) {
714
715 data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE);
716 c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST;
717 c_tx->pad = 0;
718 } else {
719 c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST;
720 c_tx->pad = -data_len & 0x3;
721 }
722 c_tx->bytes_unsent = data_len;
723
724 c_tx->pkt.ctrl.mpa_len =
725 htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE);
726
727
728
729
730 if (c_tx->mpa_crc_hd) {
731 crypto_shash_init(c_tx->mpa_crc_hd);
732 crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt,
733 c_tx->ctrl_len);
734 c_tx->do_crc = 1;
735 }
736 }
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751 static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe,
752 enum ib_access_flags perms)
753 {
754 struct siw_sge *sge = &wqe->sqe.sge[0];
755 int i, len, num_sge = wqe->sqe.num_sge;
756
757 if (unlikely(num_sge > SIW_MAX_SGE))
758 return -EINVAL;
759
760 for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) {
761
762
763
764 if (sge->length) {
765 int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0,
766 sge->length);
767
768 if (unlikely(rv != E_ACCESS_OK))
769 return rv;
770 }
771 len += sge->length;
772 }
773 return len;
774 }
775
776
777
778
779
780
781 static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe)
782 {
783 struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
784 struct socket *s = qp->attrs.sk;
785 int rv = 0, burst_len = qp->tx_ctx.burst;
786 enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM;
787
788 if (unlikely(wqe->wr_status == SIW_WR_IDLE))
789 return 0;
790
791 if (!burst_len)
792 burst_len = SQ_USER_MAXBURST;
793
794 if (wqe->wr_status == SIW_WR_QUEUED) {
795 if (!(wqe->sqe.flags & SIW_WQE_INLINE)) {
796 if (tx_type(wqe) == SIW_OP_READ_RESPONSE)
797 wqe->sqe.num_sge = 1;
798
799 if (tx_type(wqe) != SIW_OP_READ &&
800 tx_type(wqe) != SIW_OP_READ_LOCAL_INV) {
801
802
803
804
805
806 rv = siw_check_sgl_tx(qp->pd, wqe, 0);
807 if (rv < 0) {
808 if (tx_type(wqe) ==
809 SIW_OP_READ_RESPONSE)
810 ecode = siw_rdmap_error(-rv);
811 rv = -EINVAL;
812 goto tx_error;
813 }
814 wqe->bytes = rv;
815 } else {
816 wqe->bytes = 0;
817 }
818 } else {
819 wqe->bytes = wqe->sqe.sge[0].length;
820 if (!qp->kernel_verbs) {
821 if (wqe->bytes > SIW_MAX_INLINE) {
822 rv = -EINVAL;
823 goto tx_error;
824 }
825 wqe->sqe.sge[0].laddr =
826 (u64)(uintptr_t)&wqe->sqe.sge[1];
827 }
828 }
829 wqe->wr_status = SIW_WR_INPROGRESS;
830 wqe->processed = 0;
831
832 siw_update_tcpseg(c_tx, s);
833
834 rv = siw_qp_prepare_tx(c_tx);
835 if (rv == PKT_FRAGMENTED) {
836 c_tx->state = SIW_SEND_HDR;
837 siw_prepare_fpdu(qp, wqe);
838 } else if (rv == PKT_COMPLETE) {
839 c_tx->state = SIW_SEND_SHORT_FPDU;
840 } else {
841 goto tx_error;
842 }
843 }
844
845 next_segment:
846 siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n",
847 tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed,
848 wqe->sqe.id);
849
850 if (--burst_len == 0) {
851 rv = -EINPROGRESS;
852 goto tx_done;
853 }
854 if (c_tx->state == SIW_SEND_SHORT_FPDU) {
855 enum siw_opcode tx_type = tx_type(wqe);
856 unsigned int msg_flags;
857
858 if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1)
859
860
861
862
863
864 msg_flags = MSG_DONTWAIT;
865 else
866 msg_flags = MSG_DONTWAIT | MSG_MORE;
867
868 rv = siw_tx_ctrl(c_tx, s, msg_flags);
869
870 if (!rv && tx_type != SIW_OP_READ &&
871 tx_type != SIW_OP_READ_LOCAL_INV)
872 wqe->processed = wqe->bytes;
873
874 goto tx_done;
875
876 } else {
877 rv = siw_tx_hdt(c_tx, s);
878 }
879 if (!rv) {
880
881
882
883
884 if (unlikely(c_tx->tx_suspend)) {
885
886
887
888
889
890 rv = -ECONNABORTED;
891 goto tx_done;
892 }
893 if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) {
894 siw_dbg_qp(qp, "WQE completed\n");
895 goto tx_done;
896 }
897 c_tx->state = SIW_SEND_HDR;
898
899 siw_update_tcpseg(c_tx, s);
900
901 siw_prepare_fpdu(qp, wqe);
902 goto next_segment;
903 }
904 tx_done:
905 qp->tx_ctx.burst = burst_len;
906 return rv;
907
908 tx_error:
909 if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM)
910 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
911 RDMAP_ETYPE_REMOTE_PROTECTION, ecode, 1);
912 else
913 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
914 RDMAP_ETYPE_CATASTROPHIC,
915 RDMAP_ECODE_UNSPECIFIED, 1);
916 return rv;
917 }
918
919 static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe)
920 {
921 struct ib_mr *base_mr = (struct ib_mr *)(uintptr_t)sqe->base_mr;
922 struct siw_device *sdev = to_siw_dev(pd->device);
923 struct siw_mem *mem;
924 int rv = 0;
925
926 siw_dbg_pd(pd, "STag 0x%08x\n", sqe->rkey);
927
928 if (unlikely(!base_mr)) {
929 pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey);
930 return -EINVAL;
931 }
932
933 if (unlikely(base_mr->rkey >> 8 != sqe->rkey >> 8)) {
934 pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe->rkey);
935 return -EINVAL;
936 }
937
938 mem = siw_mem_id2obj(sdev, sqe->rkey >> 8);
939 if (unlikely(!mem)) {
940 pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey);
941 return -EINVAL;
942 }
943
944 if (unlikely(mem->pd != pd)) {
945 pr_warn("siw: fastreg: PD mismatch\n");
946 rv = -EINVAL;
947 goto out;
948 }
949 if (unlikely(mem->stag_valid)) {
950 pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe->rkey);
951 rv = -EINVAL;
952 goto out;
953 }
954
955 mem->stag = sqe->rkey;
956 mem->perms = sqe->access;
957
958 siw_dbg_mem(mem, "STag 0x%08x now valid\n", sqe->rkey);
959 mem->va = base_mr->iova;
960 mem->stag_valid = 1;
961 out:
962 siw_mem_put(mem);
963 return rv;
964 }
965
966 static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe)
967 {
968 int rv;
969
970 switch (tx_type(wqe)) {
971 case SIW_OP_REG_MR:
972 rv = siw_fastreg_mr(qp->pd, &wqe->sqe);
973 break;
974
975 case SIW_OP_INVAL_STAG:
976 rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey);
977 break;
978
979 default:
980 rv = -EINVAL;
981 }
982 return rv;
983 }
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012 int siw_qp_sq_process(struct siw_qp *qp)
1013 {
1014 struct siw_wqe *wqe = tx_wqe(qp);
1015 enum siw_opcode tx_type;
1016 unsigned long flags;
1017 int rv = 0;
1018
1019 siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe));
1020
1021 next_wqe:
1022
1023
1024
1025 if (unlikely(qp->tx_ctx.tx_suspend)) {
1026 siw_dbg_qp(qp, "tx suspended\n");
1027 goto done;
1028 }
1029 tx_type = tx_type(wqe);
1030
1031 if (tx_type <= SIW_OP_READ_RESPONSE)
1032 rv = siw_qp_sq_proc_tx(qp, wqe);
1033 else
1034 rv = siw_qp_sq_proc_local(qp, wqe);
1035
1036 if (!rv) {
1037
1038
1039
1040 switch (tx_type) {
1041 case SIW_OP_SEND:
1042 case SIW_OP_SEND_REMOTE_INV:
1043 case SIW_OP_WRITE:
1044 siw_wqe_put_mem(wqe, tx_type);
1045
1046
1047 case SIW_OP_INVAL_STAG:
1048 case SIW_OP_REG_MR:
1049 if (tx_flags(wqe) & SIW_WQE_SIGNALLED)
1050 siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
1051 SIW_WC_SUCCESS);
1052 break;
1053
1054 case SIW_OP_READ:
1055 case SIW_OP_READ_LOCAL_INV:
1056
1057
1058
1059 break;
1060
1061 case SIW_OP_READ_RESPONSE:
1062 siw_wqe_put_mem(wqe, tx_type);
1063 break;
1064
1065 default:
1066 WARN(1, "undefined WQE type %d\n", tx_type);
1067 rv = -EINVAL;
1068 goto done;
1069 }
1070
1071 spin_lock_irqsave(&qp->sq_lock, flags);
1072 wqe->wr_status = SIW_WR_IDLE;
1073 rv = siw_activate_tx(qp);
1074 spin_unlock_irqrestore(&qp->sq_lock, flags);
1075
1076 if (rv <= 0)
1077 goto done;
1078
1079 goto next_wqe;
1080
1081 } else if (rv == -EAGAIN) {
1082 siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n",
1083 qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len,
1084 qp->tx_ctx.bytes_unsent);
1085 rv = 0;
1086 goto done;
1087 } else if (rv == -EINPROGRESS) {
1088 rv = siw_sq_start(qp);
1089 goto done;
1090 } else {
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103 siw_dbg_qp(qp, "wqe type %d processing failed: %d\n",
1104 tx_type(wqe), rv);
1105
1106 spin_lock_irqsave(&qp->sq_lock, flags);
1107
1108
1109
1110 if (tx_type == SIW_OP_READ ||
1111 tx_type == SIW_OP_READ_LOCAL_INV) {
1112
1113 qp->orq_put--;
1114 qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0;
1115 }
1116 spin_unlock_irqrestore(&qp->sq_lock, flags);
1117
1118
1119
1120 if (!qp->tx_ctx.tx_suspend)
1121 siw_qp_cm_drop(qp, 0);
1122
1123 switch (tx_type) {
1124 case SIW_OP_SEND:
1125 case SIW_OP_SEND_REMOTE_INV:
1126 case SIW_OP_SEND_WITH_IMM:
1127 case SIW_OP_WRITE:
1128 case SIW_OP_READ:
1129 case SIW_OP_READ_LOCAL_INV:
1130 siw_wqe_put_mem(wqe, tx_type);
1131
1132
1133 case SIW_OP_INVAL_STAG:
1134 case SIW_OP_REG_MR:
1135 siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
1136 SIW_WC_LOC_QP_OP_ERR);
1137
1138 siw_qp_event(qp, IB_EVENT_QP_FATAL);
1139
1140 break;
1141
1142 case SIW_OP_READ_RESPONSE:
1143 siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv);
1144
1145 siw_qp_event(qp, IB_EVENT_QP_REQ_ERR);
1146
1147 siw_wqe_put_mem(wqe, SIW_OP_READ_RESPONSE);
1148
1149 break;
1150
1151 default:
1152 WARN(1, "undefined WQE type %d\n", tx_type);
1153 rv = -EINVAL;
1154 }
1155 wqe->wr_status = SIW_WR_IDLE;
1156 }
1157 done:
1158 return rv;
1159 }
1160
1161 static void siw_sq_resume(struct siw_qp *qp)
1162 {
1163 if (down_read_trylock(&qp->state_lock)) {
1164 if (likely(qp->attrs.state == SIW_QP_STATE_RTS &&
1165 !qp->tx_ctx.tx_suspend)) {
1166 int rv = siw_qp_sq_process(qp);
1167
1168 up_read(&qp->state_lock);
1169
1170 if (unlikely(rv < 0)) {
1171 siw_dbg_qp(qp, "SQ task failed: err %d\n", rv);
1172
1173 if (!qp->tx_ctx.tx_suspend)
1174 siw_qp_cm_drop(qp, 0);
1175 }
1176 } else {
1177 up_read(&qp->state_lock);
1178 }
1179 } else {
1180 siw_dbg_qp(qp, "Resume SQ while QP locked\n");
1181 }
1182 siw_qp_put(qp);
1183 }
1184
1185 struct tx_task_t {
1186 struct llist_head active;
1187 wait_queue_head_t waiting;
1188 };
1189
1190 static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g);
1191
1192 void siw_stop_tx_thread(int nr_cpu)
1193 {
1194 kthread_stop(siw_tx_thread[nr_cpu]);
1195 wake_up(&per_cpu(siw_tx_task_g, nr_cpu).waiting);
1196 }
1197
1198 int siw_run_sq(void *data)
1199 {
1200 const int nr_cpu = (unsigned int)(long)data;
1201 struct llist_node *active;
1202 struct siw_qp *qp;
1203 struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu);
1204
1205 init_llist_head(&tx_task->active);
1206 init_waitqueue_head(&tx_task->waiting);
1207
1208 while (1) {
1209 struct llist_node *fifo_list = NULL;
1210
1211 wait_event_interruptible(tx_task->waiting,
1212 !llist_empty(&tx_task->active) ||
1213 kthread_should_stop());
1214
1215 if (kthread_should_stop())
1216 break;
1217
1218 active = llist_del_all(&tx_task->active);
1219
1220
1221
1222
1223 while (active) {
1224 struct llist_node *tmp = active;
1225
1226 active = llist_next(active);
1227 tmp->next = fifo_list;
1228 fifo_list = tmp;
1229 }
1230 while (fifo_list) {
1231 qp = container_of(fifo_list, struct siw_qp, tx_list);
1232 fifo_list = llist_next(fifo_list);
1233 qp->tx_list.next = NULL;
1234
1235 siw_sq_resume(qp);
1236 }
1237 }
1238 active = llist_del_all(&tx_task->active);
1239 if (active) {
1240 llist_for_each_entry(qp, active, tx_list) {
1241 qp->tx_list.next = NULL;
1242 siw_sq_resume(qp);
1243 }
1244 }
1245 return 0;
1246 }
1247
1248 int siw_sq_start(struct siw_qp *qp)
1249 {
1250 if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
1251 return 0;
1252
1253 if (unlikely(!cpu_online(qp->tx_cpu))) {
1254 siw_put_tx_cpu(qp->tx_cpu);
1255 qp->tx_cpu = siw_get_tx_cpu(qp->sdev);
1256 if (qp->tx_cpu < 0) {
1257 pr_warn("siw: no tx cpu available\n");
1258
1259 return -EIO;
1260 }
1261 }
1262 siw_qp_get(qp);
1263
1264 llist_add(&qp->tx_list, &per_cpu(siw_tx_task_g, qp->tx_cpu).active);
1265
1266 wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting);
1267
1268 return 0;
1269 }