This source file includes following definitions.
- tcp_enter_memory_pressure
- tcp_leave_memory_pressure
- secs_to_retrans
- retrans_to_secs
- tcp_compute_delivery_rate
- tcp_init_sock
- tcp_tx_timestamp
- tcp_stream_is_readable
- tcp_poll
- tcp_ioctl
- tcp_mark_push
- forced_push
- skb_entail
- tcp_mark_urg
- tcp_should_autocork
- tcp_push
- tcp_splice_data_recv
- __tcp_splice_read
- tcp_splice_read
- sk_stream_alloc_skb
- tcp_xmit_size_goal
- tcp_send_mss
- tcp_remove_empty_skb
- do_tcp_sendpages
- tcp_sendpage_locked
- tcp_sendpage
- tcp_free_fastopen_req
- tcp_sendmsg_fastopen
- tcp_sendmsg_locked
- tcp_sendmsg
- tcp_recv_urg
- tcp_peek_sndq
- tcp_cleanup_rbuf
- tcp_recv_skb
- tcp_read_sock
- tcp_peek_len
- tcp_set_rcvlowat
- tcp_mmap
- tcp_zerocopy_receive
- tcp_update_recv_tstamps
- tcp_recv_timestamp
- tcp_inq_hint
- tcp_recvmsg
- tcp_set_state
- tcp_close_state
- tcp_shutdown
- tcp_check_oom
- tcp_close
- tcp_need_reset
- tcp_rtx_queue_purge
- tcp_write_queue_purge
- tcp_disconnect
- tcp_can_repair_sock
- tcp_repair_set_window
- tcp_repair_options_est
- tcp_enable_tx_delay
- do_tcp_setsockopt
- tcp_setsockopt
- compat_tcp_setsockopt
- tcp_get_info_chrono_stats
- tcp_get_info
- tcp_opt_stats_get_size
- tcp_get_timestamping_opt_stats
- do_tcp_getsockopt
- tcp_getsockopt
- compat_tcp_getsockopt
- __tcp_alloc_md5sig_pool
- tcp_alloc_md5sig_pool
- tcp_get_md5sig_pool
- tcp_md5_hash_skb_data
- tcp_md5_hash_key
- tcp_done
- tcp_abort
- set_thash_entries
- tcp_init_mem
- tcp_init
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244 #define pr_fmt(fmt) "TCP: " fmt
245
246 #include <crypto/hash.h>
247 #include <linux/kernel.h>
248 #include <linux/module.h>
249 #include <linux/types.h>
250 #include <linux/fcntl.h>
251 #include <linux/poll.h>
252 #include <linux/inet_diag.h>
253 #include <linux/init.h>
254 #include <linux/fs.h>
255 #include <linux/skbuff.h>
256 #include <linux/scatterlist.h>
257 #include <linux/splice.h>
258 #include <linux/net.h>
259 #include <linux/socket.h>
260 #include <linux/random.h>
261 #include <linux/memblock.h>
262 #include <linux/highmem.h>
263 #include <linux/swap.h>
264 #include <linux/cache.h>
265 #include <linux/err.h>
266 #include <linux/time.h>
267 #include <linux/slab.h>
268 #include <linux/errqueue.h>
269 #include <linux/static_key.h>
270
271 #include <net/icmp.h>
272 #include <net/inet_common.h>
273 #include <net/tcp.h>
274 #include <net/xfrm.h>
275 #include <net/ip.h>
276 #include <net/sock.h>
277
278 #include <linux/uaccess.h>
279 #include <asm/ioctls.h>
280 #include <net/busy_poll.h>
281
282 struct percpu_counter tcp_orphan_count;
283 EXPORT_SYMBOL_GPL(tcp_orphan_count);
284
285 long sysctl_tcp_mem[3] __read_mostly;
286 EXPORT_SYMBOL(sysctl_tcp_mem);
287
288 atomic_long_t tcp_memory_allocated;
289 EXPORT_SYMBOL(tcp_memory_allocated);
290
291 #if IS_ENABLED(CONFIG_SMC)
292 DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
293 EXPORT_SYMBOL(tcp_have_smc);
294 #endif
295
296
297
298
299 struct percpu_counter tcp_sockets_allocated;
300 EXPORT_SYMBOL(tcp_sockets_allocated);
301
302
303
304
305 struct tcp_splice_state {
306 struct pipe_inode_info *pipe;
307 size_t len;
308 unsigned int flags;
309 };
310
311
312
313
314
315
316
317 unsigned long tcp_memory_pressure __read_mostly;
318 EXPORT_SYMBOL_GPL(tcp_memory_pressure);
319
320 DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
321 EXPORT_SYMBOL(tcp_rx_skb_cache_key);
322
323 DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
324
325 void tcp_enter_memory_pressure(struct sock *sk)
326 {
327 unsigned long val;
328
329 if (READ_ONCE(tcp_memory_pressure))
330 return;
331 val = jiffies;
332
333 if (!val)
334 val--;
335 if (!cmpxchg(&tcp_memory_pressure, 0, val))
336 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
337 }
338 EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
339
340 void tcp_leave_memory_pressure(struct sock *sk)
341 {
342 unsigned long val;
343
344 if (!READ_ONCE(tcp_memory_pressure))
345 return;
346 val = xchg(&tcp_memory_pressure, 0);
347 if (val)
348 NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
349 jiffies_to_msecs(jiffies - val));
350 }
351 EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
352
353
354 static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
355 {
356 u8 res = 0;
357
358 if (seconds > 0) {
359 int period = timeout;
360
361 res = 1;
362 while (seconds > period && res < 255) {
363 res++;
364 timeout <<= 1;
365 if (timeout > rto_max)
366 timeout = rto_max;
367 period += timeout;
368 }
369 }
370 return res;
371 }
372
373
374 static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
375 {
376 int period = 0;
377
378 if (retrans > 0) {
379 period = timeout;
380 while (--retrans) {
381 timeout <<= 1;
382 if (timeout > rto_max)
383 timeout = rto_max;
384 period += timeout;
385 }
386 }
387 return period;
388 }
389
390 static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
391 {
392 u32 rate = READ_ONCE(tp->rate_delivered);
393 u32 intv = READ_ONCE(tp->rate_interval_us);
394 u64 rate64 = 0;
395
396 if (rate && intv) {
397 rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
398 do_div(rate64, intv);
399 }
400 return rate64;
401 }
402
403
404
405
406
407
408 void tcp_init_sock(struct sock *sk)
409 {
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
412
413 tp->out_of_order_queue = RB_ROOT;
414 sk->tcp_rtx_queue = RB_ROOT;
415 tcp_init_xmit_timers(sk);
416 INIT_LIST_HEAD(&tp->tsq_node);
417 INIT_LIST_HEAD(&tp->tsorted_sent_queue);
418
419 icsk->icsk_rto = TCP_TIMEOUT_INIT;
420 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
421 minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
422
423
424
425
426
427
428 tp->snd_cwnd = TCP_INIT_CWND;
429
430
431 tp->app_limited = ~0U;
432
433
434
435
436 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
437 tp->snd_cwnd_clamp = ~0;
438 tp->mss_cache = TCP_MSS_DEFAULT;
439
440 tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
441 tcp_assign_congestion_control(sk);
442
443 tp->tsoffset = 0;
444 tp->rack.reo_wnd_steps = 1;
445
446 sk->sk_state = TCP_CLOSE;
447
448 sk->sk_write_space = sk_stream_write_space;
449 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
450
451 icsk->icsk_sync_mss = tcp_sync_mss;
452
453 WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
454 WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
455
456 sk_sockets_allocated_inc(sk);
457 sk->sk_route_forced_caps = NETIF_F_GSO;
458 }
459 EXPORT_SYMBOL(tcp_init_sock);
460
461 static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
462 {
463 struct sk_buff *skb = tcp_write_queue_tail(sk);
464
465 if (tsflags && skb) {
466 struct skb_shared_info *shinfo = skb_shinfo(skb);
467 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
468
469 sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
470 if (tsflags & SOF_TIMESTAMPING_TX_ACK)
471 tcb->txstamp_ack = 1;
472 if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
473 shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
474 }
475 }
476
477 static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
478 int target, struct sock *sk)
479 {
480 int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq);
481
482 if (avail > 0) {
483 if (avail >= target)
484 return true;
485 if (tcp_rmem_pressure(sk))
486 return true;
487 }
488 if (sk->sk_prot->stream_memory_read)
489 return sk->sk_prot->stream_memory_read(sk);
490 return false;
491 }
492
493
494
495
496
497
498
499
500 __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
501 {
502 __poll_t mask;
503 struct sock *sk = sock->sk;
504 const struct tcp_sock *tp = tcp_sk(sk);
505 int state;
506
507 sock_poll_wait(file, sock, wait);
508
509 state = inet_sk_state_load(sk);
510 if (state == TCP_LISTEN)
511 return inet_csk_listen_poll(sk);
512
513
514
515
516
517
518 mask = 0;
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547 if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
548 mask |= EPOLLHUP;
549 if (sk->sk_shutdown & RCV_SHUTDOWN)
550 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
551
552
553 if (state != TCP_SYN_SENT &&
554 (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
555 int target = sock_rcvlowat(sk, 0, INT_MAX);
556
557 if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
558 !sock_flag(sk, SOCK_URGINLINE) &&
559 tp->urg_data)
560 target++;
561
562 if (tcp_stream_is_readable(tp, target, sk))
563 mask |= EPOLLIN | EPOLLRDNORM;
564
565 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
566 if (sk_stream_is_writeable(sk)) {
567 mask |= EPOLLOUT | EPOLLWRNORM;
568 } else {
569 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
570 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
571
572
573
574
575
576
577 smp_mb__after_atomic();
578 if (sk_stream_is_writeable(sk))
579 mask |= EPOLLOUT | EPOLLWRNORM;
580 }
581 } else
582 mask |= EPOLLOUT | EPOLLWRNORM;
583
584 if (tp->urg_data & TCP_URG_VALID)
585 mask |= EPOLLPRI;
586 } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
587
588
589
590
591 mask |= EPOLLOUT | EPOLLWRNORM;
592 }
593
594 smp_rmb();
595 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
596 mask |= EPOLLERR;
597
598 return mask;
599 }
600 EXPORT_SYMBOL(tcp_poll);
601
602 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
603 {
604 struct tcp_sock *tp = tcp_sk(sk);
605 int answ;
606 bool slow;
607
608 switch (cmd) {
609 case SIOCINQ:
610 if (sk->sk_state == TCP_LISTEN)
611 return -EINVAL;
612
613 slow = lock_sock_fast(sk);
614 answ = tcp_inq(sk);
615 unlock_sock_fast(sk, slow);
616 break;
617 case SIOCATMARK:
618 answ = tp->urg_data &&
619 READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
620 break;
621 case SIOCOUTQ:
622 if (sk->sk_state == TCP_LISTEN)
623 return -EINVAL;
624
625 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
626 answ = 0;
627 else
628 answ = READ_ONCE(tp->write_seq) - tp->snd_una;
629 break;
630 case SIOCOUTQNSD:
631 if (sk->sk_state == TCP_LISTEN)
632 return -EINVAL;
633
634 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
635 answ = 0;
636 else
637 answ = READ_ONCE(tp->write_seq) -
638 READ_ONCE(tp->snd_nxt);
639 break;
640 default:
641 return -ENOIOCTLCMD;
642 }
643
644 return put_user(answ, (int __user *)arg);
645 }
646 EXPORT_SYMBOL(tcp_ioctl);
647
648 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
649 {
650 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
651 tp->pushed_seq = tp->write_seq;
652 }
653
654 static inline bool forced_push(const struct tcp_sock *tp)
655 {
656 return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
657 }
658
659 static void skb_entail(struct sock *sk, struct sk_buff *skb)
660 {
661 struct tcp_sock *tp = tcp_sk(sk);
662 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
663
664 skb->csum = 0;
665 tcb->seq = tcb->end_seq = tp->write_seq;
666 tcb->tcp_flags = TCPHDR_ACK;
667 tcb->sacked = 0;
668 __skb_header_release(skb);
669 tcp_add_write_queue_tail(sk, skb);
670 sk_wmem_queued_add(sk, skb->truesize);
671 sk_mem_charge(sk, skb->truesize);
672 if (tp->nonagle & TCP_NAGLE_PUSH)
673 tp->nonagle &= ~TCP_NAGLE_PUSH;
674
675 tcp_slow_start_after_idle_check(sk);
676 }
677
678 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
679 {
680 if (flags & MSG_OOB)
681 tp->snd_up = tp->write_seq;
682 }
683
684
685
686
687
688
689
690
691
692
693
694 static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
695 int size_goal)
696 {
697 return skb->len < size_goal &&
698 sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
699 !tcp_rtx_queue_empty(sk) &&
700 refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
701 }
702
703 static void tcp_push(struct sock *sk, int flags, int mss_now,
704 int nonagle, int size_goal)
705 {
706 struct tcp_sock *tp = tcp_sk(sk);
707 struct sk_buff *skb;
708
709 skb = tcp_write_queue_tail(sk);
710 if (!skb)
711 return;
712 if (!(flags & MSG_MORE) || forced_push(tp))
713 tcp_mark_push(tp, skb);
714
715 tcp_mark_urg(tp, flags);
716
717 if (tcp_should_autocork(sk, skb, size_goal)) {
718
719
720 if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
721 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
722 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
723 }
724
725
726
727 if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize)
728 return;
729 }
730
731 if (flags & MSG_MORE)
732 nonagle = TCP_NAGLE_CORK;
733
734 __tcp_push_pending_frames(sk, mss_now, nonagle);
735 }
736
737 static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
738 unsigned int offset, size_t len)
739 {
740 struct tcp_splice_state *tss = rd_desc->arg.data;
741 int ret;
742
743 ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
744 min(rd_desc->count, len), tss->flags);
745 if (ret > 0)
746 rd_desc->count -= ret;
747 return ret;
748 }
749
750 static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
751 {
752
753 read_descriptor_t rd_desc = {
754 .arg.data = tss,
755 .count = tss->len,
756 };
757
758 return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
759 }
760
761
762
763
764
765
766
767
768
769
770
771
772
773 ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
774 struct pipe_inode_info *pipe, size_t len,
775 unsigned int flags)
776 {
777 struct sock *sk = sock->sk;
778 struct tcp_splice_state tss = {
779 .pipe = pipe,
780 .len = len,
781 .flags = flags,
782 };
783 long timeo;
784 ssize_t spliced;
785 int ret;
786
787 sock_rps_record_flow(sk);
788
789
790
791 if (unlikely(*ppos))
792 return -ESPIPE;
793
794 ret = spliced = 0;
795
796 lock_sock(sk);
797
798 timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
799 while (tss.len) {
800 ret = __tcp_splice_read(sk, &tss);
801 if (ret < 0)
802 break;
803 else if (!ret) {
804 if (spliced)
805 break;
806 if (sock_flag(sk, SOCK_DONE))
807 break;
808 if (sk->sk_err) {
809 ret = sock_error(sk);
810 break;
811 }
812 if (sk->sk_shutdown & RCV_SHUTDOWN)
813 break;
814 if (sk->sk_state == TCP_CLOSE) {
815
816
817
818
819 ret = -ENOTCONN;
820 break;
821 }
822 if (!timeo) {
823 ret = -EAGAIN;
824 break;
825 }
826
827
828
829
830 if (!skb_queue_empty(&sk->sk_receive_queue))
831 break;
832 sk_wait_data(sk, &timeo, NULL);
833 if (signal_pending(current)) {
834 ret = sock_intr_errno(timeo);
835 break;
836 }
837 continue;
838 }
839 tss.len -= ret;
840 spliced += ret;
841
842 if (!timeo)
843 break;
844 release_sock(sk);
845 lock_sock(sk);
846
847 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
848 (sk->sk_shutdown & RCV_SHUTDOWN) ||
849 signal_pending(current))
850 break;
851 }
852
853 release_sock(sk);
854
855 if (spliced)
856 return spliced;
857
858 return ret;
859 }
860 EXPORT_SYMBOL(tcp_splice_read);
861
862 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
863 bool force_schedule)
864 {
865 struct sk_buff *skb;
866
867 if (likely(!size)) {
868 skb = sk->sk_tx_skb_cache;
869 if (skb) {
870 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
871 sk->sk_tx_skb_cache = NULL;
872 pskb_trim(skb, 0);
873 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
874 skb_shinfo(skb)->tx_flags = 0;
875 memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
876 return skb;
877 }
878 }
879
880 size = ALIGN(size, 4);
881
882 if (unlikely(tcp_under_memory_pressure(sk)))
883 sk_mem_reclaim_partial(sk);
884
885 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
886 if (likely(skb)) {
887 bool mem_scheduled;
888
889 if (force_schedule) {
890 mem_scheduled = true;
891 sk_forced_mem_schedule(sk, skb->truesize);
892 } else {
893 mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
894 }
895 if (likely(mem_scheduled)) {
896 skb_reserve(skb, sk->sk_prot->max_header);
897
898
899
900
901 skb->reserved_tailroom = skb->end - skb->tail - size;
902 INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
903 return skb;
904 }
905 __kfree_skb(skb);
906 } else {
907 sk->sk_prot->enter_memory_pressure(sk);
908 sk_stream_moderate_sndbuf(sk);
909 }
910 return NULL;
911 }
912
913 static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
914 int large_allowed)
915 {
916 struct tcp_sock *tp = tcp_sk(sk);
917 u32 new_size_goal, size_goal;
918
919 if (!large_allowed)
920 return mss_now;
921
922
923 new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
924 new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
925
926
927 size_goal = tp->gso_segs * mss_now;
928 if (unlikely(new_size_goal < size_goal ||
929 new_size_goal >= size_goal + mss_now)) {
930 tp->gso_segs = min_t(u16, new_size_goal / mss_now,
931 sk->sk_gso_max_segs);
932 size_goal = tp->gso_segs * mss_now;
933 }
934
935 return max(size_goal, mss_now);
936 }
937
938 static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
939 {
940 int mss_now;
941
942 mss_now = tcp_current_mss(sk);
943 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
944
945 return mss_now;
946 }
947
948
949
950
951
952
953
954 static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
955 {
956 if (skb && !skb->len) {
957 tcp_unlink_write_queue(skb, sk);
958 if (tcp_write_queue_empty(sk))
959 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
960 sk_wmem_free_skb(sk, skb);
961 }
962 }
963
964 ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
965 size_t size, int flags)
966 {
967 struct tcp_sock *tp = tcp_sk(sk);
968 int mss_now, size_goal;
969 int err;
970 ssize_t copied;
971 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
972
973 if (IS_ENABLED(CONFIG_DEBUG_VM) &&
974 WARN_ONCE(PageSlab(page), "page must not be a Slab one"))
975 return -EINVAL;
976
977
978
979
980
981 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
982 !tcp_passive_fastopen(sk)) {
983 err = sk_stream_wait_connect(sk, &timeo);
984 if (err != 0)
985 goto out_err;
986 }
987
988 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
989
990 mss_now = tcp_send_mss(sk, &size_goal, flags);
991 copied = 0;
992
993 err = -EPIPE;
994 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
995 goto out_err;
996
997 while (size > 0) {
998 struct sk_buff *skb = tcp_write_queue_tail(sk);
999 int copy, i;
1000 bool can_coalesce;
1001
1002 if (!skb || (copy = size_goal - skb->len) <= 0 ||
1003 !tcp_skb_can_collapse_to(skb)) {
1004 new_segment:
1005 if (!sk_stream_memory_free(sk))
1006 goto wait_for_sndbuf;
1007
1008 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
1009 tcp_rtx_and_write_queues_empty(sk));
1010 if (!skb)
1011 goto wait_for_memory;
1012
1013 #ifdef CONFIG_TLS_DEVICE
1014 skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
1015 #endif
1016 skb_entail(sk, skb);
1017 copy = size_goal;
1018 }
1019
1020 if (copy > size)
1021 copy = size;
1022
1023 i = skb_shinfo(skb)->nr_frags;
1024 can_coalesce = skb_can_coalesce(skb, i, page, offset);
1025 if (!can_coalesce && i >= sysctl_max_skb_frags) {
1026 tcp_mark_push(tp, skb);
1027 goto new_segment;
1028 }
1029 if (!sk_wmem_schedule(sk, copy))
1030 goto wait_for_memory;
1031
1032 if (can_coalesce) {
1033 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1034 } else {
1035 get_page(page);
1036 skb_fill_page_desc(skb, i, page, offset, copy);
1037 }
1038
1039 if (!(flags & MSG_NO_SHARED_FRAGS))
1040 skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
1041
1042 skb->len += copy;
1043 skb->data_len += copy;
1044 skb->truesize += copy;
1045 sk_wmem_queued_add(sk, copy);
1046 sk_mem_charge(sk, copy);
1047 skb->ip_summed = CHECKSUM_PARTIAL;
1048 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1049 TCP_SKB_CB(skb)->end_seq += copy;
1050 tcp_skb_pcount_set(skb, 0);
1051
1052 if (!copied)
1053 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1054
1055 copied += copy;
1056 offset += copy;
1057 size -= copy;
1058 if (!size)
1059 goto out;
1060
1061 if (skb->len < size_goal || (flags & MSG_OOB))
1062 continue;
1063
1064 if (forced_push(tp)) {
1065 tcp_mark_push(tp, skb);
1066 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1067 } else if (skb == tcp_send_head(sk))
1068 tcp_push_one(sk, mss_now);
1069 continue;
1070
1071 wait_for_sndbuf:
1072 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1073 wait_for_memory:
1074 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1075 TCP_NAGLE_PUSH, size_goal);
1076
1077 err = sk_stream_wait_memory(sk, &timeo);
1078 if (err != 0)
1079 goto do_error;
1080
1081 mss_now = tcp_send_mss(sk, &size_goal, flags);
1082 }
1083
1084 out:
1085 if (copied) {
1086 tcp_tx_timestamp(sk, sk->sk_tsflags);
1087 if (!(flags & MSG_SENDPAGE_NOTLAST))
1088 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1089 }
1090 return copied;
1091
1092 do_error:
1093 tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
1094 if (copied)
1095 goto out;
1096 out_err:
1097
1098 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1099 sk->sk_write_space(sk);
1100 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1101 }
1102 return sk_stream_error(sk, flags, err);
1103 }
1104 EXPORT_SYMBOL_GPL(do_tcp_sendpages);
1105
1106 int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
1107 size_t size, int flags)
1108 {
1109 if (!(sk->sk_route_caps & NETIF_F_SG))
1110 return sock_no_sendpage_locked(sk, page, offset, size, flags);
1111
1112 tcp_rate_check_app_limited(sk);
1113
1114 return do_tcp_sendpages(sk, page, offset, size, flags);
1115 }
1116 EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
1117
1118 int tcp_sendpage(struct sock *sk, struct page *page, int offset,
1119 size_t size, int flags)
1120 {
1121 int ret;
1122
1123 lock_sock(sk);
1124 ret = tcp_sendpage_locked(sk, page, offset, size, flags);
1125 release_sock(sk);
1126
1127 return ret;
1128 }
1129 EXPORT_SYMBOL(tcp_sendpage);
1130
1131 void tcp_free_fastopen_req(struct tcp_sock *tp)
1132 {
1133 if (tp->fastopen_req) {
1134 kfree(tp->fastopen_req);
1135 tp->fastopen_req = NULL;
1136 }
1137 }
1138
1139 static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
1140 int *copied, size_t size,
1141 struct ubuf_info *uarg)
1142 {
1143 struct tcp_sock *tp = tcp_sk(sk);
1144 struct inet_sock *inet = inet_sk(sk);
1145 struct sockaddr *uaddr = msg->msg_name;
1146 int err, flags;
1147
1148 if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
1149 (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1150 uaddr->sa_family == AF_UNSPEC))
1151 return -EOPNOTSUPP;
1152 if (tp->fastopen_req)
1153 return -EALREADY;
1154
1155 tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1156 sk->sk_allocation);
1157 if (unlikely(!tp->fastopen_req))
1158 return -ENOBUFS;
1159 tp->fastopen_req->data = msg;
1160 tp->fastopen_req->size = size;
1161 tp->fastopen_req->uarg = uarg;
1162
1163 if (inet->defer_connect) {
1164 err = tcp_connect(sk);
1165
1166 if (err) {
1167 tcp_set_state(sk, TCP_CLOSE);
1168 inet->inet_dport = 0;
1169 sk->sk_route_caps = 0;
1170 }
1171 }
1172 flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
1173 err = __inet_stream_connect(sk->sk_socket, uaddr,
1174 msg->msg_namelen, flags, 1);
1175
1176
1177
1178 if (tp->fastopen_req) {
1179 *copied = tp->fastopen_req->copied;
1180 tcp_free_fastopen_req(tp);
1181 inet->defer_connect = 0;
1182 }
1183 return err;
1184 }
1185
1186 int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
1187 {
1188 struct tcp_sock *tp = tcp_sk(sk);
1189 struct ubuf_info *uarg = NULL;
1190 struct sk_buff *skb;
1191 struct sockcm_cookie sockc;
1192 int flags, err, copied = 0;
1193 int mss_now = 0, size_goal, copied_syn = 0;
1194 int process_backlog = 0;
1195 bool zc = false;
1196 long timeo;
1197
1198 flags = msg->msg_flags;
1199
1200 if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
1201 skb = tcp_write_queue_tail(sk);
1202 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
1203 if (!uarg) {
1204 err = -ENOBUFS;
1205 goto out_err;
1206 }
1207
1208 zc = sk->sk_route_caps & NETIF_F_SG;
1209 if (!zc)
1210 uarg->zerocopy = 0;
1211 }
1212
1213 if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
1214 !tp->repair) {
1215 err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
1216 if (err == -EINPROGRESS && copied_syn > 0)
1217 goto out;
1218 else if (err)
1219 goto out_err;
1220 }
1221
1222 timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1223
1224 tcp_rate_check_app_limited(sk);
1225
1226
1227
1228
1229
1230 if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
1231 !tcp_passive_fastopen(sk)) {
1232 err = sk_stream_wait_connect(sk, &timeo);
1233 if (err != 0)
1234 goto do_error;
1235 }
1236
1237 if (unlikely(tp->repair)) {
1238 if (tp->repair_queue == TCP_RECV_QUEUE) {
1239 copied = tcp_send_rcvq(sk, msg, size);
1240 goto out_nopush;
1241 }
1242
1243 err = -EINVAL;
1244 if (tp->repair_queue == TCP_NO_QUEUE)
1245 goto out_err;
1246
1247
1248 }
1249
1250 sockcm_init(&sockc, sk);
1251 if (msg->msg_controllen) {
1252 err = sock_cmsg_send(sk, msg, &sockc);
1253 if (unlikely(err)) {
1254 err = -EINVAL;
1255 goto out_err;
1256 }
1257 }
1258
1259
1260 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1261
1262
1263 copied = 0;
1264
1265 restart:
1266 mss_now = tcp_send_mss(sk, &size_goal, flags);
1267
1268 err = -EPIPE;
1269 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1270 goto do_error;
1271
1272 while (msg_data_left(msg)) {
1273 int copy = 0;
1274
1275 skb = tcp_write_queue_tail(sk);
1276 if (skb)
1277 copy = size_goal - skb->len;
1278
1279 if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
1280 bool first_skb;
1281
1282 new_segment:
1283 if (!sk_stream_memory_free(sk))
1284 goto wait_for_sndbuf;
1285
1286 if (unlikely(process_backlog >= 16)) {
1287 process_backlog = 0;
1288 if (sk_flush_backlog(sk))
1289 goto restart;
1290 }
1291 first_skb = tcp_rtx_and_write_queues_empty(sk);
1292 skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
1293 first_skb);
1294 if (!skb)
1295 goto wait_for_memory;
1296
1297 process_backlog++;
1298 skb->ip_summed = CHECKSUM_PARTIAL;
1299
1300 skb_entail(sk, skb);
1301 copy = size_goal;
1302
1303
1304
1305
1306
1307 if (tp->repair)
1308 TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
1309 }
1310
1311
1312 if (copy > msg_data_left(msg))
1313 copy = msg_data_left(msg);
1314
1315
1316 if (skb_availroom(skb) > 0 && !zc) {
1317
1318 copy = min_t(int, copy, skb_availroom(skb));
1319 err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
1320 if (err)
1321 goto do_fault;
1322 } else if (!zc) {
1323 bool merge = true;
1324 int i = skb_shinfo(skb)->nr_frags;
1325 struct page_frag *pfrag = sk_page_frag(sk);
1326
1327 if (!sk_page_frag_refill(sk, pfrag))
1328 goto wait_for_memory;
1329
1330 if (!skb_can_coalesce(skb, i, pfrag->page,
1331 pfrag->offset)) {
1332 if (i >= sysctl_max_skb_frags) {
1333 tcp_mark_push(tp, skb);
1334 goto new_segment;
1335 }
1336 merge = false;
1337 }
1338
1339 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1340
1341 if (!sk_wmem_schedule(sk, copy))
1342 goto wait_for_memory;
1343
1344 err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
1345 pfrag->page,
1346 pfrag->offset,
1347 copy);
1348 if (err)
1349 goto do_error;
1350
1351
1352 if (merge) {
1353 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1354 } else {
1355 skb_fill_page_desc(skb, i, pfrag->page,
1356 pfrag->offset, copy);
1357 page_ref_inc(pfrag->page);
1358 }
1359 pfrag->offset += copy;
1360 } else {
1361 err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
1362 if (err == -EMSGSIZE || err == -EEXIST) {
1363 tcp_mark_push(tp, skb);
1364 goto new_segment;
1365 }
1366 if (err < 0)
1367 goto do_error;
1368 copy = err;
1369 }
1370
1371 if (!copied)
1372 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1373
1374 WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1375 TCP_SKB_CB(skb)->end_seq += copy;
1376 tcp_skb_pcount_set(skb, 0);
1377
1378 copied += copy;
1379 if (!msg_data_left(msg)) {
1380 if (unlikely(flags & MSG_EOR))
1381 TCP_SKB_CB(skb)->eor = 1;
1382 goto out;
1383 }
1384
1385 if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
1386 continue;
1387
1388 if (forced_push(tp)) {
1389 tcp_mark_push(tp, skb);
1390 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1391 } else if (skb == tcp_send_head(sk))
1392 tcp_push_one(sk, mss_now);
1393 continue;
1394
1395 wait_for_sndbuf:
1396 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1397 wait_for_memory:
1398 if (copied)
1399 tcp_push(sk, flags & ~MSG_MORE, mss_now,
1400 TCP_NAGLE_PUSH, size_goal);
1401
1402 err = sk_stream_wait_memory(sk, &timeo);
1403 if (err != 0)
1404 goto do_error;
1405
1406 mss_now = tcp_send_mss(sk, &size_goal, flags);
1407 }
1408
1409 out:
1410 if (copied) {
1411 tcp_tx_timestamp(sk, sockc.tsflags);
1412 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
1413 }
1414 out_nopush:
1415 sock_zerocopy_put(uarg);
1416 return copied + copied_syn;
1417
1418 do_error:
1419 skb = tcp_write_queue_tail(sk);
1420 do_fault:
1421 tcp_remove_empty_skb(sk, skb);
1422
1423 if (copied + copied_syn)
1424 goto out;
1425 out_err:
1426 sock_zerocopy_put_abort(uarg, true);
1427 err = sk_stream_error(sk, flags, err);
1428
1429 if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1430 sk->sk_write_space(sk);
1431 tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
1432 }
1433 return err;
1434 }
1435 EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1436
1437 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
1438 {
1439 int ret;
1440
1441 lock_sock(sk);
1442 ret = tcp_sendmsg_locked(sk, msg, size);
1443 release_sock(sk);
1444
1445 return ret;
1446 }
1447 EXPORT_SYMBOL(tcp_sendmsg);
1448
1449
1450
1451
1452
1453
1454 static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
1455 {
1456 struct tcp_sock *tp = tcp_sk(sk);
1457
1458
1459 if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1460 tp->urg_data == TCP_URG_READ)
1461 return -EINVAL;
1462
1463 if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1464 return -ENOTCONN;
1465
1466 if (tp->urg_data & TCP_URG_VALID) {
1467 int err = 0;
1468 char c = tp->urg_data;
1469
1470 if (!(flags & MSG_PEEK))
1471 tp->urg_data = TCP_URG_READ;
1472
1473
1474 msg->msg_flags |= MSG_OOB;
1475
1476 if (len > 0) {
1477 if (!(flags & MSG_TRUNC))
1478 err = memcpy_to_msg(msg, &c, 1);
1479 len = 1;
1480 } else
1481 msg->msg_flags |= MSG_TRUNC;
1482
1483 return err ? -EFAULT : len;
1484 }
1485
1486 if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1487 return 0;
1488
1489
1490
1491
1492
1493
1494
1495 return -EAGAIN;
1496 }
1497
1498 static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
1499 {
1500 struct sk_buff *skb;
1501 int copied = 0, err = 0;
1502
1503
1504
1505 skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1506 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1507 if (err)
1508 return err;
1509 copied += skb->len;
1510 }
1511
1512 skb_queue_walk(&sk->sk_write_queue, skb) {
1513 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
1514 if (err)
1515 break;
1516
1517 copied += skb->len;
1518 }
1519
1520 return err ?: copied;
1521 }
1522
1523
1524
1525
1526
1527
1528
1529 static void tcp_cleanup_rbuf(struct sock *sk, int copied)
1530 {
1531 struct tcp_sock *tp = tcp_sk(sk);
1532 bool time_to_ack = false;
1533
1534 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1535
1536 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1537 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1538 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1539
1540 if (inet_csk_ack_scheduled(sk)) {
1541 const struct inet_connection_sock *icsk = inet_csk(sk);
1542
1543
1544 if (icsk->icsk_ack.blocked ||
1545
1546 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1547
1548
1549
1550
1551
1552
1553 (copied > 0 &&
1554 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1555 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1556 !inet_csk_in_pingpong_mode(sk))) &&
1557 !atomic_read(&sk->sk_rmem_alloc)))
1558 time_to_ack = true;
1559 }
1560
1561
1562
1563
1564
1565
1566
1567 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1568 __u32 rcv_window_now = tcp_receive_window(tp);
1569
1570
1571 if (2*rcv_window_now <= tp->window_clamp) {
1572 __u32 new_window = __tcp_select_window(sk);
1573
1574
1575
1576
1577
1578
1579 if (new_window && new_window >= 2 * rcv_window_now)
1580 time_to_ack = true;
1581 }
1582 }
1583 if (time_to_ack)
1584 tcp_send_ack(sk);
1585 }
1586
1587 static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1588 {
1589 struct sk_buff *skb;
1590 u32 offset;
1591
1592 while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
1593 offset = seq - TCP_SKB_CB(skb)->seq;
1594 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1595 pr_err_once("%s: found a SYN, please report !\n", __func__);
1596 offset--;
1597 }
1598 if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1599 *off = offset;
1600 return skb;
1601 }
1602
1603
1604
1605
1606 sk_eat_skb(sk, skb);
1607 }
1608 return NULL;
1609 }
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1623 sk_read_actor_t recv_actor)
1624 {
1625 struct sk_buff *skb;
1626 struct tcp_sock *tp = tcp_sk(sk);
1627 u32 seq = tp->copied_seq;
1628 u32 offset;
1629 int copied = 0;
1630
1631 if (sk->sk_state == TCP_LISTEN)
1632 return -ENOTCONN;
1633 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1634 if (offset < skb->len) {
1635 int used;
1636 size_t len;
1637
1638 len = skb->len - offset;
1639
1640 if (tp->urg_data) {
1641 u32 urg_offset = tp->urg_seq - seq;
1642 if (urg_offset < len)
1643 len = urg_offset;
1644 if (!len)
1645 break;
1646 }
1647 used = recv_actor(desc, skb, offset, len);
1648 if (used <= 0) {
1649 if (!copied)
1650 copied = used;
1651 break;
1652 } else if (used <= len) {
1653 seq += used;
1654 copied += used;
1655 offset += used;
1656 }
1657
1658
1659
1660
1661
1662 skb = tcp_recv_skb(sk, seq - 1, &offset);
1663 if (!skb)
1664 break;
1665
1666
1667
1668 if (offset + 1 != skb->len)
1669 continue;
1670 }
1671 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1672 sk_eat_skb(sk, skb);
1673 ++seq;
1674 break;
1675 }
1676 sk_eat_skb(sk, skb);
1677 if (!desc->count)
1678 break;
1679 WRITE_ONCE(tp->copied_seq, seq);
1680 }
1681 WRITE_ONCE(tp->copied_seq, seq);
1682
1683 tcp_rcv_space_adjust(sk);
1684
1685
1686 if (copied > 0) {
1687 tcp_recv_skb(sk, seq, &offset);
1688 tcp_cleanup_rbuf(sk, copied);
1689 }
1690 return copied;
1691 }
1692 EXPORT_SYMBOL(tcp_read_sock);
1693
1694 int tcp_peek_len(struct socket *sock)
1695 {
1696 return tcp_inq(sock->sk);
1697 }
1698 EXPORT_SYMBOL(tcp_peek_len);
1699
1700
1701 int tcp_set_rcvlowat(struct sock *sk, int val)
1702 {
1703 int cap;
1704
1705 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1706 cap = sk->sk_rcvbuf >> 1;
1707 else
1708 cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
1709 val = min(val, cap);
1710 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1711
1712
1713 tcp_data_ready(sk);
1714
1715 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1716 return 0;
1717
1718 val <<= 1;
1719 if (val > sk->sk_rcvbuf) {
1720 WRITE_ONCE(sk->sk_rcvbuf, val);
1721 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1722 }
1723 return 0;
1724 }
1725 EXPORT_SYMBOL(tcp_set_rcvlowat);
1726
1727 #ifdef CONFIG_MMU
1728 static const struct vm_operations_struct tcp_vm_ops = {
1729 };
1730
1731 int tcp_mmap(struct file *file, struct socket *sock,
1732 struct vm_area_struct *vma)
1733 {
1734 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1735 return -EPERM;
1736 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1737
1738
1739 vma->vm_flags |= VM_MIXEDMAP;
1740
1741 vma->vm_ops = &tcp_vm_ops;
1742 return 0;
1743 }
1744 EXPORT_SYMBOL(tcp_mmap);
1745
1746 static int tcp_zerocopy_receive(struct sock *sk,
1747 struct tcp_zerocopy_receive *zc)
1748 {
1749 unsigned long address = (unsigned long)zc->address;
1750 const skb_frag_t *frags = NULL;
1751 u32 length = 0, seq, offset;
1752 struct vm_area_struct *vma;
1753 struct sk_buff *skb = NULL;
1754 struct tcp_sock *tp;
1755 int inq;
1756 int ret;
1757
1758 if (address & (PAGE_SIZE - 1) || address != zc->address)
1759 return -EINVAL;
1760
1761 if (sk->sk_state == TCP_LISTEN)
1762 return -ENOTCONN;
1763
1764 sock_rps_record_flow(sk);
1765
1766 down_read(¤t->mm->mmap_sem);
1767
1768 vma = find_vma(current->mm, address);
1769 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) {
1770 up_read(¤t->mm->mmap_sem);
1771 return -EINVAL;
1772 }
1773 zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
1774
1775 tp = tcp_sk(sk);
1776 seq = tp->copied_seq;
1777 inq = tcp_inq(sk);
1778 zc->length = min_t(u32, zc->length, inq);
1779 zc->length &= ~(PAGE_SIZE - 1);
1780 if (zc->length) {
1781 zap_page_range(vma, address, zc->length);
1782 zc->recv_skip_hint = 0;
1783 } else {
1784 zc->recv_skip_hint = inq;
1785 }
1786 ret = 0;
1787 while (length + PAGE_SIZE <= zc->length) {
1788 if (zc->recv_skip_hint < PAGE_SIZE) {
1789 if (skb) {
1790 skb = skb->next;
1791 offset = seq - TCP_SKB_CB(skb)->seq;
1792 } else {
1793 skb = tcp_recv_skb(sk, seq, &offset);
1794 }
1795
1796 zc->recv_skip_hint = skb->len - offset;
1797 offset -= skb_headlen(skb);
1798 if ((int)offset < 0 || skb_has_frag_list(skb))
1799 break;
1800 frags = skb_shinfo(skb)->frags;
1801 while (offset) {
1802 if (skb_frag_size(frags) > offset)
1803 goto out;
1804 offset -= skb_frag_size(frags);
1805 frags++;
1806 }
1807 }
1808 if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
1809 int remaining = zc->recv_skip_hint;
1810
1811 while (remaining && (skb_frag_size(frags) != PAGE_SIZE ||
1812 skb_frag_off(frags))) {
1813 remaining -= skb_frag_size(frags);
1814 frags++;
1815 }
1816 zc->recv_skip_hint -= remaining;
1817 break;
1818 }
1819 ret = vm_insert_page(vma, address + length,
1820 skb_frag_page(frags));
1821 if (ret)
1822 break;
1823 length += PAGE_SIZE;
1824 seq += PAGE_SIZE;
1825 zc->recv_skip_hint -= PAGE_SIZE;
1826 frags++;
1827 }
1828 out:
1829 up_read(¤t->mm->mmap_sem);
1830 if (length) {
1831 WRITE_ONCE(tp->copied_seq, seq);
1832 tcp_rcv_space_adjust(sk);
1833
1834
1835 tcp_recv_skb(sk, seq, &offset);
1836 tcp_cleanup_rbuf(sk, length);
1837 ret = 0;
1838 if (length == zc->length)
1839 zc->recv_skip_hint = 0;
1840 } else {
1841 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
1842 ret = -EIO;
1843 }
1844 zc->length = length;
1845 return ret;
1846 }
1847 #endif
1848
1849 static void tcp_update_recv_tstamps(struct sk_buff *skb,
1850 struct scm_timestamping_internal *tss)
1851 {
1852 if (skb->tstamp)
1853 tss->ts[0] = ktime_to_timespec64(skb->tstamp);
1854 else
1855 tss->ts[0] = (struct timespec64) {0};
1856
1857 if (skb_hwtstamps(skb)->hwtstamp)
1858 tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
1859 else
1860 tss->ts[2] = (struct timespec64) {0};
1861 }
1862
1863
1864 static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
1865 struct scm_timestamping_internal *tss)
1866 {
1867 int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
1868 bool has_timestamping = false;
1869
1870 if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
1871 if (sock_flag(sk, SOCK_RCVTSTAMP)) {
1872 if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
1873 if (new_tstamp) {
1874 struct __kernel_timespec kts = {tss->ts[0].tv_sec, tss->ts[0].tv_nsec};
1875
1876 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
1877 sizeof(kts), &kts);
1878 } else {
1879 struct timespec ts_old = timespec64_to_timespec(tss->ts[0]);
1880
1881 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
1882 sizeof(ts_old), &ts_old);
1883 }
1884 } else {
1885 if (new_tstamp) {
1886 struct __kernel_sock_timeval stv;
1887
1888 stv.tv_sec = tss->ts[0].tv_sec;
1889 stv.tv_usec = tss->ts[0].tv_nsec / 1000;
1890 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
1891 sizeof(stv), &stv);
1892 } else {
1893 struct __kernel_old_timeval tv;
1894
1895 tv.tv_sec = tss->ts[0].tv_sec;
1896 tv.tv_usec = tss->ts[0].tv_nsec / 1000;
1897 put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
1898 sizeof(tv), &tv);
1899 }
1900 }
1901 }
1902
1903 if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
1904 has_timestamping = true;
1905 else
1906 tss->ts[0] = (struct timespec64) {0};
1907 }
1908
1909 if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
1910 if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
1911 has_timestamping = true;
1912 else
1913 tss->ts[2] = (struct timespec64) {0};
1914 }
1915
1916 if (has_timestamping) {
1917 tss->ts[1] = (struct timespec64) {0};
1918 if (sock_flag(sk, SOCK_TSTAMP_NEW))
1919 put_cmsg_scm_timestamping64(msg, tss);
1920 else
1921 put_cmsg_scm_timestamping(msg, tss);
1922 }
1923 }
1924
1925 static int tcp_inq_hint(struct sock *sk)
1926 {
1927 const struct tcp_sock *tp = tcp_sk(sk);
1928 u32 copied_seq = READ_ONCE(tp->copied_seq);
1929 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
1930 int inq;
1931
1932 inq = rcv_nxt - copied_seq;
1933 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
1934 lock_sock(sk);
1935 inq = tp->rcv_nxt - tp->copied_seq;
1936 release_sock(sk);
1937 }
1938
1939
1940
1941 if (inq == 0 && sock_flag(sk, SOCK_DONE))
1942 inq = 1;
1943 return inq;
1944 }
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954 int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1955 int flags, int *addr_len)
1956 {
1957 struct tcp_sock *tp = tcp_sk(sk);
1958 int copied = 0;
1959 u32 peek_seq;
1960 u32 *seq;
1961 unsigned long used;
1962 int err, inq;
1963 int target;
1964 long timeo;
1965 struct sk_buff *skb, *last;
1966 u32 urg_hole = 0;
1967 struct scm_timestamping_internal tss;
1968 int cmsg_flags;
1969
1970 if (unlikely(flags & MSG_ERRQUEUE))
1971 return inet_recv_error(sk, msg, len, addr_len);
1972
1973 if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
1974 (sk->sk_state == TCP_ESTABLISHED))
1975 sk_busy_loop(sk, nonblock);
1976
1977 lock_sock(sk);
1978
1979 err = -ENOTCONN;
1980 if (sk->sk_state == TCP_LISTEN)
1981 goto out;
1982
1983 cmsg_flags = tp->recvmsg_inq ? 1 : 0;
1984 timeo = sock_rcvtimeo(sk, nonblock);
1985
1986
1987 if (flags & MSG_OOB)
1988 goto recv_urg;
1989
1990 if (unlikely(tp->repair)) {
1991 err = -EPERM;
1992 if (!(flags & MSG_PEEK))
1993 goto out;
1994
1995 if (tp->repair_queue == TCP_SEND_QUEUE)
1996 goto recv_sndq;
1997
1998 err = -EINVAL;
1999 if (tp->repair_queue == TCP_NO_QUEUE)
2000 goto out;
2001
2002
2003 }
2004
2005 seq = &tp->copied_seq;
2006 if (flags & MSG_PEEK) {
2007 peek_seq = tp->copied_seq;
2008 seq = &peek_seq;
2009 }
2010
2011 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
2012
2013 do {
2014 u32 offset;
2015
2016
2017 if (tp->urg_data && tp->urg_seq == *seq) {
2018 if (copied)
2019 break;
2020 if (signal_pending(current)) {
2021 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
2022 break;
2023 }
2024 }
2025
2026
2027
2028 last = skb_peek_tail(&sk->sk_receive_queue);
2029 skb_queue_walk(&sk->sk_receive_queue, skb) {
2030 last = skb;
2031
2032
2033
2034 if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2035 "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2036 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2037 flags))
2038 break;
2039
2040 offset = *seq - TCP_SKB_CB(skb)->seq;
2041 if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2042 pr_err_once("%s: found a SYN, please report !\n", __func__);
2043 offset--;
2044 }
2045 if (offset < skb->len)
2046 goto found_ok_skb;
2047 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2048 goto found_fin_ok;
2049 WARN(!(flags & MSG_PEEK),
2050 "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2051 *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2052 }
2053
2054
2055
2056 if (copied >= target && !sk->sk_backlog.tail)
2057 break;
2058
2059 if (copied) {
2060 if (sk->sk_err ||
2061 sk->sk_state == TCP_CLOSE ||
2062 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2063 !timeo ||
2064 signal_pending(current))
2065 break;
2066 } else {
2067 if (sock_flag(sk, SOCK_DONE))
2068 break;
2069
2070 if (sk->sk_err) {
2071 copied = sock_error(sk);
2072 break;
2073 }
2074
2075 if (sk->sk_shutdown & RCV_SHUTDOWN)
2076 break;
2077
2078 if (sk->sk_state == TCP_CLOSE) {
2079
2080
2081
2082 copied = -ENOTCONN;
2083 break;
2084 }
2085
2086 if (!timeo) {
2087 copied = -EAGAIN;
2088 break;
2089 }
2090
2091 if (signal_pending(current)) {
2092 copied = sock_intr_errno(timeo);
2093 break;
2094 }
2095 }
2096
2097 tcp_cleanup_rbuf(sk, copied);
2098
2099 if (copied >= target) {
2100
2101 release_sock(sk);
2102 lock_sock(sk);
2103 } else {
2104 sk_wait_data(sk, &timeo, last);
2105 }
2106
2107 if ((flags & MSG_PEEK) &&
2108 (peek_seq - copied - urg_hole != tp->copied_seq)) {
2109 net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2110 current->comm,
2111 task_pid_nr(current));
2112 peek_seq = tp->copied_seq;
2113 }
2114 continue;
2115
2116 found_ok_skb:
2117
2118 used = skb->len - offset;
2119 if (len < used)
2120 used = len;
2121
2122
2123 if (tp->urg_data) {
2124 u32 urg_offset = tp->urg_seq - *seq;
2125 if (urg_offset < used) {
2126 if (!urg_offset) {
2127 if (!sock_flag(sk, SOCK_URGINLINE)) {
2128 WRITE_ONCE(*seq, *seq + 1);
2129 urg_hole++;
2130 offset++;
2131 used--;
2132 if (!used)
2133 goto skip_copy;
2134 }
2135 } else
2136 used = urg_offset;
2137 }
2138 }
2139
2140 if (!(flags & MSG_TRUNC)) {
2141 err = skb_copy_datagram_msg(skb, offset, msg, used);
2142 if (err) {
2143
2144 if (!copied)
2145 copied = -EFAULT;
2146 break;
2147 }
2148 }
2149
2150 WRITE_ONCE(*seq, *seq + used);
2151 copied += used;
2152 len -= used;
2153
2154 tcp_rcv_space_adjust(sk);
2155
2156 skip_copy:
2157 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
2158 tp->urg_data = 0;
2159 tcp_fast_path_check(sk);
2160 }
2161
2162 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2163 tcp_update_recv_tstamps(skb, &tss);
2164 cmsg_flags |= 2;
2165 }
2166
2167 if (used + offset < skb->len)
2168 continue;
2169
2170 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2171 goto found_fin_ok;
2172 if (!(flags & MSG_PEEK))
2173 sk_eat_skb(sk, skb);
2174 continue;
2175
2176 found_fin_ok:
2177
2178 WRITE_ONCE(*seq, *seq + 1);
2179 if (!(flags & MSG_PEEK))
2180 sk_eat_skb(sk, skb);
2181 break;
2182 } while (len > 0);
2183
2184
2185
2186
2187
2188
2189 tcp_cleanup_rbuf(sk, copied);
2190
2191 release_sock(sk);
2192
2193 if (cmsg_flags) {
2194 if (cmsg_flags & 2)
2195 tcp_recv_timestamp(msg, sk, &tss);
2196 if (cmsg_flags & 1) {
2197 inq = tcp_inq_hint(sk);
2198 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
2199 }
2200 }
2201
2202 return copied;
2203
2204 out:
2205 release_sock(sk);
2206 return err;
2207
2208 recv_urg:
2209 err = tcp_recv_urg(sk, msg, len, flags);
2210 goto out;
2211
2212 recv_sndq:
2213 err = tcp_peek_sndq(sk, msg, len);
2214 goto out;
2215 }
2216 EXPORT_SYMBOL(tcp_recvmsg);
2217
2218 void tcp_set_state(struct sock *sk, int state)
2219 {
2220 int oldstate = sk->sk_state;
2221
2222
2223
2224
2225
2226
2227
2228
2229 BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2230 BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2231 BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2232 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2233 BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2234 BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2235 BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2236 BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2237 BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2238 BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2239 BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2240 BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2241 BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2242
2243 if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2244 tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
2245
2246 switch (state) {
2247 case TCP_ESTABLISHED:
2248 if (oldstate != TCP_ESTABLISHED)
2249 TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2250 break;
2251
2252 case TCP_CLOSE:
2253 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
2254 TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2255
2256 sk->sk_prot->unhash(sk);
2257 if (inet_csk(sk)->icsk_bind_hash &&
2258 !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
2259 inet_put_port(sk);
2260
2261 default:
2262 if (oldstate == TCP_ESTABLISHED)
2263 TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2264 }
2265
2266
2267
2268
2269 inet_sk_state_store(sk, state);
2270 }
2271 EXPORT_SYMBOL_GPL(tcp_set_state);
2272
2273
2274
2275
2276
2277
2278
2279
2280 static const unsigned char new_state[16] = {
2281
2282 [0 ] = TCP_CLOSE,
2283 [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2284 [TCP_SYN_SENT] = TCP_CLOSE,
2285 [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN,
2286 [TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
2287 [TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
2288 [TCP_TIME_WAIT] = TCP_CLOSE,
2289 [TCP_CLOSE] = TCP_CLOSE,
2290 [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN,
2291 [TCP_LAST_ACK] = TCP_LAST_ACK,
2292 [TCP_LISTEN] = TCP_CLOSE,
2293 [TCP_CLOSING] = TCP_CLOSING,
2294 [TCP_NEW_SYN_RECV] = TCP_CLOSE,
2295 };
2296
2297 static int tcp_close_state(struct sock *sk)
2298 {
2299 int next = (int)new_state[sk->sk_state];
2300 int ns = next & TCP_STATE_MASK;
2301
2302 tcp_set_state(sk, ns);
2303
2304 return next & TCP_ACTION_FIN;
2305 }
2306
2307
2308
2309
2310
2311
2312 void tcp_shutdown(struct sock *sk, int how)
2313 {
2314
2315
2316
2317
2318 if (!(how & SEND_SHUTDOWN))
2319 return;
2320
2321
2322 if ((1 << sk->sk_state) &
2323 (TCPF_ESTABLISHED | TCPF_SYN_SENT |
2324 TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
2325
2326 if (tcp_close_state(sk))
2327 tcp_send_fin(sk);
2328 }
2329 }
2330 EXPORT_SYMBOL(tcp_shutdown);
2331
2332 bool tcp_check_oom(struct sock *sk, int shift)
2333 {
2334 bool too_many_orphans, out_of_socket_memory;
2335
2336 too_many_orphans = tcp_too_many_orphans(sk, shift);
2337 out_of_socket_memory = tcp_out_of_memory(sk);
2338
2339 if (too_many_orphans)
2340 net_info_ratelimited("too many orphaned sockets\n");
2341 if (out_of_socket_memory)
2342 net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
2343 return too_many_orphans || out_of_socket_memory;
2344 }
2345
2346 void tcp_close(struct sock *sk, long timeout)
2347 {
2348 struct sk_buff *skb;
2349 int data_was_unread = 0;
2350 int state;
2351
2352 lock_sock(sk);
2353 sk->sk_shutdown = SHUTDOWN_MASK;
2354
2355 if (sk->sk_state == TCP_LISTEN) {
2356 tcp_set_state(sk, TCP_CLOSE);
2357
2358
2359 inet_csk_listen_stop(sk);
2360
2361 goto adjudge_to_death;
2362 }
2363
2364
2365
2366
2367
2368 while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
2369 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
2370
2371 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2372 len--;
2373 data_was_unread += len;
2374 __kfree_skb(skb);
2375 }
2376
2377 sk_mem_reclaim(sk);
2378
2379
2380 if (sk->sk_state == TCP_CLOSE)
2381 goto adjudge_to_death;
2382
2383
2384
2385
2386
2387
2388
2389
2390 if (unlikely(tcp_sk(sk)->repair)) {
2391 sk->sk_prot->disconnect(sk, 0);
2392 } else if (data_was_unread) {
2393
2394 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
2395 tcp_set_state(sk, TCP_CLOSE);
2396 tcp_send_active_reset(sk, sk->sk_allocation);
2397 } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
2398
2399 sk->sk_prot->disconnect(sk, 0);
2400 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
2401 } else if (tcp_close_state(sk)) {
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431 tcp_send_fin(sk);
2432 }
2433
2434 sk_stream_wait_close(sk, timeout);
2435
2436 adjudge_to_death:
2437 state = sk->sk_state;
2438 sock_hold(sk);
2439 sock_orphan(sk);
2440
2441 local_bh_disable();
2442 bh_lock_sock(sk);
2443
2444 __release_sock(sk);
2445
2446 percpu_counter_inc(sk->sk_prot->orphan_count);
2447
2448
2449 if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
2450 goto out;
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466 if (sk->sk_state == TCP_FIN_WAIT2) {
2467 struct tcp_sock *tp = tcp_sk(sk);
2468 if (tp->linger2 < 0) {
2469 tcp_set_state(sk, TCP_CLOSE);
2470 tcp_send_active_reset(sk, GFP_ATOMIC);
2471 __NET_INC_STATS(sock_net(sk),
2472 LINUX_MIB_TCPABORTONLINGER);
2473 } else {
2474 const int tmo = tcp_fin_time(sk);
2475
2476 if (tmo > TCP_TIMEWAIT_LEN) {
2477 inet_csk_reset_keepalive_timer(sk,
2478 tmo - TCP_TIMEWAIT_LEN);
2479 } else {
2480 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2481 goto out;
2482 }
2483 }
2484 }
2485 if (sk->sk_state != TCP_CLOSE) {
2486 sk_mem_reclaim(sk);
2487 if (tcp_check_oom(sk, 0)) {
2488 tcp_set_state(sk, TCP_CLOSE);
2489 tcp_send_active_reset(sk, GFP_ATOMIC);
2490 __NET_INC_STATS(sock_net(sk),
2491 LINUX_MIB_TCPABORTONMEMORY);
2492 } else if (!check_net(sock_net(sk))) {
2493
2494 tcp_set_state(sk, TCP_CLOSE);
2495 }
2496 }
2497
2498 if (sk->sk_state == TCP_CLOSE) {
2499 struct request_sock *req;
2500
2501 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
2502 lockdep_sock_is_held(sk));
2503
2504
2505
2506
2507 if (req)
2508 reqsk_fastopen_remove(sk, req, false);
2509 inet_csk_destroy_sock(sk);
2510 }
2511
2512
2513 out:
2514 bh_unlock_sock(sk);
2515 local_bh_enable();
2516 release_sock(sk);
2517 sock_put(sk);
2518 }
2519 EXPORT_SYMBOL(tcp_close);
2520
2521
2522
2523 static inline bool tcp_need_reset(int state)
2524 {
2525 return (1 << state) &
2526 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2527 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2528 }
2529
2530 static void tcp_rtx_queue_purge(struct sock *sk)
2531 {
2532 struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
2533
2534 tcp_sk(sk)->highest_sack = NULL;
2535 while (p) {
2536 struct sk_buff *skb = rb_to_skb(p);
2537
2538 p = rb_next(p);
2539
2540
2541
2542 tcp_rtx_queue_unlink(skb, sk);
2543 sk_wmem_free_skb(sk, skb);
2544 }
2545 }
2546
2547 void tcp_write_queue_purge(struct sock *sk)
2548 {
2549 struct sk_buff *skb;
2550
2551 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
2552 while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
2553 tcp_skb_tsorted_anchor_cleanup(skb);
2554 sk_wmem_free_skb(sk, skb);
2555 }
2556 tcp_rtx_queue_purge(sk);
2557 skb = sk->sk_tx_skb_cache;
2558 if (skb) {
2559 __kfree_skb(skb);
2560 sk->sk_tx_skb_cache = NULL;
2561 }
2562 INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
2563 sk_mem_reclaim(sk);
2564 tcp_clear_all_retrans_hints(tcp_sk(sk));
2565 tcp_sk(sk)->packets_out = 0;
2566 inet_csk(sk)->icsk_backoff = 0;
2567 }
2568
2569 int tcp_disconnect(struct sock *sk, int flags)
2570 {
2571 struct inet_sock *inet = inet_sk(sk);
2572 struct inet_connection_sock *icsk = inet_csk(sk);
2573 struct tcp_sock *tp = tcp_sk(sk);
2574 int old_state = sk->sk_state;
2575 u32 seq;
2576
2577 if (old_state != TCP_CLOSE)
2578 tcp_set_state(sk, TCP_CLOSE);
2579
2580
2581 if (old_state == TCP_LISTEN) {
2582 inet_csk_listen_stop(sk);
2583 } else if (unlikely(tp->repair)) {
2584 sk->sk_err = ECONNABORTED;
2585 } else if (tcp_need_reset(old_state) ||
2586 (tp->snd_nxt != tp->write_seq &&
2587 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2588
2589
2590
2591 tcp_send_active_reset(sk, gfp_any());
2592 sk->sk_err = ECONNRESET;
2593 } else if (old_state == TCP_SYN_SENT)
2594 sk->sk_err = ECONNRESET;
2595
2596 tcp_clear_xmit_timers(sk);
2597 __skb_queue_purge(&sk->sk_receive_queue);
2598 if (sk->sk_rx_skb_cache) {
2599 __kfree_skb(sk->sk_rx_skb_cache);
2600 sk->sk_rx_skb_cache = NULL;
2601 }
2602 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
2603 tp->urg_data = 0;
2604 tcp_write_queue_purge(sk);
2605 tcp_fastopen_active_disable_ofo_check(sk);
2606 skb_rbtree_purge(&tp->out_of_order_queue);
2607
2608 inet->inet_dport = 0;
2609
2610 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2611 inet_reset_saddr(sk);
2612
2613 sk->sk_shutdown = 0;
2614 sock_reset_flag(sk, SOCK_DONE);
2615 tp->srtt_us = 0;
2616 tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
2617 tp->rcv_rtt_last_tsecr = 0;
2618
2619 seq = tp->write_seq + tp->max_window + 2;
2620 if (!seq)
2621 seq = 1;
2622 WRITE_ONCE(tp->write_seq, seq);
2623
2624 icsk->icsk_backoff = 0;
2625 tp->snd_cwnd = 2;
2626 icsk->icsk_probes_out = 0;
2627 icsk->icsk_rto = TCP_TIMEOUT_INIT;
2628 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2629 tp->snd_cwnd = TCP_INIT_CWND;
2630 tp->snd_cwnd_cnt = 0;
2631 tp->window_clamp = 0;
2632 tp->delivered = 0;
2633 tp->delivered_ce = 0;
2634 tcp_set_ca_state(sk, TCP_CA_Open);
2635 tp->is_sack_reneg = 0;
2636 tcp_clear_retrans(tp);
2637 tp->total_retrans = 0;
2638 inet_csk_delack_init(sk);
2639
2640
2641
2642 icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
2643 memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2644 __sk_dst_reset(sk);
2645 dst_release(sk->sk_rx_dst);
2646 sk->sk_rx_dst = NULL;
2647 tcp_saved_syn_free(tp);
2648 tp->compressed_ack = 0;
2649 tp->segs_in = 0;
2650 tp->segs_out = 0;
2651 tp->bytes_sent = 0;
2652 tp->bytes_acked = 0;
2653 tp->bytes_received = 0;
2654 tp->bytes_retrans = 0;
2655 tp->data_segs_in = 0;
2656 tp->data_segs_out = 0;
2657 tp->duplicate_sack[0].start_seq = 0;
2658 tp->duplicate_sack[0].end_seq = 0;
2659 tp->dsack_dups = 0;
2660 tp->reord_seen = 0;
2661 tp->retrans_out = 0;
2662 tp->sacked_out = 0;
2663 tp->tlp_high_seq = 0;
2664 tp->last_oow_ack_time = 0;
2665
2666 tp->app_limited = ~0U;
2667 tp->rack.mstamp = 0;
2668 tp->rack.advanced = 0;
2669 tp->rack.reo_wnd_steps = 1;
2670 tp->rack.last_delivered = 0;
2671 tp->rack.reo_wnd_persist = 0;
2672 tp->rack.dsack_seen = 0;
2673 tp->syn_data_acked = 0;
2674 tp->rx_opt.saw_tstamp = 0;
2675 tp->rx_opt.dsack = 0;
2676 tp->rx_opt.num_sacks = 0;
2677 tp->rcv_ooopack = 0;
2678
2679
2680
2681 tcp_free_fastopen_req(tp);
2682 inet->defer_connect = 0;
2683
2684 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
2685
2686 if (sk->sk_frag.page) {
2687 put_page(sk->sk_frag.page);
2688 sk->sk_frag.page = NULL;
2689 sk->sk_frag.offset = 0;
2690 }
2691
2692 sk->sk_error_report(sk);
2693 return 0;
2694 }
2695 EXPORT_SYMBOL(tcp_disconnect);
2696
2697 static inline bool tcp_can_repair_sock(const struct sock *sk)
2698 {
2699 return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
2700 (sk->sk_state != TCP_LISTEN);
2701 }
2702
2703 static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
2704 {
2705 struct tcp_repair_window opt;
2706
2707 if (!tp->repair)
2708 return -EPERM;
2709
2710 if (len != sizeof(opt))
2711 return -EINVAL;
2712
2713 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2714 return -EFAULT;
2715
2716 if (opt.max_window < opt.snd_wnd)
2717 return -EINVAL;
2718
2719 if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
2720 return -EINVAL;
2721
2722 if (after(opt.rcv_wup, tp->rcv_nxt))
2723 return -EINVAL;
2724
2725 tp->snd_wl1 = opt.snd_wl1;
2726 tp->snd_wnd = opt.snd_wnd;
2727 tp->max_window = opt.max_window;
2728
2729 tp->rcv_wnd = opt.rcv_wnd;
2730 tp->rcv_wup = opt.rcv_wup;
2731
2732 return 0;
2733 }
2734
2735 static int tcp_repair_options_est(struct sock *sk,
2736 struct tcp_repair_opt __user *optbuf, unsigned int len)
2737 {
2738 struct tcp_sock *tp = tcp_sk(sk);
2739 struct tcp_repair_opt opt;
2740
2741 while (len >= sizeof(opt)) {
2742 if (copy_from_user(&opt, optbuf, sizeof(opt)))
2743 return -EFAULT;
2744
2745 optbuf++;
2746 len -= sizeof(opt);
2747
2748 switch (opt.opt_code) {
2749 case TCPOPT_MSS:
2750 tp->rx_opt.mss_clamp = opt.opt_val;
2751 tcp_mtup_init(sk);
2752 break;
2753 case TCPOPT_WINDOW:
2754 {
2755 u16 snd_wscale = opt.opt_val & 0xFFFF;
2756 u16 rcv_wscale = opt.opt_val >> 16;
2757
2758 if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE)
2759 return -EFBIG;
2760
2761 tp->rx_opt.snd_wscale = snd_wscale;
2762 tp->rx_opt.rcv_wscale = rcv_wscale;
2763 tp->rx_opt.wscale_ok = 1;
2764 }
2765 break;
2766 case TCPOPT_SACK_PERM:
2767 if (opt.opt_val != 0)
2768 return -EINVAL;
2769
2770 tp->rx_opt.sack_ok |= TCP_SACK_SEEN;
2771 break;
2772 case TCPOPT_TIMESTAMP:
2773 if (opt.opt_val != 0)
2774 return -EINVAL;
2775
2776 tp->rx_opt.tstamp_ok = 1;
2777 break;
2778 }
2779 }
2780
2781 return 0;
2782 }
2783
2784 DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
2785 EXPORT_SYMBOL(tcp_tx_delay_enabled);
2786
2787 static void tcp_enable_tx_delay(void)
2788 {
2789 if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
2790 static int __tcp_tx_delay_enabled = 0;
2791
2792 if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
2793 static_branch_enable(&tcp_tx_delay_enabled);
2794 pr_info("TCP_TX_DELAY enabled\n");
2795 }
2796 }
2797 }
2798
2799
2800
2801
2802 static int do_tcp_setsockopt(struct sock *sk, int level,
2803 int optname, char __user *optval, unsigned int optlen)
2804 {
2805 struct tcp_sock *tp = tcp_sk(sk);
2806 struct inet_connection_sock *icsk = inet_csk(sk);
2807 struct net *net = sock_net(sk);
2808 int val;
2809 int err = 0;
2810
2811
2812 switch (optname) {
2813 case TCP_CONGESTION: {
2814 char name[TCP_CA_NAME_MAX];
2815
2816 if (optlen < 1)
2817 return -EINVAL;
2818
2819 val = strncpy_from_user(name, optval,
2820 min_t(long, TCP_CA_NAME_MAX-1, optlen));
2821 if (val < 0)
2822 return -EFAULT;
2823 name[val] = 0;
2824
2825 lock_sock(sk);
2826 err = tcp_set_congestion_control(sk, name, true, true,
2827 ns_capable(sock_net(sk)->user_ns,
2828 CAP_NET_ADMIN));
2829 release_sock(sk);
2830 return err;
2831 }
2832 case TCP_ULP: {
2833 char name[TCP_ULP_NAME_MAX];
2834
2835 if (optlen < 1)
2836 return -EINVAL;
2837
2838 val = strncpy_from_user(name, optval,
2839 min_t(long, TCP_ULP_NAME_MAX - 1,
2840 optlen));
2841 if (val < 0)
2842 return -EFAULT;
2843 name[val] = 0;
2844
2845 lock_sock(sk);
2846 err = tcp_set_ulp(sk, name);
2847 release_sock(sk);
2848 return err;
2849 }
2850 case TCP_FASTOPEN_KEY: {
2851 __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
2852 __u8 *backup_key = NULL;
2853
2854
2855
2856
2857 if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
2858 optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
2859 return -EINVAL;
2860
2861 if (copy_from_user(key, optval, optlen))
2862 return -EFAULT;
2863
2864 if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
2865 backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
2866
2867 return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
2868 }
2869 default:
2870
2871 break;
2872 }
2873
2874 if (optlen < sizeof(int))
2875 return -EINVAL;
2876
2877 if (get_user(val, (int __user *)optval))
2878 return -EFAULT;
2879
2880 lock_sock(sk);
2881
2882 switch (optname) {
2883 case TCP_MAXSEG:
2884
2885
2886
2887
2888 if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
2889 err = -EINVAL;
2890 break;
2891 }
2892 tp->rx_opt.user_mss = val;
2893 break;
2894
2895 case TCP_NODELAY:
2896 if (val) {
2897
2898
2899
2900
2901
2902
2903
2904
2905 tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2906 tcp_push_pending_frames(sk);
2907 } else {
2908 tp->nonagle &= ~TCP_NAGLE_OFF;
2909 }
2910 break;
2911
2912 case TCP_THIN_LINEAR_TIMEOUTS:
2913 if (val < 0 || val > 1)
2914 err = -EINVAL;
2915 else
2916 tp->thin_lto = val;
2917 break;
2918
2919 case TCP_THIN_DUPACK:
2920 if (val < 0 || val > 1)
2921 err = -EINVAL;
2922 break;
2923
2924 case TCP_REPAIR:
2925 if (!tcp_can_repair_sock(sk))
2926 err = -EPERM;
2927 else if (val == TCP_REPAIR_ON) {
2928 tp->repair = 1;
2929 sk->sk_reuse = SK_FORCE_REUSE;
2930 tp->repair_queue = TCP_NO_QUEUE;
2931 } else if (val == TCP_REPAIR_OFF) {
2932 tp->repair = 0;
2933 sk->sk_reuse = SK_NO_REUSE;
2934 tcp_send_window_probe(sk);
2935 } else if (val == TCP_REPAIR_OFF_NO_WP) {
2936 tp->repair = 0;
2937 sk->sk_reuse = SK_NO_REUSE;
2938 } else
2939 err = -EINVAL;
2940
2941 break;
2942
2943 case TCP_REPAIR_QUEUE:
2944 if (!tp->repair)
2945 err = -EPERM;
2946 else if ((unsigned int)val < TCP_QUEUES_NR)
2947 tp->repair_queue = val;
2948 else
2949 err = -EINVAL;
2950 break;
2951
2952 case TCP_QUEUE_SEQ:
2953 if (sk->sk_state != TCP_CLOSE)
2954 err = -EPERM;
2955 else if (tp->repair_queue == TCP_SEND_QUEUE)
2956 WRITE_ONCE(tp->write_seq, val);
2957 else if (tp->repair_queue == TCP_RECV_QUEUE) {
2958 WRITE_ONCE(tp->rcv_nxt, val);
2959 WRITE_ONCE(tp->copied_seq, val);
2960 }
2961 else
2962 err = -EINVAL;
2963 break;
2964
2965 case TCP_REPAIR_OPTIONS:
2966 if (!tp->repair)
2967 err = -EINVAL;
2968 else if (sk->sk_state == TCP_ESTABLISHED)
2969 err = tcp_repair_options_est(sk,
2970 (struct tcp_repair_opt __user *)optval,
2971 optlen);
2972 else
2973 err = -EPERM;
2974 break;
2975
2976 case TCP_CORK:
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988 if (val) {
2989 tp->nonagle |= TCP_NAGLE_CORK;
2990 } else {
2991 tp->nonagle &= ~TCP_NAGLE_CORK;
2992 if (tp->nonagle&TCP_NAGLE_OFF)
2993 tp->nonagle |= TCP_NAGLE_PUSH;
2994 tcp_push_pending_frames(sk);
2995 }
2996 break;
2997
2998 case TCP_KEEPIDLE:
2999 if (val < 1 || val > MAX_TCP_KEEPIDLE)
3000 err = -EINVAL;
3001 else {
3002 tp->keepalive_time = val * HZ;
3003 if (sock_flag(sk, SOCK_KEEPOPEN) &&
3004 !((1 << sk->sk_state) &
3005 (TCPF_CLOSE | TCPF_LISTEN))) {
3006 u32 elapsed = keepalive_time_elapsed(tp);
3007 if (tp->keepalive_time > elapsed)
3008 elapsed = tp->keepalive_time - elapsed;
3009 else
3010 elapsed = 0;
3011 inet_csk_reset_keepalive_timer(sk, elapsed);
3012 }
3013 }
3014 break;
3015 case TCP_KEEPINTVL:
3016 if (val < 1 || val > MAX_TCP_KEEPINTVL)
3017 err = -EINVAL;
3018 else
3019 tp->keepalive_intvl = val * HZ;
3020 break;
3021 case TCP_KEEPCNT:
3022 if (val < 1 || val > MAX_TCP_KEEPCNT)
3023 err = -EINVAL;
3024 else
3025 tp->keepalive_probes = val;
3026 break;
3027 case TCP_SYNCNT:
3028 if (val < 1 || val > MAX_TCP_SYNCNT)
3029 err = -EINVAL;
3030 else
3031 icsk->icsk_syn_retries = val;
3032 break;
3033
3034 case TCP_SAVE_SYN:
3035 if (val < 0 || val > 1)
3036 err = -EINVAL;
3037 else
3038 tp->save_syn = val;
3039 break;
3040
3041 case TCP_LINGER2:
3042 if (val < 0)
3043 tp->linger2 = -1;
3044 else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
3045 tp->linger2 = 0;
3046 else
3047 tp->linger2 = val * HZ;
3048 break;
3049
3050 case TCP_DEFER_ACCEPT:
3051
3052 icsk->icsk_accept_queue.rskq_defer_accept =
3053 secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3054 TCP_RTO_MAX / HZ);
3055 break;
3056
3057 case TCP_WINDOW_CLAMP:
3058 if (!val) {
3059 if (sk->sk_state != TCP_CLOSE) {
3060 err = -EINVAL;
3061 break;
3062 }
3063 tp->window_clamp = 0;
3064 } else
3065 tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
3066 SOCK_MIN_RCVBUF / 2 : val;
3067 break;
3068
3069 case TCP_QUICKACK:
3070 if (!val) {
3071 inet_csk_enter_pingpong_mode(sk);
3072 } else {
3073 inet_csk_exit_pingpong_mode(sk);
3074 if ((1 << sk->sk_state) &
3075 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3076 inet_csk_ack_scheduled(sk)) {
3077 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
3078 tcp_cleanup_rbuf(sk, 1);
3079 if (!(val & 1))
3080 inet_csk_enter_pingpong_mode(sk);
3081 }
3082 }
3083 break;
3084
3085 #ifdef CONFIG_TCP_MD5SIG
3086 case TCP_MD5SIG:
3087 case TCP_MD5SIG_EXT:
3088 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
3089 err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
3090 else
3091 err = -EINVAL;
3092 break;
3093 #endif
3094 case TCP_USER_TIMEOUT:
3095
3096
3097
3098 if (val < 0)
3099 err = -EINVAL;
3100 else
3101 icsk->icsk_user_timeout = val;
3102 break;
3103
3104 case TCP_FASTOPEN:
3105 if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
3106 TCPF_LISTEN))) {
3107 tcp_fastopen_init_key_once(net);
3108
3109 fastopen_queue_tune(sk, val);
3110 } else {
3111 err = -EINVAL;
3112 }
3113 break;
3114 case TCP_FASTOPEN_CONNECT:
3115 if (val > 1 || val < 0) {
3116 err = -EINVAL;
3117 } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
3118 if (sk->sk_state == TCP_CLOSE)
3119 tp->fastopen_connect = val;
3120 else
3121 err = -EINVAL;
3122 } else {
3123 err = -EOPNOTSUPP;
3124 }
3125 break;
3126 case TCP_FASTOPEN_NO_COOKIE:
3127 if (val > 1 || val < 0)
3128 err = -EINVAL;
3129 else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3130 err = -EINVAL;
3131 else
3132 tp->fastopen_no_cookie = val;
3133 break;
3134 case TCP_TIMESTAMP:
3135 if (!tp->repair)
3136 err = -EPERM;
3137 else
3138 tp->tsoffset = val - tcp_time_stamp_raw();
3139 break;
3140 case TCP_REPAIR_WINDOW:
3141 err = tcp_repair_set_window(tp, optval, optlen);
3142 break;
3143 case TCP_NOTSENT_LOWAT:
3144 tp->notsent_lowat = val;
3145 sk->sk_write_space(sk);
3146 break;
3147 case TCP_INQ:
3148 if (val > 1 || val < 0)
3149 err = -EINVAL;
3150 else
3151 tp->recvmsg_inq = val;
3152 break;
3153 case TCP_TX_DELAY:
3154 if (val)
3155 tcp_enable_tx_delay();
3156 tp->tcp_tx_delay = val;
3157 break;
3158 default:
3159 err = -ENOPROTOOPT;
3160 break;
3161 }
3162
3163 release_sock(sk);
3164 return err;
3165 }
3166
3167 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
3168 unsigned int optlen)
3169 {
3170 const struct inet_connection_sock *icsk = inet_csk(sk);
3171
3172 if (level != SOL_TCP)
3173 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
3174 optval, optlen);
3175 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3176 }
3177 EXPORT_SYMBOL(tcp_setsockopt);
3178
3179 #ifdef CONFIG_COMPAT
3180 int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
3181 char __user *optval, unsigned int optlen)
3182 {
3183 if (level != SOL_TCP)
3184 return inet_csk_compat_setsockopt(sk, level, optname,
3185 optval, optlen);
3186 return do_tcp_setsockopt(sk, level, optname, optval, optlen);
3187 }
3188 EXPORT_SYMBOL(compat_tcp_setsockopt);
3189 #endif
3190
3191 static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
3192 struct tcp_info *info)
3193 {
3194 u64 stats[__TCP_CHRONO_MAX], total = 0;
3195 enum tcp_chrono i;
3196
3197 for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
3198 stats[i] = tp->chrono_stat[i - 1];
3199 if (i == tp->chrono_type)
3200 stats[i] += tcp_jiffies32 - tp->chrono_start;
3201 stats[i] *= USEC_PER_SEC / HZ;
3202 total += stats[i];
3203 }
3204
3205 info->tcpi_busy_time = total;
3206 info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
3207 info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
3208 }
3209
3210
3211 void tcp_get_info(struct sock *sk, struct tcp_info *info)
3212 {
3213 const struct tcp_sock *tp = tcp_sk(sk);
3214 const struct inet_connection_sock *icsk = inet_csk(sk);
3215 unsigned long rate;
3216 u32 now;
3217 u64 rate64;
3218 bool slow;
3219
3220 memset(info, 0, sizeof(*info));
3221 if (sk->sk_type != SOCK_STREAM)
3222 return;
3223
3224 info->tcpi_state = inet_sk_state_load(sk);
3225
3226
3227 rate = READ_ONCE(sk->sk_pacing_rate);
3228 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3229 info->tcpi_pacing_rate = rate64;
3230
3231 rate = READ_ONCE(sk->sk_max_pacing_rate);
3232 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3233 info->tcpi_max_pacing_rate = rate64;
3234
3235 info->tcpi_reordering = tp->reordering;
3236 info->tcpi_snd_cwnd = tp->snd_cwnd;
3237
3238 if (info->tcpi_state == TCP_LISTEN) {
3239
3240
3241
3242
3243 info->tcpi_unacked = sk->sk_ack_backlog;
3244 info->tcpi_sacked = sk->sk_max_ack_backlog;
3245 return;
3246 }
3247
3248 slow = lock_sock_fast(sk);
3249
3250 info->tcpi_ca_state = icsk->icsk_ca_state;
3251 info->tcpi_retransmits = icsk->icsk_retransmits;
3252 info->tcpi_probes = icsk->icsk_probes_out;
3253 info->tcpi_backoff = icsk->icsk_backoff;
3254
3255 if (tp->rx_opt.tstamp_ok)
3256 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
3257 if (tcp_is_sack(tp))
3258 info->tcpi_options |= TCPI_OPT_SACK;
3259 if (tp->rx_opt.wscale_ok) {
3260 info->tcpi_options |= TCPI_OPT_WSCALE;
3261 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
3262 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
3263 }
3264
3265 if (tp->ecn_flags & TCP_ECN_OK)
3266 info->tcpi_options |= TCPI_OPT_ECN;
3267 if (tp->ecn_flags & TCP_ECN_SEEN)
3268 info->tcpi_options |= TCPI_OPT_ECN_SEEN;
3269 if (tp->syn_data_acked)
3270 info->tcpi_options |= TCPI_OPT_SYN_DATA;
3271
3272 info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
3273 info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
3274 info->tcpi_snd_mss = tp->mss_cache;
3275 info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
3276
3277 info->tcpi_unacked = tp->packets_out;
3278 info->tcpi_sacked = tp->sacked_out;
3279
3280 info->tcpi_lost = tp->lost_out;
3281 info->tcpi_retrans = tp->retrans_out;
3282
3283 now = tcp_jiffies32;
3284 info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
3285 info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
3286 info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
3287
3288 info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
3289 info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
3290 info->tcpi_rtt = tp->srtt_us >> 3;
3291 info->tcpi_rttvar = tp->mdev_us >> 2;
3292 info->tcpi_snd_ssthresh = tp->snd_ssthresh;
3293 info->tcpi_advmss = tp->advmss;
3294
3295 info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
3296 info->tcpi_rcv_space = tp->rcvq_space.space;
3297
3298 info->tcpi_total_retrans = tp->total_retrans;
3299
3300 info->tcpi_bytes_acked = tp->bytes_acked;
3301 info->tcpi_bytes_received = tp->bytes_received;
3302 info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
3303 tcp_get_info_chrono_stats(tp, info);
3304
3305 info->tcpi_segs_out = tp->segs_out;
3306 info->tcpi_segs_in = tp->segs_in;
3307
3308 info->tcpi_min_rtt = tcp_min_rtt(tp);
3309 info->tcpi_data_segs_in = tp->data_segs_in;
3310 info->tcpi_data_segs_out = tp->data_segs_out;
3311
3312 info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
3313 rate64 = tcp_compute_delivery_rate(tp);
3314 if (rate64)
3315 info->tcpi_delivery_rate = rate64;
3316 info->tcpi_delivered = tp->delivered;
3317 info->tcpi_delivered_ce = tp->delivered_ce;
3318 info->tcpi_bytes_sent = tp->bytes_sent;
3319 info->tcpi_bytes_retrans = tp->bytes_retrans;
3320 info->tcpi_dsack_dups = tp->dsack_dups;
3321 info->tcpi_reord_seen = tp->reord_seen;
3322 info->tcpi_rcv_ooopack = tp->rcv_ooopack;
3323 info->tcpi_snd_wnd = tp->snd_wnd;
3324 unlock_sock_fast(sk, slow);
3325 }
3326 EXPORT_SYMBOL_GPL(tcp_get_info);
3327
3328 static size_t tcp_opt_stats_get_size(void)
3329 {
3330 return
3331 nla_total_size_64bit(sizeof(u64)) +
3332 nla_total_size_64bit(sizeof(u64)) +
3333 nla_total_size_64bit(sizeof(u64)) +
3334 nla_total_size_64bit(sizeof(u64)) +
3335 nla_total_size_64bit(sizeof(u64)) +
3336 nla_total_size_64bit(sizeof(u64)) +
3337 nla_total_size_64bit(sizeof(u64)) +
3338 nla_total_size(sizeof(u32)) +
3339 nla_total_size(sizeof(u32)) +
3340 nla_total_size(sizeof(u32)) +
3341 nla_total_size(sizeof(u8)) +
3342 nla_total_size(sizeof(u8)) +
3343 nla_total_size(sizeof(u32)) +
3344 nla_total_size(sizeof(u8)) +
3345 nla_total_size(sizeof(u32)) +
3346 nla_total_size(sizeof(u32)) +
3347 nla_total_size(sizeof(u32)) +
3348 nla_total_size_64bit(sizeof(u64)) +
3349 nla_total_size_64bit(sizeof(u64)) +
3350 nla_total_size(sizeof(u32)) +
3351 nla_total_size(sizeof(u32)) +
3352 nla_total_size(sizeof(u32)) +
3353 0;
3354 }
3355
3356 struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3357 {
3358 const struct tcp_sock *tp = tcp_sk(sk);
3359 struct sk_buff *stats;
3360 struct tcp_info info;
3361 unsigned long rate;
3362 u64 rate64;
3363
3364 stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
3365 if (!stats)
3366 return NULL;
3367
3368 tcp_get_info_chrono_stats(tp, &info);
3369 nla_put_u64_64bit(stats, TCP_NLA_BUSY,
3370 info.tcpi_busy_time, TCP_NLA_PAD);
3371 nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
3372 info.tcpi_rwnd_limited, TCP_NLA_PAD);
3373 nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
3374 info.tcpi_sndbuf_limited, TCP_NLA_PAD);
3375 nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT,
3376 tp->data_segs_out, TCP_NLA_PAD);
3377 nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS,
3378 tp->total_retrans, TCP_NLA_PAD);
3379
3380 rate = READ_ONCE(sk->sk_pacing_rate);
3381 rate64 = (rate != ~0UL) ? rate : ~0ULL;
3382 nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
3383
3384 rate64 = tcp_compute_delivery_rate(tp);
3385 nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
3386
3387 nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
3388 nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
3389 nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
3390
3391 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3392 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3393 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3394 nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3395 nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3396
3397 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3398 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3399
3400 nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
3401 TCP_NLA_PAD);
3402 nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
3403 TCP_NLA_PAD);
3404 nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
3405 nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
3406 nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
3407
3408 return stats;
3409 }
3410
3411 static int do_tcp_getsockopt(struct sock *sk, int level,
3412 int optname, char __user *optval, int __user *optlen)
3413 {
3414 struct inet_connection_sock *icsk = inet_csk(sk);
3415 struct tcp_sock *tp = tcp_sk(sk);
3416 struct net *net = sock_net(sk);
3417 int val, len;
3418
3419 if (get_user(len, optlen))
3420 return -EFAULT;
3421
3422 len = min_t(unsigned int, len, sizeof(int));
3423
3424 if (len < 0)
3425 return -EINVAL;
3426
3427 switch (optname) {
3428 case TCP_MAXSEG:
3429 val = tp->mss_cache;
3430 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
3431 val = tp->rx_opt.user_mss;
3432 if (tp->repair)
3433 val = tp->rx_opt.mss_clamp;
3434 break;
3435 case TCP_NODELAY:
3436 val = !!(tp->nonagle&TCP_NAGLE_OFF);
3437 break;
3438 case TCP_CORK:
3439 val = !!(tp->nonagle&TCP_NAGLE_CORK);
3440 break;
3441 case TCP_KEEPIDLE:
3442 val = keepalive_time_when(tp) / HZ;
3443 break;
3444 case TCP_KEEPINTVL:
3445 val = keepalive_intvl_when(tp) / HZ;
3446 break;
3447 case TCP_KEEPCNT:
3448 val = keepalive_probes(tp);
3449 break;
3450 case TCP_SYNCNT:
3451 val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
3452 break;
3453 case TCP_LINGER2:
3454 val = tp->linger2;
3455 if (val >= 0)
3456 val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
3457 break;
3458 case TCP_DEFER_ACCEPT:
3459 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
3460 TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
3461 break;
3462 case TCP_WINDOW_CLAMP:
3463 val = tp->window_clamp;
3464 break;
3465 case TCP_INFO: {
3466 struct tcp_info info;
3467
3468 if (get_user(len, optlen))
3469 return -EFAULT;
3470
3471 tcp_get_info(sk, &info);
3472
3473 len = min_t(unsigned int, len, sizeof(info));
3474 if (put_user(len, optlen))
3475 return -EFAULT;
3476 if (copy_to_user(optval, &info, len))
3477 return -EFAULT;
3478 return 0;
3479 }
3480 case TCP_CC_INFO: {
3481 const struct tcp_congestion_ops *ca_ops;
3482 union tcp_cc_info info;
3483 size_t sz = 0;
3484 int attr;
3485
3486 if (get_user(len, optlen))
3487 return -EFAULT;
3488
3489 ca_ops = icsk->icsk_ca_ops;
3490 if (ca_ops && ca_ops->get_info)
3491 sz = ca_ops->get_info(sk, ~0U, &attr, &info);
3492
3493 len = min_t(unsigned int, len, sz);
3494 if (put_user(len, optlen))
3495 return -EFAULT;
3496 if (copy_to_user(optval, &info, len))
3497 return -EFAULT;
3498 return 0;
3499 }
3500 case TCP_QUICKACK:
3501 val = !inet_csk_in_pingpong_mode(sk);
3502 break;
3503
3504 case TCP_CONGESTION:
3505 if (get_user(len, optlen))
3506 return -EFAULT;
3507 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
3508 if (put_user(len, optlen))
3509 return -EFAULT;
3510 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
3511 return -EFAULT;
3512 return 0;
3513
3514 case TCP_ULP:
3515 if (get_user(len, optlen))
3516 return -EFAULT;
3517 len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
3518 if (!icsk->icsk_ulp_ops) {
3519 if (put_user(0, optlen))
3520 return -EFAULT;
3521 return 0;
3522 }
3523 if (put_user(len, optlen))
3524 return -EFAULT;
3525 if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
3526 return -EFAULT;
3527 return 0;
3528
3529 case TCP_FASTOPEN_KEY: {
3530 __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3531 struct tcp_fastopen_context *ctx;
3532 unsigned int key_len = 0;
3533
3534 if (get_user(len, optlen))
3535 return -EFAULT;
3536
3537 rcu_read_lock();
3538 ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
3539 if (ctx) {
3540 key_len = tcp_fastopen_context_len(ctx) *
3541 TCP_FASTOPEN_KEY_LENGTH;
3542 memcpy(&key[0], &ctx->key[0], key_len);
3543 }
3544 rcu_read_unlock();
3545
3546 len = min_t(unsigned int, len, key_len);
3547 if (put_user(len, optlen))
3548 return -EFAULT;
3549 if (copy_to_user(optval, key, len))
3550 return -EFAULT;
3551 return 0;
3552 }
3553 case TCP_THIN_LINEAR_TIMEOUTS:
3554 val = tp->thin_lto;
3555 break;
3556
3557 case TCP_THIN_DUPACK:
3558 val = 0;
3559 break;
3560
3561 case TCP_REPAIR:
3562 val = tp->repair;
3563 break;
3564
3565 case TCP_REPAIR_QUEUE:
3566 if (tp->repair)
3567 val = tp->repair_queue;
3568 else
3569 return -EINVAL;
3570 break;
3571
3572 case TCP_REPAIR_WINDOW: {
3573 struct tcp_repair_window opt;
3574
3575 if (get_user(len, optlen))
3576 return -EFAULT;
3577
3578 if (len != sizeof(opt))
3579 return -EINVAL;
3580
3581 if (!tp->repair)
3582 return -EPERM;
3583
3584 opt.snd_wl1 = tp->snd_wl1;
3585 opt.snd_wnd = tp->snd_wnd;
3586 opt.max_window = tp->max_window;
3587 opt.rcv_wnd = tp->rcv_wnd;
3588 opt.rcv_wup = tp->rcv_wup;
3589
3590 if (copy_to_user(optval, &opt, len))
3591 return -EFAULT;
3592 return 0;
3593 }
3594 case TCP_QUEUE_SEQ:
3595 if (tp->repair_queue == TCP_SEND_QUEUE)
3596 val = tp->write_seq;
3597 else if (tp->repair_queue == TCP_RECV_QUEUE)
3598 val = tp->rcv_nxt;
3599 else
3600 return -EINVAL;
3601 break;
3602
3603 case TCP_USER_TIMEOUT:
3604 val = icsk->icsk_user_timeout;
3605 break;
3606
3607 case TCP_FASTOPEN:
3608 val = icsk->icsk_accept_queue.fastopenq.max_qlen;
3609 break;
3610
3611 case TCP_FASTOPEN_CONNECT:
3612 val = tp->fastopen_connect;
3613 break;
3614
3615 case TCP_FASTOPEN_NO_COOKIE:
3616 val = tp->fastopen_no_cookie;
3617 break;
3618
3619 case TCP_TX_DELAY:
3620 val = tp->tcp_tx_delay;
3621 break;
3622
3623 case TCP_TIMESTAMP:
3624 val = tcp_time_stamp_raw() + tp->tsoffset;
3625 break;
3626 case TCP_NOTSENT_LOWAT:
3627 val = tp->notsent_lowat;
3628 break;
3629 case TCP_INQ:
3630 val = tp->recvmsg_inq;
3631 break;
3632 case TCP_SAVE_SYN:
3633 val = tp->save_syn;
3634 break;
3635 case TCP_SAVED_SYN: {
3636 if (get_user(len, optlen))
3637 return -EFAULT;
3638
3639 lock_sock(sk);
3640 if (tp->saved_syn) {
3641 if (len < tp->saved_syn[0]) {
3642 if (put_user(tp->saved_syn[0], optlen)) {
3643 release_sock(sk);
3644 return -EFAULT;
3645 }
3646 release_sock(sk);
3647 return -EINVAL;
3648 }
3649 len = tp->saved_syn[0];
3650 if (put_user(len, optlen)) {
3651 release_sock(sk);
3652 return -EFAULT;
3653 }
3654 if (copy_to_user(optval, tp->saved_syn + 1, len)) {
3655 release_sock(sk);
3656 return -EFAULT;
3657 }
3658 tcp_saved_syn_free(tp);
3659 release_sock(sk);
3660 } else {
3661 release_sock(sk);
3662 len = 0;
3663 if (put_user(len, optlen))
3664 return -EFAULT;
3665 }
3666 return 0;
3667 }
3668 #ifdef CONFIG_MMU
3669 case TCP_ZEROCOPY_RECEIVE: {
3670 struct tcp_zerocopy_receive zc;
3671 int err;
3672
3673 if (get_user(len, optlen))
3674 return -EFAULT;
3675 if (len != sizeof(zc))
3676 return -EINVAL;
3677 if (copy_from_user(&zc, optval, len))
3678 return -EFAULT;
3679 lock_sock(sk);
3680 err = tcp_zerocopy_receive(sk, &zc);
3681 release_sock(sk);
3682 if (!err && copy_to_user(optval, &zc, len))
3683 err = -EFAULT;
3684 return err;
3685 }
3686 #endif
3687 default:
3688 return -ENOPROTOOPT;
3689 }
3690
3691 if (put_user(len, optlen))
3692 return -EFAULT;
3693 if (copy_to_user(optval, &val, len))
3694 return -EFAULT;
3695 return 0;
3696 }
3697
3698 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
3699 int __user *optlen)
3700 {
3701 struct inet_connection_sock *icsk = inet_csk(sk);
3702
3703 if (level != SOL_TCP)
3704 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
3705 optval, optlen);
3706 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3707 }
3708 EXPORT_SYMBOL(tcp_getsockopt);
3709
3710 #ifdef CONFIG_COMPAT
3711 int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
3712 char __user *optval, int __user *optlen)
3713 {
3714 if (level != SOL_TCP)
3715 return inet_csk_compat_getsockopt(sk, level, optname,
3716 optval, optlen);
3717 return do_tcp_getsockopt(sk, level, optname, optval, optlen);
3718 }
3719 EXPORT_SYMBOL(compat_tcp_getsockopt);
3720 #endif
3721
3722 #ifdef CONFIG_TCP_MD5SIG
3723 static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
3724 static DEFINE_MUTEX(tcp_md5sig_mutex);
3725 static bool tcp_md5sig_pool_populated = false;
3726
3727 static void __tcp_alloc_md5sig_pool(void)
3728 {
3729 struct crypto_ahash *hash;
3730 int cpu;
3731
3732 hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
3733 if (IS_ERR(hash))
3734 return;
3735
3736 for_each_possible_cpu(cpu) {
3737 void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
3738 struct ahash_request *req;
3739
3740 if (!scratch) {
3741 scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
3742 sizeof(struct tcphdr),
3743 GFP_KERNEL,
3744 cpu_to_node(cpu));
3745 if (!scratch)
3746 return;
3747 per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
3748 }
3749 if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
3750 continue;
3751
3752 req = ahash_request_alloc(hash, GFP_KERNEL);
3753 if (!req)
3754 return;
3755
3756 ahash_request_set_callback(req, 0, NULL, NULL);
3757
3758 per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
3759 }
3760
3761
3762
3763 smp_wmb();
3764 tcp_md5sig_pool_populated = true;
3765 }
3766
3767 bool tcp_alloc_md5sig_pool(void)
3768 {
3769 if (unlikely(!tcp_md5sig_pool_populated)) {
3770 mutex_lock(&tcp_md5sig_mutex);
3771
3772 if (!tcp_md5sig_pool_populated) {
3773 __tcp_alloc_md5sig_pool();
3774 if (tcp_md5sig_pool_populated)
3775 static_branch_inc(&tcp_md5_needed);
3776 }
3777
3778 mutex_unlock(&tcp_md5sig_mutex);
3779 }
3780 return tcp_md5sig_pool_populated;
3781 }
3782 EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792 struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
3793 {
3794 local_bh_disable();
3795
3796 if (tcp_md5sig_pool_populated) {
3797
3798 smp_rmb();
3799 return this_cpu_ptr(&tcp_md5sig_pool);
3800 }
3801 local_bh_enable();
3802 return NULL;
3803 }
3804 EXPORT_SYMBOL(tcp_get_md5sig_pool);
3805
3806 int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3807 const struct sk_buff *skb, unsigned int header_len)
3808 {
3809 struct scatterlist sg;
3810 const struct tcphdr *tp = tcp_hdr(skb);
3811 struct ahash_request *req = hp->md5_req;
3812 unsigned int i;
3813 const unsigned int head_data_len = skb_headlen(skb) > header_len ?
3814 skb_headlen(skb) - header_len : 0;
3815 const struct skb_shared_info *shi = skb_shinfo(skb);
3816 struct sk_buff *frag_iter;
3817
3818 sg_init_table(&sg, 1);
3819
3820 sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
3821 ahash_request_set_crypt(req, &sg, NULL, head_data_len);
3822 if (crypto_ahash_update(req))
3823 return 1;
3824
3825 for (i = 0; i < shi->nr_frags; ++i) {
3826 const skb_frag_t *f = &shi->frags[i];
3827 unsigned int offset = skb_frag_off(f);
3828 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3829
3830 sg_set_page(&sg, page, skb_frag_size(f),
3831 offset_in_page(offset));
3832 ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
3833 if (crypto_ahash_update(req))
3834 return 1;
3835 }
3836
3837 skb_walk_frags(skb, frag_iter)
3838 if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
3839 return 1;
3840
3841 return 0;
3842 }
3843 EXPORT_SYMBOL(tcp_md5_hash_skb_data);
3844
3845 int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
3846 {
3847 struct scatterlist sg;
3848
3849 sg_init_one(&sg, key->key, key->keylen);
3850 ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen);
3851 return crypto_ahash_update(hp->md5_req);
3852 }
3853 EXPORT_SYMBOL(tcp_md5_hash_key);
3854
3855 #endif
3856
3857 void tcp_done(struct sock *sk)
3858 {
3859 struct request_sock *req;
3860
3861
3862
3863
3864
3865 req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
3866
3867 if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3868 TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3869
3870 tcp_set_state(sk, TCP_CLOSE);
3871 tcp_clear_xmit_timers(sk);
3872 if (req)
3873 reqsk_fastopen_remove(sk, req, false);
3874
3875 sk->sk_shutdown = SHUTDOWN_MASK;
3876
3877 if (!sock_flag(sk, SOCK_DEAD))
3878 sk->sk_state_change(sk);
3879 else
3880 inet_csk_destroy_sock(sk);
3881 }
3882 EXPORT_SYMBOL_GPL(tcp_done);
3883
3884 int tcp_abort(struct sock *sk, int err)
3885 {
3886 if (!sk_fullsock(sk)) {
3887 if (sk->sk_state == TCP_NEW_SYN_RECV) {
3888 struct request_sock *req = inet_reqsk(sk);
3889
3890 local_bh_disable();
3891 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
3892 local_bh_enable();
3893 return 0;
3894 }
3895 return -EOPNOTSUPP;
3896 }
3897
3898
3899 lock_sock(sk);
3900
3901 if (sk->sk_state == TCP_LISTEN) {
3902 tcp_set_state(sk, TCP_CLOSE);
3903 inet_csk_listen_stop(sk);
3904 }
3905
3906
3907 local_bh_disable();
3908 bh_lock_sock(sk);
3909
3910 if (!sock_flag(sk, SOCK_DEAD)) {
3911 sk->sk_err = err;
3912
3913 smp_wmb();
3914 sk->sk_error_report(sk);
3915 if (tcp_need_reset(sk->sk_state))
3916 tcp_send_active_reset(sk, GFP_ATOMIC);
3917 tcp_done(sk);
3918 }
3919
3920 bh_unlock_sock(sk);
3921 local_bh_enable();
3922 tcp_write_queue_purge(sk);
3923 release_sock(sk);
3924 return 0;
3925 }
3926 EXPORT_SYMBOL_GPL(tcp_abort);
3927
3928 extern struct tcp_congestion_ops tcp_reno;
3929
3930 static __initdata unsigned long thash_entries;
3931 static int __init set_thash_entries(char *str)
3932 {
3933 ssize_t ret;
3934
3935 if (!str)
3936 return 0;
3937
3938 ret = kstrtoul(str, 0, &thash_entries);
3939 if (ret)
3940 return 0;
3941
3942 return 1;
3943 }
3944 __setup("thash_entries=", set_thash_entries);
3945
3946 static void __init tcp_init_mem(void)
3947 {
3948 unsigned long limit = nr_free_buffer_pages() / 16;
3949
3950 limit = max(limit, 128UL);
3951 sysctl_tcp_mem[0] = limit / 4 * 3;
3952 sysctl_tcp_mem[1] = limit;
3953 sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
3954 }
3955
3956 void __init tcp_init(void)
3957 {
3958 int max_rshare, max_wshare, cnt;
3959 unsigned long limit;
3960 unsigned int i;
3961
3962 BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
3963 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
3964 FIELD_SIZEOF(struct sk_buff, cb));
3965
3966 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3967 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
3968 inet_hashinfo_init(&tcp_hashinfo);
3969 inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
3970 thash_entries, 21,
3971 0, 64 * 1024);
3972 tcp_hashinfo.bind_bucket_cachep =
3973 kmem_cache_create("tcp_bind_bucket",
3974 sizeof(struct inet_bind_bucket), 0,
3975 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3976
3977
3978
3979
3980
3981
3982 tcp_hashinfo.ehash =
3983 alloc_large_system_hash("TCP established",
3984 sizeof(struct inet_ehash_bucket),
3985 thash_entries,
3986 17,
3987 0,
3988 NULL,
3989 &tcp_hashinfo.ehash_mask,
3990 0,
3991 thash_entries ? 0 : 512 * 1024);
3992 for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
3993 INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
3994
3995 if (inet_ehash_locks_alloc(&tcp_hashinfo))
3996 panic("TCP: failed to alloc ehash_locks");
3997 tcp_hashinfo.bhash =
3998 alloc_large_system_hash("TCP bind",
3999 sizeof(struct inet_bind_hashbucket),
4000 tcp_hashinfo.ehash_mask + 1,
4001 17,
4002 0,
4003 &tcp_hashinfo.bhash_size,
4004 NULL,
4005 0,
4006 64 * 1024);
4007 tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
4008 for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
4009 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
4010 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
4011 }
4012
4013
4014 cnt = tcp_hashinfo.ehash_mask + 1;
4015 sysctl_tcp_max_orphans = cnt / 2;
4016
4017 tcp_init_mem();
4018
4019 limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
4020 max_wshare = min(4UL*1024*1024, limit);
4021 max_rshare = min(6UL*1024*1024, limit);
4022
4023 init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
4024 init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
4025 init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
4026
4027 init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
4028 init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
4029 init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
4030
4031 pr_info("Hash tables configured (established %u bind %u)\n",
4032 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
4033
4034 tcp_v4_init();
4035 tcp_metrics_init();
4036 BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
4037 tcp_tasklet_init();
4038 }